This file is indexed.

/usr/bin/maf-join is in last-align 921-1.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#! /usr/bin/env python

# Copyright 2009, 2010, 2011 Martin C. Frith

# Join two or more sets of MAF-format multiple alignments into bigger
# multiple alignments.  The 'join field' is the top genome, which
# should be the same for each input.  Each input should be sorted by
# position in the top genome.

# WARNING: Alignment columns with a gap in the top genome are joined
# arbitrarily!!!

import sys, os, fileinput, optparse, signal

signal.signal(signal.SIGPIPE, signal.SIG_DFL)  # stop spurious error message

class peekable:  # Adapted from Python Cookbook 2nd edition
    """An iterator that supports a peek operation."""
    def __init__(self, iterable):
        self.it = iter(iterable)
        self.cache = []
    def __iter__(self):
        return self
    def next(self):
        if self.cache: return self.cache.pop()
        else: return self.it.next()
    def peek(self):
        if not self.cache: self.cache.append(self.it.next())
        return self.cache[0]

def maxLen(things): return max(map(len, things))

class MafBlock:
    def __init__(self, chr, beg, end, strand, chrSize, seq, prob):
        self.chr = chr  # chromosome names
        self.beg = beg  # alignment begin coordinates
        self.end = end  # alignment end coordinates
        self.strand = strand
        self.chrSize = chrSize  # chromosome sizes
        self.seq = seq  # aligned sequences, including gaps
        self.prob = prob  # probabilities (may be empty)

    def __nonzero__(self):
        return len(self.seq) > 0

    def __cmp__(self, other):
        return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1])

    def before(self, other):
        return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0])

    def after(self, other):
        return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0])

    def addLine(self, line):
        words = line.split()
        if line.startswith('s'):
            self.chr.append(words[1])
            self.beg.append(int(words[2]))
            self.end.append(int(words[2]) + int(words[3]))
            self.strand.append(words[4])
            self.chrSize.append(words[5])
            self.seq.append(list(words[6]))
        elif line.startswith('p'):
            self.prob.append(words[1])

    def write(self):
        beg = map(str, self.beg)
        size = [str(e-b) for b, e in zip(self.beg, self.end)]
        seq = [''.join(i) for i in self.seq]
        columns = self.chr, beg, size, self.strand, self.chrSize, seq
        widths = map(maxLen, columns)
        print 'a'
        for row in zip(*columns):
            widthsAndFields = zip(widths, row)
            field0 = "%-*s" % widthsAndFields[0]  # left-justify
            fields = ["%*s" % i for i in widthsAndFields[1:]]  # right-justify
            print 's', field0, ' '.join(fields)
        pad = ' '.join(' ' * i for i in widths[:-1])
        for i in self.prob:
            print 'p', pad, i
        print  # blank line afterwards

def topSeqBeg(maf): return maf.beg[0]
def topSeqEnd(maf): return maf.end[0]
def emptyMaf(): return MafBlock([], [], [], [], [], [], [])

def joinOnFirstItem(x, y):
    if x[0] != y[0]:
        raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0]))
    return x + y[1:]

def mafEasyJoin(x, y):
    '''Join two MAF blocks on the top sequence.'''
    xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq)
    yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq)
    joined = joinOnFirstItem(xJoin, yJoin)
    chr, beg, end, strand, chrSize, seq = zip(*joined)
    prob = x.prob + y.prob
    return MafBlock(chr, beg, end, strand, chrSize, seq, prob)

def countNonGaps(s): return len(s) - s.count('-')

def nthNonGap(s, n):
    '''Get the start position of the n-th non-gap.'''
    for i, x in enumerate(s):
        if x != '-':
            if n == 0: return i
            n -= 1
    raise ValueError('non-gap not found')

def nthLastNonGap(s, n):
    '''Get the end position of the n-th last non-gap.'''
    return len(s) - nthNonGap(s[::-1], n)

def mafSlice(maf, alnBeg, alnEnd):
    '''Return a slice of a MAF block, using coordinates in the alignment.'''
    beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)]
    end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)]
    seq = [i[alnBeg:alnEnd] for i in maf.seq]
    prob = [i[alnBeg:alnEnd] for i in maf.prob]
    return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob)

def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd):
    '''Return a slice of a MAF block, using coordinates in the top sequence.'''
    lettersFromBeg = newTopSeqBeg - topSeqBeg(maf)
    lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd
    alnBeg = nthNonGap(maf.seq[0], lettersFromBeg)
    alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd)
    return mafSlice(maf, alnBeg, alnEnd)

def jumpGaps(sequence, index):
    '''Return the next index of the sequence where there is a non-gap.'''
    nextIndex = index
    while sequence[nextIndex] == '-': nextIndex += 1
    return nextIndex

def gapsToAdd(sequences):
    '''Return new gaps and their positions, needed to align the non-gaps.'''
    gapInfo = [[] for i in sequences]
    gapBeg = [0 for i in sequences]
    try:
        while True:
            gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)]
            gapSize = [e-b for b, e in zip(gapBeg, gapEnd)]
            maxGapSize = max(gapSize)
            for s, e, i in zip(gapSize, gapEnd, gapInfo):
                if s < maxGapSize:
                    newGap = maxGapSize - s
                    i.append((newGap, e))
            gapBeg = [e+1 for e in gapEnd]
    except IndexError: return gapInfo

def chunksAndGaps(s, gapsAndPositions, oneGap):
    '''Yield chunks of "s" interspersed with gaps at given positions.'''
    oldPosition = 0
    for gapLen, position in gapsAndPositions:
        yield s[oldPosition:position]
        yield oneGap * gapLen
        oldPosition = position
    yield s[oldPosition:]

def mafAddGaps(maf, gapsAndPositions):
    '''Add the given gaps at the given positions to a MAF block.'''
    maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), [])
               for i in maf.seq]
    maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~'))
                for i in maf.prob]

def mafJoin(mafs):
    '''Intersect and join overlapping MAF blocks.'''
    newTopSeqBeg = max(map(topSeqBeg, mafs))
    newTopSeqEnd = min(map(topSeqEnd, mafs))
    mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs]
    topSeqs = [i.seq[0] for i in mafs]
    gapInfo = gapsToAdd(topSeqs)
    for maf, gapsAndPositions in zip(mafs, gapInfo):
        mafAddGaps(maf, gapsAndPositions)
    return reduce(mafEasyJoin, mafs)

def mafInput(lines):
    '''Read lines and yield MAF blocks.'''
    maf = emptyMaf()
    for line in lines:
        if line.isspace():
            if maf: yield maf
            maf = emptyMaf()
        else:
            maf.addLine(line)
    if maf: yield maf

def sortedMafInput(lines):
    '''Read lines and yield MAF blocks, checking that they are in order.'''
    old = emptyMaf()
    for maf in mafInput(lines):
        if maf < old: sys.exit(progName + ": MAF blocks not sorted properly")
        yield maf
        old = maf

def allOverlaps(sequences, beg, end):
    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
    assert beg < end
    if not sequences:
        yield ()
        return
    for i in sequences[0]:
        if topSeqEnd(i) <= beg: continue
        if topSeqBeg(i) >= end: break  # assumes they're sorted by position
        newBeg = max(beg, topSeqBeg(i))
        newEnd = min(end, topSeqEnd(i))
        for j in allOverlaps(sequences[1:], newBeg, newEnd):
            yield (i,) + j

def nextWindow(window, input, referenceMaf):
    '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.'''
    for maf in window:
        if not maf.before(referenceMaf): yield maf
    try:
        while True:
            maf = input.peek()
            if maf.after(referenceMaf): break
            maf = input.next()
            if not maf.before(referenceMaf): yield maf
    except StopIteration: pass

def overlappingMafs(sortedMafInputs):
    '''Yield all combinations of MAF blocks that overlap in the top genome.'''
    if not sortedMafInputs: return
    head, tail = sortedMafInputs[0], sortedMafInputs[1:]
    windows = [[] for t in tail]
    for h in head:  # iterate over MAF blocks in the first input
        windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)]
        for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)):
            yield (h,) + i

op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...")
(opts, args) = op.parse_args()

progName = os.path.basename(sys.argv[0])

inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args]

for mafs in overlappingMafs(inputs):
    mafJoin(mafs).write()