This file is indexed.

/usr/lib/python2.7/dist-packages/biotools/analysis/predict.py is in python-biotools 1.2.12-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python
import biotools.IO as io
import biotools.BLAST as BLAST
import biotools.analysis.options as options
from biotools.sequence import Sequence, annotation as ann
from biotools.align import OptimalCTether as align
from biotools.translate import translate
from biotools.complement import complement
try:
    import Queue as queue
except ImportError:
    import queue
import threading
from os import sep, mkdir

PIPING = True


def ORFGenerator(sequ):
    '''
    Scans both strands of the given sequence and yields the longest subsequence
    that starts with a start codon and contains no stop codon other than the
    final codon.
    '''
    comp = complement(sequ[::-1])
    seq = sequ.seq
    cseq = comp.seq
    slen = len(sequ)
    starts = [-1, 0, 1, -1, 0, 1]  # locations of start codons in each frame
    stops = [0, 1, 2,  0, 1, 2]  # locations of stop  codons in each frame
    mlen = options.MIN_ORFLEN

    for i in xrange(slen - 2):
        fcodon, rcodon = seq[i:i + 3], cseq[i:i + 3]
        if fcodon in options.STOP_CODONS:
            if i - mlen >= starts[i % 3 + 3] >= stops[i % 3 + 3]:
                yield sequ[starts[i % 3 + 3]:i + 3]
            stops[i % 3 + 3] = i + 3
        elif fcodon in options.START_CODONS:
            if starts[i % 3 + 3] < stops[i % 3 + 3]:
                starts[i % 3 + 3] = i
        if rcodon in options.STOP_CODONS:
            if i - mlen >= starts[i % 3] >= stops[i % 3]:
                yield comp[starts[i % 3]:i + 3]
            stops[i % 3] = i + 3
        elif rcodon in options.START_CODONS:
            if starts[i % 3] < stops[i % 3]:
                starts[i % 3] = i
    raise StopIteration()


class ThreadQueue(queue.Queue):

    def __init__(self, target):
        queue.Queue.__init__(self)
        self.threadcount = 0
        self.target = target

    def put(self, item):
        options.lock.acquire()
        queue.Queue.put(self, item)
        if self.threadcount < options.NUM_THREADS - 1:
            thread = threading.Thread(target=self.target)
            thread.start()
            self.threadcount += 1
        options.lock.release()


def GeneFromBLAST(db, sequences, pref, names):
    '''
    BLASTs database against sequences, and for those results that pass the
    length and percent identity requirements, attempt to locate the full gene
    that corresponds to that BLAST hit. Genes that are found are saved in the
    subdirectory sequences under the given directory, divided depending on
    whether the sequnece is amino acid or nucleotide.
    '''
    PIPING = True
    wd = options.DIRECTORY + 'sequences' + sep

    for d in [options.DIRECTORY, wd]:
        try:
            mkdir(d)
        except OSError:
            pass

    subj = dict((s.name, s) for s in io.open(db, 'r'))
    options.debug("Database sequences loaded from file %s." % db)

    try:
        orfs = dict((s.name, [orf for orf in ORFGenerator(s)])
                    for s in io.open(sequences, 'r'))
        options.debug("ORFs loaded from file %s." % sequences)
    except IOError:
        options.debug("No file \"" + sequences + ",\" skipping.")
        return

    def target():
        while 1:
            try:
                res = qin.get(PIPING, 1)
            except queue.Empty:
                if not PIPING:
                    break
                else:
                    continue

            qname, sname = res['query']['name'], res['subject']['name']
            start, end = res['query']['start'], res['query']['end']
            alignments = []
            max_match = (options.MIN_IDENTITY, None)

            if subj[sname].type == 'nucl':
                subject = translate(subj[sname])
            else:
                subject = subj[sname]

            while qname:
                try:
                    o = orfs[qname]
                    break
                except KeyError:
                    qname = qname[:-1]
            if not qname:
                qin.task_done()
                continue

            for orf in o:
                if in_range(orf, start, end, res['frame']):
                    orf = orf[:-3]
                    query = translate(orf)
                    options.debug("Aligning %33s v. %33s." % (qname, sname))
                    alignment = align(subject.seq, query.seq)
                    alignments.append((orf, sname, alignment))

            for orf, refname, aln in alignments:
                hitlen = aln['sublength']
                region = orf[-3 * hitlen:]
                identity = float(aln['identities']) / aln['length']
                if identity >= max_match[0]:
                    max_match = (identity, (region, sname, aln))

            if max_match[1]:
                seq, name, _ = max_match[1]
                odl = subject.defline.split('[')[0].strip()
                src = seq.original.name
                start, end, strand = seq.start, seq.end, seq.step
                defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
                    (odl + (' ' if odl else ''), src, start, end, strand)

                new = Sequence(name.strip(), seq.seq, defline=defline,
                               original=seq.original, type=seq.type,
                               start=seq.start, end=seq.end, step=seq.step)
                qout.put(new)
            qin.task_done()

    def in_range(seq, start, end, frame):
        ss, se = sorted((seq.start, seq.end))
        os, oe = sorted((start, end))
        frame = int(frame)

        return (ss < oe and se > os and
                (se % 3 == oe % 3 or ss % 3 == oe % 3) and
                ((frame < 0 and seq.step < 0) or
                 (frame > 0 and seq.step > 0)))

    qout = queue.Queue()
    qin = ThreadQueue(target)

    blastopts = {
        'evalue': options.MAX_EVALUE,
        'num_threads': options.NUM_THREADS
    }

    for res in BLAST.run(db, sequences, **blastopts):
        if float(res['expect']) > options.MAX_EVALUE:
            continue

        sbjl = len(subj[res['subject']['name']])
        ident = float(res['identities'].split('(')[1][:-2]) / 100
        lerr = float(res['subject']['length']) / sbjl

        if ident >= options.MIN_IDENTITY:
            if lerr >= (1.0 - options.LENGTH_ERR):
                qin.put(res)
    PIPING = False
    options.debug("BLAST done.")

    target()
    qin.join()
    options.debug("Done Aligning sequences.")

    options.debug("Now writing sequences (%d)." % qout.qsize())
    seqs = {}
    nuc_file = io.open(wd + pref + '.fasta', 'w')
    count = 0
    while 1:
        try:
            seq = qout.get(False)
            if seq.seq not in seqs:
                seqs[seq.seq] = set()
            seqs[seq.seq].add(seq)
            nuc_file.write(seq)
            count += 1
            options.debug("Wrote %s (%d)." % (seq.name, count));
        except queue.Empty:
            break
    nuc_file.close()
    options.debug("Done Aligning sequences.")

    gh = io.open(wd + pref + '.gff3', 'w')
    names.append(wd + pref + '.fasta')

    for id in seqs:
        gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
                     homologs=','.join(s.name for s in seqs[id])))
    gh.close()


def run(subject, query, prefix, names):
    GeneFromBLAST(subject, query, prefix, names)


if __name__ == '__main__':
    options.START_CODONS = ['TTG']
    import sys
    f = io.open(sys.argv[1], 'r')
    for seq in f:
        print(seq.name + ' ' + seq.defline)
        for orf in ORFGenerator(seq):
            print('%d ... %d' % (orf.start, orf.end))