This file is indexed.

/usr/share/pyshared/cogent/parse/fasta.py is in python-cogent 1.5.3-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#/usr/bin/env python
"""Parsers for FASTA and related formats.
"""
from cogent.parse.record_finder import LabeledRecordFinder
from cogent.parse.record import RecordError
from cogent.core.info import Info, DbRef
from cogent.core.moltype import BYTES, ASCII

from string import strip
import cogent
import re

__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight","Peter Maxwell", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"

Sequence = BYTES.Sequence

def is_fasta_label(x):
    """Checks if x looks like a FASTA label line."""
    return x.startswith('>')

def is_gde_label(x):
    """Checks if x looks like a GDE label line."""
    return x and x[0] in '%#'

def is_blank_or_comment(x):
    """Checks if x is blank or a FASTA comment line."""
    return (not x) or x.startswith('#') or x.isspace()

def is_blank(x):
    """Checks if x is blank."""
    return (not x) or x.isspace()

FastaFinder = LabeledRecordFinder(is_fasta_label, ignore=is_blank_or_comment)

def MinimalFastaParser(infile, strict=True, \
    label_to_name=str, finder=FastaFinder, \
    is_label=None, label_characters='>'):
    """Yields successive sequences from infile as (label, seq) tuples.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    
    for rec in finder(infile):
        #first line must be a label line
        if not rec[0][0] in label_characters:
            if strict:
                raise RecordError, "Found Fasta record without label line: %s"%\
                    rec
            else:
                continue
        #record must have at least one sequence
        if len(rec) < 2:
            if strict:
                raise RecordError, "Found label line without sequences: %s" % \
                    rec
            else:
                continue
            
        label = rec[0][1:].strip()
        label = label_to_name(label)
        seq = ''.join(rec[1:])

        yield label, seq

GdeFinder = LabeledRecordFinder(is_gde_label, ignore=is_blank) 

def MinimalGdeParser(infile, strict=True, label_to_name=str):
    return MinimalFastaParser(infile, strict, label_to_name, finder=GdeFinder,\
        label_characters='%#')

def xmfa_label_to_name(line):
    (loc, strand, contig) = line.split()
    (sp, loc) = loc.split(':')
    (lo, hi) = [int(x) for x in loc.split('-')]
    if strand == '-':
        (lo, hi) = (hi, lo)
    else:
        assert strand == '+'
    name = '%s:%s:%s-%s' % (sp, contig, lo, hi)
    return name
   
def is_xmfa_blank_or_comment(x):
    """Checks if x is blank or an XMFA comment line."""
    return (not x) or x.startswith('=') or x.isspace()

XmfaFinder = LabeledRecordFinder(is_fasta_label, \
    ignore=is_xmfa_blank_or_comment)

def MinimalXmfaParser(infile, strict=True):
    # Fasta-like but with header info like ">1:10-1000 + chr1"
    return MinimalFastaParser(infile, strict, label_to_name=xmfa_label_to_name,
        finder=XmfaFinder)

def MinimalInfo(label):
    """Minimal info data maker: returns Name, and empty dict for info{}."""
    return label, {}

def NameLabelInfo(label):
    """Returns name as label split on whitespace, and Label in Info."""
    return label.split()[0], {'Label':label}

def FastaParser(infile,seq_maker=None,info_maker=MinimalInfo,strict=True):
    """Yields successive sequences from infile as (name, sequence) tuples.

    Constructs the sequence using seq_maker(seq, info=Info(info_maker(label))).

    If strict is True (default), raises RecordError when label or seq missing.
    Also raises RecordError if seq_maker fails.

    It is info_maker's responsibility to raise the appropriate RecordError or
    FieldError on failure.

    Result of info_maker need not actually be an info object, but can just be
    a dict or other data that Info can use in its constructor.
    """
    if seq_maker is None:
        seq_maker = Sequence
    for label, seq in MinimalFastaParser(infile, strict=strict):
        if strict:
            #need to do error checking when constructing info and sequence
            try:
                name, info = info_maker(label) #will raise exception if bad
                yield name, seq_maker(seq, Name=name, Info=info)
            except Exception, e:
                raise RecordError, \
                "Sequence construction failed on record with label %s" % label
        else:
            #not strict: just skip any record that raises an exception
            try:
                name, info = info_maker(label)
                yield(name, seq_maker(seq, Name=name, Info=info))
            except Exception, e:
                continue

#labeled fields in the NCBI FASTA records
NcbiLabels = {
'dbj':'DDBJ',
'emb':'EMBL',
'gb':'GenBank',
'ref':'RefSeq',
}

def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.
    
    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = map(strip, line.split('|', 4))
    except ValueError:  #probably got wrong value
        raise RecordError, "Unable to parse label line %s" % line
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info

def NcbiFastaParser(infile, seq_maker=None, strict=True):
    return FastaParser(infile, seq_maker=seq_maker, 
        info_maker=NcbiFastaLabelParser, strict=strict)

class RichLabel(str):
    """Object for overloaded Fasta labels. Holds an Info object storing keyed
    attributes from the fasta label. The str is created from a provided format
    template that uses the keys from the Info object."""
    
    def __new__(cls, info, template="%s"):
        """Arguments:
        
            - info: a cogent.core.info.Info instance
            - template: a string template, using a subset of the keys in info.
              Defaults to just '%s'.
        
        Example:
            label = RichLabel(Info(name='rat', species='Rattus norvegicus'),
                        '%(name)s')"""
        label = template % info
        new = str.__new__(cls, label)
        new.Info = info
        return new
    

def LabelParser(display_template, field_formatters, split_with=":", DEBUG=False):
    """returns a function for creating a RichLabel's from a string
    
    Arguments;
        - display_template: string format template
        - field_formatters: series of 
                (field index, field name, coverter function)
        - split_with: characters separating fields in the label.
          The display_template must use at least one of the assigned field
          names."""
    indexed = False
    for index, field, converter in field_formatters:
        if field in display_template:
            indexed = True
    assert indexed, "display_template [%s] does not use a field name"\
                    % display_template
    sep = re.compile("[%s]" % split_with)
    def call(label):
        label = [label, label[1:]][label[0] == ">"]
        label = sep.split(label)
        if DEBUG:
            print label
        info = Info()
        for index, name, converter in field_formatters:
            if callable(converter):
                try:
                    info[name] = converter(label[index])
                except IndexError:
                    print label, index, name
                    raise
            else:
                info[name] = label[index]
        return RichLabel(info, display_template)
    return call

def GroupFastaParser(data, label_to_name, group_key="Group", aligned=False,
        moltype=ASCII, done_groups=None, DEBUG=False):
    """yields related sequences as a separate seq collection
    
    Arguments:
        - data: line iterable data source
        - label_to_name: LabelParser callback
        - group_key: name of group key in RichLabel.Info object
        - aligned: whether sequences are to be considered aligned
        - moltype: default is ASCII
        - done_groups: series of group keys to be excluded
        """
    
    done_groups = [[], done_groups][done_groups is not None]
    parser = MinimalFastaParser(data, label_to_name=label_to_name, finder=XmfaFinder)
    group_ids = []
    current_collection = {}
    for label, seq in parser:
        seq = moltype.makeSequence(seq, Name=label, Info=label.Info)
        if DEBUG:
            print "str(label) ",str(label), "repr(label)", repr(label)
        if not group_ids or label.Info[group_key] in group_ids:
            current_collection[label] = seq
            if not group_ids:
                group_ids.append(label.Info[group_key])
        else:
            # we finish off check of current before creating a collection
            if group_ids[-1] not in done_groups:
                info = Info(Group=group_ids[-1])
                if DEBUG:
                    print "GroupParser collection keys", current_collection.keys()
                seqs = cogent.LoadSeqs(data=current_collection, moltype=moltype,
                                aligned=aligned)
                seqs.Info = info
                yield seqs
            current_collection = {label: seq}
            group_ids.append(label.Info[group_key])
    info = Info(Group=group_ids[-1])
    seqs = cogent.LoadSeqs(data=current_collection, moltype=moltype,
                    aligned=aligned)
    seqs.Info = info
    yield seqs