/usr/share/pyshared/cogent/parse/unigene.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | #!/usr/bin/env python
"""Parsers for the various files in the UniGene database.
"""
from cogent.parse.record import MappedRecord, ByPairs, semi_splitter, \
equal_pairs, LineOrientedConstructor, list_adder, int_setter
from cogent.parse.record_finder import GbFinder
from string import maketrans, strip
__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"
def _read_sts(line):
"""Turns an STS line (without label) into a record.
Infuritatingly, STS lines are not semicolon-delimited, and spaces appear
in places they shouldn't. This was the case as of 10/9/03: expect this
'feature' to be unstable!
"""
filtered = line.replace('=', ' ')
return MappedRecord(list(ByPairs(filtered.split())))
def _read_expression(line):
"""Turns a semicolon-delimited expression line into list of expressions"""
return semi_splitter(line)
class UniGeneSeqRecord(MappedRecord):
Aliases = {'ACC':'Accession', 'CLONE':'CloneId', 'END':'End',\
'LID':'LibraryId', 'SEQTYPE':'SequenceType', 'TRACE':'Trace', \
'EST':'EstId', 'NID':'NucleotideId', 'PID':'ProteinId'}
class UniGeneProtSimRecord(MappedRecord):
Aliases = {'ORG':'Species', 'PROTGI':'ProteinGi', 'ProtId':'ProteinId',\
'PCT':'PercentSimilarity', 'ALN':'AlignmentScore'}
def _read_seq(line):
"""Turns a sequence line into a UniGeneSeqRecord.
BEWARE: first level delimiter is ';' and second level delimiter is '=', but
'=' can also appear inside the _value_ of the second level!
"""
first_level = semi_splitter(line)
second_level = map(equal_pairs, first_level)
return UniGeneSeqRecord(second_level)
def _read_protsim(line):
"""Turns a protsim line into a UniGeneProtSim record.
BEWARE: first level delimiter is ';' and second level delimiter is '=', but
'=' can also appear inside the _value_ of the second level!
"""
first_level = semi_splitter(line)
second_level = map(equal_pairs, first_level)
return UniGeneProtSimRecord(second_level)
class UniGene(MappedRecord):
"""Holds data for a UniGene record."""
Required = { 'STS':[], 'PROTSIM':[], 'SEQUENCE':[], 'EXPRESS': []}
Aliases = {'STS':'Sts', 'PROTSIM':'ProteinSimilarities',\
'SEQUENCE':'SequenceIds','SCOUNT':'SequenceCount','CTYOBAND':'CytoBand',\
'EXPRESS':'ExpressedIn', 'CHROMOSOME':'Chromosome','ID':'UniGeneId', \
'TITLE':'UniGeneTitle','LOCUSLINK':'LocusLinkId'}
def _expressions_setter(obj, field, val):
"""Sets specified field to a list of expressions"""
setattr(obj, field, semi_splitter(val))
def _sts_adder(obj, field, val):
"""Appends the current STS-type record to specified field"""
list_adder(obj, field, _read_sts(val))
def _seq_adder(obj, field, val):
"""Appends the current Sequence-type record to specified field"""
list_adder(obj, field, _read_seq(val))
def _protsim_adder(obj, field, val):
"""Appends the current ProtSim record to specified field"""
list_adder(obj, field, _read_protsim(val))
LinesToUniGene = LineOrientedConstructor()
LinesToUniGene.Constructor = UniGene
LinesToUniGene.FieldMap = {
'LOCUSLINK':int_setter,
'EXPRESS':_expressions_setter,
'PROTSIM':_protsim_adder,
'SCOUNT':int_setter,
'SEQUENCE':_seq_adder,
'STS':_sts_adder,
}
def UniGeneParser(lines):
"""Treats lines as a stream of unigene records"""
for record in GbFinder(lines):
curr = LinesToUniGene(record)
del curr['//'] #clean up delimiter
yield curr
if __name__ == '__main__':
from sys import argv, stdout
filename = argv[1]
count = 0
for record in UniGeneParser(open(filename)):
stdout.write('.')
stdout.flush()
count += 1
print "read %s records" % count
|