/usr/share/pyshared/cogent/parse/cutg.py is in python-cogent 1.5.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | #/usr/bin/env python
"""Parsers for CUTG codon and spsum files.
Notes
The CUTG format specifiers are highly inaccurate. For the species sum records,
colons can appear in the organism name (e.g. for some bacterial strains), so
it's important to split on the _last_ colon.
For the codon records, the 'nt..nt' field now stores the GenBank location line
in all its complexity. The length field precedes the PID field. There is not
really a separator between the title and the descriptions.
Some of the database _names_ contain '/', the field delimiter. For now, these
are just skipped, since writing something sufficiently general to detect
whether they are quoted is considerably more effort than the reward warrants.
"""
from cogent.parse.record_finder import LabeledRecordFinder
from cogent.parse.record import RecordError, DelimitedSplitter
from cogent.core.info import Info, DbRef
from cogent.util.misc import caps_from_underscores as cfu
from cogent.core.usage import CodonUsage
from string import strip
__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"
def is_cutg_label(x):
"""Checks if x looks like a CUTG label line."""
return x.startswith('>')
def is_cutg_species_label(x):
"""Checks if x looks like a CUTG label line."""
return ':' in x
def is_blank(x):
"""Checks if x is blank."""
return (not x) or x.isspace()
CutgSpeciesFinder = LabeledRecordFinder(is_cutg_species_label, ignore=is_blank)
CutgFinder = LabeledRecordFinder(is_cutg_label, ignore=is_blank)
codon_order = "CGA CGC CGG CGU AGA AGG CUA CUC CUG CUU UUA UUG UCA UCC UCG UCU AGC AGU ACA ACC ACG ACU CCA CCC CCG CCU GCA GCC GCG GCU GGA GGC GGG GGU GUA GUC GUG GUU AAA AAG AAC AAU CAA CAG CAC CAU GAA GAG GAC GAU UAC UAU UGC UGU UUC UUU AUA AUC AUU AUG UGG UAA UAG UGA".split()
#NOTE: following field order omits Locus/CDS (first field), which needs further
#processing. Use zip(field_order, fields[1:]) and handle first field specially.
field_order = "GenBank Location Length GenPept Species Description".split()
species_label_splitter = DelimitedSplitter(':', -1)
def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage):
"""Yields successive sequences from infile as CodonUsage objects.
If strict is True (default), raises RecordError when label or seq missing.
"""
if not strict: #easier to see logic without detailed error handling
for rec in CutgSpeciesFinder(infile):
try:
label, counts = rec
if not is_cutg_species_label(label):
continue
species, genes = species_label_splitter(label)
info = Info({'Species':species, 'NumGenes':int(genes)})
freqs = constructor(zip(codon_order, map(int, counts.split())),
Info=info)
yield freqs
except:
continue
else:
for rec in CutgSpeciesFinder(infile):
try:
label, counts = rec
except ValueError: #can't have got any counts
raise RecordError, "Found label without sequences: %s" % rec
if not is_cutg_species_label(label):
raise RecordError, "Found CUTG record without label: %s" % rec
species, genes = species_label_splitter(label)
info = Info({'Species':species, 'NumGenes':int(genes)})
try:
d = zip(codon_order, map(int, counts.split()))
freqs = constructor(d, Info=info)
except:
raise RecordError, "Unable to convert counts: %s" % counts
yield freqs
def InfoFromLabel(line):
"""Takes a CUTG codon description line and returns an Info object.
Raises RecordError if wrong number of fields etc.
"""
try:
raw_fields = line.split('\\')
result = Info(dict(zip(field_order, map(strip, raw_fields[1:]))))
#extra processing for first field
first = raw_fields[0]
if '#' in first:
locus, cds_num = map(strip, raw_fields[0].split('#'))
else:
locus, cds_num = first, '1'
result['Locus'] = locus[1:] #remove leading '>'
result['CdsNumber'] = cds_num
#additional processing for last field: mostly key="value" pairs
description = result['Description']
descrs = description.split('/')
for d in descrs:
if '=' in d: #assume key-value pair
key, val = map(strip, d.split('=', 1)) #might be '=' in value
#cut off leading and trailing " if present, but _not_ internal!
if val.startswith('"'):
val = val[1:]
if val.endswith('"'):
val = val[:-1]
if key == 'db_xref': #handle cross-refs specially
try:
key, val = val.split(':')
except ValueError: #missing actual reference?
continue #just skip the bad db records
try:
if result[key]:
result[key].append(val)
else:
result[key] = [val]
except (KeyError, TypeError): #didn't recognize database
result[key] = val
else:
#remember to convert the key to MixedCase naming convention
result[cfu(key)] = val
return result
except:
raise RecordError, "Failed to read label line:\n%s" % line
def CutgParser(infile, strict=True, constructor=CodonUsage):
"""Yields successive sequences from infile as CodonUsage objects.
If strict is True (default), raises RecordError when label or seq missing.
"""
if not strict: #not much error checking needed: following makes logic clear
for rec in CutgFinder(infile):
try:
label, counts = rec
if not is_cutg_label(label):
continue
info = InfoFromLabel(label)
freqs = constructor(zip(codon_order, map(int, counts.split())),
Info=info)
yield freqs
except:
continue
else: #need to do more detailed error checking
count = 0
for rec in CutgFinder(infile):
try:
label, counts = rec
except ValueError: #can't have got any counts
raise RecordError, "Found label without sequences: %s" % rec
if not is_cutg_label(label):
raise RecordError, "Found CUTG record without label: %s" % rec
info = InfoFromLabel(label)
try:
freqs = constructor(zip(codon_order, map(int, counts.split())),
Info=info)
except NotImplementedError:
raise RecordError, "Unable to convert counts: %s" % counts
yield freqs
|