/usr/share/pyshared/cogent/parse/sequence.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | #!/usr/bin/env python
"""Classes for reading multiple sequence alignment files in different formats."""
import xml.dom.minidom
from cogent.parse import fasta, phylip, paml, clustal, genbank
from cogent.parse import gbseq, tinyseq, macsim, gcg
from cogent.parse.record import FileFormatError
__author__ = "Cath Lawrence"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Cath Lawrence", "Gavin Huttley", "Peter Maxwell",
"Matthew Wakefield", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Gavin Huttley"
__email__ = "gavin.huttley@anu.edu.au"
__status__ = "Production"
_lc_to_wc = ''.join([[chr(x),'?']['A' <= chr(x) <= 'Z'] for x in range(256)])
def FromFilenameParser(filename, format=None, **kw):
"""Arguments:
- filename: name of the sequence alignment file
- format: the multiple sequence file format
"""
format = format_from_filename(filename, format)
f = open(filename, 'U')
return FromFileParser(f, format, **kw)
def FromFileParser(f, format, dialign_recode=False, **kw):
format = format.lower()
if format in XML_PARSERS:
doctype = format
format = 'xml'
else:
doctype = None
if format == 'xml':
source = dom = xml.dom.minidom.parse(f)
if doctype is None:
doctype = str(dom.doctype.name).lower()
if doctype not in XML_PARSERS:
raise FileFormatError("Unsupported XML doctype %s" % doctype)
parser = XML_PARSERS[doctype]
else:
if format not in PARSERS:
raise FileFormatError("Unsupported file format %s" % format)
parser = PARSERS[format]
source = f
for (name, seq) in parser(source, **kw):
if isinstance(seq, basestring):
if dialign_recode:
seq = seq.translate(_lc_to_wc)
if not seq.isupper():
seq = seq.upper()
yield (name, seq)
def format_from_filename(filename, format=None):
"""Detects format based on filename."""
if format:
return format
else:
return filename[filename.rfind('.')+1:]
PARSERS = {
'phylip': phylip.MinimalPhylipParser,
'paml': paml.PamlParser,
'fasta': fasta.MinimalFastaParser,
'mfa': fasta.MinimalFastaParser,
'fa': fasta.MinimalFastaParser,
'faa': fasta.MinimalFastaParser,
'fna': fasta.MinimalFastaParser,
'xmfa': fasta.MinimalXmfaParser,
'gde': fasta.MinimalGdeParser,
'aln': clustal.ClustalParser,
'clustal': clustal.ClustalParser,
'gb': genbank.RichGenbankParser,
'gbk': genbank.RichGenbankParser,
'genbank': genbank.RichGenbankParser,
'msf': gcg.MsfParser,
}
XML_PARSERS = {
'gbseq': gbseq.GbSeqXmlParser,
'tseq': tinyseq.TinyseqParser,
'macsim': macsim.MacsimParser,
}
|