/usr/share/pyshared/cogent/parse/greengenes.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | #!/usr/bin/env python
"""Parse the Greengenes formatted sequence data records
The script is intended to be used with the following input:
http://greengenes.lbl.gov/Download/Sequence_Data/Greengenes_format/greengenes16SrRNAgenes.txt.gz
"""
from cogent.parse.record_finder import DelimitedRecordFinder
from cogent.parse.record import DelimitedSplitter, GenericRecord
__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Daniel McDonald"
__email__ = "daniel.mcdonald@colorado.edu"
__status__ = "Prototype"
def make_ignore_f(start_line):
"""Make an ignore function that ignores bad gg lines"""
def ignore(line):
"""Return false if line is bad"""
return not line or ['',''] == line or [start_line,''] == line
return ignore
def DefaultDelimitedSplitter(delimiter):
"""Wraps delimited splitter to handle empty records"""
parser = DelimitedSplitter(delimiter=delimiter)
def f(line):
parsed = parser(line)
if len(parsed) == 1:
parsed.append('')
return parsed
return f
def MinimalGreengenesParser(lines,LineDelim="=",RecStart="BEGIN",RecEnd="END"):
"""Parses raw Greengeens 16S rRNA Gene records
lines : open records file
LineDelim : individual line delimiter, eg foo=bar
RecStart : start identifier for a record
RecEnd : end identifier for a record
"""
line_parser = DefaultDelimitedSplitter(delimiter=LineDelim)
# parse what the ending record looks like so it can match after being split
RecordDelim = line_parser(RecEnd)
# make sure to ignore the starting record
ignore = make_ignore_f(RecStart)
parser = DelimitedRecordFinder(RecordDelim, constructor=line_parser,
keep_delimiter=False, ignore=ignore)
for record in parser(lines):
yield GenericRecord(record)
all_ids = lambda x,y: True
specific_ids = lambda x,y: x in y
def SpecificGreengenesParser(lines, fields, ids=None, **kwargs):
"""Yield specific fields from successive Greengenes records
If ids are specified, only the records for the set of ids passed in will
be returned. Parser will silently ignore ids that are not present in the
set of ids as well as silently ignore ids in the set that are not present
in the records file.
ids : must either test True or be an iterable with prokMSA_ids
Returns tuples in 'fields' order
"""
parser = MinimalGreengenesParser(lines, **kwargs)
if ids:
ids = set(ids)
id_lookup = specific_ids
else:
id_lookup = all_ids
for record in parser:
if id_lookup(record['prokMSA_id'], ids):
yield tuple([record[field] for field in fields])
def main():
from optparse import make_option
from cogent.util.misc import parse_command_line_parameters
from sys import exit, stdout
script_info = {}
script_info['brief_description'] = "Parse raw Greengenes 16S records"
script_info['script_description'] = """Parse out specific fields from raw Greengenes 16S records. These records are rich but often only a subset of each record is required for downstream processing."""
script_info['script_usage'] = []
script_info['script_usage'].append(("""Example:""","""Greengenes taxonomy and raw sequences are needed:""","""python greengenes.py -i greengenes16SrRNAgenes.txt -o gg_seq_and_tax.txt -f prokMSA_id,greengenes_tax_string,aligned_seq"""))
script_info['script_usage'].append(("""Example:""","""Spitting out the available fields from Greengenes:""","""python greengenes.py -i greengenes16SrRNAgenes.txt --print-fields"""))
script_info['output_description'] = """The resulting output file will contain a header that is prefixed with a # and delimited by the specified delimiter (default is tab). All records will follow in the same order with the same delimiter. It is possible for some key/value pairs within a record to lack a value. In this case, the value placed will be ''"""
script_info['required_options']=[make_option('--input','-i',dest='input',\
help='Greengenes Records')]
script_info['optional_options']=[\
make_option('--output','-o',dest='output',help='Output file'),
make_option('--fields','-f',dest='fields',\
help='Greengenes fields to keep'),
make_option('--delim','-d',dest='delim',help='Output delimiter',\
default="\t"),
make_option('--list-of-ids','-l',dest='ids',default=None,\
help='File with a single column list of ids to retrieve'),
make_option('--print-fields','-p',dest='print_fields',\
help='Prints available fields from first Greengenes Record',\
action='store_true',default=False)]
script_info['version'] = __version__
option_parser, opts, args = parse_command_line_parameters(**script_info)
if opts.print_fields:
gg_parser = MinimalGreengenesParser(open(opts.input))
rec = gg_parser.next()
print '\n'.join(sorted(rec.keys()))
exit(0)
if not opts.fields:
print option_parser.usage()
print
print "Greengenes fields must be specified!"
exit(1)
if not opts.output:
output = stdout
else:
output = open(opts.output,'w')
fields = opts.fields.split(',')
output.write("#%s\n" % opts.delim.join(fields))
if opts.ids:
ids = set([l.strip() for l in open(opts.ids, 'U')])
gg_parser = SpecificGreengenesParser(open(opts.input), fields, ids)
for record in gg_parser:
output.write(opts.delim.join(record))
output.write('\n')
if opts.output:
output.close()
if __name__ == '__main__':
main()
|