/usr/share/pyshared/cogent/parse/phylip.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/env python
from cogent.parse.record import RecordError
from cogent.core.alignment import Alignment
__author__ = "Micah Hamady"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Micah Hamady", "Peter Maxwell", "Gavin Huttley",
"Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Micah Hamady"
__email__ = "hamady@colorado.edu"
__status__ = "Prototype"
def is_blank(x):
"""Checks if x is blank."""
return not x.strip()
def _get_header_info(line):
"""
Get number of sequences and length of sequence
"""
header_parts = line.split()
num_seqs, length = map(int, header_parts[:2])
is_interleaved = len(header_parts) > 2
return num_seqs, length, is_interleaved
def _split_line(line, id_offset):
"""
First 10 chars must be blank or contain id info
"""
if not line or not line.strip():
return None, None
# extract id and sequence
curr_id = line[0:id_offset].strip()
curr_seq = line[id_offset:].strip().replace(" ", "")
return curr_id, curr_seq
def MinimalPhylipParser(data, id_map=None, interleaved=True):
"""Yields successive sequences from data as (label, seq) tuples.
**Need to implement id map.
**NOTE if using phylip interleaved format, will cache entire file in
memory before returning sequences. If phylip file not interleaved
then will yield each successive sequence.
data: sequence of lines in phylip format (an open file, list, etc)
id_map: optional id mapping from external ids to phylip labels - not sure
if we're going to implement this
returns (id, sequence) tuples
"""
seq_cache = {}
interleaved_id_map = {}
id_offset = 10
curr_ct = -1
for line in data:
if curr_ct == -1:
# get header info
num_seqs, seq_len, interleaved = _get_header_info(line)
if not num_seqs or not seq_len:
return
curr_ct += 1
continue
curr_id, curr_seq = _split_line(line, id_offset)
# skip blank lines
if not curr_id and not curr_seq:
continue
if not interleaved:
if curr_id:
if seq_cache:
yield seq_cache[0], ''.join(seq_cache[1:])
seq_cache = [curr_id, curr_seq]
else:
seq_cache.append(curr_seq)
else:
curr_id_ix = curr_ct % num_seqs
if (curr_ct + 1) % num_seqs == 0:
id_offset = 0
if curr_id_ix not in interleaved_id_map:
interleaved_id_map[curr_id_ix] = curr_id
seq_cache[curr_id_ix] = []
seq_cache[curr_id_ix].append(curr_seq)
curr_ct += 1
# return joined sequences if interleaved
if interleaved:
for curr_id_ix, seq_parts in seq_cache.items():
join_seq = ''.join(seq_parts)
if len(join_seq) != seq_len:
raise RecordError(
"Length of sequence '%s' is not the same as in header "
"Found %d, Expected %d" % (
interleaved_id_map[curr_id_ix], len(join_seq), seq_len))
yield interleaved_id_map[curr_id_ix], join_seq
#return last seq if not interleaved
else:
if seq_cache:
yield seq_cache[0], ''.join(seq_cache[1:])
def get_align_for_phylip(data, id_map=None):
"""
Convenience function to return aligment object from phylip data
data: sequence of lines in phylip format (an open file, list, etc)
id_map: optional id mapping from external ids to phylip labels - not sure
if we're going to implement this
returns Alignment object
"""
mpp = MinimalPhylipParser(data, id_map)
tuples = []
for tup in mpp:
tuples.append(tup)
return Alignment(tuples)
|