/usr/share/pyshared/cogent/parse/tree.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | #/usr/bin/env python
"""Parsers for tree formats.
Implementation Notes
The algorithm used here is fairly general: should possibly make the code
generalizable to tree strings that use alternative delimiters and symbols.
However, I can't think of any cases where alternatives are used, so this is
left to future work.
Should possibly build a dict of {label:TreeNode} while parsing to make it
convenient to fill in additional data later, e.g. to fill in sequences from
their numeric labels in Newick format. Alternatively, maybe TreeNode should
get a buildIndex() method that performs the equivalent task.
As of 12/27/03, should be capable of parsing the ClustalW .dnd files without
difficulty.
"""
from cogent.core.tree import PhyloNode
from cogent.parse.record import RecordError
from string import strip, maketrans
__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Rob Knight", "Catherine Lozupone", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"
_dnd_token_str = '(:),;'
_dnd_tokens = dict.fromkeys(_dnd_token_str)
_dnd_tokens_and_spaces = _dnd_token_str + ' \t\v\n'
remove_dnd_tokens = maketrans(_dnd_tokens_and_spaces, \
'-'*len(_dnd_tokens_and_spaces))
def safe_for_tree(s):
"""Makes string s safe for DndParser by removing significant chars."""
return s.translate(remove_dnd_tokens)
def bad_dnd_tokens(s, is_valid_name):
"""Returns list of bad dnd tokens from s, using is_valid_name for names.
Useful for finding trees with misformatted names that break parsing.
"""
for t in DndTokenizer(s):
if t in _dnd_tokens:
continue
#also OK if it's a number
try:
float(t)
continue
except: #wasn't a number -- further tests
pass
if is_valid_name(t):
continue
#if we got here, nothing worked, so yield the current token
yield t
def DndTokenizer(data):
"""Tokenizes data into a stream of punctuation, labels and lengths.
Note: data should all be a single sequence, e.g. a single string.
"""
in_quotes = False
saved = []
sa = saved.append
for d in data:
if d == "'":
in_quotes = not(in_quotes)
if d in _dnd_tokens and not in_quotes:
curr = ''.join(saved).strip()
if curr:
yield curr
yield d
saved = []
sa = saved.append
else:
sa(d)
def DndParser(lines, constructor=PhyloNode, unescape_name=False):
"""Returns tree from the Clustal .dnd file format, and anything equivalent.
Tree is made up of cogent.base.tree.PhyloNode objects, with branch lengths
(by default, although you can pass in an alternative constructor
explicitly).
"""
if isinstance(lines, str):
data = lines
else:
data = ''.join(lines)
#skip arb comment stuff if present: start at first paren
paren_index = data.find('(')
data = data[paren_index:]
left_count = data.count('(')
right_count = data.count(')')
if left_count != right_count:
raise RecordError, "Found %s left parens but %s right parens." % \
(left_count, right_count)
tokens = DndTokenizer(data)
curr_node = None
state = 'PreColon'
state1 = 'PreClosed'
last_token = None
for t in tokens:
if t == ':': #expecting branch length
state = 'PostColon'
#prevent state reset
last_token = t
continue
if t == ')' and (last_token == ',' or last_token == '('): # node without name
new_node = _new_child(curr_node, constructor)
new_node.Name = None
curr_node = new_node.Parent
state1 = 'PostClosed'
last_token = t
continue
if t == ')': #closing the current node
curr_node = curr_node.Parent
state1 = 'PostClosed'
last_token = t
continue
if t == '(': #opening a new node
curr_node = _new_child(curr_node, constructor)
elif t == ';': #end of data
last_token = t
break
# node without name
elif t == ',' and (last_token == ',' or last_token == '('):
new_node = _new_child(curr_node, constructor)
new_node.Name = None
curr_node = new_node.Parent
elif t == ',': #separator: next node adds to this node's parent
curr_node = curr_node.Parent
elif state == 'PreColon' and state1 == 'PreClosed': #data for the current node
new_node = _new_child(curr_node, constructor)
if unescape_name:
if t.startswith("'") and t.endswith("'"):
while t.startswith("'") and t.endswith("'"):
t = t[1:-1]
else:
if '_' in t:
t = t.replace('_', ' ')
new_node.Name = t
curr_node = new_node
elif state == 'PreColon' and state1 == 'PostClosed':
if unescape_name:
while t.startswith("'") and t.endswith("'"):
t = t[1:-1]
curr_node.Name = t
elif state == 'PostColon': #length data for the current node
curr_node.Length = float(t)
else: #can't think of a reason to get here
raise RecordError, "Incorrect PhyloNode state? %s" % t
state = 'PreColon' #get here for any non-colon token
state1 = 'PreClosed'
last_token = t
if curr_node is not None and curr_node.Parent is not None:
raise RecordError, "Didn't get back to root of tree."
if curr_node is None: #no data -- return empty node
return constructor()
return curr_node #this should be the root of the tree
def _new_child(old_node, constructor):
"""Returns new_node which has old_node as its parent."""
new_node = constructor()
new_node.Parent = old_node
if old_node is not None:
if id(new_node) not in map(id, old_node.Children):
old_node.Children.append(new_node)
return new_node
|