/usr/share/pyshared/cogent/parse/newick.py is in python-cogent 1.5.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | #!/usr/bin/env python
"""Newick format with all features as per the specs at:
http://evolution.genetics.washington.edu/phylip/newick_doc.html
http://evolution.genetics.washington.edu/phylip/newicktree.html
ie:
Unquoted label underscore munging
Quoted labels
Inner node labels
Lengths
[ ... ] Comments (discarded)
Unlabeled tips
also:
Double quotes can be used.
Spaces and quote marks are OK inside unquoted labels.
"""
from cogent.parse.record import FileFormatError
import re
EOT = None
__author__ = "Peter Maxwell"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Peter Maxwell", "Andrew Butterfield", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Peter Maxwell"
__email__ = "pm67nz@gmail.com"
__status__ = "Production"
class TreeParseError(FileFormatError):
pass
class _Tokeniser(object):
"""Supplies an iterable stream of Newick tokens from 'text'
By default this is very forgiving of non-standard unquoted labels.
Two options can change how unquoted labels are interpreted:
To prohibit internal spaces and quotes set strict_labels=True.
To disable conversion of '_' to ' ' set underscore_unmunge=False.
NOTE: underscore_unmunging is part of the Newick standard, although it
is often inconvenient for other purposes.
"""
def __init__(self, text, strict_labels=False, underscore_unmunge=True):
self.text = text
self.posn = None
self.strict_unquoted_labels = strict_labels
self.underscore_unmunge = underscore_unmunge
def error(self, detail=""):
if self.token:
msg = 'Unexpected "%s" at ' % self.token
else:
msg = 'At '
(line, column) = self.posn
sample = self.text.split('\n')[line][:column]
if column > 30:
sample = "..." + sample[-20:]
if line > 0:
msg += 'line %s:%s "%s"' % (line+1, column, sample)
else:
msg += 'char %s "%s"' % (column, sample)
return TreeParseError(msg + '. ' + detail)
def tokens(self):
closing_quote_token = None
column = 0
line = 0
text = None
closing_quote_token = None
in_comment = False
for token in re.split("""([\\t ]+|\\n|''|""|[]['"(),:;])""", self.text)+[EOT]:
label_complete = False
token_consumed = True
self.token = token
column += len(token or '')
self.posn = (line, column)
if token == "":
pass
elif in_comment:
if token is EOT:
raise self.error('Ended with unclosed comment')
if token == ']':
in_comment = False
elif closing_quote_token:
if token is EOT:
raise self.error('Text ended inside quoted label')
if token == '\n':
raise self.error('Line ended inside quoted label')
if token == closing_quote_token:
label_complete = True
closing_quote_token = None
else:
if token == closing_quote_token*2:
token = token[0]
text += token
elif token is EOT or token in '\n[():,;':
if text:
text = text.strip()
if self.underscore_unmunge and '_' in text:
text = text.replace('_', ' ')
label_complete = True
if token == '\n':
line += 1
column = 1
elif token == '[':
in_comment = True
else:
token_consumed = False
elif text is not None:
text += token
elif token in ["''", '""']:
label_complete = True
text = ""
elif token in ["'", '"']:
closing_quote_token = token
text = ""
elif token.strip():
text = token
label_complete = self.strict_unquoted_labels
if label_complete:
self.token = None
yield text
text = None
if not token_consumed:
self.token = token
yield token
def parse_string(text, constructor, **kw):
"""Parses a Newick-format string, using specified constructor for tree.
Calls constructor(children, name, attributes)
Note: underscore_unmunge, if True, replaces underscores with spaces in
the data that's read in. This is part of the Newick format, but it is
often useful to suppress this behavior.
"""
if "(" not in text and ";" not in text and text.strip():
# otherwise "filename" is a valid (if small) tree
raise TreeParseError('Not a Newick tree: "%s"' % text[:10])
sentinals = [';', EOT]
stack = []
nodes = []
children = name = expected_attribute = None
attributes = {}
tokeniser = _Tokeniser(text, **kw)
for token in tokeniser.tokens():
if expected_attribute is not None:
(attr_name, attr_cast) = expected_attribute
try:
attributes[attr_name] = attr_cast(token)
except ValueError:
raise tokeniser.error("Can't convert %s '%s'" %
(attr_name, token))
expected_attribute = None
elif token == '(':
if children is not None:
raise tokeniser.error(
"Two subtrees in one node, missing comma?")
elif name or attributes:
raise tokeniser.error(
"Subtree must be first element of the node.")
stack.append((nodes, sentinals, attributes))
(nodes, sentinals, attributes) = ([], [')'], {})
elif token == ':':
if 'length' in attributes:
raise tokeniser.error("Already have a length.")
expected_attribute = ('length', float)
elif token in [')', ';', ',', EOT]:
nodes.append(constructor(children, name, attributes))
children = name = expected_attribute = None
attributes = {}
if token in sentinals:
if stack:
children = nodes
(nodes, sentinals, attributes) = stack.pop()
else:
break
elif token == ',' and ')' in sentinals:
pass
else:
raise tokeniser.error("Was expecting to end with %s" %
' or '.join([repr(s) for s in sentinals]))
else:
if name is not None:
raise tokeniser.error("Already have a name '%s' for this node." % name)
elif attributes:
raise tokeniser.error("Name should come before length.")
name = token
assert not stack, stack
assert len(nodes) == 1, len(nodes)
return nodes[0]
|