/usr/share/pyshared/cogent/parse/aaindex.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | #!/usr/bin/env python
"""Parsers for the AAIndex file format.
AAIndex can be downloaded at: http://www.genome.ad.jp/dbget/aaindex.html
There are two main files: AAIndex1 contains linear measures (one number per
amino acid) of amino acid properties, while AAIndex2 contains pairwise measures
(one number per pair of amino acids, e.g. distance or similarity matrices).
"""
import re
from cogent.parse.record_finder import DelimitedRecordFinder
from string import rstrip
from cogent.maths.matrix.distance import DistanceMatrix
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Greg Caporaso", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Greg Caporaso"
__email__ = "caporaso@colorado.edu"
__status__ = "Production"
class AAIndexParser(object):
""" Abstract class for AAIndex file parsers
This file is an abstract class for the parsers of the two AAIndex
files. The only real difference between the files is that AAIndex1
has one additional field, labeled in here as Correlating.
"""
def __init__(self):
""" Initialize the object. """
def __call__(self, infile):
""" Parse AAIndex file into dict of AAIndex objects with ID as key
infile = file to parse as file object or list of lines
Usage:
aa1p = AAIndex1Parser()
aaIndex1Objects = aa1p('data/AAIndex1')
aa2p = AAIndex2Parser()
aaIndex2Objects = aa2p('data/AAIndex2')
"""
result = {}
# Break down the file into records delimited by '//' and then
# parse each record into AAIndexRecord objects which will be stored
# in a dict keyed by the records unique ID string
AAIndexRecordFinder = DelimitedRecordFinder('//', constructor=rstrip)
# parser is a generator of AAIndexRecords from file
parser = AAIndexRecordFinder(infile)
for r in parser:
new_record = self._parse_record(r)
if new_record:
yield new_record
def _get_field(self, field_identifier, lines):
""" Returns the field identified as a one line string
"""
i = 0
result = ''
# Concatenate multi-line data with line_split
line_split = ' '
# Run through all lines in the current record
while (i < len(lines)):
# Check each line to see if it starts with the field
# identifier we are looking for
if (lines[i].startswith(field_identifier)):
# If we find the line we are looking for, include it in
# the result, unless it's a Data line.
# Data entries are multi-line, and the first is information
# that we are not interested in here.
if (field_identifier != 'I'):
result += lines[i]
if field_identifier == 'M': result += 'BRK'
# Get rid of the line identifier and leading white space
result = result[2:]
# Move to next line
i += 1
# and see if it's a continuation from the above line
while (i < len(lines) and\
(lines[i].startswith(' ') or\
lines[i].startswith(field_identifier))):
# if continuation combine the lines while treating the
# spaces nicely, ie, multiple spaces -> one space
# this is mostly just important for the
# lines that are strings such as title
result = result.rstrip() + line_split + lines[i].lstrip()
i += 1
break
i += 1
# return the field of interest
return result
class AAIndex1Parser(AAIndexParser):
""" Parse AAIndex1 file & return it as dict of AAIndex1 objects"""
def _parse_record(self, lines):
""" Parse a single record and return it as a AAIndex1Record Object """
# init all of the fields each time, this is so that
# if fields are missing they don't get the value from the last
# record
id = None
description = None
LITDB = None
authors = None
title = None
citations = None
comments = None
correlating = {}
data = [None] * 20
id = self._get_field('H', lines)
description = self._get_field('D', lines)
LITDB = self._get_field('R', lines)
authors = self._get_field('A', lines)
title = self._get_field('T', lines)
citations = self._get_field('J', lines)
comments = self._get_field('*', lines)
correlating = self._parse_correlating(self._get_field('C', lines))
data = self._parse_data(self._get_field('I', lines))
return AAIndex1Record(id, description, LITDB, authors,\
title, citations, comments, correlating, data)
def _parse_correlating(self, raw):
""" Parse Correlating entries from the current record """
keys = []
values = []
raw = raw.lstrip()
# Split by white space
data = re.split('\s*', raw)
i=0
while(i<len(data)):
# If it's even it's a key
if((i % 2) == 0):
keys += [data[i]]
# if it's not even it's a value
else:
# convert values to floats
try:
values += [float(data[i])]
except ValueError:
values += [data[i]]
i += 1
result = dict(zip(keys, values))
return result
def _parse_data(self, raw):
""" Parse the data field from current record into a dict
"""
# init for use in result
keys = 'ARNDCQEGHILKMFPSTWYV'
values = []
# get rid of leading white spaces, it makes../ the reg exp act weird
raw = raw.lstrip()
# split by any number/ types of white spaces
data = re.split('\s*', raw)
# convert the data to a float while checking for invlaid data,
# specifically the string 'NA' is present sometimes instead of data
for i in data:
try:
values += [float(i)]
except ValueError:
values += i
result = dict(zip(keys, values))
# return the dict
return result
class AAIndex2Parser(AAIndexParser):
""" Parse AAIndex2 file & return it as dict of AAIndex2 objects"""
def _parse_record(self, lines):
""" Parse a single record and return it as a AAIndex2Record Object """
# Init attributes of each record each run through
id = None
description = None
LITDB = None
authors = None
title = None
citations = None
comments = None
rowscols = None
data = []
# Fill in the values
id = self._get_field('H', lines)
description = self._get_field('D', lines)
LITDB = self._get_field('R', lines)
authors = self._get_field('A', lines)
title = self._get_field('T', lines)
citations = self._get_field('J', lines)
comments = self._get_field('*', lines)
raw_data = self._get_field('M', lines)
rowscols = self._parse_rowscols(raw_data[:raw_data.find('BRK')])
try:
data = self._parse_data(raw_data[raw_data.find('BRK')+3:],\
rowscols[0], rowscols[1])
except IndexError:
return None
return AAIndex2Record(id, description, LITDB, authors,\
title, citations, comments, data)
def _parse_data(self, raw, rows, cols):
""" Parse the data field from current record into dict """
# init result dict
result = None
# get rid of leading white spaces, it make the reg exp act weird
raw = raw.lstrip()
# split by any number/ types of white spaces
data = re.split('\s*', raw)
# If square matrix
if len(data) == (len(rows)*len(cols)):
result = dict.fromkeys(rows)
i = 0
for r in rows:
new_row = dict.fromkeys(cols)
for c in cols:
try:
new_row[c] = float(data[i])
except ValueError:
new_row[c] = data[i]
i+=1
result[r] = new_row
# else if LTM
elif len(data) == (len(cols)+1) * len(rows)/2 :
result = dict.fromkeys(rows)
i = 0
for r in rows:
new_row = dict.fromkeys(cols)
for c in cols:
if cols.find(c) <= rows.find(r):
try:
new_row[c] = float(data[i])
except ValueError:
new_row[c] = data[i]
i += 1
result[r] = new_row
return result
def _parse_rowscols(self, raw):
""" Returns two element list, 0: rows info, 1: cols info
This parses the data out of the data description line
for each record in AAIndex2 so we know what the data is that
we are looking at.
"""
p ='[rows|cols]\s=\s([^ \t\n\r\f\v,]*)'
result = []
result += re.findall(p, raw)
return result
class AAIndexRecord(object):
""" Abstract class, stores records from AAIndex files """
def __init__(self, id,
description, LITDB_entry_num,
authors, title,
citation, comments, data):
""" Stores data for individual AAIndex entires """
self.ID = str(id)
self.Description = str(description)
self.LITDBEntryNum = str(LITDB_entry_num)
self.Authors = str(authors)
self.Title = str(title)
self.Citation = str(citation)
self.Comments = str(comments)
self.Data = data
def _toSquareDistanceMatrix(self, include_stops=False):
""" Converts AAIndex Data to square distance matrix
This abstract method must be overwritten for each subclass.
The interface must be identical across subclasses, must
take self and return new square matrix (for now).
"""
pass
def toDistanceMatrix(self, include_stops=False):
""" Builds a DistanceMatrix object based on self """
data = self._toSquareDistanceMatrix(include_stops=include_stops)
# If there is missing or invalid data, data will be None
# if that's the case return None for easy detection, otherwise
# return a new DistanceMatrix object
if data:
return DistanceMatrix(data=data, info=self)
return None
class AAIndex1Record(AAIndexRecord):
""" Stores records from AAIndex1, inherits from AAIndexRecord """
def __init__(self, id,
description, LITDB_entry_num,
authors, title,
citation, comments,
correlating, data):
""" Stores data for individual AAIndex 1 entires """
# Call init from super class
AAIndexRecord.__init__(self, id,
description, LITDB_entry_num,
authors, title,
citation, comments, data)
self.Correlating = correlating
def _toSquareDistanceMatrix(self, include_stops=False):
""" AAIndex1 data to square distance matrix
"""
keys = self.Data.keys()
if include_stops : keys += '*'
# build result dict top layer, start empty
result = {}
for r in keys:
new_row = {}
for c in keys:
if (r == '*' or c == '*'):
new_row[c] = None
else:
# Build the ditance matrix by subtracting the
# value of each aminoacid and then taking the
# absolute value. If the data can not be
# turned into a float, it's not a number, so the data
# is invalid. Return None for easy detection
try:
new_row[c] =\
abs(float(self.Data[r])
- float(self.Data[c]))
except ValueError:
return None
result[r] = new_row
return result
class AAIndex2Record(AAIndexRecord):
""" Stores records from AAIndex2, inherits from AAIndexRecord """
def __init__(self, id,
description, LITDB_entry_num,
authors, title,
citation, comments, data):
""" Stores data for individual AAIndex 2 entires """
# Call init from super class
AAIndexRecord.__init__(self, id,
description, LITDB_entry_num,
authors, title,
citation, comments, data)
def _toSquareDistanceMatrix(self, include_stops=False):
""" Returns data as a square matrix
Note: This method is not currently functional,
we are awaiting information on how to process data into
a distance matrix
"""
# create a new dict based on self.Data so we don't alter self.Data
result = dict(self.Data)
# Add in the new row of stop codon data
if include_stops:
stop_row = {}
for i in result:
stop_row.update({i:None})
result.update({'*':stop_row})
for i in result:
result[i].update({'*':None})
# Right now we are only dealing with square matrices
return result
def AAIndexLookup(records):
""" Build a dict of AAIndexObjects hashed by ID """
result = {}
for r in records:
result[r.ID] = r
return result
def AAIndex1FromFiles(file):
""" Taking a file or list of data return a dict of AAIndex1Objects """
aap = AAIndex1Parser()
return AAIndexLookup(aap(file))
def AAIndex2FromFiles(file):
""" Taking a file or list of data return a dict of AAIndex2Objects """
aap = AAIndex2Parser()
return AAIndexLookup(aap(file))
Woese_data = """//
H WOEC730101
D Polar requirement (Woese, 1973)
R PMID:4588588
A Woese, C.R.
T Evolution of genetic code
J Naturwiss. 60, 447-459 (1973)
C GRAR740102 0.960 HOPT810101 0.886 HOPA770101 0.876
LEVM760101 0.872 PRAM900101 0.871 ROSM880101 0.844
WOLS870101 0.841 KUHL950101 0.837 OOBM770103 0.835
VINM940101 0.834 PARJ860101 0.821 FUKS010102 0.820
FAUJ880110 0.812 OOBM770101 0.804 ROSM880102 0.801
NADH010102 -0.800 CIDH920105 -0.800 MEIH800103 -0.802
ISOY800102 -0.803 EISD860103 -0.803 ROSG850102 -0.804
TANS770103 -0.806 RADA880101 -0.812 BIOV880102 -0.819
WIMW960101 -0.821 NISK860101 -0.822 PONP800103 -0.823
CIDH920104 -0.823 RADA880108 -0.825 BIOV880101 -0.829
PONP800108 -0.831 SWER830101 -0.832 EISD860101 -0.838
MAXF760102 -0.842 DESM900102 -0.847 FAUJ830101 -0.880
I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V
7.0 9.1 10.0 13.0 5.5 8.6 12.5 7.9 8.4 4.9
4.9 10.1 5.3 5.0 6.6 7.5 6.6 5.3 5.7 5.6
//
"""
def getWoeseDistanceMatrix():
""" Return the Woese Polar Requirement Distance Matrix """
aaindexObjects = AAIndex1FromFiles(Woese_data.split('\n'))
distance_matrices = {}
for m in aaindexObjects:
distance_matrices[m] = aaindexObjects[m].toDistanceMatrix()
return distance_matrices['WOEC730101']
|