/usr/share/pyshared/cogent/parse/aaindex.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | #!/usr/bin/env python
"""Parsers for the AAIndex file format.
AAIndex can be downloaded at: http://www.genome.ad.jp/dbget/aaindex.html
There are two main files: AAIndex1 contains linear measures (one number per
amino acid) of amino acid properties, while AAIndex2 contains pairwise measures
(one number per pair of amino acids, e.g. distance or similarity matrices).
"""
import re
from cogent.parse.record_finder import DelimitedRecordFinder
from string import rstrip
from cogent.maths.matrix.distance import DistanceMatrix
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Greg Caporaso", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Greg Caporaso"
__email__ = "caporaso@colorado.edu"
__status__ = "Production"
class AAIndexParser(object):
    """ Abstract class for AAIndex file parsers
        This file is an abstract class for the parsers of the two AAIndex
        files.  The only real difference between the files is that AAIndex1
        has one additional field, labeled in here as Correlating.
    
    """
    def __init__(self):
        """ Initialize the object. """
        
    def __call__(self, infile):
        """ Parse AAIndex file into dict of AAIndex objects with ID as key
            infile = file to parse as file object or list of lines
            Usage:
                aa1p = AAIndex1Parser()
                aaIndex1Objects = aa1p('data/AAIndex1')
                aa2p = AAIndex2Parser()
                aaIndex2Objects = aa2p('data/AAIndex2')
        """
        
        result = {}
        # Break down the file into records delimited by '//' and then
        # parse each record into AAIndexRecord objects which will be stored
        # in a dict keyed by the records unique ID string
        AAIndexRecordFinder = DelimitedRecordFinder('//', constructor=rstrip)
        # parser is a generator of AAIndexRecords from file
        parser = AAIndexRecordFinder(infile)       
        for r in parser:
            new_record = self._parse_record(r)
            if new_record:
                yield new_record
    def _get_field(self, field_identifier, lines):
        """ Returns the field identified as a one line string
        """
        i = 0
        result = ''
        # Concatenate multi-line data with line_split
        line_split = ' '
        # Run through all lines in the current record
        while (i < len(lines)):
            # Check each line to see if it starts with the field
            # identifier we are looking for
            if (lines[i].startswith(field_identifier)):
                # If we find the line we are looking for, include it in
                # the result, unless it's a Data line.
                # Data entries are multi-line, and the first is information
                # that we are not interested in here.
                if (field_identifier != 'I'):
                    result += lines[i]
                    if field_identifier == 'M': result += 'BRK'
                    # Get rid of the line identifier and leading white space
                    result = result[2:]
                # Move to next line
                i += 1
                # and see if it's a continuation from the above line
                while (i < len(lines) and\
                     (lines[i].startswith(' ') or\
                     lines[i].startswith(field_identifier))):
                    # if continuation combine the lines while treating the
                    # spaces nicely, ie, multiple spaces -> one space
                    # this is mostly just important for the
                    # lines that are strings such as title
                    result = result.rstrip() + line_split + lines[i].lstrip()
                    i += 1
                break
            i += 1
        # return the field of interest   
        return result
        
class AAIndex1Parser(AAIndexParser):
    """ Parse AAIndex1 file & return it as dict of AAIndex1 objects"""
    def _parse_record(self, lines):
        """ Parse a single record and return it as a AAIndex1Record Object """
        # init all of the fields each time, this is so that
        # if fields are missing they don't get the value from the last
        # record
        id = None
        description = None
        LITDB = None
        authors = None
        title = None
        citations = None
        comments = None
        correlating = {}
        data = [None] * 20
        id = self._get_field('H', lines)
        description = self._get_field('D', lines)
        LITDB = self._get_field('R', lines)
        authors = self._get_field('A', lines)
        title = self._get_field('T', lines)
        citations = self._get_field('J', lines)
        comments = self._get_field('*', lines)
        correlating = self._parse_correlating(self._get_field('C', lines))
        data = self._parse_data(self._get_field('I', lines))
        return AAIndex1Record(id, description, LITDB, authors,\
                title, citations, comments, correlating, data)
                    
    def _parse_correlating(self, raw):
        """ Parse Correlating entries from the current record """
        keys = []
        values = []
        raw = raw.lstrip()
        # Split by white space
        data = re.split('\s*', raw)
        i=0
        while(i<len(data)):
            # If it's even it's a key
            if((i % 2) == 0):
                keys += [data[i]]
            # if it's not even it's a value
            else:
                # convert values to floats
                try:
                    values += [float(data[i])]
                except ValueError:
                    values += [data[i]]
            i += 1
        result = dict(zip(keys, values))
        return result
    def _parse_data(self, raw):
        """ Parse the data field from current record into a dict
        """
        # init for use in result
        keys = 'ARNDCQEGHILKMFPSTWYV'  
        values = []
        
        # get rid of leading white spaces, it makes../ the reg exp act weird
        raw = raw.lstrip()
        # split by any number/ types of white spaces
        data = re.split('\s*', raw)
        # convert the data to a float while checking for invlaid data,
        # specifically the string 'NA' is present sometimes instead of data
        for i in data:
            try:
                values += [float(i)]
            except ValueError:
                values += i
        result = dict(zip(keys, values))
        # return the dict
        return result
class AAIndex2Parser(AAIndexParser):
    """ Parse AAIndex2 file & return it as dict of AAIndex2 objects"""
    def _parse_record(self, lines):
        """ Parse a single record and return it as a AAIndex2Record Object """
        # Init attributes of each record each run through
       
        id = None
        description = None
        LITDB = None
        authors = None
        title = None
        citations = None
        comments = None
        rowscols = None
        data = []
        # Fill in the values
        id = self._get_field('H', lines)
        description = self._get_field('D', lines)
        LITDB = self._get_field('R', lines)
        authors = self._get_field('A', lines)
        title = self._get_field('T', lines)
        citations = self._get_field('J', lines)
        comments = self._get_field('*', lines)
        raw_data = self._get_field('M', lines)
        rowscols = self._parse_rowscols(raw_data[:raw_data.find('BRK')])
        try:
            data = self._parse_data(raw_data[raw_data.find('BRK')+3:],\
            rowscols[0], rowscols[1])
        except IndexError:
            return None
        return AAIndex2Record(id, description, LITDB, authors,\
            title, citations, comments, data)                       
    def _parse_data(self, raw, rows, cols):
        """ Parse the data field from current record into dict """
        # init result dict
        result = None
        # get rid of leading white spaces, it make the reg exp act weird
        raw = raw.lstrip()
        # split by any number/ types of white spaces
        data = re.split('\s*', raw)
        # If square matrix
        if len(data) == (len(rows)*len(cols)):
            result = dict.fromkeys(rows)
            i = 0
            for r in rows:
                new_row = dict.fromkeys(cols)
                for c in cols:
                    try:
                        new_row[c] = float(data[i])
                    except ValueError:
                        new_row[c] = data[i]
                    i+=1
                result[r] = new_row
        # else if LTM
        elif len(data) == (len(cols)+1) * len(rows)/2 :
            result = dict.fromkeys(rows)
            i = 0
            for r in rows:
                new_row = dict.fromkeys(cols)
                for c in cols:
                    if cols.find(c) <= rows.find(r):
                        try:
                            new_row[c] = float(data[i])
                        except ValueError:
                            new_row[c] = data[i]
                        i += 1
                result[r] = new_row                      
            
        return result
    def _parse_rowscols(self, raw):
        """ Returns two element list, 0: rows info, 1: cols info
        
            This parses the data out of the data description line
            for each record in AAIndex2 so we know what the data is that
            we are looking at.
        """
        p ='[rows|cols]\s=\s([^ \t\n\r\f\v,]*)'
        result = []
        result += re.findall(p, raw)
        return result
class AAIndexRecord(object):
    """ Abstract class, stores records from AAIndex files """
    def __init__(self, id,
                  description, LITDB_entry_num,
                  authors, title,
                  citation, comments, data):
        """ Stores data for individual AAIndex entires """
        self.ID = str(id)
        self.Description = str(description)
        self.LITDBEntryNum = str(LITDB_entry_num)
        self.Authors = str(authors)
        self.Title = str(title)
        self.Citation = str(citation)
        self.Comments = str(comments)
        self.Data = data
    def _toSquareDistanceMatrix(self, include_stops=False):
        """ Converts AAIndex Data to square distance matrix
            This abstract method must be overwritten for each subclass.
            The interface must be identical across subclasses, must
            take self and return new square matrix (for now).
        """
        pass
    def toDistanceMatrix(self, include_stops=False):
        """ Builds a DistanceMatrix object based on self """
        data = self._toSquareDistanceMatrix(include_stops=include_stops)
        # If there is missing or invalid data, data will be None
        # if that's the case return None for easy detection, otherwise
        # return a new DistanceMatrix object
        if data:
            return DistanceMatrix(data=data, info=self)
        return None
class AAIndex1Record(AAIndexRecord):
    """ Stores records from AAIndex1, inherits from AAIndexRecord """
    def __init__(self, id,
                  description, LITDB_entry_num,
                  authors, title,
                  citation, comments,
                  correlating, data):
        """ Stores data for individual AAIndex 1 entires """
        # Call init from super class
        AAIndexRecord.__init__(self, id,
                  description, LITDB_entry_num,
                  authors, title,
                  citation, comments, data)
        self.Correlating = correlating
    def _toSquareDistanceMatrix(self, include_stops=False):
        """ AAIndex1 data to square distance matrix
        """
        keys = self.Data.keys()
        if include_stops : keys += '*'
        # build result dict top layer, start empty
        result = {}
        for r in keys:
            new_row = {}
            for c in keys:
                if (r == '*' or c == '*'):
                    new_row[c] = None
                else:
                    # Build the ditance matrix by subtracting the
                    # value of each aminoacid and then taking the
                    # absolute value.  If the data can not be
                    # turned into a float, it's not a number, so the data
                    # is invalid. Return None for easy detection
                    try:
                        new_row[c] =\
                            abs(float(self.Data[r])
                             - float(self.Data[c]))
                    except ValueError:
                        return None
            result[r] = new_row
        return result
class AAIndex2Record(AAIndexRecord):
    """ Stores records from AAIndex2, inherits from AAIndexRecord  """
    def __init__(self, id,
                  description, LITDB_entry_num,
                  authors, title,
                  citation, comments, data):
        """ Stores data for individual AAIndex 2 entires """
        # Call init from super class
        AAIndexRecord.__init__(self, id,
                  description, LITDB_entry_num,
                  authors, title,
                  citation, comments, data)
    def _toSquareDistanceMatrix(self, include_stops=False):
        """ Returns data as a square matrix
            Note: This method is not currently functional,
            we are awaiting information on how to process data into
            a distance matrix
        """
        # create a new dict based on self.Data so we don't alter self.Data
        result = dict(self.Data)
        # Add in the new row of stop codon data
        if include_stops:
            stop_row = {}
            for i in result:
                stop_row.update({i:None})
            result.update({'*':stop_row})
            for i in result:
                result[i].update({'*':None})
        # Right now we are only dealing with square matrices
        return result
def AAIndexLookup(records):
    """ Build a dict of AAIndexObjects hashed by ID """    
    result = {}
    for r in records:
        result[r.ID] = r
    return result
        
def AAIndex1FromFiles(file):
    """ Taking a file or list of data return a dict of AAIndex1Objects """
    aap = AAIndex1Parser()
    return AAIndexLookup(aap(file))
def AAIndex2FromFiles(file):
    """ Taking a file or list of data return a dict of AAIndex2Objects """
    aap = AAIndex2Parser()
    return AAIndexLookup(aap(file))    
Woese_data = """//
H WOEC730101
D Polar requirement (Woese, 1973)
R PMID:4588588
A Woese, C.R.
T Evolution of genetic code
J Naturwiss. 60, 447-459 (1973)
C GRAR740102    0.960  HOPT810101    0.886  HOPA770101    0.876
  LEVM760101    0.872  PRAM900101    0.871  ROSM880101    0.844
  WOLS870101    0.841  KUHL950101    0.837  OOBM770103    0.835
  VINM940101    0.834  PARJ860101    0.821  FUKS010102    0.820
  FAUJ880110    0.812  OOBM770101    0.804  ROSM880102    0.801
  NADH010102   -0.800  CIDH920105   -0.800  MEIH800103   -0.802
  ISOY800102   -0.803  EISD860103   -0.803  ROSG850102   -0.804
  TANS770103   -0.806  RADA880101   -0.812  BIOV880102   -0.819
  WIMW960101   -0.821  NISK860101   -0.822  PONP800103   -0.823
  CIDH920104   -0.823  RADA880108   -0.825  BIOV880101   -0.829
  PONP800108   -0.831  SWER830101   -0.832  EISD860101   -0.838
  MAXF760102   -0.842  DESM900102   -0.847  FAUJ830101   -0.880
I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V
     7.0     9.1    10.0    13.0     5.5     8.6    12.5     7.9     8.4     4.9
     4.9    10.1     5.3     5.0     6.6     7.5     6.6     5.3     5.7     5.6
//
"""
def getWoeseDistanceMatrix():
    """ Return the Woese Polar Requirement Distance Matrix """
    aaindexObjects = AAIndex1FromFiles(Woese_data.split('\n'))
    distance_matrices = {}
    for m in aaindexObjects:
        distance_matrices[m] = aaindexObjects[m].toDistanceMatrix()
    return distance_matrices['WOEC730101']    
 |