This file is indexed.

/usr/share/pyshared/Bio/SeqIO/PirIO.py is in python-biopython 1.58-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Copyright 2008-2009 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#
# This module is for reading and writing PIR or NBRF format files as
# SeqRecord objects.  The code is based on Bio.SeqIO.FastaIO

"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.

You are expected to use this module via the Bio.SeqIO functions, or if
the file contains a sequence alignment, optionally via Bio.AlignIO instead.

This format was introduced for the Protein Information Resource (PIR), a
project of the National Biomedical Research Foundation (NBRF).  The PIR
database itself is now part of UniProt.

The file format is described online at:
http://www.ebi.ac.uk/help/pir_frame.html
http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)

An example file in this format would be:

>P1;CRAB_ANAPL
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR 
  SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH 
  GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ 
  SDVPERSIPI TREEKPAIAG AQRK*

>P1;CRAB_BOVIN
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
  MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR 
  PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV 
  HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK 
  QASGPERTIP ITREEKPAVT AAPKK*

Or, an example of a multiple sequence alignment:

>P1;S27231
rhodopsin - northern leopard frog
MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*

>P1;I51200
rhodopsin - African clawed frog
MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*

>P1;JN0120
rhodopsin - Japanese lamprey
MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*


As with the FASTA format, each record starts with a line begining with ">"
character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
RC, or XX), a semi colon, and the identification code.  The second like is
free text description.  The remaining lines contain the sequence itself,
terminating in an asterisk.  Space separated blocks of ten letters as shown
above are typical.

Sequence codes and their meanings:

P1 - Protein (complete)
F1 - Protein (fragment)
D1 - DNA (e.g. EMBOSS seqret output)
DL - DNA (linear)
DC - DNA (circular)
RL - RNA (linear)
RC - RNA (circular)
N3 - tRNA
N1 - Other functional RNA
XX - Unknown
"""

from Bio.Alphabet import single_letter_alphabet, generic_protein, \
                         generic_dna, generic_rna
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

_pir_alphabets = {"P1" : generic_protein,
                  "F1" : generic_protein,
                  "D1" : generic_dna,
                  "DL" : generic_dna,
                  "DC" : generic_dna,
                  "RL" : generic_rna,
                  "RC" : generic_rna,
                  "N3" : generic_rna,
                  "XX" : single_letter_alphabet,
                  }

#This is a generator function!
def PirIterator(handle):
    """Generator function to iterate over Fasta records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet
    title2ids - A function that, when given the title of the FASTA
    file (without the beginning >), will return the id, name and
    description (in that order) for the record as a tuple of strings.

    If this is not given, then the entire title line will be used
    as the description, and the first word as the id and name.

    Note that use of title2ids matches that of Bio.Fasta.SequenceParser
    but the defaults are slightly different.
    """
    #Skip any text before the first record (e.g. blank lines, comments)
    while True:
        line = handle.readline()
        if line == "":
            return #Premature end of file, or just empty?
        if line[0] == ">":
            break

    while True:
        if line[0] != ">":
            raise ValueError(\
                "Records in PIR files should start with '>' character")
        pir_type = line[1:3]
        if pir_type not in _pir_alphabets or line[3] != ";":
            raise ValueError(\
                "Records should start with '>XX;' "
                "where XX is a valid sequence type")
        identifier = line[4:].strip()
        description = handle.readline().strip()
        
            
        lines = []
        line = handle.readline()
        while True:
            if not line:
                break
            if line[0] == ">":
                break
            #Remove trailing whitespace, and any internal spaces
            lines.append(line.rstrip().replace(" ",""))
            line = handle.readline()
        seq = "".join(lines)
        if seq[-1] != "*":
            #Note the * terminator is present on nucleotide sequences too,
            #it is not a stop codon!
            raise ValueError(\
                "Sequences in PIR files should include a * terminator!")
            
        #Return the record and then continue...
        record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]),
                           id = identifier, name = identifier,
                           description = description)
        record.annotations["PIR-type"] = pir_type
        yield record

        if not line : return #StopIteration
    assert False, "Should not reach this line"

if __name__ == "__main__":
    print "Running quick self test"

    import os
    
    for name in ["clustalw",  "DMA_nuc", "DMB_prot", "B_nuc", "Cw_prot"]:
        print name
        filename = "../../Tests/NBRF/%s.pir" % name
        if not os.path.isfile(filename):
            print "Missing %s" % filename
            continue

        records = list(PirIterator(open(filename)))
        count = 0
        for record in records:
            count += 1
            parts = record.description.split()
            if "bases," in parts:
                assert len(record) == int(parts[parts.index("bases,")-1])
        print "Could read %s (%i records)" % (name, count)