This file is indexed.

/usr/share/pyshared/cogent/parse/bpseq.py is in python-cogent 1.5.1-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/usr/bin/env python
#bpseq.py
"""Provides parser for bpseq files downloaded from Gutell's comparative 
RNA website (CRW): http://www.rna.icmb.utexas.edu/

The file format is:

Filename: d.16.b.E.coli.bpseq
Organism: Escherichia coli
Accession Number: J01695
Citation and related information available at http://www.rna.icmb.utexas.edu
1 A 0
2 A 0
3 A 0
4 U 0
5 U 0
6 G 0
7 A 0
8 A 0
9 G 25
10 A 24
11 G 23
12 U 22
13 U 21
14 U 0

So, header of four lines (Filename, Organism, Accession Number, Citation)
Sequence, structure information in tuples of residue position, residue name,
residue partner. The residue partner is 0 if the base is unpaired.

Numbering is 1-based!
"""
from __future__ import division
from string import strip
from cogent.struct.rna2d import Vienna, Pairs
from cogent.struct.knots import opt_single_random
from cogent.core.info import Info
from cogent.core.sequence import RnaSequence

__author__ = "Sandra Smit"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Sandra Smit", "Gavin Huttley", "Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Sandra Smit"
__email__ = "sandra.smit@colorado.edu"
__status__ = "Production"

class BpseqParseError(Exception):
    """Exception raised when an error occurs during parsing a bpseq file"""
    pass

def parse_header(header_lines):
    """Return Info object from header information.
   
    header_lines -- list of lines or anything that behaves like it.

    Parses only the first three header lines with Filename, Organism, and
    Accession number. In general lines that contain a colon will be parsed.
    There's no error checking in here. If it fails to split on ':', the 
    information is simply not added to the dictionary. The expected format
    for header lines is "key: value". The citation lane is parsed differently. 
    """
    info = {}
    for line in header_lines:
        if line.startswith('Citation'):
            info['Citation'] = line.split()[-1].strip()
        elif ':' in line:
            try:
                field, value = map(strip,line.split(':',1))
                info[field] = value
            except ValueError:
                #no interesting header line
                continue
        else:
            continue
    return Info(info)

def construct_sequence(seq_dict):
    """Construct RnaSequence from dict of {pos:residue}.

    seq_dict -- dictionary of {position: residue}
    
    Checks whether the first residue is 0. Checks whether all residues
    between min and max index are present. No checking on validity of
    residue symbols. 
    """
    all_pos = seq_dict.keys()
    min_pos, max_pos = min(all_pos), max(all_pos)
    if min_pos != 0:
        raise BpseqParseError(\
            "Something went wrong with adjusting the numbering")
    # make sure all positions are in the dictionary
    for idx in range(min_pos, max_pos+1):
        if idx not in seq_dict:
            raise BpseqParseError(\
                "Description of residue with index %s is missing"%(idx))
    seq = [] 
    for idx in range(min_pos, max_pos+1):
        seq.append(seq_dict[idx])
    # Return as a simple string
    return ''.join(seq)

def parse_residues(residue_lines, num_base, unpaired_symbol):
    """Return RnaSequence and Pairs object from residue lines.

    residue_lines -- list of lines or anything that behaves like it. 
        Lines should contain:
        residue_position, residue_identiy, residue_partner.
    num_base -- int, basis of the residue numbering. In bpseq files from
        the CRW website, the numbering starts at 1.
    unpaired_symbol -- string, symbol in the 'partner' column that indicates
        that a base is unpaired. In bpseq files from the CRW website, the
        unpaired_symbol is '0'. This parameter should be a string to allow
        other symbols that can't be casted to an integer to indicate
        unpaired bases.
    
    Checks for double entries both in the sequence and the structure, and
    checks that the structre is valid in the sense that if (up,down) in there,
    that (down,up) is the same.
    """
    #create dictionary/list for sequence and structure
    seq_dict = {}
    pairs = Pairs()
    
    for line in residue_lines:
        try:
            pos, res, partner = line.strip().split()
            if partner == unpaired_symbol:
                # adjust pos, not partner
                pos = int(pos) - num_base
                partner = None
            else:
                # adjust pos and partner
                pos = int(pos) - num_base
                partner = int(partner) - num_base
            pairs.append((pos,partner))
            
            #fill seq_dict
            if pos in seq_dict:
                raise BpseqParseError(\
                    "Double entry for residue %s (%s in bpseq file)"\
                    %(str(pos), str(pos+1)))
            else:
                seq_dict[pos] = res
        
        except ValueError:
            raise BpseqParseError("Failed to parse line: %s"%(line))
    
    #check for conflicts, remove unpaired bases 
    if pairs.hasConflicts():
        raise BpseqParseError("Conflicts in the list of basepairs")
    pairs = pairs.directed()
    pairs.sort()
    
    # construct sequence from seq_dict
    seq = RnaSequence(construct_sequence(seq_dict))
    
    return seq, pairs

def MinimalBpseqParser(lines):
    """Separate header and content (residue lines).
    
    lines -- a list of lines or anything that behaves like that.

    The standard bpseq header (from the CRW website) is recognized. Also,
    lines that contain a colon are accepted as header lines.
    Header lines that aren't accepted as header, but that can be split into 
    three parts are residue lines (sequence and structure description). 
    Lines that don't fall into any of these categories are ignored.
    """
    result = {'HEADER':[], 'SEQ_STRUCT':[]}
    
    for line in lines:
        if line.startswith('Filename') or line.startswith('Organism') or\
            line.startswith('Accession') or line.startswith('Citation') or\
            ":" in line:
            result['HEADER'].append(line.strip())
        elif len(line.split()) == 3:
            result['SEQ_STRUCT'].append(line.strip())
        else:
            continue #unknown
    return result

def BpseqParser(lines, num_base=1, unpaired_symbol='0'):
    """Return RnaSequence and structure (Pairs object) specified in file.

    lines -- filestream of bpseq file. File should contain a single record.
    num_base -- int, basis of the residue numbering. In bpseq files from
        the CRW website, the numbering starts at 1.
    unpaired_symbol -- string, symbol in the 'partner' column that indicates
        that a base is unpaired. In bpseq files from the CRW website, the
        unpaired_symbol is '0'. This parameter should be a string to allow
        other symbols that can't be casted to an integer to indicate
        unpaired bases.

    Bpseq file looks like this:
    
    Filename: d.16.b.E.coli.bpseq
    Organism: Escherichia coli
    Accession Number: J01695
    Citation and related information available at http://www....
    1 A 0
    2 A 0
    3 A 0
    4 U 0
    5 U 0
    6 G 0
    7 A 0
    8 A 0
    9 G 25
    10 A 24
    11 G 23
    12 U 22
    13 U 21
        
    So, 4 header lines, followed by a list of residues.
    Position (indexed to 1), residue, partner position
    """
    # separate header and residue lines
    grouped_lines = MinimalBpseqParser(lines)
    # parse header and seq/struct separately
    header_info = parse_header(grouped_lines['HEADER'])
    seq, struct = parse_residues(grouped_lines['SEQ_STRUCT'],\
        num_base, unpaired_symbol)
    #add header info to the sequence as Info object
    seq.Info = header_info
    
    return seq, struct

# ============================================================================
# CONVENIENCE FUNCTIONS
# ============================================================================

def bpseq_specify_output(lines, num_base=1, unpaired_symbol='0',
    return_vienna=False, remove_pseudo=False,\
    pseudoknot_function=opt_single_random):
    """Return Vienna structure of Pairs object with or without pseudoknots
    
    lines -- filestream of bpseq file. File should contain a single record.
    num_base -- int, basis of the residue numbering. In bpseq files from
        the CRW website, the numbering starts at 1.
    unpaired_symbol -- string, symbol in the 'partner' column that indicates
        that a base is unpaired. In bpseq files from the CRW website, the
        unpaired_symbol is '0'. This parameter should be a string to allow
        other symbols that can't be casted to an integer to indicate
        unpaired bases.
    return_vienna -- boolean, if True, a ViennaStructure object is returned,
        if False, a Pairs object is returned. If return_vienna is True,
        pseudoknots need to be removed from the structure.
    remove_pseudo -- boolean, if True, pseudoknots will be removed from
        the structure.
    pseudoknot_function -- function that takes a Pairs object as input and
        returns a nested version of the structure. The function should return
        a single nested structure, not a list of structures. Default is 
        opt_single_random, which retuns a single nested structure (it picks
        one at random in case of multiple structures with the maximum number of 
        base pairs). This default is chosen to assure the code always returns
        something. In case the experiment needs to be reproducible,
        a random choice isn't the best one to make, and one should
        use a different pseudoknot removal function. See struct/knots.py
        for documentation.
    """
    seq, pairs = BpseqParser(lines, num_base, unpaired_symbol)

    if pairs.hasPseudoknots() and (remove_pseudo or return_vienna):
        pairs = pseudoknot_function(pairs)
    if return_vienna:
        v = pairs.toVienna(len(seq))
        return seq, v
    else:
        return seq, pairs


if __name__ == "__main__":
    from sys import argv
    seq, struct = BpseqParser(open(argv[1]))
    print seq
    print seq.Info
    print struct