/usr/share/pyshared/cogent/parse/comrna.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | #!/usr/bin/env python
#file comrna_parser.py
"""Parser for comRNA output format
To reduce number of structures that parser report set first to True, then parser
will only report structures from first block (Maximum stem similarity score block).
The function common can be used to have the most common occuring structure be
reported first, if all structure only occurs once the structure with the most pairs
will be reported as the most common.
"""
from cogent.util.transform import make_trans
from cogent.struct.rna2d import Pairs
from cogent.struct.knots import opt_single_random
from string import index
__author__ = "Shandy Wikman"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__contributors__ = ["Shandy Wikman"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Shandy Wikman"
__email__ = "ens01svn@cs.umu.se"
__status__ = "Development"
def comRNA_parser(lines=None,pseudo=True,first=False):
"""Parsed comRNA output.
pseudo - if True, report results with pseudoknots; if flag
is False pseudoknots will be removed.
"""
names = get_names(lines)
result = []
for block in minimalComrnaParser(lines):
for structure in blockParser(block):
for struct in structParser(structure):
for pairs,seq in pairsParser(struct,names):
result.append([seq,pairs])
if first:
break
if first:
break
if first:
break
if not pseudo:
tmp = []
for block in result:
tmp.append([block[0],opt_single_random(block[-1])])
result = tmp
return result
def common(structs):
"""
Will return a list of sequences and structures with the most common structure
first in the list. (rest or list unordered!)
Don't care which of the sequences for the "winning" sequence that is reported
since the are not ranked amongst them self.
"""
frequency = {}
v = 0
indx = 0
result = []
tmp_list = [] #lookup the seq for the structures,dont care which winner seq
key = []
for block in structs:
tmp_list.extend(block)
p = tuple(block[-1])
if frequency.__contains__(p): #everytime struct p appears count up by 1
frequency[p]+=1
else:
frequency[p]=1
nr = frequency[p]
if nr > v: #Which struct appears most times
v = nr
key = p
#if winning structure has frequency == 1 all structure apper only once
if frequency[key]==1:
longest = 0
for block in structs:
l = len(block[-1])
if l > longest: #pick longest sequence as the winner
key = tuple(block[-1])
winner = Pairs(key)
indx = tmp_list.index(winner)-1
result.append([tmp_list[indx],winner]) #adds the most common structure first
del frequency[key]
for i in frequency.keys(): #rest of structures added
i = Pairs(i)
indx = tmp_list.index(i)-1
result.append([tmp_list[indx],i])
return result
def get_names(lines):
"""
Retrieves the names of the sequences in the output.
"""
next = False
names = []
for line in lines:
if next:
if len(line) == 1:
break
else:
tmp = line.split()
names.append(tmp[1])
if line.startswith('Sequences loaded ...'):
next = True
return names
def minimalComrnaParser(lines):
"""
Parses the output file in to blocks depending on the S score
S score is the Maximum stem similarity score.
"""
block = []
first = True
record = False
for line in lines:
if line.startswith('=========================== S ='):
record = True
if not first:
yield block
block = []
first = False
if record:
block.append(line)
yield block
def blockParser(block):
"""
Parses every block of S scores in to blocks of structures
every S score block has 10 or less structures
"""
struct = []
first = True
record = False
for line in block:
if line.startswith('Structure #'):
record = True
if not first:
yield struct
struct = []
first = False
if record:
struct.append(line)
yield struct
def structParser(lines):
"""
Parses a structure block into a block containing the sequens and structures
lines.
"""
blc = 0 #blank line counter
bc = 0 #block counter
struct = []
record = False
for line in lines:
if len(line) == 1:
blc +=1
record = False
if blc == 2:
blc = 0
bc +=1
record = True
if record and bc < 3:
struct.append(line)
yield struct
def pairsParser(seqBlock,names):
"""
Takes a structure block and parse that into structures
"""
for name in names:
seq = []
sIndx = [] #start index, where in the line the sequence start
struct = [] #structure lines
record = False
for line in seqBlock:
if line.startswith(name+' '):
tmp = line.split()
#if seq length is shorter then 80 for one seq and longer
#for another seq the following block will be empty for the
#shorter sequence. this if statement protects against that
if len(tmp) == 4:
try:
seq.append(tmp[2])#[name,start nr,seq,end nr]
except:
print 'LINE',line
print 'BLOCK', seqBlock
sIndx.append(index(line,tmp[2]))
record = True
else:
continue
else:
if record:
record = False
struct.append(line)
###############################################################################
# Construction of the full sequence and structure and then mapping each letter
#in structure to a position
Fseq = '' #full sequence
Fstruct = '' #full structure
for i in range(len(seq)):
# slice out corresponding structure to sequence
#so you can get the same index for structure and sequence
tmpStruct = struct[i][sIndx[i]:(sIndx[i]+len(seq[i]))]
Fseq = ''.join([Fseq,seq[i]])
Fstruct = ''.join([Fstruct,tmpStruct])
#Applies a position to every letter in structure sequence
letterPos = zip(range(len(Fseq)),Fstruct)
###############################################################################
#Cunstruction of dictionary for where every letter in structure has a list of
#positions corresponding to that of that letter in respect to the sequence
alphabet = {}
for pos, letter in letterPos:
indices = []
#if the dict contains the letter you want to add to that list
if alphabet.__contains__(letter):
indices = alphabet[letter]
indices.append(pos)
alphabet[letter] = indices
#else you want to create a new list for that letter
elif not letter==' ':
indices.append(pos)
alphabet[letter] = indices
###############################################################################
#Each list in alphabet needs to be split in two,
#oL and cL (open and close list), to be able to fold the positions into pairs
pairs = []
for value in alphabet.values():
middle = len(value)/2
oL = value[:middle]
cL = value[middle:]
#pairs are created by making a tuple of the first in oL to
#the last in cl, second in oL to second last in cL and so on
pairs.extend(zip(oL,cL.__reversed__()))
yield Pairs(pairs),Fseq
|