/usr/share/pyshared/cogent/parse/gibbs.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | #!/usr/bin/env python
#file cogent/parse/gibbs.py
"""Parses Gibbs Sampler output file and creates MotifResults object."""
from cogent.parse.record_finder import LabeledRecordFinder
from cogent.motif.util import Location, ModuleInstance, Module, Motif,\
MotifResults
from cogent.core.moltype import DNA, RNA, PROTEIN
from numpy import exp
__author__ = "Jeremy Widmann"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Jeremy Widmann", "Micah Hamady", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Jeremy Widmann"
__email__ = "jeremy.widmann@colorado.edu"
__status__ = "Development"
def get_sequence_and_motif_blocks(lines):
"""Returns main block of data as a list.
"""
gibbs_map_maximization = \
LabeledRecordFinder(lambda x: 'MAP MAXIMIZATION RESULTS' in x)
seq_block, motif_block = list(gibbs_map_maximization(lines))
return seq_block, motif_block
def get_sequence_map(lines):
"""Returns dict mapping Gibbs sequence number to sequence ID.
- ex: sequence numbers mapping to gis:
{'1':'1091044',
'2':'11467494',
'3':'11499727'}
"""
sequence_map = {}
sequence_finder = \
LabeledRecordFinder(lambda x: x.startswith('Sequences to be Searched:'))
sequence_block = list(sequence_finder(lines))[-1]
for i in sequence_block[2:]:
if i.startswith('#'):
num,label = i.strip().split(' ',1)
num = num.strip()
label = label.strip()
sequence_map[num[1:]] = label
else:
break
return sequence_map
def get_motif_blocks(lines):
"""Returns list of motif blocks given main block as lines.
"""
gibbs_motif = LabeledRecordFinder(lambda x: 'MOTIF' in x)
return list(gibbs_motif(lines))[1:]
def get_motif_sequences(lines):
"""Returns list of tuples with motif sequence information given motif block.
- result is list of tuples :
[(seq_num, motif_start, motif_seq, motif_sig),]
"""
motif_list = []
motif_seq_finder = LabeledRecordFinder(lambda x: 'columns' in x)
motifs = list(motif_seq_finder(lines))[-1]
for m in motifs[2:]:
if ',' in m:
curr = m.strip().split()
motif_num = curr[1]
seq_num = curr[0].split(',')[0]
motif_start = int(curr[2])-1
#If motif does not start at beginning of sequence:
if motif_start > 0:
motif_seq = curr[4]
#Motif starts at beginning of sequence, no context before motif
else:
motif_seq = curr[3]
motif_sig = float(curr[-3])
motif_list.append((seq_num,motif_start,motif_seq,motif_sig,motif_num))
else:
break
return motif_list
def get_motif_p_value(lines):
"""Returns the motif p-value given motif block.
"""
motif_p_finder = LabeledRecordFinder(lambda x: x.startswith('Log Motif'))
motif_p_block = list(motif_p_finder(lines))[-1]
log_motif_portion = float(motif_p_block[0].split()[-1])
return exp(log_motif_portion)
def guess_alphabet(motif_list):
"""Returns alphabet given a motif_list from get_motif_sequences.
- temp hack...should really think of a better way to get alphabet,
but Gibbs sampler help tells nothing of how it guesses the alphabet
or what it does with degenerate characters.
- only allows for 2 degenerate nucleotide chars.
"""
alpha_dict = {}
for motif in motif_list:
for char in motif[2]:
if char not in alpha_dict:
alpha_dict[char]=0
alpha_dict[char]+=1
if len(alpha_dict) > 6:
alphabet=PROTEIN
elif 'T' in alpha_dict:
alphabet=DNA
else:
alphabet=RNA
return alphabet
def build_module_objects(motif_block, sequence_map, truncate_len=None):
"""Returns module object given a motif_block and sequence_map.
- motif_block is list of lines resulting from calling get_motif_blocks
- sequence_map is the mapping between Gibbs sequence numbering and
sequence id from fasta file.
"""
#Get motif id
motif_id = motif_block[0].strip().split()[-1]
#Get motif_list
motif_list = get_motif_sequences(motif_block)
#Get motif p-value
motif_p = get_motif_p_value(motif_block)
#Guess alphabet from motif sequences
alphabet = guess_alphabet(motif_list)
#Create Module object(s)
gibbs_module = {}
module_keys = ["1"]
for motif in motif_list:
seq_id = str(sequence_map[motif[0]])
if truncate_len:
seq_id = seq_id[:truncate_len]
start = motif[1]
seq = motif[2]
sig = motif[3]
motif_num = "1"
#Create Location object
location = Location(seq_id, start, start + len(seq))
#Create ModuleInstance
mod_instance = ModuleInstance(seq,location,sig)
cur_key = (seq_id,start)
gibbs_module[(seq_id,start)]=mod_instance
gibbs_mod = Module(gibbs_module,MolType=alphabet)
gibbs_mod.Pvalue = motif_p
gibbs_mod.ID = motif_id + module_keys[0]
yield gibbs_mod
def module_ids_to_int(modules):
"""Given a list of modules changes alpha ids to int ids.
"""
id_map = {}
for m in modules:
id_map[m.ID]=True
for i,mid in enumerate(sorted(id_map.keys())):
id_map[mid]=str(i)
for m in modules:
m.ID=id_map[m.ID]
def GibbsParser(lines, truncate_len=None, strict=True):
"""Returns a MotifResults object given a Gibbs Sampler results file.
- only works with results from command line version of Gibbs Sampler
"""
try:
#Get sequence and motif blocks
sequence_block, motif_block = get_sequence_and_motif_blocks(lines)
except Exception, e:
if strict:
raise e
else:
return None
#Create MotifResults object
gibbs_motif_results = MotifResults()
#Get sequence map
sequence_map = get_sequence_map(sequence_block)
#Get motif blocks
motif_blocks = get_motif_blocks(motif_block)
#Get modules
for module in motif_blocks:
if module[1] == 'No Motifs Detected':
print "No Modules detcted!!", module[0]
continue
for cur_smod in build_module_objects(module, sequence_map, truncate_len=truncate_len):
gibbs_motif_results.Modules.append(cur_smod)
module_ids_to_int(gibbs_motif_results.Modules)
for ct, module in enumerate(gibbs_motif_results.Modules):
gibbs_motif_results.Motifs.append(Motif(module))
gibbs_motif_results.Alphabet=module.Alphabet
return gibbs_motif_results
if __name__ == "__main__":
from sys import argv, exit
print "Running..."
if len(argv) != 2:
print "Usage: gibbs.py <gibbs out file>"
exit(1)
mr = GibbsParser(open(argv[1]), 24)
print mr
for module in mr.Modules:
print module.ID
print str(module)
|