This file is indexed.

/usr/share/pyshared/cogent/parse/gibbs.py is in python-cogent 1.5.1-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
#file cogent/parse/gibbs.py
"""Parses Gibbs Sampler output file and creates MotifResults object."""

from cogent.parse.record_finder import LabeledRecordFinder
from cogent.motif.util import Location, ModuleInstance, Module, Motif,\
     MotifResults
from cogent.core.moltype import DNA, RNA, PROTEIN
from numpy import exp

__author__ = "Jeremy Widmann"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Jeremy Widmann", "Micah Hamady", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Jeremy Widmann"
__email__ = "jeremy.widmann@colorado.edu"
__status__ = "Development"

def get_sequence_and_motif_blocks(lines):
    """Returns main block of data as a list.
    """
    gibbs_map_maximization = \
        LabeledRecordFinder(lambda x: 'MAP MAXIMIZATION RESULTS' in x)
    seq_block, motif_block = list(gibbs_map_maximization(lines))
    return seq_block, motif_block

def get_sequence_map(lines):
    """Returns dict mapping Gibbs sequence number to sequence ID.
    
        - ex: sequence numbers mapping to gis:
            {'1':'1091044',
             '2':'11467494',
             '3':'11499727'}
    """
    sequence_map = {}
    sequence_finder = \
        LabeledRecordFinder(lambda x: x.startswith('Sequences to be Searched:'))
    sequence_block = list(sequence_finder(lines))[-1]
    for i in sequence_block[2:]:
        if i.startswith('#'):
            num,label = i.strip().split(' ',1)
            num = num.strip()
            label = label.strip()
            sequence_map[num[1:]] = label
        else:
            break
    return sequence_map
    

def get_motif_blocks(lines):
    """Returns list of motif blocks given main block as lines.
    """
    gibbs_motif = LabeledRecordFinder(lambda x: 'MOTIF' in x)
    return list(gibbs_motif(lines))[1:]
    
def get_motif_sequences(lines):
    """Returns list of tuples with motif sequence information given motif block.
    
        - result is list of tuples :
            [(seq_num, motif_start, motif_seq, motif_sig),]
    """
    motif_list = []
    motif_seq_finder = LabeledRecordFinder(lambda x: 'columns' in x)
    motifs = list(motif_seq_finder(lines))[-1]
    for m in motifs[2:]:
        if ',' in m:
            curr = m.strip().split()
            motif_num = curr[1]
            seq_num = curr[0].split(',')[0]
            motif_start = int(curr[2])-1
            #If motif does not start at beginning of sequence:
            if motif_start > 0:
                motif_seq = curr[4]
            #Motif starts at beginning of sequence, no context before motif
            else:
                motif_seq = curr[3]
            motif_sig = float(curr[-3])
            motif_list.append((seq_num,motif_start,motif_seq,motif_sig,motif_num))
        else:
            break
    return motif_list

def get_motif_p_value(lines):
    """Returns the motif p-value given motif block.
    """
    motif_p_finder = LabeledRecordFinder(lambda x: x.startswith('Log Motif'))
    motif_p_block = list(motif_p_finder(lines))[-1]
    log_motif_portion = float(motif_p_block[0].split()[-1])
    return exp(log_motif_portion)

def guess_alphabet(motif_list):
    """Returns alphabet given a motif_list from get_motif_sequences.
    
        - temp hack...should really think of a better way to get alphabet,
        but Gibbs sampler help tells nothing of how it guesses the alphabet
        or what it does with degenerate characters.
        - only allows for 2 degenerate nucleotide chars.
    """
    alpha_dict = {}
    for motif in motif_list:
        for char in motif[2]:
            if char not in alpha_dict:
                alpha_dict[char]=0
            alpha_dict[char]+=1
    if len(alpha_dict) > 6:
        alphabet=PROTEIN
    elif 'T' in alpha_dict:
        alphabet=DNA
    else:
        alphabet=RNA
    return alphabet
    

def build_module_objects(motif_block, sequence_map, truncate_len=None):
    """Returns module object given a motif_block and sequence_map.
    
        - motif_block is list of lines resulting from calling get_motif_blocks
        - sequence_map is the mapping between Gibbs sequence numbering and 
        sequence id from fasta file.
    """
    #Get motif id
    motif_id = motif_block[0].strip().split()[-1]

    #Get motif_list
    motif_list = get_motif_sequences(motif_block)
    #Get motif p-value
    motif_p = get_motif_p_value(motif_block)
    #Guess alphabet from motif sequences
    alphabet = guess_alphabet(motif_list)
    
    #Create Module object(s)
    gibbs_module = {}

    module_keys = ["1"]

    for motif in motif_list:

        seq_id = str(sequence_map[motif[0]])

        if truncate_len:
            seq_id = seq_id[:truncate_len]
        
        start = motif[1]
        seq = motif[2]
        sig = motif[3]
        motif_num = "1" 
        
        #Create Location object
        location = Location(seq_id, start, start + len(seq))
        #Create ModuleInstance
        mod_instance = ModuleInstance(seq,location,sig)
        cur_key = (seq_id,start)
        
        gibbs_module[(seq_id,start)]=mod_instance
    gibbs_mod = Module(gibbs_module,MolType=alphabet)
    gibbs_mod.Pvalue = motif_p
    gibbs_mod.ID = motif_id + module_keys[0]
    yield gibbs_mod

def module_ids_to_int(modules):
    """Given a list of modules changes alpha ids to int ids.
    """
    id_map = {}
    for m in modules:
        id_map[m.ID]=True
    for i,mid in enumerate(sorted(id_map.keys())):
        id_map[mid]=str(i)
    for m in modules:
        m.ID=id_map[m.ID]


def GibbsParser(lines, truncate_len=None, strict=True):
    """Returns a MotifResults object given a Gibbs Sampler results file.
    
        - only works with results from command line version of Gibbs Sampler
    """
    try:
        #Get sequence and motif blocks
        sequence_block, motif_block = get_sequence_and_motif_blocks(lines)
    except Exception, e:
        if strict:
            raise e
        else:
            return None
    #Create MotifResults object
    gibbs_motif_results = MotifResults()

    #Get sequence map
    sequence_map = get_sequence_map(sequence_block)
    #Get motif blocks
    motif_blocks = get_motif_blocks(motif_block)
    #Get modules
    for module in motif_blocks:
        if module[1] == 'No Motifs Detected':
            print "No Modules detcted!!", module[0]
            continue
        for cur_smod in build_module_objects(module, sequence_map, truncate_len=truncate_len):
            gibbs_motif_results.Modules.append(cur_smod)
    module_ids_to_int(gibbs_motif_results.Modules)
    
    for ct, module in enumerate(gibbs_motif_results.Modules):
        gibbs_motif_results.Motifs.append(Motif(module))
    gibbs_motif_results.Alphabet=module.Alphabet
    return gibbs_motif_results

if __name__ == "__main__":
    from sys import argv, exit
    print "Running..."

    if len(argv) != 2:
        print "Usage: gibbs.py <gibbs out file>"
        exit(1)
    mr = GibbsParser(open(argv[1]), 24)
    print mr
    for module in mr.Modules:
        print module.ID
        print str(module)