This file is indexed.

/usr/share/pyshared/cogent/seqsim/markov.py is in python-cogent 1.5.3-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
#!/usr/bin/env python
"""markov.py: various types of random and non-random generators.

Currently provides:

MarkovGenerator: reads in k-word frequencies from a text, and generates random text based on those frequencies. Also calculates the average entropy of the (k+1)th symbol (for sufficiently large k, converges on the entropy of the text).

NOTE:  The text must be a list of strings (e.g. lines of text).  If 
a single string is passed into the constructor it should be 
put into a list (i.e. ['your_string']) or it will result in errors
when calculating kword frequencies. 
"""
from __future__ import division
from operator import mul
from random import choice, shuffle, randrange
from cogent.maths.stats.util import UnsafeFreqs as Freqs
from cogent.util.array import cartesian_product
from cogent.maths.stats.test import G_fit
from copy import copy,deepcopy
from numpy import ones, zeros, ravel, array, rank, put, argsort, searchsorted,\
                  take
from numpy.random import random

__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight", "Jesse Zaneveld", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Development"

class MarkovGenerator(object):
    """Holds k-word probabilities read from file, and can generate text."""
    def __init__(self, text=None, order=1, linebreaks=False, \
        calc_entropy=False, freqs=None, overlapping=True, \
        array_pseudocounts=0, delete_bad_suffixes=True):
        """Sets text and generates k-word frequencies."""
        self.Text = text
        self.Linebreaks = linebreaks
        self.Order = order
        self.Frequencies = freqs or {}
        self.RawCounts = {}
        self.FrequencyArray=None
        self.CountArray=None
        self.ExcludedCounts=None
        self.ArrayPseudocounts=array_pseudocounts
        self._calc_entropy = calc_entropy
        self.Entropy = None
        self.Prior = None
        self.Overlapping=overlapping
        if self.Text:
            self.calcFrequencies(delete_bad_suffixes)

    def calcFrequencies(self, delete_bad_suffixes=True):
        """For order k, gets the (k-1)-word frequencies plus what follows."""
        #reset text if possible -- but it might just be a string, so don't
        #complain if the reset fails.
        overlapping=self.Overlapping
        try:
            self.Text.reset()
        except AttributeError:
            try:
                self.Text.seek(0)
            except AttributeError:
                pass
        k = self.Order
        if k < 1:   #must be 0 or '-1': just need to count single bases
            self._first_order_frequency_calculation()
        else:   #need to figure out what comes after the first k bases
            all_freqs = {}
            for line in self.Text:
                if not self.Linebreaks:
                    line = line.strip()
                #skip the line if it's blank
                if (not line):
                    continue
                #otherwise, make a frequency distribution of symbols
                end = len(line) - k
                if overlapping:
                    rang=xrange(end)
                else:
                    rang=xrange(0,end,(k+1))
                for i in rang:
                    word, next = line[i:i+k], line[i+k]
                    curr = all_freqs.get(word, None)
                    if curr is None:
                        curr = Freqs({next:1})
                        all_freqs[word] = curr
                    else:
                        curr += next
            if self._calc_entropy:
                self.Entropy = self._entropy(all_freqs)
            self.Frequencies = all_freqs
            if delete_bad_suffixes:
                self.deleteBadSuffixes()
            self.RawCounts=deepcopy(all_freqs)
            #preserve non-normalized freqs
            for dist in self.Frequencies.values():
                dist.normalize()

    def wordToUniqueKey\
            (self,word, conversion_dict={'a':0,'c':1,'t':2,'g':3}):
        #since conversion_dict values are used as array indices later, 
        #values of conversion dict should range from 0 to (n-1), 
        #where n=number of characters in your alphabet
        uniqueKey=0
        alpha_len = len(conversion_dict)
        for i in range(0,len(word)):
            uniqueKey += (conversion_dict[word[i]]*alpha_len**i)
        return uniqueKey

    def makeCountArray(self):
        """Generates a 1 column array with indices equal to the keys for each 
        kword + character and raw counts of the occurances of that key as values.
        
        This allows counts for many k+1 long strings to be 
        found simultaneously with evaluateArrayProbability 
        (which also normalizes the raw counts to frequencies)"""
        #print "makeCountArray: before replaceDegen self.Rawcounts=",\
        #        self.RawCounts
        self.replaceDegenerateBases()
        #print "makeCountArray:self.RawCounts=",self.RawCounts #debugging
        counts=self.RawCounts
        self.CountArray=zeros((4**(self.Order+1)),'f')
        #Order 0 --> 4 spots ('a','c','t','g') 1 --> 16 etc
        #TODO: may generate problems if Order = -1
            
        if self.Order==0:
            #print "makeCountArray:counts=",counts #debugging
            for key in counts['']:
                #print "attempting to put",float(counts[''][key]),"into index",\
                #        self.wordToUniqueKey(key),\
                #        "of array CountArray=",self.CountArray #debugging
                put(self.CountArray,self.wordToUniqueKey(key),\
                        float(counts[''][key]))
                self.CountArray[self.wordToUniqueKey(key)]=counts[''][key]
                #print "placement successful!" #debugging
        else:
            for kword in counts.keys():
                for key in counts[kword]:
                    index=self.wordToUniqueKey(kword+key)
                    #debugging
                    #print "attempting to put",counts[kword][key],"at index",\
                    #        index,"of self.CountArray, which =",self.CountArray
                    
                    put(self.CountArray,index,counts[kword][key])
                    #print "placement sucessful!" #debugging
        #print "makeCountArray:raveling self" #debugging 
        if self.ArrayPseudocounts:
            self.CountArray = self.CountArray + float(self.ArrayPseudocounts)
            # adds to each count, giving unobserved keys frequency 
            #pseudocounts/n 
            #n= number of observed counts (rather than 0 frequency)
            # When the number of pseudocounts added is one,
            # this is 'Laplace's rule'
            #(See 'Biological Sequence Analysis',Durbin et. al, p.115)
        self.CountArray=ravel(self.CountArray)
        #print "makeCountArray:final CountArray=",self.CountArray

    def updateFrequencyArray(self):
        """updates the frequency array by re-normalizing CountArray"""
        self.FrequencyArray=deepcopy(self.CountArray) #preserve raw counts
        total_counts=sum(self.FrequencyArray)
        self.FrequencyArray=self.FrequencyArray/total_counts
       
    def replaceDegenerateBases(self,normal_bases=['a','t','c','g']):
        """remove all characters from self.Text that aren't
        a,t,c or g and replace them with random characters
        (when degenerate characters are rare, this is useful
        because it avoids assigning all kwords with those 
        characters artificially low conditional probabilities)"""
        
        def normalize_character(base,bases=normal_bases):
            if base not in bases:
                base=choice(bases)
            return base
        
        text=self.Text
        for i in range(len(text)):
            text[i]=\
                    ''.join(map(normalize_character,text[i].lower()))
        self.Text=text

    def deleteBadSuffixes(self):
        """Deletes all suffixes that can't lead to prefixes.

        For example, with word size 3, if acg is present but cg* is not
        present, acg is not allowed.

        Need to repeat until no more suffixes are deleted.
        """
        f = self.Frequencies
        #loop until we make a pass where we don't delete anything
        deleted = True
        while deleted:
            deleted = False
            for k, v in f.items():
                suffix = k[1:]
                for last_char in v.keys():
                    #if we can't make suffix + last_char, can't select that char
                    if suffix + last_char not in f:
                        del v[last_char]
                        deleted=True
                if not v:   #if we deleted the last item, delete prefix
                     del f[k]
                     deleted = True

    def _entropy(self, frequencies):
        """Calcuates average entropy of the (k+1)th character for k-words."""
        sum_ = 0.
        sum_entropy = 0.
        count = 0.
        for i in frequencies.values():
            curr_entropy = i.Uncertainty
            curr_sum = sum(i.values())
            sum_ += curr_sum
            sum_entropy += curr_sum * curr_entropy
            count += 1
        return sum_entropy/sum_
        

    def _first_order_frequency_calculation(self):
        """Handles single-character calculations, which are independent.

        Specifically, don't need to take into account any other characters, and
        can just feed the whole thing into a single Freqs.
        """
        freqs = Freqs('')
        for line in self.Text:
            freqs += line
        #get rid of line breaks if necessary
        if not self.Linebreaks:
            for badkey in ['\r', '\n']:
                try:
                    del freqs[badkey]
                except KeyError:
                    pass    #don't care if there weren't any
        #if order is negative, equalize the frequencies
        if self.Order < 0:
            for key in freqs:
                freqs[key] = 1
        self.RawCounts= {'':deepcopy(freqs)}
        freqs.normalize()
        self.Frequencies = {'':freqs}

    def next(self, length=1, burn=0):
        """Generates random text of specified length with current freqs.
        burn specifies the number of iterations to throw away while the chain
        converges.
        """
        if self.Order < 1:
            return self._next_for_uncorrelated_model(length)
        freqs = self.Frequencies    #cache reference since it's frequently used
        #just pick one of the items at random, since calculating the weighted
        #frequencies is not possible without storing lots of extra info
        keys = freqs.keys()
        curr = choice(keys)
        result = []
        for i in range(burn +length):
            next = freqs[curr].choice(random())
            if i >= burn:
                result.append(next)
            curr = curr[1:] + next
        return ''.join(result)

    def _next_for_uncorrelated_model(self, length):
        """Special case for characters that don't depend on previous text."""
        return ''.join(self.Frequencies[''].randomSequence(length))

    def evaluateProbability(self,seq):
        """Evaluates the probability of generating a 
        user-specified sequence given the model."""
        conditional_prob=1
        order=self.Order
        for i in range(0,(len(seq)-(order)),1):
            k=seq[i:i+order+1]
            try:
                conditional_prob *= self.Frequencies[k[:-1]][k[-1]]
            except KeyError:
                #if key not in Frequencies 0 < Freq < 1/n
                #To be conservative in the exclusion of models, use 1/n 
                if conditional_prob:
                    conditional_prob *= 1.0/(float(len(self.Text)))
                else:
                    conditional_prob = 1.0/(float(len(self.Text)))
        return conditional_prob
        
    def evaluateWordProbability(self,word):
        k=word[:self.Order+1]
        try:
            conditional_prob= self.Frequencies[k[:-1]][k[-1]]
        except KeyError:
            conditional_prob = 1.0/(float(len(self.Text)))
        return conditional_prob
    
    def evaluateArrayProbability(self,id_array):
        #takes an array of unique integer keys
        #corresponding to (k+1) long strings
        #[can be generated by self.wordToUniqueKey()]
        #Outputs probability
        if self.FrequencyArray is None:
            if self.CountArray is None:
                self.makeCountArray()
            self.updateFrequencyArray()
        freqs=take(self.FrequencyArray,id_array)
        prob=reduce(mul,freqs)
        return float(prob)
 
    def evaluateInitiationFrequency(self,kword,\
            allowed_bases=['a','t','c','g']):
        # takes a unique key corresponding to a k long word
        # calculates the initiation frequency for that kword
        # which is equal to its relative frequency

        #TODO: add case where order is < 1
        if len(kword) != (self.Order):
            raise KwordError  #kword must be equal to markov model order
        if self.CountArray is None:
            self.makeCountArray()
        unique_keys=[]
        #add term for each possible letter
        for base in allowed_bases:
            unique_keys.append(self.wordToUniqueKey(kword+base))
        id_array=ravel(array(unique_keys))
        counts=take(self.CountArray,id_array)
        total_kword_counts=sum(counts)
        total_counts=sum(self.CountArray)
        prob=float(total_kword_counts)/float(total_counts)
        return prob

    def excludeContribution(self,excluded_texts):
        #"""Excludes the contribution of a set of texts 
        #from the markov model.  This can be useful, for example,
        #to prevent self-contribution of the data in a gene to 
        #the model under which that gene is evaluated.
        #
        #A Markov Model is made from the strings, converted to a CountArray,
        #and then that Count array (stored as ExcludedCounts) is subtracted
        #from the current CountArray, and FrequencyArray is updated.
        #
        #The data excluded with this function can be restored with
        #restoreContribution
        #
        #Only one list of texts can be excluded at any time.  If a list of 
        #texts is already excluded when excludeContribution is called, that 
        #data will be restored before the new data is excluded"""
        
        #print "excludeContribution:excluded_texts=",excluded_texts #debugging
        if self.CountArray is None:
            #print ".excludeContribution:missing countArray" #debugging
            self.makeCountArray()
        if self.ExcludedCounts:
            self.restoreContribution()
        #generate mm using same parameters as current model
        exclusion_model=MarkovGenerator(excluded_texts,order=self.Order,\
                overlapping=self.Overlapping)
        exclusion_model.makeCountArray()
        self.ExcludedCounts=\
                (exclusion_model.CountArray)
        self.CountArray = self.CountArray-(self.ExcludedCounts)
        self.updateFrequencyArray()
        
    def restoreContribution(self):
        """Restores data excluded using excludeContribution, and 
        renormalizes FrequencyArray"""
        if self.ExcludedCounts:
            self.CountArray += (self.ExcludedCounts)
            self.ExcludedCounts=None
            self.updateFrequencyArray()

        
    
def count_kwords(source, k, delimiter=''):
    """Makes dict of {word:count} for specified k."""
    result = {}
    #reset to beginning if possible
    if hasattr(source, 'seek'):
        source.seek(0)
    elif hasattr(source, 'reset'):
        source.reset()
    if isinstance(source, str):
        if delimiter:
            source = source.split(delimiter)
        else:
            source = [source]
    for s in source:
        for i in range(len(s) - k + 1):
            curr = s[i:i+k]
            if curr in result:
                result[curr] += 1
            else:
                result[curr] = 1
    return result

def extract_prefix(kwords):
    """Converts dict of {w:count} to {w[:-1]:{w[-1]:count}}"""
    result = {}
    for w, count in kwords.items():
        prefix = w[:-1]
        suffix = w[-1]
        if prefix not in result:
            result[prefix] = {}
        curr = result[prefix]
        if suffix not in curr:
            curr[suffix] = {}
        curr[suffix] = count
    return result

def _get_expected_counts(kwords, kminus1):
    """Gets expected counts from counts of 2 successive kword lengths.."""
    result = []
    total = sum(kminus1.values())   #shortest
    prefixes = extract_prefix(kminus1)
    for k in kwords:
        result.append(kminus1[k[:-1]] * prefixes[k[1:-1]]/kminus1[k[:-1]])

def _pair_product(p, i, j):
    """Return product of counts of i and j from data."""
    try:
        return sum(p[i].values())*sum(p[j].values())
    except KeyError:
        return 0

def markov_order(word_counts, k, alpha):
    """Estimates Markov order of a source, using G test for fit.

    Uses following procedure:

    A source depends on the previous k letters more than the previous (k-1)
    letters iff Pr(a|w_k) != Pr(a|w_{k-1}) for all words of length k. If we 
    know Pr(a|w) for all symbols a and words of length k and k-1, we would
    expect count(a|w_i) to equal count(a|w_i[1:]) * count(w)/count(w[1:]).
    We can compare these expected frequencies to observed frequencies using
    the G test.
    
    max_length: maximum correlation length to try
    """
    if k == 0:  #special case: test for unequal freqs
        obs = word_counts.values()
        total = sum(obs)    #will remain defined through loop
        exp = [total/len(word_counts)] * len(word_counts)
    elif k == 1: #special case: test for pair freqs
        prefix_counts = extract_prefix(word_counts)
        total = sum(word_counts.values())
        words = word_counts.keys()
        
        exp = [_pair_product(prefix_counts, w[0], w[1])/total for w in words]
        obs = word_counts.values()
    else:   # k >= 3: need to do general Markov chain
        #expect count(a_i.w.b_i) to be Pr(b_i|w)*count(a_i.w)
        cwb = {}    #count of word.b
        cw = {}     #count of word
        caw = {}    #count of a.word
        #build up counts of prefix, word, and suffix
        for word, count in word_counts.items():
            aw, w, wb = word[:-1], word[1:-1], word[1:]
            if not wb in cwb:
                cwb[wb] = 0
            cwb[wb] += count
            if not aw in caw:
                caw[aw] = 0
            caw[aw] += count
            if not w in cw:
                cw[w] = 0
            cw[w] += count

        obs = word_counts.values()
        exp = [cwb[w[1:]]/(cw[w[1:-1]])*caw[w[:-1]] for w in \
            word_counts.keys()]
    return G_fit(obs, exp)

def random_source(a, k, random_f=random):
    """Makes a random Markov source on alphabet a with memory k.
    
    Specifically, for all words k, pr(i|k) = rand().
    """
    result = dict.fromkeys(map(''.join, cartesian_product([a]*k)))
    for k in result:
        result[k] = Freqs(dict(zip(a, random_f(len(a)))))
    return result

def markov_order_tests(a, max_order=5, text_len=10000, verbose=False):
    """Tests of the Markov order inferrer using Markov chains of diff. orders.
    """
    result = []
    max_estimated_order = max_order + 2
    for real_order in range(max_order):
        print "Actual Markov order:", real_order
        s = random_source(a, real_order)
        m = MarkovGenerator(order=real_order, freqs=s)
        text = m.next(text_len)
        for word_length in range(1, max_estimated_order+1):
            words = count_kwords(text, word_length)
            g, prob = markov_order(words, word_length-1, a)
            if verbose:
                print "Inferred order: %s G=%s P=%s" % (word_length-1, g, prob)
            result.append([word_length-1, g, prob])
    return result
            
         
        

if __name__ == '__main__':
    """Makes text of specified # chars from training file.
    
    Note: these were tested from the command line and confirmed working by RK 
    on 8/6/07.
    """
    from sys import argv, exit
    if len(argv) == 2 and argv[1] == 'm':
        markov_order_tests('tcag', verbose=True)
    elif len(argv) == 3 and argv[1] == 'm':
        infilename = argv[2]
        max_estimated_order = 12
        text = open(infilename).read().split('\n')
        for word_length in range(1, max_estimated_order):
            words = count_kwords(text, word_length)
            g, prob = markov_order(words, word_length-1, 'ATGC')
            print "Inferred order: %s G=%s P=%s" % (word_length-1, g, prob)
            
    else:
        try:
            length = int(argv[1])
            max_order = int(argv[2])
            text = open(argv[3], 'U')
        except:
           print "Usage: python markov.py num_chars order training_file"
           print "...or python markov.py m training_file to check order"
           exit()
        for order in range(max_order + 1):
            m = MarkovGenerator(text, order, calc_entropy=True)
            print order,':', 'Entropy=', m.Entropy, m.next(length=length)