This file is indexed.

/usr/bin/score_conservation is in conservation-code 20110309.0-3.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
#! /usr/bin/python

################################################################################
# score_conservation.py - Copyright Tony Capra 2007 - Last Update: 03/09/11
#
# 03/09/11 - default window size = 3, as stated in help
# 06/21/09 - seq specific output now compatible with ConCavity
# 06/21/09 - numarray only included when vn_entropy is used
# 08/15/08 - added property_relative_entropy scoring method
# 08/15/08 - added equal sequence length check
# 01/07/08 - added z-score normalization option (-n)
# 01/07/08 - added seq. specific output option (-a)
# 11/30/07 - read_scoring_matrix now returns list rather than array.
# 11/30/07 - added window lambda command line option (-b)
# 07/05/07 - fixed gap penalty cutoff (<=) and error message
# 06/26/07 - fixed read_clustal_align bug
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
# -----------------------------------------------------------------------------
#
# Dependencies: 
# numarray (for von Neumann entropy method) - 
# http://sourceforge.net/project/showfiles.php?group_id=1369&release_id=223264
# 
# 
# See usage() for usage.
#
# This program supports the paper: Capra JA and Singh M. 
# Predicting functionally important residues from sequence
# conservation. Bioinformatics. 23(15): 1875-1882, 2007.
# Please cite this paper if you use this code.
#
# It contains code for each method of scoring conservation that is
# evaluated: sum of pairs, weighted sum of pairs, Shannon entropy,
# Shannon entropy with property groupings (Mirny and Shakhnovich 95,
# Valdar and Thornton 01), relative entropy with property groupings
# (Williamson 95), von Neumann entropy (Caffrey et al 04), relative
# entropy (Samudrala and Wang 06), and Jensen-Shannon divergence
# (Capra and Singh 07).
#
# The code distributed with Mayrose et al 04 is used for Rate4Site. As
# of today it can be obtained from:
# http://www.tau.ac.il/~itaymay/cp/rate4site.html
# 
# 
# All scoring functions follow the same prototype:
#
# def score(col, sim_matrix, bg_disr, seq_weights, gap_penalty=1):
#
# - col: the column to be scored.
# 
# - sim_matrix: the similarity (scoring) matrix to be used. Not all 
# methods will use this parameter. 
# 
# - bg_distr: a list containing an amino acid probability distribution. Not
# all methods use this parameter. The default is the blosum62 background, but
# other distributions can be given. 
#
# - seq_weights: an array of floats that is used to weight the contribution
# of each seqeuence. If the len(seq_weights) != len(col), then every sequence 
# gets a weight of one.
#
# - gap_penalty: a binary variable: 0 for no gap penalty and 1
# for gap penalty. The default is to use a penalty. The gap penalty used is
# the score times the fraction of non-gap positions in the column.
#
#
# For a window score of any of above methods use the window_score method to 
# transform the individual column scores. 
#
################################################################################

from __future__ import print_function
import math, sys, getopt
import re
# numarray imported below

PSEUDOCOUNT = .0000001

amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-']
iupac_alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "Z", "X", "*", "-"] 

# dictionary to map from amino acid to its row/column in a similarity matrix
aa_to_index = {}
for i, aa in enumerate(amino_acids):
    aa_to_index[aa] = i


def usage( __file = sys.stdout ):
    print( """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t
    -a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t
    -b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n
    -d\tbackground distribution file, e.g., swissprot.distribution. Default=BLOSUM62 background [filename]\n\t
    -g\tgap cutoff. Do not score columns that contain more than gap cutoff fraction gaps. Default=.3 [real in [0, 1)]\n\t
    -h\thelp. Print this message.\n
    -l\tuse sequence weighting. Default=True [True|False]\n\t
    -m\tsimilarity matrix file, e.g., matrix/blosum62.bla or .qij. Default=identity matrix [filename]\n\t
    -n\tnormalize scores. Print the z-score (over the alignment) of each column raw score. Default=False\n\t
    -o\tname of output file. Default=output to screen [filename]\n\t
    -p\tuse gap penalty. Lower the score of columns that contain gaps. Default=True [True|False]\n\t
    -s\tconservation estimation method. \n\t\tOptions: shannon_entropy, property_entropy, property_relative_entropy, vn_entropy, relative_entropy, js_divergence, sum_of_pairs. Default=js_divergence\n\t
    -w\twindow size. Number of residues on either side included in the window. Default=3 [int]\n\t
    """, file=__file)
    
    

################################################################################
# Frequency Count and Gap Penalty
################################################################################

def weighted_freq_count_pseudocount(col, seq_weights, pc_amount):
    """ Return the weighted frequency count for a column--with pseudocount."""

    # if the weights do not match, use equal weight
    if len(seq_weights) != len(col):
	seq_weights = [1.] * len(col)

    aa_num = 0
    freq_counts = len(amino_acids)*[pc_amount] # in order defined by amino_acids

    for aa in amino_acids:
	for j in range(len(col)):
	    if col[j] == aa:
		freq_counts[aa_num] += 1 * seq_weights[j]

	aa_num += 1

    freqsum = (sum(seq_weights) + len(amino_acids) * pc_amount)
    for j in range(len(freq_counts)):
	freq_counts[j] = freq_counts[j] / freqsum

    return freq_counts


def weighted_gap_penalty(col, seq_weights):
    """ Calculate the simple gap penalty multiplier for the column. If the 
    sequences are weighted, the gaps, when penalized, are weighted 
    accordingly. """

    # if the weights do not match, use equal weight
    if len(seq_weights) != len(col):
	seq_weights = [1.] * len(col)
    
    gap_sum = 0.
    for i in range(len(col)):
	if col[i] == '-':
	    gap_sum += seq_weights[i]

    return 1 - (gap_sum / sum(seq_weights))


def gap_percentage(col):
    """Return the percentage of gaps in col."""
    num_gaps = 0.

    for aa in col:
	if aa == '-': num_gaps += 1

    return num_gaps / len(col)


################################################################################
# Shannon Entropy
################################################################################

def shannon_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """Calculates the Shannon entropy of the column col. sim_matrix  and 
    bg_distr are ignored. If gap_penalty == 1, then gaps are penalized. The 
    entropy will be between zero and one because of its base. See p.13 of 
    Valdar 02 for details. The information score 1 - h is returned for the sake 
    of consistency with other scores."""

    fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

    h = 0. 
    for i in range(len(fc)):
	if fc[i] != 0:
	    h += fc[i] * math.log(fc[i])

#    h /= math.log(len(fc))
    h /= math.log(min(len(fc), len(col)))

    inf_score = 1 - (-1 * h)

    if gap_penalty == 1: 
	return inf_score * weighted_gap_penalty(col, seq_weights)
    else: 
	return inf_score


################################################################################
# Property Entropy
################################################################################

def property_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """Calculate the entropy of a column col relative to a partition of the 
    amino acids. Similar to Mirny '99. sim_matrix and bg_distr are ignored, but 
    could be used to define the sets. """

    # Mirny and Shakn. '99
    property_partition = [['A','V','L','I','M','C'], ['F','W','Y','H'], ['S','T','N','Q'], ['K','R'], ['D', 'E'], ['G', 'P'], ['-']]

    # Williamson '95
    # property_partition = [['V','L', 'I','M'], ['F','W','Y'], ['S','T'], ['N','Q'], ['H','K','R'], ['D','E'], ['A','G'], ['P'], ['C'], ['-']]

    fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

    # sum the aa frequencies to get the property frequencies
    prop_fc = [0.] * len(property_partition)
    for p in range(len(property_partition)):
	for aa in property_partition[p]:
	    prop_fc[p] += fc[aa_to_index[aa]]

    h = 0. 
    for i in range(len(prop_fc)):
	if prop_fc[i] != 0:
	    h += prop_fc[i] * math.log(prop_fc[i])

    h /= math.log(min(len(property_partition), len(col)))

    inf_score = 1 - (-1 * h)

    if gap_penalty == 1: 
	return inf_score * weighted_gap_penalty(col, seq_weights)
    else: 
	return inf_score


################################################################################
# Property Relative Entropy
################################################################################

def property_relative_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """Calculate the relative entropy of a column col relative to a
    partition of the amino acids. Similar to Williamson '95. sim_matrix is
    ignored, but could be used to define the sets. See shannon_entropy()
    for more general info. """

    # Mirny and Shakn. '99
    #property_partition = [['A','V','L','I','M','C'], ['F','W','Y','H'], ['S','T','N','Q'], ['K','R'], ['D', 'E'], ['G', 'P'], ['-']]

    # Williamson '95
    property_partition = [['V','L', 'I','M'], ['F','W','Y'], ['S','T'], ['N','Q'], ['H','K','R'], ['D','E'], ['A','G'], ['P'], ['C']]

    prop_bg_freq = []
    if len(bg_distr) == len(property_partition):
	prop_bg_freq = bg_distr
    else:
	prop_bg_freq = [0.248, 0.092, 0.114, 0.075, 0.132, 0.111, 0.161, 0.043, 0.024, 0.000] # from BL62

    #fc = weighted_freq_count_ignore_gaps(col, seq_weights)
    fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

    # sum the aa frequencies to get the property frequencies
    prop_fc = [0.] * len(property_partition)
    for p in range(len(property_partition)):
	for aa in property_partition[p]:
	    prop_fc[p] += fc[aa_to_index[aa]]

    d = 0. 
    for i in range(len(prop_fc)):
	if prop_fc[i] != 0 and prop_bg_freq[i] != 0:
	    d += prop_fc[i] * math.log(prop_fc[i] / prop_bg_freq[i], 2)


    if gap_penalty == 1: 
	return d * weighted_gap_penalty(col, seq_weights)
    else: 
	return d


################################################################################
# von Neumann Entropy
################################################################################

def vn_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """ Calculate the von Neuman Entropy as described in Caffrey et al. 04.
    This code was adapted from the implementation found in the PFAAT project 
    available on SourceForge. bg_distr is ignored."""

    aa_counts = [0.] * 20
    for aa in col:
	if aa != '-': aa_counts[aa_to_index[aa]] += 1

    dm_size = 0
    dm_aas = []
    for i in range(len(aa_counts)):
	if aa_counts[i] != 0:
	    dm_aas.append(i)
	    dm_size += 1

    if dm_size == 0: return 0.0

    row_i = 0
    col_i = 0
    dm = zeros((dm_size, dm_size), Float32)
    for i in range(dm_size):
	row_i = dm_aas[i]
	for j in range(dm_size):
	    col_i = dm_aas[j]
	    dm[i][j] = aa_counts[row_i] * sim_matrix[row_i][col_i]

    ev = la.eigenvalues(dm).real

    temp = 0.
    for e in ev:
	temp += e

    if temp != 0:
	for i in range(len(ev)):
	    ev[i] = ev[i] / temp

    vne = 0.0
    for e in ev:
	if e > (10**-10):
	    vne -= e * math.log(e) / math.log(20)

    if gap_penalty == 1: 
	#return (1-vne) * weighted_gap_penalty(col, seq_weights)
	return (1-vne) * weighted_gap_penalty(col, [1.] * len(col))
    else: 
	return 1 - vne


################################################################################
# Relative Entropy 
################################################################################

def relative_entropy(col, sim_matix, bg_distr, seq_weights, gap_penalty=1):
    """Calculate the relative entropy of the column distribution with a 
    background distribution specified in bg_distr. This is similar to the 
    approach proposed in Wang and Samudrala 06. sim_matrix is ignored."""

    distr = bg_distr[:]

    fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

    # remove gap count
    if len(distr) == 20: 
	new_fc = fc[:-1]
	s = sum(new_fc)
	for i in range(len(new_fc)):
	    new_fc[i] = new_fc[i] / s
	fc = new_fc

    if len(fc) != len(distr): return -1

    d = 0.
    for i in range(len(fc)):
	if distr[i] != 0.0:
	    d += fc[i] * math.log(fc[i]/distr[i])

    d /= math.log(len(fc))

    if gap_penalty == 1: 
	return d * weighted_gap_penalty(col, seq_weights)
    else: 
	return d



################################################################################
# Jensen-Shannon Divergence
################################################################################

def js_divergence(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """ Return the Jensen-Shannon Divergence for the column with the background
    distribution bg_distr. sim_matrix is ignored. JSD is the default method."""

    distr = bg_distr[:]

    fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)

    # if background distrubtion lacks a gap count, remove fc gap count
    if len(distr) == 20: 
	new_fc = fc[:-1]
	s = sum(new_fc)
	for i in range(len(new_fc)):
	    new_fc[i] = new_fc[i] / s
	fc = new_fc

    if len(fc) != len(distr): return -1

    # make r distriubtion
    r = []
    for i in range(len(fc)):
	r.append(.5 * fc[i] + .5 * distr[i])

    d = 0.
    for i in range(len(fc)):
	if r[i] != 0.0:
	    if fc[i] == 0.0:
		d += distr[i] * math.log(distr[i]/r[i], 2)
	    elif distr[i] == 0.0:
		d += fc[i] * math.log(fc[i]/r[i], 2) 
	    else:
		d += fc[i] * math.log(fc[i]/r[i], 2) + distr[i] * math.log(distr[i]/r[i], 2)

    # d /= 2 * math.log(len(fc))
    d /= 2

    if gap_penalty == 1: 
	return d * weighted_gap_penalty(col, seq_weights)
    else: 
	return d


################################################################################
# Mutation Weighted Pairwise Match
################################################################################

def sum_of_pairs(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
    """ Sum the similarity matrix values for all pairs in the column. 
    This method is similar to those proposed in Valdar 02. bg_distr is ignored."""

    sum = 0.
    max_sum = 0.

    for i in range(len(col)):
	for j in range(i):
	    if col[i] != '-' and col[j] != '-':
		max_sum += seq_weights[i] * seq_weights[j]
		sum += seq_weights[i] * seq_weights[j] * sim_matrix[aa_to_index[col[i]]][aa_to_index[col[j]]]

    if max_sum != 0: 
	sum /= max_sum
    else:
	sum = 0.

    if gap_penalty == 1: 
	return sum * weighted_gap_penalty(col, seq_weights)
    else:
	return sum



################################################################################
# Window Score
################################################################################

def window_score(scores, window_len, lam=.5):
    """ This function takes a list of scores and a length and transforms them 
    so that each position is a weighted average of the surrounding positions. 
    Positions with scores less than zero are not changed and are ignored in the 
    calculation. Here window_len is interpreted to mean window_len residues on 
    either side of the current residue. """

    w_scores = scores[:]

    for i in range(window_len, len(scores) - window_len):
	if scores[i] < 0: 
	    continue

	sum = 0.
	num_terms = 0.
	for j in range(i - window_len, i + window_len + 1):
	    if i != j and scores[j] >= 0:
		num_terms += 1
		sum += scores[j]

	if num_terms > 0:
	    w_scores[i] = (1 - lam) * (sum / num_terms) + lam * scores[i]

    return w_scores


def calc_z_scores(scores, score_cutoff):
    """Calculates the z-scores for a set of scores. Scores below
    score_cutoff are not included."""

    average = 0.
    std_dev = 0.
    z_scores = []
    num_scores = 0

    for s in scores:
	if s > score_cutoff:
	    average += s
	    num_scores += 1
    if num_scores != 0:
	average /= num_scores

    for s in scores:
	if s > score_cutoff:
	    std_dev += ((s - average)**2) / num_scores
    std_dev = math.sqrt(std_dev)

    for s in scores:
	if s > score_cutoff and std_dev != 0:
	    z_scores.append((s-average)/std_dev)
	else:
	    z_scores.append(-1000.0)

    return z_scores


################################################################################
################################################################################
################################################################################
#  END CONSERVATION SCORES
################################################################################
################################################################################
################################################################################


def read_scoring_matrix(sm_file):
    """ Read in a scoring matrix from a file, e.g., blosum80.bla, and return it
    as an array. """
    aa_index = 0
    first_line = 1
    row = []
    list_sm = [] # hold the matrix in list form

    try:
	matrix_file = open(sm_file, 'r')

	for line in matrix_file:

	    if line[0] != '#' and first_line:
		first_line = 0
		if len(amino_acids) == 0:
		    for c in line.split():
			aa_to_index[string.lower(c)] = aa_index
			amino_acids.append(string.lower(c))
			aa_index += 1

	    elif line[0] != '#' and first_line == 0:
		if len(line) > 1:
		    row = line.split()
		    list_sm.append(row)

    except IOError, e:
	print( "Could not load similarity matrix: %s. Using identity matrix..." % sm_file, file=sys.stderr )
	return identity(20)
	
    # if matrix is stored in lower tri form, copy to upper
    if len(list_sm[0]) < 20:
	for i in range(0,19):
	    for j in range(i+1, 20):
		list_sm[i].append(list_sm[j][i])

    for i in range(len(list_sm)):
	for j in range(len(list_sm[i])):
	    list_sm[i][j] = float(list_sm[i][j])

    return list_sm
    #sim_matrix = array(list_sm, type=Float32)
    #return sim_matrix

def calculate_sequence_weights(msa):
        """ Calculate the sequence weights using the Henikoff '94 method
        for the given msa. """

        seq_weights = [0.] * len(msa)
        for i in range(len(msa[0])):
                freq_counts = [0] * len(amino_acids)

                col = []
                for j in range(len(msa)):
                    if msa[j][i] != '-': # ignore gaps
                        freq_counts[aa_to_index[msa[j][i]]] += 1

                num_observed_types = 0
                for j in range(len(freq_counts)):
                    if freq_counts[j] > 0: num_observed_types +=1

                for j in range(len(msa)):
                    d = freq_counts[aa_to_index[msa[j][i]]] * num_observed_types
                    if d > 0:
                        seq_weights[j] += 1. / d

        for w in range(len(seq_weights)):
                seq_weights[w] /= len(msa[0])

        return seq_weights


def load_sequence_weights(fname):
    """Read in a sequence weight file f and create sequence weight list. 
    The weights are in the same order as the sequences each on a new line. """
    seq_weights = []

    try:
	f = open(fname)

	for line in f:
	    l = line.split()
	    if line[0] != '#' and len(l) == 2:
	       seq_weights.append(float(l[1]))

    except IOError, e:
	pass
	#print "No sequence weights. Can't find: ", fname

    return seq_weights

def get_column(col_num, alignment):
    """Return the col_num column of alignment as a list."""
    col = []
    for seq in alignment:
	if col_num < len(seq): col.append(seq[col_num])

    return col

def get_distribution_from_file(fname):
    """ Read an amino acid distribution from a file. The probabilities should
    be on a single line separated by whitespace in alphabetical order as in 
    amino_acids above. # is the comment character."""

    distribution = []
    try:
	f = open(fname)
	for line in f:
	    if line[0] == '#': continue
	    line = line[:-1]
	    distribution = line.split()
	    distribution = map(float, distribution)

	    
    except IOError, e:
	print( e, "Using default (BLOSUM62) background.", file=sys.stderr )
	return []

    # use a range to be flexible about round off
    if .997 > sum(distribution) or sum(distribution) > 1.003:
	print( "Distribution does not sum to 1. Using default (BLOSUM62) background.", file=sys.stderr )
	print( sum(distribution), file=sys.stderr )
	return []

    return distribution


def read_fasta_alignment(filename):
    """ Read in the alignment stored in the FASTA file, filename. Return two
    lists: the identifiers and sequences. """

    f = open(filename)

    names = []
    alignment = []
    cur_seq = ''

    for line in f:
	line = line[:-1]
	if len(line) == 0: continue

	if line[0] == ';': continue
	if line[0] == '>':
	    names.append(line[1:].replace('\r', ''))

	    if cur_seq != '':
                cur_seq = cur_seq.upper()
                for i, aa in enumerate(cur_seq):
                    if aa not in iupac_alphabet:
                        cur_seq = cur_seq.replace(aa, '-')
		alignment.append(cur_seq.replace('B', 'D').replace('Z', 'Q').replace('X', '-'))
		cur_seq = ''
	elif line[0] in iupac_alphabet:
	    cur_seq += line.replace('\r', '')

    # add the last sequence
    cur_seq = cur_seq.upper()
    for i, aa in enumerate(cur_seq):
        if aa not in iupac_alphabet:
            cur_seq = cur_seq.replace(aa, '-')
    alignment.append(cur_seq.replace('B', 'D').replace('Z', 'Q').replace('X', '-'))

    return names, alignment
	
def read_clustal_alignment(filename):
    """ Read in the alignment stored in the CLUSTAL or Stockholm file, filename. Return
    two lists: the names and sequences. """

    names = []
    alignment = []

    re_stock_markup = re.compile('^#=')

    f = open(filename)

    for line in f:
	line = line[:-1]
	if len(line) == 0: continue
	if '*' in line: continue

	if line[0:7] == 'CLUSTAL': continue
	if line[0:11] == '# STOCKHOLM': continue
	if line[0:2] == '//': continue

	if re_stock_markup.match(line): continue

	t = line.split()

	if len(t) == 2 and t[1][0] in iupac_alphabet:
	    ali = t[1].upper().replace('B', 'D').replace('Z', 'Q').replace('X', '-').replace('\r', '').replace('.', '-')
	    if t[0] not in names:
		names.append(t[0])
		alignment.append(ali)
	    else:
		alignment[names.index(t[0])] += ali
		   
    return names, alignment




################################################################################
# Begin execution
################################################################################

# BLOSUM62 background distribution
blosum_background_distr = [0.078, 0.051, 0.041, 0.052, 0.024, 0.034, 0.059, 0.083, 0.025, 0.062, 0.092, 0.056, 0.024, 0.044, 0.043, 0.059, 0.055, 0.014, 0.034, 0.072]

# set defaults
window_size = 3 # 0 = no window
win_lam = .5 # for window method linear combination
outfile_name = ""
s_matrix_file = "/usr/share/conservation-code/matrix/blosum62.bla"
bg_distribution = blosum_background_distr[:]
scoring_function = js_divergence
use_seq_weights = True
background_name = 'blosum62'
gap_cutoff = .3
use_gap_penalty = 1
seq_specific_output = 0 # name of sequence if True
normalize_scores = False

# parse options and args -- see usage()
if len(sys.argv) < 2: 
    usage(__file=sys.stderr)
    sys.exit(2)

try:
    opts, args = getopt.getopt(sys.argv[1:], "hl:d:m:o:s:w:g:p:b:a:n")
except getopt.GetoptError:
    usage(__file=sys.stderr)
    sys.exit(1)

if len(args) < 1: 
    usage(__file=sys.stderr)
    sys.exit(1)

for opt, arg in opts:
    if opt == "-h":
	usage()
	sys.exit()
    if opt == "-o":
	outfile_name = arg
    elif opt == "-l":
	if 'false' in arg.lower():
	    use_seq_weights = False
    elif opt == "-p":
	if 'false' in arg.lower():
	    use_gap_penalty = 0
    elif opt == "-m":
	s_matrix_file = arg
    elif opt == "-d":
	d = get_distribution_from_file(arg)
	if d != []: 
	    bg_distribution = d
	    background_name = arg
    elif opt == "-w":
	try:
	    window_size = int(arg)
	except ValueError:
	    print( "ERROR: Window size must be an integer. Using window_size 3...", file=sys.stderr )
	    window_size = 3
    elif opt == "-b":
	try:
	    win_lam = float(arg)
	    if not (0. <= win_lam <= 1.): raise ValueError
	except ValueError:
	    print( "ERROR: Window lambda must be a real in [0,1]. Using lambda = .5...", file=sys.stderr )
	    win_lam = .5
    elif opt == "-g":
	try:
	    gap_cutoff = float(arg)
	    if not (0. <= gap_cutoff < 1.): raise ValueError
	except ValueError:
	    print( "ERROR: Gap cutoff must be a real in [0,1). Using a gap cutoff of .3...", file=sys.stderr )
	    gap_cutoff = .3
    elif opt == '-a':
	seq_specific_output = arg
    elif opt == '-n':
	normalize_scores = True
    elif opt == '-s':
	if arg == 'shannon_entropy': scoring_function = shannon_entropy
	elif arg == 'property_entropy': scoring_function = property_entropy
	elif arg == 'property_relative_entropy': scoring_function = property_relative_entropy
	elif arg == 'vn_entropy': scoring_function = vn_entropy; from numpy.numarray import *; import numpy.numarray.linear_algebra as la

	elif arg == 'relative_entropy': scoring_function = relative_entropy
	elif arg == 'js_divergence': scoring_function = js_divergence
	elif arg == 'sum_of_pairs': scoring_function = sum_of_pairs
	else: print( "%s is not a valid scoring method. Using %s.\n" % (arg, scoring_function.__name__), file=sys.stderr )


align_file = args[0]
align_suffix = align_file.split('.')[-1]

s_matrix = read_scoring_matrix(s_matrix_file)

names = []
alignment = []
seq_weights = []

try:
    names, alignment = read_clustal_alignment(align_file)
    if names == []:
	names, alignment = read_fasta_alignment(align_file)
except IOError, e:
    print( e, "Could not find %s. Exiting..." % align_file, file=sys.stderr )
    sys.exit(1)


if len(alignment) != len(names) or alignment == []:
        print( "Unable to parse alignment.\n", file=sys.stderr )
	sys.exit(1)

seq_len = len(alignment[0])
for i, seq in enumerate(alignment):
    if len(seq) != seq_len:
	print( "ERROR: Sequences of different lengths: %s (%d) != %s (%d).\n" % (names[0], seq_len, names[i], len(seq)), file=sys.stderr )
	sys.exit(1)


if use_seq_weights:
    seq_weights = load_sequence_weights(align_file.replace('.%s' % align_suffix, '.weights'))
    if seq_weights == []:
	seq_weights = calculate_sequence_weights(alignment)

if len(seq_weights) != len(alignment): seq_weights = [1.] * len(alignment)

# handle print of output relative to specific sequence
ref_seq_num = None
if seq_specific_output and seq_specific_output not in names:
    print( "Sequence %s not found in alignment. Using default output format...\n" % seq_specific_output, file=sys.stderr )
    seq_specific_output = 0
elif seq_specific_output in names:
    ref_seq_num = names.index(seq_specific_output)

# calculate scores
scores = []
for i in range(len(alignment[0])):
    col = get_column(i, alignment)

    if len(col) == len(alignment):
	if gap_percentage(col) <= gap_cutoff:
	    scores.append(scoring_function(col, s_matrix, bg_distribution, seq_weights, use_gap_penalty))
	else:
	    scores.append(-1000.)

if window_size > 0:
    scores = window_score(scores, window_size, win_lam)

if normalize_scores:
    scores = calc_z_scores(scores, -999)


# print to file/stdout
try:
    if outfile_name != "": 
	outfile = open(outfile_name, 'w')
	outfile.write("# %s -- %s - window_size: %d - window lambda: %.2f - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s\n" % (align_file, scoring_function.__name__, window_size, win_lam, background_name, use_seq_weights, use_gap_penalty, normalize_scores))
	if seq_specific_output: 
	    outfile.write("# reference sequence: %s\n" % seq_specific_output)
	    outfile.write("# align_column_number\tamino acid\tscore\n")
	else:
	    outfile.write("# align_column_number\tscore\tcolumn\n")
    else:
	print( "# %s -- %s - window_size: %d - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s" % (align_file, scoring_function.__name__, window_size, background_name, use_seq_weights, use_gap_penalty, normalize_scores) )
	if seq_specific_output: 
	    print( "# reference sequence: %s" % seq_specific_output )
	    print( "# align_column_number\tamino acid\tscore\n" )
	else:
	    print( "# align_column_number\tscore\tcolumn\n" )

except IOError, e:
    print( "Could not open %s for output. Printing results to standard out..." % outfile_name, file=sys.stderr )
    outfile_name = ""

for i, score in enumerate(scores):
    if seq_specific_output:
	cur_aa = get_column(i, alignment)[ref_seq_num]
	if cur_aa == '-': continue
	if outfile_name == "":
	    print( "%d\t%s\t%.5f" % (i, cur_aa, score) )
	else:
	    outfile.write("%d\t%s\t%5f\n" % (i, cur_aa, score))
    else:
	if outfile_name == "":
	    print( "%d\t%.5f\t%s" % (i, score, "".join(get_column(i, alignment))) )
	else:
	    outfile.write("%d\t%5f\t%s\n" % (i, score, "".join(get_column(i, alignment))))