/usr/bin/score_conservation is in conservation-code 20110309.0-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 | #! /usr/bin/python
################################################################################
# score_conservation.py - Copyright Tony Capra 2007 - Last Update: 03/09/11
#
# 03/09/11 - default window size = 3, as stated in help
# 06/21/09 - seq specific output now compatible with ConCavity
# 06/21/09 - numarray only included when vn_entropy is used
# 08/15/08 - added property_relative_entropy scoring method
# 08/15/08 - added equal sequence length check
# 01/07/08 - added z-score normalization option (-n)
# 01/07/08 - added seq. specific output option (-a)
# 11/30/07 - read_scoring_matrix now returns list rather than array.
# 11/30/07 - added window lambda command line option (-b)
# 07/05/07 - fixed gap penalty cutoff (<=) and error message
# 06/26/07 - fixed read_clustal_align bug
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# -----------------------------------------------------------------------------
#
# Dependencies:
# numarray (for von Neumann entropy method) -
# http://sourceforge.net/project/showfiles.php?group_id=1369&release_id=223264
#
#
# See usage() for usage.
#
# This program supports the paper: Capra JA and Singh M.
# Predicting functionally important residues from sequence
# conservation. Bioinformatics. 23(15): 1875-1882, 2007.
# Please cite this paper if you use this code.
#
# It contains code for each method of scoring conservation that is
# evaluated: sum of pairs, weighted sum of pairs, Shannon entropy,
# Shannon entropy with property groupings (Mirny and Shakhnovich 95,
# Valdar and Thornton 01), relative entropy with property groupings
# (Williamson 95), von Neumann entropy (Caffrey et al 04), relative
# entropy (Samudrala and Wang 06), and Jensen-Shannon divergence
# (Capra and Singh 07).
#
# The code distributed with Mayrose et al 04 is used for Rate4Site. As
# of today it can be obtained from:
# http://www.tau.ac.il/~itaymay/cp/rate4site.html
#
#
# All scoring functions follow the same prototype:
#
# def score(col, sim_matrix, bg_disr, seq_weights, gap_penalty=1):
#
# - col: the column to be scored.
#
# - sim_matrix: the similarity (scoring) matrix to be used. Not all
# methods will use this parameter.
#
# - bg_distr: a list containing an amino acid probability distribution. Not
# all methods use this parameter. The default is the blosum62 background, but
# other distributions can be given.
#
# - seq_weights: an array of floats that is used to weight the contribution
# of each seqeuence. If the len(seq_weights) != len(col), then every sequence
# gets a weight of one.
#
# - gap_penalty: a binary variable: 0 for no gap penalty and 1
# for gap penalty. The default is to use a penalty. The gap penalty used is
# the score times the fraction of non-gap positions in the column.
#
#
# For a window score of any of above methods use the window_score method to
# transform the individual column scores.
#
################################################################################
from __future__ import print_function
import math, sys, getopt
import re
# numarray imported below
PSEUDOCOUNT = .0000001
amino_acids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-']
iupac_alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "Z", "X", "*", "-"]
# dictionary to map from amino acid to its row/column in a similarity matrix
aa_to_index = {}
for i, aa in enumerate(amino_acids):
aa_to_index[aa] = i
def usage( __file = sys.stdout ):
print( """\nUSAGE:\nscore_conservation [options] alignfile\n\t -alignfile must be in fasta, Stockholm or clustal format.\n\nOPTIONS:\n\t
-a\treference sequence. Print scores in reference to a specific sequence (ignoring gaps). Default prints the entire column. [sequence name]\n\t
-b\tlambda for window heuristic linear combination. Default=.5 [real in [0,1]]\n
-d\tbackground distribution file, e.g., swissprot.distribution. Default=BLOSUM62 background [filename]\n\t
-g\tgap cutoff. Do not score columns that contain more than gap cutoff fraction gaps. Default=.3 [real in [0, 1)]\n\t
-h\thelp. Print this message.\n
-l\tuse sequence weighting. Default=True [True|False]\n\t
-m\tsimilarity matrix file, e.g., matrix/blosum62.bla or .qij. Default=identity matrix [filename]\n\t
-n\tnormalize scores. Print the z-score (over the alignment) of each column raw score. Default=False\n\t
-o\tname of output file. Default=output to screen [filename]\n\t
-p\tuse gap penalty. Lower the score of columns that contain gaps. Default=True [True|False]\n\t
-s\tconservation estimation method. \n\t\tOptions: shannon_entropy, property_entropy, property_relative_entropy, vn_entropy, relative_entropy, js_divergence, sum_of_pairs. Default=js_divergence\n\t
-w\twindow size. Number of residues on either side included in the window. Default=3 [int]\n\t
""", file=__file)
################################################################################
# Frequency Count and Gap Penalty
################################################################################
def weighted_freq_count_pseudocount(col, seq_weights, pc_amount):
""" Return the weighted frequency count for a column--with pseudocount."""
# if the weights do not match, use equal weight
if len(seq_weights) != len(col):
seq_weights = [1.] * len(col)
aa_num = 0
freq_counts = len(amino_acids)*[pc_amount] # in order defined by amino_acids
for aa in amino_acids:
for j in range(len(col)):
if col[j] == aa:
freq_counts[aa_num] += 1 * seq_weights[j]
aa_num += 1
freqsum = (sum(seq_weights) + len(amino_acids) * pc_amount)
for j in range(len(freq_counts)):
freq_counts[j] = freq_counts[j] / freqsum
return freq_counts
def weighted_gap_penalty(col, seq_weights):
""" Calculate the simple gap penalty multiplier for the column. If the
sequences are weighted, the gaps, when penalized, are weighted
accordingly. """
# if the weights do not match, use equal weight
if len(seq_weights) != len(col):
seq_weights = [1.] * len(col)
gap_sum = 0.
for i in range(len(col)):
if col[i] == '-':
gap_sum += seq_weights[i]
return 1 - (gap_sum / sum(seq_weights))
def gap_percentage(col):
"""Return the percentage of gaps in col."""
num_gaps = 0.
for aa in col:
if aa == '-': num_gaps += 1
return num_gaps / len(col)
################################################################################
# Shannon Entropy
################################################################################
def shannon_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
"""Calculates the Shannon entropy of the column col. sim_matrix and
bg_distr are ignored. If gap_penalty == 1, then gaps are penalized. The
entropy will be between zero and one because of its base. See p.13 of
Valdar 02 for details. The information score 1 - h is returned for the sake
of consistency with other scores."""
fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)
h = 0.
for i in range(len(fc)):
if fc[i] != 0:
h += fc[i] * math.log(fc[i])
# h /= math.log(len(fc))
h /= math.log(min(len(fc), len(col)))
inf_score = 1 - (-1 * h)
if gap_penalty == 1:
return inf_score * weighted_gap_penalty(col, seq_weights)
else:
return inf_score
################################################################################
# Property Entropy
################################################################################
def property_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
"""Calculate the entropy of a column col relative to a partition of the
amino acids. Similar to Mirny '99. sim_matrix and bg_distr are ignored, but
could be used to define the sets. """
# Mirny and Shakn. '99
property_partition = [['A','V','L','I','M','C'], ['F','W','Y','H'], ['S','T','N','Q'], ['K','R'], ['D', 'E'], ['G', 'P'], ['-']]
# Williamson '95
# property_partition = [['V','L', 'I','M'], ['F','W','Y'], ['S','T'], ['N','Q'], ['H','K','R'], ['D','E'], ['A','G'], ['P'], ['C'], ['-']]
fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)
# sum the aa frequencies to get the property frequencies
prop_fc = [0.] * len(property_partition)
for p in range(len(property_partition)):
for aa in property_partition[p]:
prop_fc[p] += fc[aa_to_index[aa]]
h = 0.
for i in range(len(prop_fc)):
if prop_fc[i] != 0:
h += prop_fc[i] * math.log(prop_fc[i])
h /= math.log(min(len(property_partition), len(col)))
inf_score = 1 - (-1 * h)
if gap_penalty == 1:
return inf_score * weighted_gap_penalty(col, seq_weights)
else:
return inf_score
################################################################################
# Property Relative Entropy
################################################################################
def property_relative_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
"""Calculate the relative entropy of a column col relative to a
partition of the amino acids. Similar to Williamson '95. sim_matrix is
ignored, but could be used to define the sets. See shannon_entropy()
for more general info. """
# Mirny and Shakn. '99
#property_partition = [['A','V','L','I','M','C'], ['F','W','Y','H'], ['S','T','N','Q'], ['K','R'], ['D', 'E'], ['G', 'P'], ['-']]
# Williamson '95
property_partition = [['V','L', 'I','M'], ['F','W','Y'], ['S','T'], ['N','Q'], ['H','K','R'], ['D','E'], ['A','G'], ['P'], ['C']]
prop_bg_freq = []
if len(bg_distr) == len(property_partition):
prop_bg_freq = bg_distr
else:
prop_bg_freq = [0.248, 0.092, 0.114, 0.075, 0.132, 0.111, 0.161, 0.043, 0.024, 0.000] # from BL62
#fc = weighted_freq_count_ignore_gaps(col, seq_weights)
fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)
# sum the aa frequencies to get the property frequencies
prop_fc = [0.] * len(property_partition)
for p in range(len(property_partition)):
for aa in property_partition[p]:
prop_fc[p] += fc[aa_to_index[aa]]
d = 0.
for i in range(len(prop_fc)):
if prop_fc[i] != 0 and prop_bg_freq[i] != 0:
d += prop_fc[i] * math.log(prop_fc[i] / prop_bg_freq[i], 2)
if gap_penalty == 1:
return d * weighted_gap_penalty(col, seq_weights)
else:
return d
################################################################################
# von Neumann Entropy
################################################################################
def vn_entropy(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
""" Calculate the von Neuman Entropy as described in Caffrey et al. 04.
This code was adapted from the implementation found in the PFAAT project
available on SourceForge. bg_distr is ignored."""
aa_counts = [0.] * 20
for aa in col:
if aa != '-': aa_counts[aa_to_index[aa]] += 1
dm_size = 0
dm_aas = []
for i in range(len(aa_counts)):
if aa_counts[i] != 0:
dm_aas.append(i)
dm_size += 1
if dm_size == 0: return 0.0
row_i = 0
col_i = 0
dm = zeros((dm_size, dm_size), Float32)
for i in range(dm_size):
row_i = dm_aas[i]
for j in range(dm_size):
col_i = dm_aas[j]
dm[i][j] = aa_counts[row_i] * sim_matrix[row_i][col_i]
ev = la.eigenvalues(dm).real
temp = 0.
for e in ev:
temp += e
if temp != 0:
for i in range(len(ev)):
ev[i] = ev[i] / temp
vne = 0.0
for e in ev:
if e > (10**-10):
vne -= e * math.log(e) / math.log(20)
if gap_penalty == 1:
#return (1-vne) * weighted_gap_penalty(col, seq_weights)
return (1-vne) * weighted_gap_penalty(col, [1.] * len(col))
else:
return 1 - vne
################################################################################
# Relative Entropy
################################################################################
def relative_entropy(col, sim_matix, bg_distr, seq_weights, gap_penalty=1):
"""Calculate the relative entropy of the column distribution with a
background distribution specified in bg_distr. This is similar to the
approach proposed in Wang and Samudrala 06. sim_matrix is ignored."""
distr = bg_distr[:]
fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)
# remove gap count
if len(distr) == 20:
new_fc = fc[:-1]
s = sum(new_fc)
for i in range(len(new_fc)):
new_fc[i] = new_fc[i] / s
fc = new_fc
if len(fc) != len(distr): return -1
d = 0.
for i in range(len(fc)):
if distr[i] != 0.0:
d += fc[i] * math.log(fc[i]/distr[i])
d /= math.log(len(fc))
if gap_penalty == 1:
return d * weighted_gap_penalty(col, seq_weights)
else:
return d
################################################################################
# Jensen-Shannon Divergence
################################################################################
def js_divergence(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
""" Return the Jensen-Shannon Divergence for the column with the background
distribution bg_distr. sim_matrix is ignored. JSD is the default method."""
distr = bg_distr[:]
fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT)
# if background distrubtion lacks a gap count, remove fc gap count
if len(distr) == 20:
new_fc = fc[:-1]
s = sum(new_fc)
for i in range(len(new_fc)):
new_fc[i] = new_fc[i] / s
fc = new_fc
if len(fc) != len(distr): return -1
# make r distriubtion
r = []
for i in range(len(fc)):
r.append(.5 * fc[i] + .5 * distr[i])
d = 0.
for i in range(len(fc)):
if r[i] != 0.0:
if fc[i] == 0.0:
d += distr[i] * math.log(distr[i]/r[i], 2)
elif distr[i] == 0.0:
d += fc[i] * math.log(fc[i]/r[i], 2)
else:
d += fc[i] * math.log(fc[i]/r[i], 2) + distr[i] * math.log(distr[i]/r[i], 2)
# d /= 2 * math.log(len(fc))
d /= 2
if gap_penalty == 1:
return d * weighted_gap_penalty(col, seq_weights)
else:
return d
################################################################################
# Mutation Weighted Pairwise Match
################################################################################
def sum_of_pairs(col, sim_matrix, bg_distr, seq_weights, gap_penalty=1):
""" Sum the similarity matrix values for all pairs in the column.
This method is similar to those proposed in Valdar 02. bg_distr is ignored."""
sum = 0.
max_sum = 0.
for i in range(len(col)):
for j in range(i):
if col[i] != '-' and col[j] != '-':
max_sum += seq_weights[i] * seq_weights[j]
sum += seq_weights[i] * seq_weights[j] * sim_matrix[aa_to_index[col[i]]][aa_to_index[col[j]]]
if max_sum != 0:
sum /= max_sum
else:
sum = 0.
if gap_penalty == 1:
return sum * weighted_gap_penalty(col, seq_weights)
else:
return sum
################################################################################
# Window Score
################################################################################
def window_score(scores, window_len, lam=.5):
""" This function takes a list of scores and a length and transforms them
so that each position is a weighted average of the surrounding positions.
Positions with scores less than zero are not changed and are ignored in the
calculation. Here window_len is interpreted to mean window_len residues on
either side of the current residue. """
w_scores = scores[:]
for i in range(window_len, len(scores) - window_len):
if scores[i] < 0:
continue
sum = 0.
num_terms = 0.
for j in range(i - window_len, i + window_len + 1):
if i != j and scores[j] >= 0:
num_terms += 1
sum += scores[j]
if num_terms > 0:
w_scores[i] = (1 - lam) * (sum / num_terms) + lam * scores[i]
return w_scores
def calc_z_scores(scores, score_cutoff):
"""Calculates the z-scores for a set of scores. Scores below
score_cutoff are not included."""
average = 0.
std_dev = 0.
z_scores = []
num_scores = 0
for s in scores:
if s > score_cutoff:
average += s
num_scores += 1
if num_scores != 0:
average /= num_scores
for s in scores:
if s > score_cutoff:
std_dev += ((s - average)**2) / num_scores
std_dev = math.sqrt(std_dev)
for s in scores:
if s > score_cutoff and std_dev != 0:
z_scores.append((s-average)/std_dev)
else:
z_scores.append(-1000.0)
return z_scores
################################################################################
################################################################################
################################################################################
# END CONSERVATION SCORES
################################################################################
################################################################################
################################################################################
def read_scoring_matrix(sm_file):
""" Read in a scoring matrix from a file, e.g., blosum80.bla, and return it
as an array. """
aa_index = 0
first_line = 1
row = []
list_sm = [] # hold the matrix in list form
try:
matrix_file = open(sm_file, 'r')
for line in matrix_file:
if line[0] != '#' and first_line:
first_line = 0
if len(amino_acids) == 0:
for c in line.split():
aa_to_index[string.lower(c)] = aa_index
amino_acids.append(string.lower(c))
aa_index += 1
elif line[0] != '#' and first_line == 0:
if len(line) > 1:
row = line.split()
list_sm.append(row)
except IOError, e:
print( "Could not load similarity matrix: %s. Using identity matrix..." % sm_file, file=sys.stderr )
return identity(20)
# if matrix is stored in lower tri form, copy to upper
if len(list_sm[0]) < 20:
for i in range(0,19):
for j in range(i+1, 20):
list_sm[i].append(list_sm[j][i])
for i in range(len(list_sm)):
for j in range(len(list_sm[i])):
list_sm[i][j] = float(list_sm[i][j])
return list_sm
#sim_matrix = array(list_sm, type=Float32)
#return sim_matrix
def calculate_sequence_weights(msa):
""" Calculate the sequence weights using the Henikoff '94 method
for the given msa. """
seq_weights = [0.] * len(msa)
for i in range(len(msa[0])):
freq_counts = [0] * len(amino_acids)
col = []
for j in range(len(msa)):
if msa[j][i] != '-': # ignore gaps
freq_counts[aa_to_index[msa[j][i]]] += 1
num_observed_types = 0
for j in range(len(freq_counts)):
if freq_counts[j] > 0: num_observed_types +=1
for j in range(len(msa)):
d = freq_counts[aa_to_index[msa[j][i]]] * num_observed_types
if d > 0:
seq_weights[j] += 1. / d
for w in range(len(seq_weights)):
seq_weights[w] /= len(msa[0])
return seq_weights
def load_sequence_weights(fname):
"""Read in a sequence weight file f and create sequence weight list.
The weights are in the same order as the sequences each on a new line. """
seq_weights = []
try:
f = open(fname)
for line in f:
l = line.split()
if line[0] != '#' and len(l) == 2:
seq_weights.append(float(l[1]))
except IOError, e:
pass
#print "No sequence weights. Can't find: ", fname
return seq_weights
def get_column(col_num, alignment):
"""Return the col_num column of alignment as a list."""
col = []
for seq in alignment:
if col_num < len(seq): col.append(seq[col_num])
return col
def get_distribution_from_file(fname):
""" Read an amino acid distribution from a file. The probabilities should
be on a single line separated by whitespace in alphabetical order as in
amino_acids above. # is the comment character."""
distribution = []
try:
f = open(fname)
for line in f:
if line[0] == '#': continue
line = line[:-1]
distribution = line.split()
distribution = map(float, distribution)
except IOError, e:
print( e, "Using default (BLOSUM62) background.", file=sys.stderr )
return []
# use a range to be flexible about round off
if .997 > sum(distribution) or sum(distribution) > 1.003:
print( "Distribution does not sum to 1. Using default (BLOSUM62) background.", file=sys.stderr )
print( sum(distribution), file=sys.stderr )
return []
return distribution
def read_fasta_alignment(filename):
""" Read in the alignment stored in the FASTA file, filename. Return two
lists: the identifiers and sequences. """
f = open(filename)
names = []
alignment = []
cur_seq = ''
for line in f:
line = line[:-1]
if len(line) == 0: continue
if line[0] == ';': continue
if line[0] == '>':
names.append(line[1:].replace('\r', ''))
if cur_seq != '':
cur_seq = cur_seq.upper()
for i, aa in enumerate(cur_seq):
if aa not in iupac_alphabet:
cur_seq = cur_seq.replace(aa, '-')
alignment.append(cur_seq.replace('B', 'D').replace('Z', 'Q').replace('X', '-'))
cur_seq = ''
elif line[0] in iupac_alphabet:
cur_seq += line.replace('\r', '')
# add the last sequence
cur_seq = cur_seq.upper()
for i, aa in enumerate(cur_seq):
if aa not in iupac_alphabet:
cur_seq = cur_seq.replace(aa, '-')
alignment.append(cur_seq.replace('B', 'D').replace('Z', 'Q').replace('X', '-'))
return names, alignment
def read_clustal_alignment(filename):
""" Read in the alignment stored in the CLUSTAL or Stockholm file, filename. Return
two lists: the names and sequences. """
names = []
alignment = []
re_stock_markup = re.compile('^#=')
f = open(filename)
for line in f:
line = line[:-1]
if len(line) == 0: continue
if '*' in line: continue
if line[0:7] == 'CLUSTAL': continue
if line[0:11] == '# STOCKHOLM': continue
if line[0:2] == '//': continue
if re_stock_markup.match(line): continue
t = line.split()
if len(t) == 2 and t[1][0] in iupac_alphabet:
ali = t[1].upper().replace('B', 'D').replace('Z', 'Q').replace('X', '-').replace('\r', '').replace('.', '-')
if t[0] not in names:
names.append(t[0])
alignment.append(ali)
else:
alignment[names.index(t[0])] += ali
return names, alignment
################################################################################
# Begin execution
################################################################################
# BLOSUM62 background distribution
blosum_background_distr = [0.078, 0.051, 0.041, 0.052, 0.024, 0.034, 0.059, 0.083, 0.025, 0.062, 0.092, 0.056, 0.024, 0.044, 0.043, 0.059, 0.055, 0.014, 0.034, 0.072]
# set defaults
window_size = 3 # 0 = no window
win_lam = .5 # for window method linear combination
outfile_name = ""
s_matrix_file = "/usr/share/conservation-code/matrix/blosum62.bla"
bg_distribution = blosum_background_distr[:]
scoring_function = js_divergence
use_seq_weights = True
background_name = 'blosum62'
gap_cutoff = .3
use_gap_penalty = 1
seq_specific_output = 0 # name of sequence if True
normalize_scores = False
# parse options and args -- see usage()
if len(sys.argv) < 2:
usage(__file=sys.stderr)
sys.exit(2)
try:
opts, args = getopt.getopt(sys.argv[1:], "hl:d:m:o:s:w:g:p:b:a:n")
except getopt.GetoptError:
usage(__file=sys.stderr)
sys.exit(1)
if len(args) < 1:
usage(__file=sys.stderr)
sys.exit(1)
for opt, arg in opts:
if opt == "-h":
usage()
sys.exit()
if opt == "-o":
outfile_name = arg
elif opt == "-l":
if 'false' in arg.lower():
use_seq_weights = False
elif opt == "-p":
if 'false' in arg.lower():
use_gap_penalty = 0
elif opt == "-m":
s_matrix_file = arg
elif opt == "-d":
d = get_distribution_from_file(arg)
if d != []:
bg_distribution = d
background_name = arg
elif opt == "-w":
try:
window_size = int(arg)
except ValueError:
print( "ERROR: Window size must be an integer. Using window_size 3...", file=sys.stderr )
window_size = 3
elif opt == "-b":
try:
win_lam = float(arg)
if not (0. <= win_lam <= 1.): raise ValueError
except ValueError:
print( "ERROR: Window lambda must be a real in [0,1]. Using lambda = .5...", file=sys.stderr )
win_lam = .5
elif opt == "-g":
try:
gap_cutoff = float(arg)
if not (0. <= gap_cutoff < 1.): raise ValueError
except ValueError:
print( "ERROR: Gap cutoff must be a real in [0,1). Using a gap cutoff of .3...", file=sys.stderr )
gap_cutoff = .3
elif opt == '-a':
seq_specific_output = arg
elif opt == '-n':
normalize_scores = True
elif opt == '-s':
if arg == 'shannon_entropy': scoring_function = shannon_entropy
elif arg == 'property_entropy': scoring_function = property_entropy
elif arg == 'property_relative_entropy': scoring_function = property_relative_entropy
elif arg == 'vn_entropy': scoring_function = vn_entropy; from numpy.numarray import *; import numpy.numarray.linear_algebra as la
elif arg == 'relative_entropy': scoring_function = relative_entropy
elif arg == 'js_divergence': scoring_function = js_divergence
elif arg == 'sum_of_pairs': scoring_function = sum_of_pairs
else: print( "%s is not a valid scoring method. Using %s.\n" % (arg, scoring_function.__name__), file=sys.stderr )
align_file = args[0]
align_suffix = align_file.split('.')[-1]
s_matrix = read_scoring_matrix(s_matrix_file)
names = []
alignment = []
seq_weights = []
try:
names, alignment = read_clustal_alignment(align_file)
if names == []:
names, alignment = read_fasta_alignment(align_file)
except IOError, e:
print( e, "Could not find %s. Exiting..." % align_file, file=sys.stderr )
sys.exit(1)
if len(alignment) != len(names) or alignment == []:
print( "Unable to parse alignment.\n", file=sys.stderr )
sys.exit(1)
seq_len = len(alignment[0])
for i, seq in enumerate(alignment):
if len(seq) != seq_len:
print( "ERROR: Sequences of different lengths: %s (%d) != %s (%d).\n" % (names[0], seq_len, names[i], len(seq)), file=sys.stderr )
sys.exit(1)
if use_seq_weights:
seq_weights = load_sequence_weights(align_file.replace('.%s' % align_suffix, '.weights'))
if seq_weights == []:
seq_weights = calculate_sequence_weights(alignment)
if len(seq_weights) != len(alignment): seq_weights = [1.] * len(alignment)
# handle print of output relative to specific sequence
ref_seq_num = None
if seq_specific_output and seq_specific_output not in names:
print( "Sequence %s not found in alignment. Using default output format...\n" % seq_specific_output, file=sys.stderr )
seq_specific_output = 0
elif seq_specific_output in names:
ref_seq_num = names.index(seq_specific_output)
# calculate scores
scores = []
for i in range(len(alignment[0])):
col = get_column(i, alignment)
if len(col) == len(alignment):
if gap_percentage(col) <= gap_cutoff:
scores.append(scoring_function(col, s_matrix, bg_distribution, seq_weights, use_gap_penalty))
else:
scores.append(-1000.)
if window_size > 0:
scores = window_score(scores, window_size, win_lam)
if normalize_scores:
scores = calc_z_scores(scores, -999)
# print to file/stdout
try:
if outfile_name != "":
outfile = open(outfile_name, 'w')
outfile.write("# %s -- %s - window_size: %d - window lambda: %.2f - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s\n" % (align_file, scoring_function.__name__, window_size, win_lam, background_name, use_seq_weights, use_gap_penalty, normalize_scores))
if seq_specific_output:
outfile.write("# reference sequence: %s\n" % seq_specific_output)
outfile.write("# align_column_number\tamino acid\tscore\n")
else:
outfile.write("# align_column_number\tscore\tcolumn\n")
else:
print( "# %s -- %s - window_size: %d - background: %s - seq. weighting: %s - gap penalty: %d - normalized: %s" % (align_file, scoring_function.__name__, window_size, background_name, use_seq_weights, use_gap_penalty, normalize_scores) )
if seq_specific_output:
print( "# reference sequence: %s" % seq_specific_output )
print( "# align_column_number\tamino acid\tscore\n" )
else:
print( "# align_column_number\tscore\tcolumn\n" )
except IOError, e:
print( "Could not open %s for output. Printing results to standard out..." % outfile_name, file=sys.stderr )
outfile_name = ""
for i, score in enumerate(scores):
if seq_specific_output:
cur_aa = get_column(i, alignment)[ref_seq_num]
if cur_aa == '-': continue
if outfile_name == "":
print( "%d\t%s\t%.5f" % (i, cur_aa, score) )
else:
outfile.write("%d\t%s\t%5f\n" % (i, cur_aa, score))
else:
if outfile_name == "":
print( "%d\t%.5f\t%s" % (i, score, "".join(get_column(i, alignment))) )
else:
outfile.write("%d\t%5f\t%s\n" % (i, score, "".join(get_column(i, alignment))))
|