This file is indexed.

/usr/share/pyshared/cogent/cluster/UPGMA.py is in python-cogent 1.5.1-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#usr/bin/env python
"""Functions to cluster using UPGMA

upgma takes an dictionary of pair tuples mapped to distances as input. 

UPGMA_cluster takes an array and a list of PhyloNode objects corresponding 
to the array as input. Can also generate this type of input from a Dict2D using
inputs_from_dict2D function.

Both return a PhyloNode object of the UPGMA cluster
"""

from numpy import array, ravel, argmin, take, sum, average, ma, diag
from cogent.core.tree import PhyloNode
from cogent.util.dict2d import Dict2D

__author__ = "Catherine Lozupone"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Catherine Lozuopone", "Rob Knight", "Peter Maxwell"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Catherine Lozupone"
__email__ = "lozupone@colorado.edu"
__status__ = "Production"

import numpy
numerictypes = numpy.core.numerictypes.sctype2char
Float = numerictypes(float)
BIG_NUM = 1e305

def upgma(pairwise_distances):
    """Uses the UPGMA algorithm to cluster sequences

    pairwise_distances: a dictionary with pair tuples mapped to a distance
    returns a PhyloNode object of the UPGMA cluster
    """
    items_in_matrix = []
    for i in pairwise_distances:
        if i[0] not in items_in_matrix:
            items_in_matrix.append(i[0])
        if i[1] not in items_in_matrix:
            items_in_matrix.append(i[1])
    dict2d_input = [(i[0], i[1], pairwise_distances[i]) for i in \
            pairwise_distances]
    dict2d_input.extend([(i[1], i[0], pairwise_distances[i]) for i in \
            pairwise_distances])
    dict2d_input = Dict2D(dict2d_input, RowOrder=items_in_matrix, \
            ColOrder=items_in_matrix, Pad=True, Default=BIG_NUM)
    matrix_a, node_order = inputs_from_dict2D(dict2d_input)
    tree = UPGMA_cluster(matrix_a, node_order, BIG_NUM)
    index = 0
    for node in tree.traverse():
        if not node.Parent:
            node.Name = 'root'
        elif not node.Name:
            node.Name = 'edge.' + str(index)
            index += 1
    return tree

def find_smallest_index(matrix):
    """returns the index of the smallest element in a numpy array
    
    for UPGMA clustering elements on the diagonal should first be
    substituted with a very large number so that they are always 
    larger than the rest if the values in the array."""
    #get the shape of the array as a tuple (e.g. (3,3))
    shape = matrix.shape
    #turn into a 1 by x array and get the index of the lowest number
    matrix1D = ravel(matrix)
    lowest_index = argmin(matrix1D)
    #convert the lowest_index derived from matrix1D to one for the original
    #square matrix and return
    row_len = shape[0]
    return divmod(lowest_index, row_len)
   
def condense_matrix(matrix, smallest_index, large_value):
    """converges the rows and columns indicated by smallest_index
    
    Smallest index is returned from find_smallest_index.
    For both the rows and columns, the values for the two indices are
    averaged. The resulting vector replaces the first index in the array
    and the second index is replaced by an array with large numbers so that
    it is never chosen again with find_smallest_index.
    """
    first_index, second_index = smallest_index
    #get the rows and make a new vector that has their average
    rows = take(matrix, smallest_index, 0)
    new_vector = average(rows, 0)
    #replace info in the row and column for first index with new_vector
    matrix[first_index] = new_vector
    matrix[:, first_index] = new_vector
    #replace the info in the row and column for the second index with 
    #high numbers so that it is ignored
    matrix[second_index] = large_value
    matrix[:, second_index] = large_value
    return matrix

def condense_node_order(matrix, smallest_index, node_order):
    """condenses two nodes in node_order based on smallest_index info
    
    This function is used to create a tree while condensing a matrix
    with the condense_matrix function. The smallest_index is retrieved
    with find_smallest_index. The first index is replaced with a node object
    that combines the two nodes corresponding to the indices in node order.
    The second index in smallest_index is replaced with None.
    Also sets the branch length of the nodes to 1/2 of the distance between
    the nodes in the matrix"""
    index1, index2 = smallest_index
    node1 = node_order[index1]
    node2 = node_order[index2]
    #get the distance between the nodes and assign 1/2 the distance to the
    #Length property of each node
    distance = matrix[index1, index2]
    nodes = [node1,node2]
    d = distance/2.0
    for n in nodes:
        if n.Children:
            n.Length = d - n.Children[0].TipLength
        else:
            n.Length = d
        n.TipLength = d
    #combine the two nodes into a new PhyloNode object
    new_node = PhyloNode()
    new_node.Children.append(node1)
    new_node.Children.append(node2)
    node1.Parent = new_node
    node2.Parent = new_node
    #replace the object at index1 with the combined node
    node_order[index1] = new_node
    #replace the object at index2 with None
    node_order[index2] = None
    return node_order

def UPGMA_cluster(matrix, node_order, large_number):
    """cluster with UPGMA
    
    matrix is a numpy array.
    node_order is a list of PhyloNode objects corresponding to the matrix.
    large_number will be assigned to the matrix during the process and
    should be much larger than any value already in the matrix.
    
    WARNING: Changes matrix in-place.
    WARNING: Expects matrix to already have diagonals assigned to large_number
             before this function is called.
    """
    num_entries = len(node_order)
    tree = None
    for i in range(num_entries - 1):
        smallest_index = find_smallest_index(matrix)
        index1, index2 = smallest_index
        #if smallest_index is on the diagonal set the diagonal to large_number
        if index1 == index2:
            matrix[diag([True]*len(matrix))] = large_number
            smallest_index = find_smallest_index(matrix)
        row_order = condense_node_order(matrix, smallest_index, \
                node_order)
        matrix = condense_matrix(matrix, smallest_index, large_number)
        tree = node_order[smallest_index[0]]
    return tree

def inputs_from_dict2D(dict2d_matrix):
    """makes inputs for UPGMA_cluster from a Dict2D object
    
    Dict2D object is a distance matrix with labeled Rows. The diagonal
    elements should have a very large positive number assigned (e.g.1e305).
    
    The returned array is a numpy array with the distances.
    PhyloNode_order is a list of PhyloNode objects with the Data property
    assigned from the Dict2D Row order.
    """
    matrix_lists = list(dict2d_matrix.Rows)
    matrix = array(matrix_lists, Float)
    row_order = dict2d_matrix.RowOrder
    PhyloNode_order = []
    for i in row_order:
        PhyloNode_order.append(PhyloNode(Name=i))
    return matrix, PhyloNode_order