This file is indexed.

/usr/lib/python3/dist-packages/astroML/clustering/mst_clustering.py is in python3-astroml 0.3-6.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Minimum Spanning Tree Clustering
"""
import numpy as np

from scipy import sparse
from sklearn.neighbors import kneighbors_graph
from sklearn.mixture import GMM

try:
    from scipy.sparse.csgraph import \
        minimum_spanning_tree, connected_components
except:
    raise ValueError("scipy v0.11 or greater required "
                     "for minimum spanning tree")


class HierarchicalClustering(object):
    """Hierarchical Clustering via Approximate Euclidean Minimum Spanning Tree

    Parameters
    ----------
    n_neighbors : int
        number of neighbors of each point used for approximate Euclidean
        minimum spanning tree (MST) algorithm.  See Notes below.
    edge_cutoff : float
        specify a fraction of edges to keep when selecting clusters.
        edge_cutoff should be between 0 and 1.
    min_cluster_size : int, optional
        specify a minimum number of points per cluster.  If not specified,
        all clusters will be kept.

    Attributes
    ----------
    X_train_ : ndarray
        the training data
    full_tree_ : sparse graph
        the full approximate Euclidean MST spanning the data
    cluster_graph_ : sparse graph
        the final (truncated) graph showing clusters
    n_components_ : int
        the number of clusters found.
    labels_ : int
        the cluster labels for each training point.  Labels range from -1
        to n_components_ - 1: points labeled -1 are in the background (i.e.
        their clusters were smaller than min_cluster_size)

    Notes
    -----
    This routine uses an approximate Euclidean minimum spanning tree (MST)
    to perform hierarchical clustering.  A true Euclidean minimum spanning
    tree naively costs O[N^3].  Graph traversal algorithms only help so much,
    because all N^2 edges must be used as candidates.  In this approximate
    algorithm, we use k < N edges from each point, so that the cost is only
    O[Nk log(Nk)]. For k = N, the approximation is exact; in practice for
    well-behaved data sets, the result is exact for k << N.
"""
    def __init__(self, n_neighbors=20,
                 edge_cutoff=0.9,
                 min_cluster_size=1):
        self.n_neighbors = n_neighbors
        self.edge_cutoff = edge_cutoff
        self.min_cluster_size = min_cluster_size

    def fit(self, X):
        """Fit the clustering model

        Parameters
        ----------
        X : array_like
            the data to be clustered: shape = [n_samples, n_features]
        """
        X = np.asarray(X, dtype=float)

        self.X_train_ = X

        # generate a sparse graph using the k nearest neighbors of each point
        G = kneighbors_graph(X, n_neighbors=self.n_neighbors, mode='distance')

        # Compute the minimum spanning tree of this graph
        self.full_tree_ = minimum_spanning_tree(G, overwrite=True)

        # Find the cluster labels
        self.n_components_, self.labels_, self.cluster_graph_ =\
            self.compute_clusters()

        return self

    def compute_clusters(self, edge_cutoff=None, min_cluster_size=None):
        """Compute the clusters given a trained tree

        After fit() is called, this method may be called to obtain a
        clustering result with a new edge_cutoff and min_cluster_size.

        Parameters
        ----------
        edge_cutoff : float, optional
            specify a fraction of edges to keep when selecting clusters.
            edge_cutoff should be between 0 and 1.  If not specified,
            self.edge_cutoff will be used.
        min_cluster_size : int, optional
            specify a minimum number of points per cluster.  If not specified,
            self.min_cluster_size will be used.

        Returns
        -------
        n_components : int
            the number of clusters found
        labels : ndarray
            the labels of each point.  Labels range from -1 to
            n_components_ - 1: points labeled -1 are in the background
            (i.e. their clusters were smaller than min_cluster_size)
        T_trunc : sparse matrix
            the truncated minimum spanning tree
        """
        if edge_cutoff is None:
            edge_cutoff = self.edge_cutoff

        if min_cluster_size is None:
            min_cluster_size = self.min_cluster_size

        if not hasattr(self, 'full_tree_'):
            raise ValueError("must call fit() before calling "
                             "compute_clusters()")

        T_trunc = self.full_tree_.copy()

        # cut-off edges at the percentile given by edge_cutoff
        cutoff = np.percentile(T_trunc.data, 100 * edge_cutoff)
        T_trunc.data[T_trunc.data > cutoff] = 0
        T_trunc.eliminate_zeros()

        # find connected components
        n_components, labels = connected_components(T_trunc, directed=False)
        counts = np.bincount(labels)

        # for all components with less than min_cluster_size points, set
        # to background, and re-label the clusters
        i_bg = np.where(counts < min_cluster_size)[0]

        for i in i_bg:
            labels[labels == i] = -1

        if len(i_bg) > 0:
            _, labels = np.unique(labels, return_inverse=True)
            labels -= 1
            n_components = labels.max() + 1

        # eliminate links in T_trunc which are not clusters
        I = sparse.eye(len(labels), len(labels))
        I.data[0, labels < 0] = 0
        T_trunc = I * T_trunc * I

        return n_components, labels, T_trunc


def get_graph_segments(X, G):
    """Get graph segments for plotting a 2D graph

    Parameters
    ----------
    X : array_like
        the data, of shape [n_samples, 2]
    G : array_like or sparse graph
        the [n_samples, n_samples] matrix encoding the graph of connectinons
        on X

    Returns
    -------
    x_coords, y_coords : ndarrays
        the x and y coordinates for plotting the graph.  They are of size
        [2, n_links], and can be visualized using
        ``plt.plot(x_coords, y_coords, '-k')``
    """
    X = np.asarray(X)
    if (X.ndim != 2) or (X.shape[1] != 2):
        raise ValueError('shape of X should be (n_samples, 2)')

    n_samples = X.shape[0]

    G = sparse.coo_matrix(G)
    A = X[G.row].T
    B = X[G.col].T

    x_coords = np.vstack([A[0], B[0]])
    y_coords = np.vstack([A[1], B[1]])

    return x_coords, y_coords