/usr/include/mlpack/methods/kmeans/refined_start_impl.hpp is in libmlpack-dev 1.0.10-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /**
* @file refined_start_impl.hpp
* @author Ryan Curtin
*
* An implementation of Bradley and Fayyad's "Refining Initial Points for
* K-Means clustering". This class is meant to provide better initial points
* for the k-means algorithm.
*
* This file is part of MLPACK 1.0.10.
*
* MLPACK is free software: you can redistribute it and/or modify it under the
* terms of the GNU Lesser General Public License as published by the Free
* Software Foundation, either version 3 of the License, or (at your option) any
* later version.
*
* MLPACK is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details (LICENSE.txt).
*
* You should have received a copy of the GNU General Public License along with
* MLPACK. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
#define __MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
// In case it hasn't been included yet.
#include "refined_start.hpp"
namespace mlpack {
namespace kmeans {
//! Partition the given dataset according to Bradley and Fayyad's algorithm.
template<typename MatType>
void RefinedStart::Cluster(const MatType& data,
const size_t clusters,
arma::Col<size_t>& assignments) const
{
math::RandomSeed(std::time(NULL));
// This will hold the sampled datasets.
const size_t numPoints = size_t(percentage * data.n_cols);
MatType sampledData(data.n_rows, numPoints);
// vector<bool> is packed so each bool is 1 bit.
std::vector<bool> pointsUsed(data.n_cols, false);
arma::mat sampledCentroids(data.n_rows, samplings * clusters);
// We will use these objects repeatedly for clustering.
arma::Col<size_t> sampledAssignments;
arma::mat centroids;
KMeans<> kmeans;
for (size_t i = 0; i < samplings; ++i)
{
// First, assemble the sampled dataset.
size_t curSample = 0;
while (curSample < numPoints)
{
// Pick a random point in [0, numPoints).
size_t sample = (size_t) math::RandInt(data.n_cols);
if (!pointsUsed[sample])
{
// This point isn't used yet. So we'll put it in our sample.
pointsUsed[sample] = true;
sampledData.col(curSample) = data.col(sample);
++curSample;
}
}
// Now, using the sampled dataset, run k-means. In the case of an empty
// cluster, we re-initialize that cluster as the point furthest away from
// the cluster with maximum variance. This is not *exactly* what the paper
// implements, but it is quite similar, and we'll call it "good enough".
kmeans.Cluster(sampledData, clusters, sampledAssignments, centroids);
// Store the sampled centroids.
sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
pointsUsed.assign(data.n_cols, false);
}
// Now, we run k-means on the sampled centroids to get our final clusters.
kmeans.Cluster(sampledCentroids, clusters, sampledAssignments, centroids);
// Turn the final centroids into assignments.
assignments.set_size(data.n_cols);
for (size_t i = 0; i < data.n_cols; ++i)
{
// Find the closest centroid to this point.
double minDistance = std::numeric_limits<double>::infinity();
size_t closestCluster = clusters;
for (size_t j = 0; j < clusters; ++j)
{
const double distance = kmeans.Metric().Evaluate(data.col(i),
centroids.col(j));
if (distance < minDistance)
{
minDistance = distance;
closestCluster = j;
}
}
// Assign the point to its closest cluster.
assignments[i] = closestCluster;
}
}
}; // namespace kmeans
}; // namespace mlpack
#endif
|