/usr/share/octave/packages/statistics-1.2.3/kmeans.m is in octave-statistics 1.2.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | ## Copyright (C) 2011 Soren Hauberg <soren@hauberg.org>
## Copyright (C) 2012 Daniel Ward <dwa012@gmail.com>
##
## This program is free software; you can redistribute it and/or modify it under
## the terms of the GNU General Public License as published by the Free Software
## Foundation; either version 3 of the License, or (at your option) any later
## version.
##
## This program is distributed in the hope that it will be useful, but WITHOUT
## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
## details.
##
## You should have received a copy of the GNU General Public License along with
## this program; if not, see <http://www.gnu.org/licenses/>.
## -*- texinfo -*-
## @deftypefn {Function File} {[@var{idx}, @var{centers}] =} kmeans (@var{data}, @var{k}, @var{param1}, @var{value1}, @dots{})
## K-means clustering.
##
## @seealso{linkage}
## @end deftypefn
function [classes, centers, sumd, D] = kmeans (data, k, varargin)
[reg, prop] = parseparams (varargin);
## defaults for options
emptyaction = "error";
start = "sample";
#used for getting the number of samples
nRows = rows (data);
## used to hold the distances from each sample to each class
D = zeros (nRows, k);
#used for convergence of the centroids
err = 1;
#initial sum of distances
sumd = Inf;
## Input checking, validate the matrix and k
if (!isnumeric (data) || !ismatrix (data) || !isreal (data))
error ("kmeans: first input argument must be a DxN real data matrix");
elseif (!isscalar (k))
error ("kmeans: second input argument must be a scalar");
endif
if (length (varargin) > 0)
## check for the 'emptyaction' property
found = find (strcmpi (prop, "emptyaction") == 1);
switch (lower (prop{found+1}))
case "singleton"
emptyaction = "singleton";
otherwise
error ("kmeans: unsupported empty cluster action parameter");
endswitch
endif
## check for the 'start' property
switch (lower (start))
case "sample"
idx = randperm (nRows) (1:k);
centers = data (idx, :);
otherwise
error ("kmeans: unsupported initial clustering parameter");
endswitch
## Run the algorithm
while err > .001
## Compute distances
for i = 1:k
D (:, i) = sumsq (data - repmat (centers(i, :), nRows, 1), 2);
endfor
## Classify
[tmp, classes] = min (D, [], 2);
## Calculate new centroids
for i = 1:k
## Get binary vector indicating membership in cluster i
membership = (classes == i);
## Check for empty clusters
if (sum (membership) == 0)
switch emptyaction
## if 'singleton', then find the point that is the
## farthest and add it to the empty cluster
case 'singleton'
idx=maxCostSampleIndex (data, centers(i,:));
classes(idx) = i;
membership(idx)=1;
## if 'error' then throw the error
otherwise
error ("kmeans: empty cluster created");
endswitch
endif ## end check for empty clusters
## update the centroids
members = data(membership, :);
centers(i, :) = sum(members,1)/size(members,1);
endfor
## calculate the difference in the sum of distances
err = sumd - objCost (data, classes, centers);
## update the current sum of distances
sumd = objCost (data, classes, centers);
endwhile
endfunction
## calculate the sum of distances
function obj = objCost (data, classes, centers)
obj = 0;
for i=1:rows (data)
obj = obj + sumsq (data(i,:) - centers(classes(i),:));
endfor
endfunction
function idx = maxCostSampleIndex (data, centers)
cost = 0;
for idx = 1:rows (data)
if cost < sumsq (data(idx,:) - centers)
cost = sumsq (data(idx,:) - centers);
endif
endfor
endfunction
%!demo
%! ## Generate a two-cluster problem
%! C1 = randn (100, 2) + 1;
%! C2 = randn (100, 2) - 1;
%! data = [C1; C2];
%!
%! ## Perform clustering
%! [idx, centers] = kmeans (data, 2);
%!
%! ## Plot the result
%! figure
%! plot (data (idx==1, 1), data (idx==1, 2), 'ro');
%! hold on
%! plot (data (idx==2, 1), data (idx==2, 2), 'bs');
%! plot (centers (:, 1), centers (:, 2), 'kv', 'markersize', 10);
%! hold off
|