This file is indexed.

/usr/include/shogun/statistics/HSIC.h is in libshogun-dev 3.2.0-7.3build4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * Written (W) 2012-2013 Heiko Strathmann
 */

#ifndef __HSIC_H_
#define __HSIC_H_

#include <shogun/statistics/KernelIndependenceTestStatistic.h>

namespace shogun
{

template<class T> class SGMatrix;


/** @brief This class implements the Hilbert Schmidtd Independence Criterion
 * based independence test as described in [1].
 *
 * Given samples \f$Z=\{(x_i,y_i)\}_{i=1}^m\f$ from the joint
 * distribution \f$\textbf{P}_x\textbf{P}_y\f$, does the joint distribution
 * factorize as \f$\textbf{P}_{xy}=\textbf{P}_x\textbf{P}_y\f$?
 *
 * The HSIC is a kernel based independence criterion, which is based on the
 * largest singular value of a Cross-Covariance Operator in a reproducing
 * kernel Hilbert space (RKHS). Its population expression is zero if and only
 * if the two underlying distributions are independent.
 *
 * This class can compute empirical biased estimates:
 * \f[
 * m\text{HSIC}(Z)[,p,q]^2)=\frac{1}{m^2}\text{trace}\textbf{KHLH}
 * \f]
 * where \f$\textbf{H}=\textbf{I}-\frac{1}{m}\textbf{11}^T\f$ is a centering
 * matrix and \f$\textbf{K}, \textbf{L}\f$ are kernel matrices of both sets
 * of samples.
 *
 * Note that computing the statistic returns m*MMD; same holds for the null
 * distribution samples.
 *
 * Along with the statistic comes a method to compute a p-value based on
 * different methods. Bootstrapping, is also possible. If unsure which one to
 * use, bootstrapping with 250 iterations always is correct (but slow).
 *
 * To choose, use set_null_approximation_method() and choose from
 *
 * HSIC_GAMMA: for a very fast, but not consistent test based on moment matching
 * of a Gamma distribution, as described in [1].
 *
 * BOOTSTRAPPING: For permuting available samples to sample null-distribution.
 * Bootstrapping is done on precomputed kernel matrices, since they have to
 * be stored anyway when the statistic is computed.
 *
 * A very basic method for kernel selection when using CGaussianKernel is to
 * use the median distance of the underlying data. See examples how to do that.
 * More advanced methods will follow in the near future. However, the median
 * heuristic works in quite some cases. See [1].
 *
 * [1]: Gretton, A., Fukumizu, K., Teo, C., & Song, L. (2008).
 * A kernel statistical test of independence.
 * Advances in Neural Information Processing Systems, 1-8.
 *
 */
class CHSIC : public CKernelIndependenceTestStatistic
{
public:
	/** Constructor */
	CHSIC();

	/** Constructor
	 *
	 * @param p_and_q feature data. Is assumed to contain samples from both
	 * p and q. First all samples from p, then from index m all
	 * samples from q
	 *
	 * @param kernel_p kernel to use on samples from p
	 * @param kernel_q kernel to use on samples from q
	 * @param p_and_q samples from p and q, appended
	 * @param m index of first sample of q
	 */
	CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p_and_q,
			index_t m);

	/** Constructor.
	 * This is a convienience constructor which copies both features to one
	 * element and then calls the other constructor. Needs twice the memory
	 * for a short time
	 *
	 * @param kernel_p kernel to use on samples from p
	 * @param kernel_q kernel to use on samples from q
	 * @param p samples from distribution p, will be copied and NOT
	 * SG_REF'ed
	 * @param q samples from distribution q, will be copied and NOT
	 * SG_REF'ed
	 */
	CHSIC(CKernel* kernel_p, CKernel* kernel_q, CFeatures* p, CFeatures* q);

	virtual ~CHSIC();

	/** Computes the HSIC statistic (see class description) for underlying
	 * kernels and data. Note that it is multiplied by the number of used
	 * samples. It is a biased estimator. Note that it is m*HSIC_b.
	 *
	 * Note that since kernel matrices have to be stored, it has quadratic
	 * space costs.
	 *
	 * @return m*HSIC (unbiased estimate)
	 */
	virtual float64_t compute_statistic();

	/** computes a p-value based on current method for approximating the
	 * null-distribution. The p-value is the 1-p quantile of the null-
	 * distribution where the given statistic lies in.
	 *
	 * @param statistic statistic value to compute the p-value for
	 * @return p-value parameter statistic is the (1-p) percentile of the
	 * null distribution
	 */
	virtual float64_t compute_p_value(float64_t statistic);

	/** computes a threshold based on current method for approximating the
	 * null-distribution. The threshold is the value that a statistic has
	 * to have in ordner to reject the null-hypothesis.
	 *
	 * @param alpha test level to reject null-hypothesis
	 * @return threshold for statistics to reject null-hypothesis
	 */
	virtual float64_t compute_threshold(float64_t alpha);

	virtual const char* get_name() const
	{
		return "HSIC";
	}

	/** returns the statistic type of this test statistic */
	virtual EStatisticType get_statistic_type() const
	{
		return S_HSIC;
	}

	/** Approximates the null-distribution by a two parameter gamma
	 * distribution. Returns parameters.
	 *
	 * NOTE: the gamma distribution is fitted to m*HSIC_b. But since
	 * compute_statistic() returnes the biased estimate, you can safely call
	 * this with values from compute_statistic().
	 * However, the attached features have to be the SAME size, as these, the
	 * statistic was computed on. If compute_threshold() or compute_p_value()
	 * are used, this is ensured automatically. Note that m*Null-distribution is
	 * fitted, which is fine since the statistic is also m*HSIC.
	 *
	 * Has quadratic computational costs in terms of samples.
	 *
	 * Called by compute_p_value() if null approximation method is set to
	 * MMD2_GAMMA.
	 *
	 * @return vector with two parameters for gamma distribution. To use:
	 * call gamma_cdf(statistic, a, b).
	 */
	SGVector<float64_t> fit_null_gamma();

	/** merges both sets of samples and computes the test statistic
	 * m_bootstrap_iteration times. This version precomputes the kenrel matrix
	 * once by hand, then performs bootstrapping on this one. The matrix has
	 * to be stored anyway when statistic is computed.
	 *
	 * @return vector of all statistics
	 */
	virtual SGVector<float64_t> bootstrap_null();

protected:
	/** @return kernel matrix on samples from p. Distinguishes CustomKernels */
	SGMatrix<float64_t> get_kernel_matrix_K();

	/** @return kernel matrix on samples from q. Distinguishes CustomKernels */
	SGMatrix<float64_t> get_kernel_matrix_L();

private:
	void init();

};

}

#endif /* __HSIC_H_ */