This file is indexed.

/usr/include/shark/Data/DataDistribution.h is in libshark-dev 3.0.1+ds1-2ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
//===========================================================================
/*!
 * 
 *
 * \brief       Learning problems given by analytic distributions.
 * 
 * 
 * 
 *
 * \author      T. Glasmachers
 * \date        2006-2013
 *
 *
 * \par Copyright 1995-2015 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <http://image.diku.dk/shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================


#ifndef SHARK_DATA_DATADISTRIBUTION_H
#define SHARK_DATA_DATADISTRIBUTION_H

#include <shark/Data/Dataset.h>
#include <shark/Rng/GlobalRng.h>
#include <shark/Statistics/Distributions/MultiVariateNormalDistribution.h>
#include <utility>

namespace shark {


///
/// \brief A DataDistribution defines an unsupervised learning problem.
///
/// \par
/// The unsupervised learning problem is defined by an explicit
/// distribution (in contrast to a finite dataset). The only
/// method we need is to draw a sample from the distribution.
///
template <class InputType>
class DataDistribution
{
public:
	/// \brief Virtual destructor.
	virtual ~DataDistribution() { }

	/// \brief Generates a single pair of input and label.
	///
	/// @param input the generated input
	virtual void draw(InputType& input) const = 0;

	// \brief Interface for std::generate.
	InputType operator() () {
		InputType ret;
		draw(ret);
		return ret;
	}
	
	/// \brief Generates a data set with samples from from the distribution.
	///
	/// @param size the number of samples in the dataset
	/// @param maximumBatchSize the maximum size of a batch
	UnlabeledData<InputType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const {
		std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
		std::size_t optimalBatchSize = size / batches;
		std::size_t remainder = size - batches * optimalBatchSize;
		UnlabeledData<InputType> dataset(batches);
		InputType input;

		// now create and fill the batches, taking the remainder into account
		for (std::size_t i=0; i<batches; ++i)
		{
			std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
			typename UnlabeledData<InputType>::batch_reference b = dataset.batch(i);
			draw(input);
			b = Batch<InputType>::createBatch(input, batchsize);
			for (std::size_t j=0; j<batchsize; j++)
			{
				if (j != 0) draw(input);
				shark::get(b, j) = input;
			}
		}
		return dataset;
	}
	
	/// \brief Generates a data set with samples from from the distribution.
	///
	/// @param size the number of samples in the dataset
	UnlabeledData<InputType> generateDataset(std::size_t size) const {
		return generateDataset(size,Data<InputType>::DefaultBatchSize );
	}
};


///
/// \brief A LabeledDataDistribution defines a supervised learning problem.
///
/// \par
/// The supervised learning problem is defined by an explicit
/// distribution (in contrast to a finite dataset). The only
/// method we need is to draw a sample from the distribution.
///
template <class InputType, class LabelType>
class LabeledDataDistribution
{
public:
	/// \brief Virtual destructor.
	virtual ~LabeledDataDistribution() { }

	/// \brief Generates a single pair of input and label.
	/// @param input the generated input
	/// @param label the generated label
	virtual void draw(InputType& input, LabelType& label) const = 0;

	// \Brief Interface for std::generate.
	std::pair<InputType,LabelType> operator() () {
		std::pair<InputType,LabelType> ret;
		draw(ret.first,ret.second);
		return ret;
	}
	
	/// \brief Generates a dataset with samples from from the distribution.
	///
	/// @param size the number of samples in the dataset
	/// @param maximumBatchSize the maximum size of a batch
	LabeledData<InputType, LabelType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const
	{
		// first determine the optimal number of batches and their sizes
		std::size_t batches = (size + maximumBatchSize - 1) / maximumBatchSize;
		std::size_t optimalBatchSize = size / batches;
		std::size_t remainder = size - batches * optimalBatchSize;
		LabeledData<InputType, LabelType> dataset(batches);
		InputType input;
		LabelType label;
		DataPair<InputType, LabelType> pair(input, label);

		// now create and fill the batches, taking the remainder into account
		for (std::size_t i=0; i<batches; ++i)
		{
			std::size_t batchsize = (i<remainder) ? optimalBatchSize + 1 : optimalBatchSize;
			typename LabeledData<InputType, LabelType>::batch_reference b = dataset.batch(i);
			draw(input, label); pair.input = input; pair.label = label;
			b = Batch<DataPair<InputType, LabelType> >::createBatch(pair, batchsize);
			for (std::size_t j=0; j<batchsize; j++)
			{
				if (j != 0) draw(input, label);
				shark::get(b, j).input = input;
				shark::get(b, j).label = label;
			}
		}
		return dataset;
	}
	
	/// \brief Generates a data set with samples from from the distribution.
	///
	/// @param size the number of samples in the dataset
	LabeledData<InputType, LabelType> generateDataset(std::size_t size) const {
		return generateDataset(size,LabeledData<InputType, LabelType>::DefaultBatchSize );
	}
};


///
/// \brief "chess board" problem for binary classification
///
class Chessboard : public LabeledDataDistribution<RealVector, unsigned int>
{
public:
	Chessboard(unsigned int size = 4, double noiselevel = 0.0)
	{
		m_size = size;
		m_noiselevel = noiselevel;
	}


	void draw(RealVector& input, unsigned int& label)const{
		input.resize(2);
		unsigned int j, t = 0;
		for (j = 0; j < 2; j++)
		{
			double v = Rng::uni(0.0, (double)m_size);
			t += (int)floor(v);
			input(j) = v;
		}
		label = (t & 1);
		if (Rng::uni(0.0, 1.0) < m_noiselevel) label = 1 - label;
	}

protected:
	unsigned int m_size;
	double m_noiselevel;
};


///
/// \brief Noisy sinc function: y = sin(x) / x + noise
///
class Wave : public LabeledDataDistribution<RealVector, RealVector>
{
public:
	Wave(double stddev = 0.1, double range = 5.0){
		m_stddev = stddev;
		m_range = range;
	}


	void draw(RealVector& input, RealVector& label)const{
		input.resize(1);
		label.resize(1);
		input(0) = Rng::uni(-m_range, m_range);
		if(input(0) != 0)
            label(0) = sin(input(0)) / input(0) + Rng::gauss(0.0, m_stddev);
        else
            label(0) = Rng::gauss(0.0, m_stddev);
	}

protected:
	double m_stddev;
	double m_range;
};



/// "Pami Toy" problem for binary classification, as used in the article "Glasmachers
/// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple 
/// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
/// In summary, the first M dimensions are correlated to the labels, the last N dimensions
/// are not. 
class PamiToy : public LabeledDataDistribution<RealVector, unsigned int>
{
public:
	PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 )
	: m_size( size_useful+size_noise ),
	  m_sizeUseful( size_useful ),
	  m_sizeNoise( size_noise ),
	  m_noisePos( noise_position) ,
	  m_noiseVar( noise_variance )
	{ }

	void draw(RealVector& input, unsigned int& label)const{
		input.resize( m_size );
		label =  (unsigned int) Rng::discrete(0,1); //fix label first
		double y2 = label - 0.5; //"clean" informative feature values
		// now fill the informative features..
		for ( unsigned int i=0; i<m_sizeUseful; i++ ) {
			input(i) = y2 + Rng::gauss( m_noisePos, m_noiseVar );
		}
		// ..and the uninformative ones
		for ( unsigned int i=m_sizeUseful; i<m_size; i++ ) {
			input(i) = Rng::gauss( m_noisePos, m_noiseVar );
		}
	}

protected:
	unsigned int m_size;
	unsigned int m_sizeUseful;
	unsigned int m_sizeNoise;
	double m_noisePos;
	double m_noiseVar;
};

/// This class randomly fills a (hyper-)square with data points. Points which 
/// happen to be within a (hyper-)circle centered in the square of a certain
/// radius get a positive class label. Noise on the labels can be added.
class CircleInSquare : public LabeledDataDistribution<RealVector, unsigned int>
{
public:
	CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false )
	: m_dimensions( dimensions ),
	  m_noiselevel( noiselevel ),
	  m_lowerLimit( -1 ),
	  m_upperLimit( 1 ),
	  m_centerpoint( 0 ),
	  m_inner_radius2( 0.5*0.5 ),
	  m_outer_radius2( 0.5*0.5 ),
	  m_equal_class_prob( class_prob_equal )
	{ }
	
	/// allow for arbitrary box limits
	void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius )
	{
		RANGE_CHECK( lower_limit < upper_limit );
		RANGE_CHECK( inner_radius <= outer_radius );
		RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit );
		m_lowerLimit = lower_limit;
		m_upperLimit = upper_limit;
		m_centerpoint = (upper_limit-lower_limit)/2.0;
		m_inner_radius2 = inner_radius*inner_radius;
		m_outer_radius2 = outer_radius*outer_radius;
	}
	
	void draw(RealVector& input, unsigned int& label)const
	{
		input.resize( m_dimensions );
		double v, dist;
		
		if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/
			bool this_label = Rng::coinToss();
			label = ( this_label ? 1 : 0 );
			if ( Rng::uni(0.0, 1.0) < m_noiselevel )
				label = 1 - label;
			if ( this_label ) {
				do {
					dist = 0.0;
					for ( unsigned int i=0; i<m_dimensions; i++ ) {
						v = Rng::uni( m_lowerLimit, m_upperLimit );
						input(i) = v;
						dist += (v-m_centerpoint)*(v-m_centerpoint);
					}
				} while( dist > m_inner_radius2 );
			}
			else {
				do {
					dist = 0.0;
					for ( unsigned int i=0; i<m_dimensions; i++ ) {
						v = Rng::uni( m_lowerLimit, m_upperLimit );
						input(i) = v;
						dist += (v-m_centerpoint)*(v-m_centerpoint);
					}
				} while( dist < m_outer_radius2 );
			}
		}
		else { //equal probability to be anywhere in the cube
			do {
				dist = 0.0;
				for ( unsigned int i=0; i<m_dimensions; i++ ) {
					v = Rng::uni( m_lowerLimit, m_upperLimit );
					input(i) = v;
					dist += (v-m_centerpoint)*(v-m_centerpoint);
				}
				label = ( dist < m_inner_radius2 ? 1 : 0 );
				if ( Rng::uni(0.0, 1.0) < m_noiselevel )
					label = 1 - label;
			} while( dist > m_inner_radius2 && dist < m_outer_radius2 );
		}
	}

protected:
	unsigned int m_dimensions;
	double m_noiselevel;
	double m_lowerLimit;
	double m_upperLimit;
	double m_centerpoint;
	double m_inner_radius2;
	double m_outer_radius2;
	bool m_equal_class_prob; ///<if true, the probability to belong to either class is equal. if false, it is uniform over the cube.
};

// This class randomly fills a 4x4 square in the 2D-plane with data points. 
// Points in the lower left diagonal half are negative, points in the
// upper right diagonal half are positive. But additionally, all points
// in a circle located in the lower right quadrant are positive, effectively
// bulging the decision boundary inward. Noise on the labels can be added.
class DiagonalWithCircle : public LabeledDataDistribution<RealVector, unsigned int>
{
public:
	DiagonalWithCircle( double radius = 1.0, double noise = 0.0 )
	: m_radius2( radius*radius ),
	  m_noiselevel( noise )
	{ }
	
	void draw(RealVector& input, unsigned int& label)const
	{
		input.resize( 2 );
		double x,y;
		x = Rng::uni( 0, 4 ); //zero is left
		y = Rng::uni( 0, 4 ); //zero is bottom
		// assign label according to position w.r.t. the diagonal
		if ( x+y < 4 )
			label = 1;
		else
			label = 0;
		// but if in the circle (even above diagonal), assign positive label
		if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 )
			label = 1;
		
		// add noise
		if ( Rng::uni(0.0, 1.0) < m_noiselevel )
			label = 1 - label;
		input(0) = x;
		input(1) = y;
	}

protected:
	double m_radius2;
	double m_noiselevel;
};


/// \brief Generates a set of normally distributed points
class NormalDistributedPoints:public DataDistribution<RealVector>
{
public:
	/// \brief Generates a simple distribution with 
	NormalDistributedPoints(std::size_t dim): m_offset(dim,0){
		RealMatrix covariance(dim,dim,0);
		diag(covariance) = blas::repeat(1.0,dim);
		m_dist.setCovarianceMatrix(covariance);
	}
	NormalDistributedPoints(RealMatrix const& covariance, RealVector const& offset)
	:m_dist(covariance), m_offset(offset){
		SIZE_CHECK(offset.size() == covariance.size1());
	}
	void draw(RealVector& input) const{
		input.resize(m_offset.size());
		noalias(input) = m_offset;
		noalias(input) += m_dist().first;
	}
private:
	MultiVariateNormalDistributionCholesky m_dist;
	RealVector m_offset;
};

/// \brief Given a set of images, draws a set of image patches of a given size
class ImagePatches:public DataDistribution<RealVector>{
public:
	ImagePatches(
		Data<RealVector> images, 
		std::size_t imageWidth, std::size_t imageHeight,
		std::size_t patchWidth, std::size_t patchHeight
	):m_images(images)
	, m_imageWidth(imageWidth)
	, m_imageHeight(imageHeight)
	, m_patchWidth(patchWidth)
	, m_patchHeight(patchHeight)
	,m_numImages(m_images.numberOfElements()){}
		
	void draw(RealVector& input) const{
		//sample image
		std::size_t imageNum = Rng::discrete(0,m_numImages-1);
		Data<RealVector>::const_element_reference image = m_images.element(imageNum);
		//draw the upper left corner of the image
		std::size_t m_startX = Rng::discrete(0,m_imageWidth-m_patchWidth);
		std::size_t m_startY = Rng::discrete(0,m_imageHeight-m_patchHeight);
		
		
		//copy patch
		input.resize(m_patchWidth * m_patchHeight);
		std::size_t rowStart = m_startY * m_imageWidth + m_startX;
		for (size_t y = 0; y < m_patchHeight; ++y){
			for (size_t x = 0; x < m_patchWidth; ++x){
				input(y * m_patchWidth + x) = image(rowStart+x);
			}
			rowStart += m_imageWidth;
		}
	}
private:
	Data<RealVector> m_images;
	std::size_t m_imageWidth;
	std::size_t m_imageHeight;
	std::size_t m_patchWidth;
	std::size_t m_patchHeight;
	std::size_t m_numImages;
};

}
#endif