This file is indexed.

/usr/include/shark/Data/Dataset.h is in libshark-dev 3.0.1+ds1-2ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
//===========================================================================
/*!
 * 
 *
 * \brief       Data for (un-)supervised learning.
 * 
 * 
 * \par
 * This file provides containers for data used by the models, loss
 * functions, and learning algorithms (trainers). The reason for
 * dedicated containers of this type is that data often need to be
 * split into subsets, such as training and test data, or folds in
 * cross-validation. The containers in this file provide memory
 * efficient mechanisms for managing and providing such subsets.
 * 
 * 
 * 
 *
 * \author      O. Krause, T. Glasmachers
 * \date        2010-2014
 *
 *
 * \par Copyright 1995-2015 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <http://image.diku.dk/shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================

#ifndef SHARK_DATA_DATASET_H
#define SHARK_DATA_DATASET_H

#include <boost/foreach.hpp>
#include <boost/range/iterator_range.hpp>
#include <boost/range/algorithm/sort.hpp>

#include <shark/Core/Exception.h>
#include <shark/Core/OpenMP.h>
#include <shark/Core/utility/functional.h>
#include <shark/Rng/GlobalRng.h>
#include "Impl/Dataset.inl"

namespace shark {


///
/// \brief Data container.
///
/// The Data class is Shark's container for machine learning data.
/// This container (and its sub-classes) is used for input data,
/// labels, and model outputs.
///
/// \par
/// The Data container organizes the data it holds in batches.
/// This means, that it tries to find a good data representation for a whole
/// set of, for example 100 data points, at the same time. If the type of data it stores
/// is for example RealVector, the batches of this type are RealMatrices. This is good because most often
/// operations on the whole matrix are faster than operations on the separate vectors.
/// Nearly all operations of the set have to be interpreted in terms of the batch. Therefore the iterator interface will
/// give access to the batches but not to single elements. For this separate element_iterators and const_element_iterators
/// can be used.
///\par
/// There are a lot of these typedefs. The typical typedefs for containers like batch_type or iterator are chosen
/// as types for the batch interface. For accessing single elements, a different set of typedefs is in place. Thus instead of iterator
/// you must write element_iterator and instead of batch_type write element_type. Usually you should not use element_type except when
/// you want to actually copy the data. Instead use element_reference or const_element_reference. Note that these are proxy objects and not
/// actual references to element_type!
/// A short example for these typedefs:
///\code
///typedef Data<RealVector> Set;
/// Set data;
/// for(Set::element_iterator pos=data.elemBegin();pos!= data.elemEnd();++pos){
///     std::cout<<*pos<<" ";
///     Set::element_reference ref=*pos;
///     ref*=2;
///     std::cout<<*pos<<std::endl;
///}
///\endcode
///When you write C++11 code, this is of course much simpler:
///\code
/// Data<RealVector> data;
/// for(auto pos=data.elemBegin();pos!= data.elemEnd();++pos){
///     std::cout<<*pos<<" ";
///     auto ref=*pos;
///     ref*=2;
///     std::cout<<*pos<<std::endl;
///}
///\endcode
/// \par
/// Element wise accessing of elements is usually slower than accessing the batches. If possible, use direct batch access, or
/// at least use the iterator interface to iterate over all elements. Random access to single elements is linear time, so use it wisely.
/// Of course, when you want to use batches, you need to know the actual batch type. This depends on the actual type of the input.
/// here are the rules:
/// if the input is an arithmetic type like int or double, the result will be a vector of this
/// (i.e. double->RealVector or Int->IntVector).
/// For vectors the results are matrices as mentioned above. If the vector is sparse, so is the matrix.
/// And for everything else the batch type is just a std::vector of the type, so no optimization can be applied.
/// \par
/// When constructing the container the batchSize can be set. If it is not set by the user the default batchSize is chosen. A BatchSize of 0
/// corresponds to putting all data into a single batch. Beware that not only the data needs storage but also
/// the various models during computation. So the actual amount of space to compute a batch can greatly exceed the batch size.
///
/// An additional feature of the Data class is that it can be used to create lazy subsets. So the batches of a dataset
/// can be shared between various instances of the data class without additional memory overhead.
///
///
///\warning Be aware --especially for derived containers like LabeledData-- that the set does not enforce structural consistency.
/// When you change the structure of the data part for example by directly changing the size of the batches, the size of the labels is not
/// enforced to change accordingly. Also when creating subsets of a set changing the parent will change it's siblings and conversely. The programmer
/// needs to ensure structural integrity!
/// For example this is dangerous:
/// \code
/// void function(Data<unsigned int>& data){
///      Data<unsigned int> newData(...);
///      data=newData;
/// }
/// \endcode
/// When data was originally a labeledData object, and newData has a different batch structure than data, this will lead to structural inconsistencies.
/// When function is rewritten such that newData has the same structure as data, this code is perfectly fine. The best way to get around this problem is
/// by rewriting the code as:
/// \code
/// Data<unsigned int> function(){
///      Data<unsigned int> newData(...);
///      return newData;
/// }
/// \endcode
///\todo expand docu
template <class Type>
class Data : public ISerializable
{
protected:
	typedef detail::SharedContainer<Type> Container;
	typedef Data<Type> self_type;

	Container m_data;		///< data
public:
	/// \brief Defines the default batch size of the Container.
	///
	/// Zero means: unlimited
	BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = 256);

	typedef typename Container::BatchType batch_type;
	typedef batch_type& batch_reference;
	typedef batch_type const& const_batch_reference;

	typedef Type element_type;
	typedef typename Batch<element_type>::reference element_reference;
	typedef typename Batch<element_type>::const_reference const_element_reference;

	typedef std::vector<std::size_t> IndexSet;

	template <class T> friend bool operator == (const Data<T>& op1, const Data<T>& op2);
	template <class InputT, class LabelT> friend class LabeledData;


	// RANGES
	typedef boost::iterator_range<typename Container::element_iterator> element_range;
	typedef boost::iterator_range<typename Container::const_element_iterator> const_element_range;
	typedef boost::iterator_range<typename Container::iterator> batch_range;
	typedef boost::iterator_range<typename Container::const_iterator> const_batch_range;


	///\brief Returns the range of elements.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	const_element_range elements()const{
		return const_element_range(m_data.elemBegin(),m_data.elemEnd());
	}
	///\brief Returns therange of elements.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	element_range elements(){
		return element_range(m_data.elemBegin(),m_data.elemEnd());
	}

	///\brief Returns the range of batches.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	const_batch_range batches()const{
		return const_batch_range(m_data.begin(),m_data.end());
	}
	///\brief Returns the range of batches.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	batch_range batches(){
		return batch_range(m_data.begin(),m_data.end());
	}

	///\brief Returns the number of batches of the set.
	std::size_t numberOfBatches() const{
		return m_data.size();
	}
	///\brief Returns the total number of elements.
	std::size_t numberOfElements() const{
		return m_data.numberOfElements();
	}

	///\brief Check whether the set is empty.
	bool empty() const{
		return m_data.empty();
	}

	// ELEMENT ACCESS
	element_reference element(std::size_t i){
		return *(m_data.elemBegin()+i);
	}
	const_element_reference element(std::size_t i) const{
		return *(m_data.elemBegin()+i);
	}

	// BATCH ACCESS
	batch_reference batch(std::size_t i){
		return *(m_data.begin()+i);
	}
	const_batch_reference batch(std::size_t i) const{
		return *(m_data.begin()+i);
	}

	// CONSTRUCTORS

	///\brief Constructor which constructs an empty set
	Data(){ }

	///\brief Construct a dataset with empty batches.
	explicit Data(std::size_t numBatches) : m_data( numBatches )
	{ }

	///\brief Construct a dataset with different batch sizes as a copy of another dataset
	explicit Data(Data const& container, std::vector<std::size_t> batchSizes)
	: m_data( container.m_data, batchSizes, true )
	{ }

	///\brief Construction with size and a single element
	///
	/// Optionally the desired batch Size can be set
	///
	///@param size the new size of the container
	///@param element the blueprint element from which to create the Container
	///@param batchSize the size of the batches. if this is 0, the size is unlimited
	explicit Data(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
	: m_data(size,element,batchSize)
	{ }

	// MISC

	void read(InArchive& archive){
		archive >> m_data;
	}

	void write(OutArchive& archive) const{
		archive << m_data;
	}
	///\brief This method makes the vector independent of all siblings and parents.
	virtual void makeIndependent(){
		m_data.makeIndependent();
	}


	// METHODS TO ALTER BATCH STRUCTURE

	void splitBatch(std::size_t batch, std::size_t elementIndex){
		m_data.splitBatch(m_data.begin()+batch,elementIndex);
	}

	///\brief Splits the container into two independent parts. The front part remains in the container, the back part is returned.
	///
	///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
	///this to work.
	self_type splice(std::size_t batch){
		self_type right;
		right.m_data=m_data.splice(m_data.begin()+batch);
		return right;
	}

	/// \brief Appends the contents of another data object to the end
	///
	/// The batches are not copied but now referenced from both datasets. Thus changing the appended
	/// dataset might change this one as well.
	void append(self_type const& other){
		m_data.append(other.m_data);
	}
	
	void push_back(const_batch_reference batch){
		m_data.push_back(batch);
	}

	///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
	///
	///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
	///However the sum of all batch sizes must be equal to the current number of elements
	template<class Range>
	void repartition(Range const& batchSizes){
		m_data.repartition(batchSizes);
	}
	
	/// \brief Creates a vector with the batch sizes of every batch.
	///
	/// This method can be used together with repartition to ensure
	/// that two datasets have the same batch structure.
	std::vector<std::size_t> getPartitioning()const{
		return m_data.getPartitioning();
	}

	// SUBSETS
	///\brief Fill in the subset defined by the list of indices.
	void indexedSubset(IndexSet const& indices, self_type& subset) const{
		subset.m_data=Container(m_data,indices);
	}

	///\brief Fill in the subset defined by the list of indices as well as its complement.
	void indexedSubset(IndexSet const& indices, self_type& subset, self_type& complement) const{
		IndexSet comp;
		detail::complement(indices,m_data.size(),comp);
		subset.m_data=Container(m_data,indices);
		complement.m_data=Container(m_data,comp);
	}

	friend void swap(Data& a, Data& b){
		swap(a.m_data,b.m_data);
	}
};

/**
 * \ingroup shark_globals
 * @{
 */

/// Outstream of elements.
template<class T>
std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
	typedef typename Data<T>::const_element_reference reference;
	typename Data<T>::const_element_range elements = d.elements();
	BOOST_FOREACH(reference elem,elements)
		stream << elem << "\n";
	return stream;
}
/** @} */

/// \brief Data set for unsupervised learning.
///
/// The UnlabeledData class is basically a standard Data container
/// with the special interpretation of its data point being
/// "inputs" to a learning algorithm.
template <class InputT>
class UnlabeledData : public Data<InputT>
{
public:
	typedef InputT element_type;
	typedef Data<element_type> base_type;
	typedef UnlabeledData<element_type> self_type;
	typedef element_type InputType;
	typedef detail::SharedContainer<InputT> InputContainer;

protected:
	using base_type::m_data;
public:

	///\brief Constructor.
	UnlabeledData()
	{ }

	///\brief Construction from data.
	UnlabeledData(Data<InputT> const& points)
	: base_type(points)
	{ }

	///\brief Construction with size and a single element
	///
	/// Optionally the desired batch Size can be set
	///
	///@param size the new size of the container
	///@param element the blueprint element from which to create the Container
	///@param batchSize the size of the batches. if this is 0, the size is unlimited
	UnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = base_type::DefaultBatchSize)
	: base_type(size,element,batchSize)
	{ }

	///\brief Create an empty set with just the correct number of batches.
	///
	/// The user must initialize the dataset after that by himself.
	UnlabeledData(std::size_t numBatches)
	: base_type(numBatches)
	{ }

	///\brief Construct a dataset with different batch sizes. it is a copy of the other dataset
	UnlabeledData(UnlabeledData const& container, std::vector<std::size_t> batchSizes)
		:base_type(container,batchSizes){}

	/// \brief we allow assignment from Data.
	self_type operator=(Data<InputT> const& data){
		static_cast<Data<InputT>& >(*this) = data;
		return *this;
	}

	///\brief Access to the base_type class as "inputs".
	///
	/// Added for consistency with the LabeledData::labels() method.
	self_type& inputs(){
		return *this;
	}

	///\brief Access to the base_type class as "inputs".
	///
	/// Added for consistency with the LabeledData::labels() method.
	self_type const& inputs() const{
		return *this;
	}

	///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
	///
	///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
	///this to work.
	self_type splice(std::size_t batch){
		self_type right;
		right.m_data=m_data.splice(m_data.begin()+batch);
		return right;
	}

	///\brief shuffles all elements in the entire dataset (that is, also across the batches)
	virtual void shuffle(){
		DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
		shark::shuffle(this->elements().begin(),this->elements().end(), uni);
	}
};

///
/// \brief Data set for supervised learning.
///
/// The LabeledData class extends UnlabeledData for the
/// representation of inputs. In addition it holds and
/// provides access to the corresponding labels.
///
/// LabeledData tries to mimic the underlying data as pairs of input and label data.
/// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
/// one access the input batch by batch(i).input and the labels by batch(i).label
///
///this also holds true for single element access using operator(). Be aware, that direct access to an element is
///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
template <class InputT, class LabelT>
class LabeledData : public ISerializable
{
protected:
	typedef LabeledData<InputT, LabelT> self_type;
public:
	typedef InputT InputType;
	typedef LabelT LabelType;
	typedef UnlabeledData<InputT> InputContainer;
	typedef Data<LabelT> LabelContainer;
	typedef typename InputContainer::IndexSet IndexSet;

	BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = InputContainer::DefaultBatchSize);

	// TYPEDEFS fOR PAIRS
	typedef DataBatchPair<
		typename Batch<InputType>::type,
		typename Batch<LabelType>::type
	> batch_type;

	typedef DataPair<
		InputType,
		LabelType
	> element_type;

	// TYPEDEFS FOR  RANGES
	typedef typename PairRangeType<
		element_type,
		typename InputContainer::element_range,
		typename LabelContainer::element_range
	>::type element_range;
	typedef typename PairRangeType<
		element_type,
		typename InputContainer::const_element_range,
		typename LabelContainer::const_element_range
	>::type const_element_range;
	typedef typename PairRangeType<
		batch_type,
		typename InputContainer::batch_range,
		typename LabelContainer::batch_range
	>::type batch_range;
	typedef typename PairRangeType<
		batch_type,
		typename InputContainer::const_batch_range,
		typename LabelContainer::const_batch_range
	>::type const_batch_range;

	// TYPEDEFS FOR REFERENCES
	typedef typename boost::range_reference<batch_range>::type batch_reference;
	typedef typename boost::range_reference<const_batch_range>::type const_batch_reference;
	typedef typename boost::range_reference<element_range>::type element_reference;
	typedef typename boost::range_reference<const_element_range>::type const_element_reference;

	///\brief Returns the range of elements.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	const_element_range elements()const{
		return zipPairRange<element_type>(m_data.elements(),m_label.elements());
	}
	///\brief Returns therange of elements.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	element_range elements(){
		return zipPairRange<element_type>(m_data.elements(),m_label.elements());
	}

	///\brief Returns the range of batches.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	const_batch_range batches()const{
		return zipPairRange<batch_type>(m_data.batches(),m_label.batches());
	}
	///\brief Returns the range of batches.
	///
	///It is compatible to boost::range and STL and can be used whenever an algorithm requires
	///element access via begin()/end() in which case data.elements() provides the correct interface
	batch_range batches(){
		return zipPairRange<batch_type>(m_data.batches(),m_label.batches());
	}

	///\brief Returns the number of batches of the set.
	std::size_t numberOfBatches() const{
		return m_data.numberOfBatches();
	}
	///\brief Returns the total number of elements.
	std::size_t numberOfElements() const{
		return m_data.numberOfElements();
	}

	///\brief Check whether the set is empty.
	bool empty() const{
		return m_data.empty();
	}

	///\brief Access to inputs as a separate container.
	InputContainer const& inputs() const{
		return m_data;
	}
	///\brief Access to inputs as a separate container.
	InputContainer& inputs(){
		return m_data;
	}

	///\brief Access to labels as a separate container.
	LabelContainer const& labels() const{
		return m_label;
	}
	///\brief Access to labels as a separate container.
	LabelContainer& labels(){
		return m_label;
	}

	// CONSTRUCTORS

	///\brief Empty data set.
	LabeledData()
	{}

	///\brief Create an empty set with just the correct number of batches.
	///
	/// The user must initialize the dataset after that by himself.
	LabeledData(std::size_t numBatches)
	: m_data(numBatches),m_label(numBatches)
	{}

	///
	/// Optionally the desired batch Size can be set
	///
	///@param size the new size of the container
	///@param element the blueprint element from which to create the Container
	///@param batchSize the size of the batches. if this is 0, the size is unlimited
	LabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
	: m_data(size,element.input,batchSize),
	  m_label(size,element.label,batchSize)
	{}

	///\brief Construction from data.
	///
	/// Beware that when calling this constructor the organization of batches must be equal in both
	/// containers. This Constructor will not split the data!
	LabeledData(Data<InputType> const& inputs, Data<LabelType> const& labels)
	: m_data(inputs), m_label(labels)
	{
		SHARK_CHECK(inputs.numberOfElements() == labels.numberOfElements(), "[LabeledData::LabeledData] number of inputs and number of labels must agree");
#ifndef DNDEBUG
		for(std::size_t i  = 0; i != inputs.numberOfBatches(); ++i){
			SIZE_CHECK(shark::size(inputs.batch(i))==shark::size(labels.batch(i)));
		}
#endif
	}
	// ELEMENT ACCESS
	element_reference element(std::size_t i){
		return element_reference(m_data.element(i),m_label.element(i));
	}
	const_element_reference element(std::size_t i) const{
		return const_element_reference(m_data.element(i),m_label.element(i));
	}

	// BATCH ACCESS
	batch_reference batch(std::size_t i){
		return batch_reference(m_data.batch(i),m_label.batch(i));
	}
	const_batch_reference batch(std::size_t i) const{
		return const_batch_reference(m_data.batch(i),m_label.batch(i));
	}

	// MISC

	/// from ISerializable
	void read(InArchive& archive){
		archive & m_data;
		archive & m_label;
	}

	/// from ISerializable
	void write(OutArchive& archive) const{
		archive & m_data;
		archive & m_label;
	}

	///\brief This method makes the vector independent of all siblings and parents.
	virtual void makeIndependent(){
		m_label.makeIndependent();
		m_data.makeIndependent();
	}

	///\brief shuffles all elements in the entire dataset (that is, also across the batches)
	virtual void shuffle(){
		DiscreteUniform<Rng::rng_type> uni(Rng::globalRng);
		shark::shuffle(this->elements().begin(),this->elements().end(), uni);
	}

	void splitBatch(std::size_t batch, std::size_t elementIndex){
		m_data.splitBatch(batch,elementIndex);
		m_label.splitBatch(batch,elementIndex);
	}

	///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
	///
	///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
	///this to work.
	self_type splice(std::size_t batch){
		return self_type(m_data.splice(batch),m_label.splice(batch));
	}

	/// \brief Appends the contents of another data object to the end
	///
	/// The batches are not copied but now referenced from both datasets. Thus changing the appended
	/// dataset might change this one as well.
	void append(self_type const& other){
		m_data.append(other.m_data);
		m_label.append(other.m_label);
	}
	
	void push_back(
		typename Batch<InputType>::type const& inputs, 
		typename Batch<LabelType>::type const& labels
	){
		m_data.push_back(inputs);
		m_label.push_back(labels);
	}
	
	void push_back(
		const_batch_reference batch
	){
		push_back(batch.inputs,batch.labels);
	}


	///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
	///
	///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
	///However the sum of all batch sizes must be equal to the current number of elements
	template<class Range>
	void repartition(Range const& batchSizes){
		m_data.repartition(batchSizes);
		m_label.repartition(batchSizes);
	}
	
	/// \brief Creates a vector with the batch sizes of every batch.
	///
	/// This method can be used together with repartition to ensure
	/// that two datasets have the same batch structure.
	std::vector<std::size_t> getPartitioning()const{
		return m_data.getPartitioning();
	}

	friend void swap(LabeledData& a, LabeledData& b){
		swap(a.m_data,b.m_data);
		swap(a.m_label,b.m_label);
	}


	// SUBSETS

	///\brief Fill in the subset defined by the list of indices.
	void indexedSubset(IndexSet const& indices, self_type& subset) const{
		m_data.indexedSubset(indices,subset.m_data);
		m_label.indexedSubset(indices,subset.m_label);
	}

	///\brief Fill in the subset defined by the list of indices as well as its complement.
	void indexedSubset(IndexSet const& indices, self_type& subset, self_type& complement)const{
		IndexSet comp;
		detail::complement(indices,m_data.numberOfBatches(),comp);
		m_data.indexedSubset(indices,subset.m_data);
		m_label.indexedSubset(indices,subset.m_label);
		m_data.indexedSubset(comp,complement.m_data);
		m_label.indexedSubset(comp,complement.m_label);
	}
protected:
	InputContainer m_data;               /// point data
	LabelContainer m_label;		/// label data
};

/// specialized template for classification with unsigned int labels
typedef LabeledData<RealVector, unsigned int> ClassificationDataset;

/// specialized template for regression with RealVector labels
typedef LabeledData<RealVector, RealVector> RegressionDataset;

/// specialized template for classification with unsigned int labels and sparse data
typedef LabeledData<CompressedRealVector, unsigned int> CompressedClassificationDataset;

template<class Functor, class T>
struct TransformedData{
	typedef Data<typename detail::TransformedDataElement<Functor,T>::type > type;
};


/**
 * \addtogroup shark_globals
 * @{
 */

/// \brief creates a data object from a range of elements
template<class Range>
Data<typename boost::range_value<Range>::type>
createDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
	typedef typename boost::range_value<Range const>::type Input;
	typedef typename boost::range_iterator<Range const>::type Iterator;

	if (maximumBatchSize == 0)
		maximumBatchSize = Data<Input>::DefaultBatchSize;

	std::size_t numPoints = shark::size(inputs);
	//first determine the optimal number of batches as well as batch size
	std::size_t batches = numPoints / maximumBatchSize;
	if(numPoints > batches*maximumBatchSize)
		++batches;
	std::size_t optimalBatchSize=numPoints/batches;
	std::size_t remainder = numPoints-batches*optimalBatchSize;
	Data<Input> data(batches);

	//now create the batches taking the remainder into account
	Iterator start= boost::begin(inputs);
	for(std::size_t i = 0; i != batches; ++i){
		std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
		Iterator end = start+size;
		data.batch(i) = createBatch<Input>(
			boost::make_iterator_range(start,end)
		);
		start = end;
	}

	return data;
}

/// \brief creates a data object from a range of elements
template<class Range>
UnlabeledData<typename boost::range_value<Range>::type>
createUnlabeledDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
	return createDataFromRange(inputs,maximumBatchSize);
}
/// \brief creates a labeled data object from two ranges, representing inputs and labels
template<class Range1, class Range2>
LabeledData<
	typename boost::range_value<Range1>::type,
	typename boost::range_value<Range2>::type
> createLabeledDataFromRange(Range1 const& inputs, Range2 const& labels, std::size_t batchSize = 0){
	SHARK_CHECK(boost::size(inputs) == boost::size(labels),
	"[createDataFromRange] number of inputs and number of labels must agree");
	typedef typename boost::range_value<Range1>::type Input;
	typedef typename boost::range_value<Range2>::type Label;

	if (batchSize == 0)
		batchSize = LabeledData<Input,Label>::DefaultBatchSize;

	return LabeledData<Input,Label>(
		createDataFromRange(inputs,batchSize),
		createDataFromRange(labels,batchSize)
	);
}

///brief  Outstream of elements for labeled data.
template<class T, class U>
std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
	typedef typename LabeledData<T, U>::const_element_reference reference;
	typename LabeledData<T, U>::const_element_range elements = d.elements();
	BOOST_FOREACH(reference elem,elements)
		stream << elem.input << " [" << elem.label <<"]"<< "\n";
	return stream;
}


// FUNCTIONS FOR DIMENSIONALITY


///\brief Return the number of classes of a set of class labels with unsigned int label encoding
inline unsigned int numberOfClasses(Data<unsigned int> const& labels){
	unsigned int classes = 0;
	for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
		classes = std::max(classes,*std::max_element(labels.batch(i).begin(),labels.batch(i).end()));
	}
	return classes+1;
}

///\brief Returns the number of members of each class in the dataset.
inline std::vector<std::size_t> classSizes(Data<unsigned int> const& labels){
	std::vector<std::size_t> classCounts(numberOfClasses(labels),0u);
	for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
		std::size_t batchSize = size(labels.batch(i));
		for(std::size_t j = 0; j != batchSize; ++j){
			classCounts[labels.batch(i)(j)]++;
		}
	}
	return classCounts;
}

///\brief  Return the dimensionality of a  dataset.
template <class InputType>
std::size_t dataDimension(Data<InputType> const& dataset){
	SHARK_ASSERT(dataset.numberOfElements() > 0);
	return dataset.element(0).size();
}

///\brief  Return the input dimensionality of a labeled dataset.
template <class InputType, class LabelType>
std::size_t inputDimension(LabeledData<InputType, LabelType> const& dataset){
	return dataDimension(dataset.inputs());
}

///\brief  Return the label/output dimensionality of a labeled dataset.
template <class InputType, class LabelType>
std::size_t labelDimension(LabeledData<InputType, LabelType> const& dataset){
	return dataDimension(dataset.labels());
}
///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
template <class InputType>
std::size_t numberOfClasses(LabeledData<InputType, unsigned int> const& dataset){
	return numberOfClasses(dataset.labels());
}
///\brief Returns the number of members of each class in the dataset.
template<class InputType, class LabelType>
inline std::vector<std::size_t> classSizes(LabeledData<InputType, LabelType> const& dataset){
	return classSizes(dataset.labels());
}


//subsetting
template<class DatasetT>
DatasetT indexedSubset(
	DatasetT const& dataset,
	typename DatasetT::IndexSet const& indices
){
	DatasetT subset;
	dataset.indexedSubset(indices,subset);
	return subset;
}
///\brief  Fill in the subset of batches [start,...,size+start[.
template<class DatasetT>
DatasetT rangeSubset(DatasetT const& dataset, std::size_t start, std::size_t end){
	typename DatasetT::IndexSet indices;
	detail::range(end-start, start, indices);
	return indexedSubset(dataset,indices);
}
///\brief  Fill in the subset of batches [0,...,size[.
template<class DatasetT>
DatasetT rangeSubset(DatasetT const& dataset, std::size_t size){
	return rangeSubset(dataset,size,0);
}

// TRANSFORMATION
///\brief Transforms a dataset using a Functor f and returns the transformed result.
///
/// this version is used, when the Functor supports only element-by-element transformations
template<class T,class Functor>
typename boost::lazy_disable_if<
	CanBeCalled<Functor,typename Data<T>::batch_type>,
	TransformedData<Functor,T>
>::type
transform(Data<T> const& data, Functor f){
	typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
	int batches = (int) data.numberOfBatches();
	Data<ResultType> result(batches);
	SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
		result.batch(i)= createBatch<T>(boost::adaptors::transform(data.batch(i), f));
	return result;
}

///\brief Transforms a dataset using a Functor f and returns the transformed result.
///
/// this version is used, when the Functor supports batch-by-batch transformations
template<class T,class Functor>
typename boost::lazy_enable_if<
	CanBeCalled<Functor,typename Data<T>::batch_type>,
	TransformedData<Functor,T>
>::type
transform(Data<T> const& data, Functor const& f){
	typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
	int batches = (int) data.numberOfBatches();
	Data<ResultType> result(batches);
	SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
		result.batch(i)= f(data.batch(i));
	return result;
}

///\brief Transforms the inputs of a dataset and return the transformed result.
template<class I,class L, class Functor>
LabeledData<typename detail::TransformedDataElement<Functor,I >::type, L >
transformInputs(LabeledData<I,L> const& data, Functor const& f){
	typedef LabeledData<typename detail::TransformedDataElement<Functor,I>::type,L > DatasetType;
	return DatasetType(transform(data.inputs(),f),data.labels());
}
///\brief Transforms the labels of a dataset and returns the transformed result.
template<class I,class L, class Functor>
LabeledData<I,typename detail::TransformedDataElement<Functor,L >::type >
transformLabels(LabeledData<I,L> const& data, Functor const& f){
	typedef LabeledData<I,typename detail::TransformedDataElement<Functor,L>::type > DatasetType;
	return DatasetType(data.inputs(),transform(data.labels(),f));
}

///\brief Creates a copy o a dataset selecting only a certain set of features.
template<class FeatureSet>
Data<RealVector> selectFeatures(Data<RealVector> const& data,FeatureSet const& features){
	return transform(data,detail::SelectFeatures<FeatureSet>(features));
}

template<class T, class FeatureSet>
LabeledData<RealVector,T> selectInputFeatures(LabeledData<RealVector,T> const& data,FeatureSet const& features){
	return transformInputs(data, detail::SelectFeatures<FeatureSet>(features));
}



/// \brief Removes the last part of a given dataset and returns a new split containing the removed elements
///
/// For this operation, the dataset is not allowed to be shared.
/// \brief data The dataset which should be splited
/// \brief index the first element to be split
/// \returns the  set which contains the splitd element (right part of the given set)
template<class DatasetT>
DatasetT splitAtElement(DatasetT& data, std::size_t elementIndex){
	SIZE_CHECK(elementIndex<=data.numberOfElements());

	std::size_t batchPos = 0;
	std::size_t batchStart = 0;
	while(batchStart + boost::size(data.batch(batchPos)) < elementIndex){
		batchStart += boost::size(data.batch(batchPos));
		++batchPos;
	};
	std::size_t splitPoint = elementIndex-batchStart;
	if(splitPoint != 0){
		data.splitBatch(batchPos,splitPoint);
		++batchPos;
	}

	return data.splice(batchPos);
}


///\brief reorders the dataset such, that points are grouped by labels
///
/// The elements are not only reordered but the batches are also resized such, that every batch
/// only contains elements of one class. This method must be used in order to use binarySubproblem.
template<class I>
void repartitionByClass(LabeledData<I,unsigned int>& data,std::size_t batchSize = LabeledData<I,unsigned int>::DefaultBatchSize){
	std::vector<std::size_t > classCounts = classSizes(data);
	std::vector<std::size_t > partitioning;//new, optimal partitioning of the data according to the batch sizes
	std::vector<std::size_t > classStart;//at which batch the elements of the class are starting
	detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);

	data.repartition(partitioning);

	// Now place examples into the batches reserved for their class...

	// The following line does the job in principle but it crashes with clang on the mac:
//	boost::sort(data.elements());//todo we are lying here, use bidirectional iterator sort.

	// The following fixes the issue. As an aside it is even linear time:
	std::vector<std::size_t> bat = classStart;           // batch index until which the class is already filled in
	std::vector<std::size_t> idx(classStart.size(), 0);  // index within the batch until which the class is already filled in
	unsigned int c = 0;                                  // current class in whose batch space we operate
	typedef typename Batch<I>::type InputBatchType;
	typedef typename Batch<unsigned int>::type LabelBatchType;
	for (std::size_t b=0; b<data.numberOfBatches(); b++)
	{
		// update class range index
		std::size_t e = 0;
		while (c + 1 < classStart.size() && b == classStart[c + 1])
		{
			c++;
			b = bat[c];
			e = idx[c];
		}
		if (b == data.numberOfBatches()) break;

		InputBatchType& bi1 = data.inputs().batch(b);
		LabelBatchType& bl1 = data.labels().batch(b);
		while (true)
		{
			unsigned int l = shark::get(bl1, e);
			if (l == c)   // leave element in place
			{
				e++;
				idx[c] = e;
				if (e == boost::size(bl1))
				{
					e = 0;
					idx[c] = 0;
					bat[c]++;
					break;
				}
			}
			else   // swap elements
			{
				InputBatchType& bi2 = data.inputs().batch(bat[l]);
				LabelBatchType& bl2 = data.labels().batch(bat[l]);
				swap(shark::get(bi1, e), shark::get(bi2, idx[l]));
				shark::get(bl1, e) = shark::get(bl2, idx[l]);
				shark::get(bl2, idx[l]) = l;
				idx[l]++;
				if (idx[l] == boost::size(bl2))
				{
					idx[l] = 0;
					bat[l]++;
				}
			}
		}
	}
}

template<class I>
LabeledData<I,unsigned int> binarySubProblem(
	LabeledData<I,unsigned int>const& data,
	unsigned int zeroClass,
	unsigned int oneClass
){
	std::vector<std::size_t> indexSet;
	std::size_t smaller = std::min(zeroClass,oneClass);
	std::size_t bigger = std::max(zeroClass,oneClass);
	std::size_t numBatches = data.numberOfBatches();

	//find first class
	std::size_t start= 0;
	for(;start != numBatches && get(data.batch(start),0).label != smaller;++start);
	SHARK_CHECK(start != numBatches, "[shark::binarySubProblem] class does not exist");

	//copy batch indices of first class
	for(;start != numBatches && get(data.batch(start),0).label == smaller; ++start)
		indexSet.push_back(start);

	//find second class
	for(;start != numBatches && get(data.batch(start),0).label != bigger;++start);
	SHARK_CHECK(start != numBatches, "[shark::binarySubProblem] class does not exist");

	//copy batch indices of second class
	for(;start != numBatches && get(data.batch(start),0).label == bigger; ++start)
		indexSet.push_back(start);

	return transformLabels(indexedSubset(data,indexSet), detail::TransformOneVersusRestLabels(oneClass));
}

/// \brief Construct a binary (two-class) one-versus-rest problem from a multi-class problem.
///
/// \par
/// The function returns a new LabeledData object. The input part
/// coincides with the multi-class data, but the label part is replaced
/// with binary labels 0 and 1. All instances of the given class
/// (parameter oneClass) get a label of one, all others are assigned a
/// label of zero.
template<class I>
LabeledData<I,unsigned int> oneVersusRestProblem(
	LabeledData<I,unsigned int>const& data,
	unsigned int oneClass)
{
	return transformLabels(data, detail::TransformOneVersusRestLabels(oneClass));
}

///
///\brief Transformation function multiplying the elements in a dataset by a scalar or component-wise by values stores in a vector
///
class Multiply {
public:
	///@param factor All components of all vectors in the dataset are multiplied by this number
	Multiply(double factor) : m_factor(factor), m_scalar(true) {}
	///@param factor For all elements in the dataset, the i-th component is multiplied with the i-th component of this vector
	Multiply(const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}

	typedef RealVector result_type;

	RealVector operator()(RealVector input) const {
		if(m_scalar) {
			for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factor;
			return input;
		} else {
			SIZE_CHECK(m_factorv.size() == input.size());
			for(std::size_t i = 0; i != input.size(); ++i) input(i) *= m_factorv(i);
			return input;
		}
	}
private:
	double m_factor;
	RealVector m_factorv;
	bool m_scalar;
};

///
///\brief Transformation function dividing the elements in a dataset by a scalar or component-wise by values stores in a vector
///
class Divide {
public:
	///@param factor All components of all vectors in the dataset are divided by this number
	Divide(double factor) : m_factor(factor), m_scalar(true) {}
	///@param factor For all elements in the dataset, the i-th component is divided by the i-th component of this vector
	Divide(const RealVector factor) : m_factor(0), m_factorv(factor), m_scalar(false) {}

	typedef RealVector result_type;

	RealVector operator()(RealVector input) const {
		if(m_scalar) {
			for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factor;
			return input;
		} else {
			SIZE_CHECK(m_factorv.size() == input.size());
			for(std::size_t i = 0; i != input.size(); ++i) input(i) /= m_factorv(i);
			return input;
		}
	}
private:
	double m_factor;
	RealVector m_factorv;
	bool m_scalar;
};


///
///\brief Transformation function adding a vector or a scalar to the elements in a dataset
///
class Shift {
public:
	///@param offset Scalar added to all components of all vectors in the dataset
	Shift(double offset) : m_offset(offset), m_scalar(true) {}
	///@param offset Vector added to vectors in the dataset
	Shift(const RealVector offset) : m_offsetv(offset), m_scalar(false) {}

	typedef RealVector result_type;

	RealVector operator()(RealVector input) const {
		if(m_scalar) {
			for(std::size_t i = 0; i != input.size(); ++i)
				input(i) += m_offset;
		} else {
			SIZE_CHECK(m_offsetv.size() == input.size());
			for(std::size_t i = 0; i != input.size(); ++i)
				input(i) += m_offsetv(i);
		}
		return input;

	}
private:
	double m_offset;
	RealVector m_offsetv;
	bool m_scalar;
};

///
///\brief Transformation function truncating elements in a dataset
///
class Truncate {
public:
	///@param minValue All elements below this value are cut to the minimum value
	///@param maxValue All elements above this value are cut to the maximum value
	Truncate(double minValue,double maxValue) : m_min(minValue), m_max(maxValue){}
	///@param minv Lower bound for element-wise truncation
	///@param maxv Upper bound for element-wise truncation
	Truncate(const RealVector minv, const RealVector maxv) : m_min(1), m_max(-1), m_minv(minv), m_maxv(maxv) { SIZE_CHECK(m_minv.size() == m_maxv.size()); }

	typedef RealVector result_type;

	RealVector operator()(RealVector input) const {
		if(m_min < m_max) {
			for(std::size_t i = 0; i != input.size(); ++i){
				input(i) = std::max(m_min, std::min(m_max, input(i)));
			}
		} else {
			SIZE_CHECK(m_minv.size() == input.size());
			for(std::size_t i = 0; i != input.size(); ++i){
				input(i) = std::max(m_minv(i), std::min(m_maxv(i), input(i)));
			}
		}
		return input;
	}
private:
	double m_min;
	double m_max;
	RealVector m_minv;
	RealVector m_maxv;
};

///
///\brief Transformation function first truncating and then rescaling elements in a dataset
///
class TruncateAndRescale {
public:
	///@param minCutValue All elements below this value are cut to the minimum value
	///@param maxCutValue All elements above this value are cut to the maximum value
	///@param minValue The imterval [minCutValue, maxCutValue] is mapped to [minValue, maxValue]
	///@param maxValue The imterval [minCutValue, maxCutValue] is mapped to [minValue, maxValue]
	TruncateAndRescale(double minCutValue, double maxCutValue, double minValue = 0., double maxValue = 1.) : m_minCut(minCutValue), m_maxCut(maxCutValue), m_range(maxValue - minValue), m_min(minValue), m_scalar(true) {}
	///@param minv Lower bound for element-wise truncation
	///@param maxv Upper bound for element-wise truncation
	///@param minValue The imterval [minv, maxv is mapped to [minValue, maxValue]
	///@param maxValue The imterval [minv, maxv] is mapped to [minValue, maxValue]
	TruncateAndRescale(const RealVector minv, const RealVector maxv, double minValue = 0., double maxValue = 1.) : m_minCutv(minv), m_maxCutv(maxv), m_range(maxValue - minValue), m_min(minValue), m_scalar(false) { SIZE_CHECK(m_minCutv.size() == m_maxCutv.size()); }

	typedef RealVector result_type;

	RealVector operator()(RealVector input) const {
		if(m_scalar) {
			for(std::size_t i = 0; i != input.size(); ++i){
				input(i) = (std::max(m_minCut, std::min(m_maxCut, input(i))) - m_minCut)  / (m_maxCut - m_minCut) * m_range + m_min;
			}
		} else {
			SIZE_CHECK(m_minCutv.size() == input.size());
			for(std::size_t i = 0; i != input.size(); ++i){
				input(i) = (std::max(m_minCutv(i), std::min(m_maxCutv(i), input(i))) - m_minCutv(i))  / (m_maxCutv(i) - m_minCutv(i)) * m_range + m_min;
			}
		}
		return input;
	}
private:
	double m_minCut;
	double m_maxCut;
	RealVector m_minCutv;
	RealVector m_maxCutv;
	double m_range; // maximum - minimum
	double m_min;
	bool m_scalar;
};


template <typename RowType> RowType getColumn(const Data<RowType> &data, std::size_t columnID) {
	SHARK_ASSERT(dataDimension(data) > columnID);
	RowType column(data.numberOfElements());
	std::size_t rowCounter = 0;
	BOOST_FOREACH(typename Data<RowType>::const_element_reference row, data.elements()){
		column(rowCounter) = row(columnID);
		rowCounter++;
	}
	return column;
}

template <typename RowType> void setColumn(Data<RowType> &data, std::size_t columnID, RowType newColumn) {
	SHARK_ASSERT(dataDimension(data) > columnID);
	SHARK_ASSERT(data.numberOfElements() == newColumn.size());
	std::size_t rowCounter = 0;
	BOOST_FOREACH(typename Data<RowType>::element_reference row, data.elements()){
		row(columnID) = newColumn(rowCounter);
		rowCounter++;
	}
}

/** @*/
}

#endif