This file is indexed.

/usr/include/tesseract/trainingsampleset.h is in libtesseract-dev 3.02.01-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__

#include "bitvector.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "matrix.h"
#include "shapetable.h"
#include "trainingsample.h"

class UNICHARSET;
template <typename T> class UnicityTable;

namespace tesseract {

struct FontInfo;
class IntFeatureMap;
class IntFeatureSpace;
class TrainingSample;
class UnicharAndFonts;

// Collection of TrainingSample used for training or testing a classifier.
// Provides several useful methods to operate on the collection as a whole,
// including outlier detection and deletion, providing access by font and
// class, finding the canonical sample, finding the "cloud" features (OR of
// all features in all samples), replication of samples, caching of distance
// metrics.
class TrainingSampleSet {
 public:
  explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
  ~TrainingSampleSet();

  // Writes to the given file. Returns false in case of error.
  bool Serialize(FILE* fp) const;
  // Reads from the given file. Returns false in case of error.
  // If swap is true, assumes a big/little-endian swap is needed.
  bool DeSerialize(bool swap, FILE* fp);

  // Accessors
  int num_samples() const {
    return samples_.size();
  }
  int num_raw_samples() const {
    return num_raw_samples_;
  }
  int NumFonts() const {
    return font_id_map_.SparseSize();
  }
  const UNICHARSET& unicharset() const {
    return unicharset_;
  }
  int charsetsize() const {
    return unicharset_size_;
  }

  // Loads an initial unicharset, or sets one up if the file cannot be read.
  void LoadUnicharset(const char* filename);

  // Adds a character sample to this sample set.
  // If the unichar is not already in the local unicharset, it is added.
  // Returns the unichar_id of the added sample, from the local unicharset.
  int AddSample(const char* unichar, TrainingSample* sample);
  // Adds a character sample to this sample set with the given unichar_id,
  // which must correspond to the local unicharset (in this).
  void AddSample(int unichar_id, TrainingSample* sample);

  // Returns the number of samples for the given font,class pair.
  // If randomize is true, returns the number of samples accessible
  // with randomizing on. (Increases the number of samples if small.)
  // OrganizeByFontAndClass must have been already called.
  int NumClassSamples(int font_id, int class_id, bool randomize) const;

  // Gets a sample by its index.
  const TrainingSample* GetSample(int index) const;

  // Gets a sample by its font, class, index.
  // OrganizeByFontAndClass must have been already called.
  const TrainingSample* GetSample(int font_id, int class_id, int index) const;

  // Get a sample by its font, class, index. Does not randomize.
  // OrganizeByFontAndClass must have been already called.
  TrainingSample* MutableSample(int font_id, int class_id, int index);

  // Returns a string debug representation of the given sample:
  // font, unichar_str, bounding box, page.
  STRING SampleToString(const TrainingSample& sample) const;

  // Gets the combined set of features used by all the samples of the given
  // font/class combination.
  const BitVector& GetCloudFeatures(int font_id, int class_id) const;
  // Gets the indexed features of the canonical sample of the given
  // font/class combination.
  const GenericVector<int>& GetCanonicalFeatures(int font_id,
                                                 int class_id) const;

  // Returns the distance between the given UniCharAndFonts pair.
  // If matched_fonts, only matching fonts, are considered, unless that yields
  // the empty set.
  // OrganizeByFontAndClass must have been already called.
  float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
                        bool matched_fonts, const IntFeatureMap& feature_map);

  // Returns the distance between the given pair of font/class pairs.
  // Finds in cache or computes and caches.
  // OrganizeByFontAndClass must have been already called.
  float ClusterDistance(int font_id1, int class_id1,
                        int font_id2, int class_id2,
                        const IntFeatureMap& feature_map);

  // Computes the distance between the given pair of font/class pairs.
  float ComputeClusterDistance(int font_id1, int class_id1,
                               int font_id2, int class_id2,
                               const IntFeatureMap& feature_map) const;

  // Returns the number of canonical features of font/class 2 for which
  // neither the feature nor any of its near neighbors occurs in the cloud
  // of font/class 1. Each such feature is a reliable separation between
  // the classes, ASSUMING that the canonical sample is sufficiently
  // representative that every sample has a feature near that particular
  // feature. To check that this is so on the fly would be prohibitively
  // expensive, but it might be possible to pre-qualify the canonical features
  // to include only those for which this assumption is true.
  // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
  // first, or the results will be nonsense.
  int ReliablySeparable(int font_id1, int class_id1,
                        int font_id2, int class_id2,
                        const IntFeatureMap& feature_map,
                        bool thorough) const;


  // Returns the total index of the requested sample.
  // OrganizeByFontAndClass must have been already called.
  int GlobalSampleIndex(int font_id, int class_id, int index) const;

  // Gets the canonical sample for the given font, class pair.
  // ComputeCanonicalSamples must have been called first.
  const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
  // Gets the max distance for the given canonical sample.
  // ComputeCanonicalSamples must have been called first.
  float GetCanonicalDist(int font_id, int class_id) const;

  // Returns a mutable pointer to the sample with the given index.
  TrainingSample* mutable_sample(int index) {
    return samples_[index];
  }
  // Gets ownership of the sample with the given index, removing it from this.
  TrainingSample* extract_sample(int index) {
    TrainingSample* sample = samples_[index];
    samples_[index] = NULL;
    return sample;
  }

  // Generates indexed features for all samples with the supplied feature_space.
  void IndexFeatures(const IntFeatureSpace& feature_space);

  // Delete outlier samples with few features that are shared with others.
  // IndexFeatures must have been called already.
  void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);

  // Marks the given sample for deletion.
  // Deletion is actually completed by DeleteDeadSamples.
  void KillSample(TrainingSample* sample);

  // Deletes all samples with a negative sample index marked by KillSample.
  // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
  // must be called after as the samples have been renumbered.
  void DeleteDeadSamples();

  // Callback function returns true if the given sample is to be deleted, due
  // to having a negative classid.
  bool DeleteableSample(const TrainingSample* sample);

  // Construct an array to access the samples by font,class pair.
  void OrganizeByFontAndClass();

  // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
  // index for the font_class_array_.
  void SetupFontIdMap();

  // Finds the sample for each font, class pair that has least maximum
  // distance to all the other samples of the same font, class.
  // OrganizeByFontAndClass must have been already called.
  void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);

  // Replicates the samples to a minimum frequency defined by
  // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
  // After replication, the replicated samples are perturbed slightly, but
  // in a predictable and repeatable way.
  // Use after OrganizeByFontAndClass().
  void ReplicateAndRandomizeSamples();

  // Caches the indexed features of the canonical samples.
  // ComputeCanonicalSamples must have been already called.
  void ComputeCanonicalFeatures();
  // Computes the combined set of features used by all the samples of each
  // font/class combination. Use after ReplicateAndRandomizeSamples.
  void ComputeCloudFeatures(int feature_space_size);

  // Adds all fonts of the given class to the shape.
  void AddAllFontsForClass(int class_id, Shape* shape) const;

  // Display the samples with the given indexed feature that also match
  // the given shape.
  void DisplaySamplesWithFeature(int f_index, const Shape& shape,
                                 const IntFeatureSpace& feature_space,
                                 ScrollView::Color color,
                                 ScrollView* window) const;

 private:
  // Struct to store a triplet of unichar, font, distance in the distance cache.
  struct FontClassDistance {
    int unichar_id;
    int font_id;  // Real font id.
    float distance;
  };
  // Simple struct to store information related to each font/class combination.
  struct FontClassInfo {
    FontClassInfo();

    // Writes to the given file. Returns false in case of error.
    bool Serialize(FILE* fp) const;
    // Reads from the given file. Returns false in case of error.
    // If swap is true, assumes a big/little-endian swap is needed.
    bool DeSerialize(bool swap, FILE* fp);

    // Number of raw samples.
    inT32 num_raw_samples;
    // Index of the canonical sample.
    inT32 canonical_sample;
    // Max distance of the canonical sample from any other.
    float canonical_dist;
    // Sample indices for the samples, including replicated.
    GenericVector<inT32> samples;

    // Non-serialized cache data.
    // Indexed features of the canonical sample.
    GenericVector<int> canonical_features;
    // The mapped features of all the samples.
    BitVector cloud_features;

    // Caches for ClusterDistance.
    // Caches for other fonts but matching this unichar. -1 indicates not set.
    // Indexed by compact font index from font_id_map_.
    GenericVector<float> font_distance_cache;
    // Caches for other unichars but matching this font. -1 indicates not set.
    GenericVector<float> unichar_distance_cache;
    // Cache for the rest (non matching font and unichar.)
    // A cache of distances computed by ReliablySeparable.
    GenericVector<FontClassDistance> distance_cache;
  };

  PointerVector<TrainingSample> samples_;
  // Number of samples before replication/randomization.
  int num_raw_samples_;
  // Character set we are training for.
  UNICHARSET unicharset_;
  // Character set size to which the 2-d arrays below refer.
  int unicharset_size_;
  // Map to allow the font_class_array_ below to be compact.
  // The sparse space is the real font_id, used in samples_ .
  // The compact space is an index to font_class_array_
  IndexMapBiDi font_id_map_;
  // A 2-d array of FontClassInfo holding information related to each
  // (font_id, class_id) pair.
  GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;

  // Reference to the fontinfo_table_ in MasterTrainer. Provides names
  // for font_ids in the samples. Not serialized!
  const UnicityTable<FontInfo>& fontinfo_table_;
};

}  // namespace tesseract.


#endif  // TRAININGSAMPLESETSET_H_