/usr/include/tesseract/trainingsampleset.h is in libtesseract-dev 3.02.01-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | // Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
#include "bitvector.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "matrix.h"
#include "shapetable.h"
#include "trainingsample.h"
class UNICHARSET;
template <typename T> class UnicityTable;
namespace tesseract {
struct FontInfo;
class IntFeatureMap;
class IntFeatureSpace;
class TrainingSample;
class UnicharAndFonts;
// Collection of TrainingSample used for training or testing a classifier.
// Provides several useful methods to operate on the collection as a whole,
// including outlier detection and deletion, providing access by font and
// class, finding the canonical sample, finding the "cloud" features (OR of
// all features in all samples), replication of samples, caching of distance
// metrics.
class TrainingSampleSet {
public:
explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
~TrainingSampleSet();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Accessors
int num_samples() const {
return samples_.size();
}
int num_raw_samples() const {
return num_raw_samples_;
}
int NumFonts() const {
return font_id_map_.SparseSize();
}
const UNICHARSET& unicharset() const {
return unicharset_;
}
int charsetsize() const {
return unicharset_size_;
}
// Loads an initial unicharset, or sets one up if the file cannot be read.
void LoadUnicharset(const char* filename);
// Adds a character sample to this sample set.
// If the unichar is not already in the local unicharset, it is added.
// Returns the unichar_id of the added sample, from the local unicharset.
int AddSample(const char* unichar, TrainingSample* sample);
// Adds a character sample to this sample set with the given unichar_id,
// which must correspond to the local unicharset (in this).
void AddSample(int unichar_id, TrainingSample* sample);
// Returns the number of samples for the given font,class pair.
// If randomize is true, returns the number of samples accessible
// with randomizing on. (Increases the number of samples if small.)
// OrganizeByFontAndClass must have been already called.
int NumClassSamples(int font_id, int class_id, bool randomize) const;
// Gets a sample by its index.
const TrainingSample* GetSample(int index) const;
// Gets a sample by its font, class, index.
// OrganizeByFontAndClass must have been already called.
const TrainingSample* GetSample(int font_id, int class_id, int index) const;
// Get a sample by its font, class, index. Does not randomize.
// OrganizeByFontAndClass must have been already called.
TrainingSample* MutableSample(int font_id, int class_id, int index);
// Returns a string debug representation of the given sample:
// font, unichar_str, bounding box, page.
STRING SampleToString(const TrainingSample& sample) const;
// Gets the combined set of features used by all the samples of the given
// font/class combination.
const BitVector& GetCloudFeatures(int font_id, int class_id) const;
// Gets the indexed features of the canonical sample of the given
// font/class combination.
const GenericVector<int>& GetCanonicalFeatures(int font_id,
int class_id) const;
// Returns the distance between the given UniCharAndFonts pair.
// If matched_fonts, only matching fonts, are considered, unless that yields
// the empty set.
// OrganizeByFontAndClass must have been already called.
float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
bool matched_fonts, const IntFeatureMap& feature_map);
// Returns the distance between the given pair of font/class pairs.
// Finds in cache or computes and caches.
// OrganizeByFontAndClass must have been already called.
float ClusterDistance(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map);
// Computes the distance between the given pair of font/class pairs.
float ComputeClusterDistance(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map) const;
// Returns the number of canonical features of font/class 2 for which
// neither the feature nor any of its near neighbors occurs in the cloud
// of font/class 1. Each such feature is a reliable separation between
// the classes, ASSUMING that the canonical sample is sufficiently
// representative that every sample has a feature near that particular
// feature. To check that this is so on the fly would be prohibitively
// expensive, but it might be possible to pre-qualify the canonical features
// to include only those for which this assumption is true.
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
// first, or the results will be nonsense.
int ReliablySeparable(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map,
bool thorough) const;
// Returns the total index of the requested sample.
// OrganizeByFontAndClass must have been already called.
int GlobalSampleIndex(int font_id, int class_id, int index) const;
// Gets the canonical sample for the given font, class pair.
// ComputeCanonicalSamples must have been called first.
const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
// Gets the max distance for the given canonical sample.
// ComputeCanonicalSamples must have been called first.
float GetCanonicalDist(int font_id, int class_id) const;
// Returns a mutable pointer to the sample with the given index.
TrainingSample* mutable_sample(int index) {
return samples_[index];
}
// Gets ownership of the sample with the given index, removing it from this.
TrainingSample* extract_sample(int index) {
TrainingSample* sample = samples_[index];
samples_[index] = NULL;
return sample;
}
// Generates indexed features for all samples with the supplied feature_space.
void IndexFeatures(const IntFeatureSpace& feature_space);
// Delete outlier samples with few features that are shared with others.
// IndexFeatures must have been called already.
void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);
// Marks the given sample for deletion.
// Deletion is actually completed by DeleteDeadSamples.
void KillSample(TrainingSample* sample);
// Deletes all samples with a negative sample index marked by KillSample.
// Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
// must be called after as the samples have been renumbered.
void DeleteDeadSamples();
// Callback function returns true if the given sample is to be deleted, due
// to having a negative classid.
bool DeleteableSample(const TrainingSample* sample);
// Construct an array to access the samples by font,class pair.
void OrganizeByFontAndClass();
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
// index for the font_class_array_.
void SetupFontIdMap();
// Finds the sample for each font, class pair that has least maximum
// distance to all the other samples of the same font, class.
// OrganizeByFontAndClass must have been already called.
void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
// Replicates the samples to a minimum frequency defined by
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
// After replication, the replicated samples are perturbed slightly, but
// in a predictable and repeatable way.
// Use after OrganizeByFontAndClass().
void ReplicateAndRandomizeSamples();
// Caches the indexed features of the canonical samples.
// ComputeCanonicalSamples must have been already called.
void ComputeCanonicalFeatures();
// Computes the combined set of features used by all the samples of each
// font/class combination. Use after ReplicateAndRandomizeSamples.
void ComputeCloudFeatures(int feature_space_size);
// Adds all fonts of the given class to the shape.
void AddAllFontsForClass(int class_id, Shape* shape) const;
// Display the samples with the given indexed feature that also match
// the given shape.
void DisplaySamplesWithFeature(int f_index, const Shape& shape,
const IntFeatureSpace& feature_space,
ScrollView::Color color,
ScrollView* window) const;
private:
// Struct to store a triplet of unichar, font, distance in the distance cache.
struct FontClassDistance {
int unichar_id;
int font_id; // Real font id.
float distance;
};
// Simple struct to store information related to each font/class combination.
struct FontClassInfo {
FontClassInfo();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Number of raw samples.
inT32 num_raw_samples;
// Index of the canonical sample.
inT32 canonical_sample;
// Max distance of the canonical sample from any other.
float canonical_dist;
// Sample indices for the samples, including replicated.
GenericVector<inT32> samples;
// Non-serialized cache data.
// Indexed features of the canonical sample.
GenericVector<int> canonical_features;
// The mapped features of all the samples.
BitVector cloud_features;
// Caches for ClusterDistance.
// Caches for other fonts but matching this unichar. -1 indicates not set.
// Indexed by compact font index from font_id_map_.
GenericVector<float> font_distance_cache;
// Caches for other unichars but matching this font. -1 indicates not set.
GenericVector<float> unichar_distance_cache;
// Cache for the rest (non matching font and unichar.)
// A cache of distances computed by ReliablySeparable.
GenericVector<FontClassDistance> distance_cache;
};
PointerVector<TrainingSample> samples_;
// Number of samples before replication/randomization.
int num_raw_samples_;
// Character set we are training for.
UNICHARSET unicharset_;
// Character set size to which the 2-d arrays below refer.
int unicharset_size_;
// Map to allow the font_class_array_ below to be compact.
// The sparse space is the real font_id, used in samples_ .
// The compact space is an index to font_class_array_
IndexMapBiDi font_id_map_;
// A 2-d array of FontClassInfo holding information related to each
// (font_id, class_id) pair.
GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
// Reference to the fontinfo_table_ in MasterTrainer. Provides names
// for font_ids in the samples. Not serialized!
const UnicityTable<FontInfo>& fontinfo_table_;
};
} // namespace tesseract.
#endif // TRAININGSAMPLESETSET_H_
|