/usr/include/tesseract/mastertrainer.h is in libtesseract-dev 3.02.01-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | // Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: mastertrainer.h
// Description: Trainer to build the MasterClassifier.
// Author: Ray Smith
// Created: Wed Nov 03 18:07:01 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__
#define TESSERACT_TRAINING_MASTERTRAINER_H__
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "classify.h"
#include "cluster.h"
#include "intfx.h"
#include "elst.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "indexmapbidi.h"
#include "intfeaturespace.h"
#include "intfeaturemap.h"
#include "intmatcher.h"
#include "params.h"
#include "shapetable.h"
#include "trainingsample.h"
#include "trainingsampleset.h"
#include "unicharset.h"
namespace tesseract {
class ShapeClassifier;
// Simple struct to hold the distance between two shapes during clustering.
struct ShapeDist {
ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}
ShapeDist(int s1, int s2, float dist)
: shape1(s1), shape2(s2), distance(dist) {}
// Sort operator to sort in ascending order of distance.
bool operator<(const ShapeDist& other) const {
return distance < other.distance;
}
int shape1;
int shape2;
float distance;
};
// Class to encapsulate training processes that use the TrainingSampleSet.
// Initially supports shape clustering and mftrainining.
// Other important features of the MasterTrainer are conditioning the data
// by outlier elimination, replication with perturbation, and serialization.
class MasterTrainer {
public:
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,
bool replicate_samples, int debug_level);
~MasterTrainer();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Loads an initial unicharset, or sets one up if the file cannot be read.
void LoadUnicharset(const char* filename);
// Sets the feature space definition.
void SetFeatureSpace(const IntFeatureSpace& fs) {
feature_space_ = fs;
feature_map_.Init(fs);
}
// Reads the samples and their features from the given file,
// adding them to the trainer with the font_id from the content of the file.
// If verification, then these are verification samples, not training.
void ReadTrainingSamples(FILE *fp,
const FEATURE_DEFS_STRUCT& feature_defs,
bool verification);
// Adds the given single sample to the trainer, setting the classid
// appropriately from the given unichar_str.
void AddSample(bool verification, const char* unichar_str,
TrainingSample* sample);
// Loads all pages from the given tif filename and append to page_images_.
// Must be called after ReadTrainingSamples, as the current number of images
// is used as an offset for page numbers in the samples.
void LoadPageImages(const char* filename);
// Cleans up the samples after initial load from the tr files, and prior to
// saving the MasterTrainer:
// Remaps fragmented chars if running shape anaylsis.
// Sets up the samples appropriately for class/fontwise access.
// Deletes outlier samples.
void PostLoadCleanup();
// Gets the samples ready for training. Use after both
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
// Re-indexes the features and computes canonical and cloud features.
void PreTrainingSetup();
// Sets up the master_shapes_ table, which tells which fonts should stay
// together until they get to a leaf node classifier.
void SetupMasterShapes();
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
// fragments and n-grams (all incorrectly segmented characters).
// Various training functions may result in incorrectly segmented characters
// being added to the unicharset of the main samples, perhaps because they
// form a "radical" decomposition of some (Indic) grapheme, or because they
// just look the same as a real character (like rn/m)
// This function moves all the junk samples, to the main samples_ set, but
// desirable junk, being any sample for which the unichar already exists in
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
// anything else gets re-marked as unichar_id 0 (space character) to identify
// it as junk to the error counter.
void IncludeJunk();
// Replicates the samples and perturbs them if the enable_replication_ flag
// is set. MUST be used after the last call to OrganizeByFontAndClass on
// the training samples, ie after IncludeJunk if it is going to be used, as
// OrganizeByFontAndClass will eat the replicated samples into the regular
// samples.
void ReplicateAndRandomizeSamplesIfRequired();
// Loads the basic font properties file into fontinfo_table_.
// Returns false on failure.
bool LoadFontInfo(const char* filename);
// Loads the xheight font properties file into xheights_.
// Returns false on failure.
bool LoadXHeights(const char* filename);
// Reads spacing stats from filename and adds them to fontinfo_table.
// Returns false on failure.
bool AddSpacingInfo(const char *filename);
// Returns the font id corresponding to the given font name.
// Returns -1 if the font cannot be found.
int GetFontInfoId(const char* font_name);
// Returns the font_id of the closest matching font name to the given
// filename. It is assumed that a substring of the filename will match
// one of the fonts. If more than one is matched, the longest is returned.
int GetBestMatchingFontInfoId(const char* filename);
// Sets up a flat shapetable with one shape per class/font combination.
void SetupFlatShapeTable(ShapeTable* shape_table);
// Sets up a Clusterer for mftraining on a single shape_id.
// Call FreeClusterer on the return value after use.
CLUSTERER* SetupForClustering(const ShapeTable& shape_table,
const FEATURE_DEFS_STRUCT& feature_defs,
int shape_id, int* num_samples);
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
// to the given inttemp_file, and the corresponding pffmtable.
// The unicharset is the original encoding of graphemes, and shape_set should
// match the size of the shape_table, and may possibly be totally fake.
void WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
const UNICHARSET& shape_set,
const ShapeTable& shape_table,
CLASS_STRUCT* float_classes,
const char* inttemp_file,
const char* pffmtable_file);
const UNICHARSET& unicharset() const {
return samples_.unicharset();
}
TrainingSampleSet* GetSamples() {
return &samples_;
}
const ShapeTable& master_shapes() const {
return master_shapes_;
}
// Generates debug output relating to the canonical distance between the
// two given UTF8 grapheme strings.
void DebugCanonical(const char* unichar_str1, const char* unichar_str2);
// Debugging for cloud/canonical features.
// Displays a Features window containing:
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
// displays the canonical features of the char/font combination in red.
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
// displays the cloud feature of the char/font combination in green.
// The canonical features are drawn first to show which ones have no
// matches in the cloud features.
// Until the features window is destroyed, each click in the features window
// will display the samples that have that feature in a separate window.
void DisplaySamples(const char* unichar_str1, int cloud_font,
const char* unichar_str2, int canonical_font);
// Tests the given test_classifier on the internal samples.
// See TestClassifier for details.
void TestClassifierOnSamples(int report_level,
bool replicate_samples,
ShapeClassifier* test_classifier,
STRING* report_string);
// Tests the given test_classifier on the given samples
// report_levels:
// 0 = no output.
// 1 = bottom-line error rate.
// 2 = bottom-line error rate + time.
// 3 = font-level error rate + time.
// 4 = list of all errors + short classifier debug output on 16 errors.
// 5 = list of all errors + short classifier debug output on 25 errors.
// If replicate_samples is true, then the test is run on an extended test
// sample including replicated and systematically perturbed samples.
// If report_string is non-NULL, a summary of the results for each font
// is appended to the report_string.
double TestClassifier(int report_level,
bool replicate_samples,
TrainingSampleSet* samples,
ShapeClassifier* test_classifier,
STRING* report_string);
// Returns the average (in some sense) distance between the two given
// shapes, which may contain multiple fonts and/or unichars.
// This function is public to facilitate testing.
float ShapeDistance(const ShapeTable& shapes, int s1, int s2);
private:
// Replaces samples that are always fragmented with the corresponding
// fragment samples.
void ReplaceFragmentedSamples();
// Runs a hierarchical agglomerative clustering to merge shapes in the given
// shape_table, while satisfying the given constraints:
// * End with at least min_shapes left in shape_table,
// * No shape shall have more than max_shape_unichars in it,
// * Don't merge shapes where the distance between them exceeds max_dist.
void ClusterShapes(int min_shapes, int max_shape_unichars,
float max_dist, ShapeTable* shape_table);
private:
NormalizationMode norm_mode_;
// Character set we are training for.
UNICHARSET unicharset_;
// Original feature space. Subspace mapping is contained in feature_map_.
IntFeatureSpace feature_space_;
TrainingSampleSet samples_;
TrainingSampleSet junk_samples_;
TrainingSampleSet verify_samples_;
// Master shape table defines what fonts stay together until the leaves.
ShapeTable master_shapes_;
// Flat shape table has each unichar/font id pair in a separate shape.
ShapeTable flat_shapes_;
// Font metrics gathered from multiple files.
UnicityTable<FontInfo> fontinfo_table_;
// Array of xheights indexed by font ids in fontinfo_table_;
GenericVector<int> xheights_;
// Non-serialized data initialized by other means or used temporarily
// during loading of training samples.
// Number of different class labels in unicharset_.
int charsetsize_;
// Flag to indicate that we are running shape analysis and need fragments
// fixing.
bool enable_shape_anaylsis_;
// Flag to indicate that sample replication is required.
bool enable_replication_;
// Flag to indicate that junk should be included in samples_.
bool include_junk_;
// Array of classids of fragments that replace the correctly segmented chars.
int* fragments_;
// Classid of previous correctly segmented sample that was added.
int prev_unichar_id_;
// Debug output control.
int debug_level_;
// Feature map used to construct reduced feature spaces for compact
// classifiers.
IntFeatureMap feature_map_;
// Vector of Pix pointers used for classifiers that need the image.
// Indexed by page_num_ in the samples.
// These images are owned by the trainer and need to be pixDestroyed.
GenericVector<Pix*> page_images_;
};
} // namespace tesseract.
#endif
|