This file is indexed.

/usr/include/tesseract/errorcounter.h is in libtesseract-dev 3.02.01-6.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
#define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_

#include "genericvector.h"
#include "matrix.h"

struct Pix;
template <typename T> class UnicityTable;

namespace tesseract {

struct FontInfo;
class SampleIterator;
class ShapeClassifier;
class ShapeRating;
class ShapeTable;
class TrainingSample;

// Enumeration of the different types of error count.
// Error counts work as follows:
//
// Ground truth is a valid unichar-id / font-id pair:
//        Number of classifier answers?
//          0                       >0
//     CT_REJECT     BOTH unichar-id and font-id match top shape?
//     __________             yes!              no
//                   CT_SHAPE_TOP_CORRECT  CT_SHAPE_TOP_ERR
//                           |            Font attributes match?
//                           |               yes!        no
//                           |                 |     CT_FONT_ATTR_ERROR
//                           |         Top unichar-id matches?
//                           |         yes!          no
//       Top shape-id has multiple unichars?    CT_UNICHAR_TOP1_ERR
//               yes!            no           2nd shape unichar id matches?
//        CT_OK_MULTI_UNICHAR   ________        yes!              no
//        ___________________                  _____  CT_UNICHAR_TOP2_ERR
//                                                    Any unichar-id matches?
//                                                    yes!        no
//                                                   ______ CT_UNICHAR_TOPN_ERR
//                                                           _________________
// Note that multiple counts may be activated for a single sample!
//
// Ground truth is for a fragment/n-gram that is NOT in the unicharset.
// This is called junk and is expected to be rejected:
//        Number of classifier answers?
//          0                       >0
//     CT_REJECTED_JUNK     CT_ACCEPTED_JUNK
//
// Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
// the mean rank of the correct result, counting from 0, and with an error
// receiving the number of answers as the correct rank.
//
// Keep in sync with the ReportString function.
enum CountTypes {
  CT_SHAPE_TOP_CORRECT,  // Top shape id is actually correct.
  CT_SHAPE_TOP_ERR,      // Top shape id is not correct.
  CT_FONT_ATTR_ERR,      // Font attributes incorrect, ignoring unichar.
  CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
  CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
  CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
  CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
  CT_REJECT,             // Classifier hates this.
  CT_NUM_RESULTS,        // Number of answers produced.
  CT_RANK,               // Rank of correct answer.
  CT_REJECTED_JUNK,      // Junk that was correctly rejected.
  CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.

  CT_SIZE                // Number of types for array sizing.
};

// Class to encapsulate all the functionality and sub-structures required
// to count errors for an isolated character classifier (ShapeClassifier).
class ErrorCounter {
 public:
  // Computes and returns the unweighted boosting_mode error rate of the given
  // classifier. Can be used for testing, or inside an iterative training
  // system, including one that uses boosting.
  // report_levels:
  // 0 = no output.
  // 1 = bottom-line error rate.
  // 2 = bottom-line error rate + time.
  // 3 = font-level error rate + time.
  // 4 = list of all errors + short classifier debug output on 16 errors.
  // 5 = list of all errors + short classifier debug output on 25 errors.
  // * The boosting_mode determines which error type is used for computing the
  //   scaled_error output, and setting the is_error flag in the samples.
  // * The fontinfo_table is used to get string font names for the debug
  //   output, and also to count font attributes errors.
  // * The page_images vector may contain a Pix* (which may be NULL) for each
  //   page index assigned to the samples.
  // * The it provides encapsulated iteration over some sample set.
  // * The outputs unichar_error, scaled_error and totals_report are all
  //   optional.
  // * If not NULL, unichar error gets the top1 unichar error rate.
  // * Scaled_error gets the error chosen by boosting_mode weighted by the
  //   weights on the samples.
  // * Fonts_report gets a string summarizing the error rates for each font in
  //   both human-readable form and as a tab-separated list of error counts.
  //   The human-readable form is all before the first tab.
  // * The return value is the un-weighted version of the scaled_error.
  static double ComputeErrorRate(ShapeClassifier* classifier,
                                 int report_level, CountTypes boosting_mode,
                                 const UnicityTable<FontInfo>& fontinfo_table,
                                 const GenericVector<Pix*>& page_images,
                                 SampleIterator* it,
                                 double* unichar_error,
                                 double* scaled_error,
                                 STRING* fonts_report);

 private:
  // Simple struct to hold an array of counts.
  struct Counts {
    Counts();
    // Adds other into this for computing totals.
    void operator+=(const Counts& other);

    int n[CT_SIZE];
  };

  // Constructor is private. Only anticipated use of ErrorCounter is via
  // the static ComputeErrorRate.
  ErrorCounter(int charsetsize, int shapesize, int fontsize);
  ~ErrorCounter();

  // Accumulates the errors from the classifier results on a single sample.
  // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
  // boosting_mode selects the type of error to be used for boosting and the
  // is_error_ member of sample is set according to whether the required type
  // of error occurred. The font_table provides access to font properties
  // for error counting and shape_table is used to understand the relationship
  // between unichar_ids and shape_ids in the results
  bool AccumulateErrors(bool debug, CountTypes boosting_mode,
                        const UnicityTable<FontInfo>& font_table,
                        const ShapeTable& shape_table,
                        const GenericVector<ShapeRating>& results,
                        TrainingSample* sample);

  // Accumulates counts for junk. Counts only whether the junk was correctly
  // rejected or not.
  void AccumulateJunk(const ShapeTable& shape_table,
                      const GenericVector<ShapeRating>& results,
                      TrainingSample* sample);

  // Creates a report of the error rate. The report_level controls the detail
  // that is reported to stderr via tprintf:
  // 0   -> no output.
  // >=1 -> bottom-line error rate.
  // >=3 -> font-level error rate.
  // boosting_mode determines the return value. It selects which (un-weighted)
  // error rate to return.
  // The fontinfo_table from MasterTrainer provides the names of fonts.
  // The it determines the current subset of the training samples.
  // If not NULL, the top-choice unichar error rate is saved in unichar_error.
  // If not NULL, the report string is saved in fonts_report.
  // (Ignoring report_level).
  double ReportErrors(int report_level, CountTypes boosting_mode,
                      const UnicityTable<FontInfo>& fontinfo_table,
                      const SampleIterator& it,
                      double* unichar_error,
                      STRING* fonts_report);

  // Sets the report string to a combined human and machine-readable report
  // string of the error rates.
  // Returns false if there is no data, leaving report unchanged.
  static bool ReportString(const Counts& counts, STRING* report);

  // Computes the error rates and returns in rates which is an array of size
  // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
  static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);


  // Total scaled error used by boosting algorithms.
  double scaled_error_;
  // Vector indexed by font_id from the samples of error accumulators.
  GenericVector<Counts> font_counts_;
  // Counts of the results that map each unichar_id (from samples) to an
  // incorrect shape_id.
  GENERIC_2D_ARRAY<int> unichar_counts_;
};

}  // namespace tesseract.

#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */