This file is indexed.

/usr/include/tesseract/tablefind.h is in libtesseract-dev 3.02.01-6.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
///////////////////////////////////////////////////////////////////////
// File:        tablefind.h
// Description: Helper classes to find tables from ColPartitions.
// Author:      Faisal Shafait (faisal.shafait@dfki.de)
// Created:     Tue Jan 06 11:13:01 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_TEXTORD_TABLEFIND_H__
#define TESSERACT_TEXTORD_TABLEFIND_H__

#include "colpartitiongrid.h"
#include "elst.h"
#include "rect.h"

namespace tesseract {

// Possible types for a column segment.
enum ColSegType {
  COL_UNKNOWN,
  COL_TEXT,
  COL_TABLE,
  COL_MIXED,
  COL_COUNT
};

class ColPartitionSet;

// ColSegment holds rectangular blocks that represent segmentation of a page
// into regions containing single column text/table.
class ColSegment;
ELISTIZEH(ColSegment)
CLISTIZEH(ColSegment)

class ColSegment : public ELIST_LINK {
 public:
  ColSegment();
  ~ColSegment();

  // Simple accessors and mutators
  const TBOX& bounding_box() const {
    return bounding_box_;
  }

  void set_top(int y) {
    bounding_box_.set_top(y);
  }

  void set_bottom(int y) {
    bounding_box_.set_bottom(y);
  }

  void set_left(int x) {
    bounding_box_.set_left(x);
  }

  void set_right(int x) {
    bounding_box_.set_right(x);
  }

  void set_bounding_box(const TBOX& other) {
    bounding_box_ = other;
  }

  int get_num_table_cells() const {
    return num_table_cells_;
  }

  // set the number of table colpartitions covered by the bounding_box_
  void set_num_table_cells(int n) {
    num_table_cells_ = n;
  }

  int get_num_text_cells() const {
    return num_text_cells_;
  }

  // set the number of text colpartitions covered by the bounding_box_
  void set_num_text_cells(int n) {
    num_text_cells_ = n;
  }

  ColSegType type() const {
    return type_;
  }

  // set the type of the block based on the ratio of table to text
  // colpartitions covered by it.
  void set_type();

  // Provides a color for BBGrid to draw the rectangle.
  ScrollView::Color  BoxColor() const;

  // Insert a rectangle into bounding_box_
  void InsertBox(const TBOX& other);

 private:
  TBOX bounding_box_;                    // bounding box
  int num_table_cells_;
  int num_text_cells_;
  ColSegType type_;
};

// Typedef BBGrid of ColSegments
typedef BBGrid<ColSegment,
               ColSegment_CLIST,
               ColSegment_C_IT> ColSegmentGrid;
typedef GridSearch<ColSegment,
                   ColSegment_CLIST,
                   ColSegment_C_IT> ColSegmentGridSearch;

// TableFinder is a utility class to find a set of tables given a set of
// ColPartitions and Columns. The TableFinder will mark candidate ColPartitions
// based on research in "Table Detection in Heterogeneous Documents".
// Usage flow is as follows:
//   TableFinder finder;
//   finder.InsertCleanPartitions(/* grid info */)
//   finder.LocateTables(/* ColPartitions and Columns */);
//   finder.Update TODO(nbeato)
class TableFinder {
 public:
  // Constructor is simple initializations
  TableFinder();
  ~TableFinder();

  // Set the resolution of the connected components in ppi.
  void set_resolution(int resolution) {
    resolution_ = resolution;
  }
  // Change the reading order. Initially it is left to right.
  void set_left_to_right_language(bool order);

  // Initialize
  void Init(int grid_size, const ICOORD& bottom_left, const ICOORD& top_right);

  // Copy cleaned partitions from ColumnFinder's part_grid_ to this
  // clean_part_grid_ and insert dot-like noise into period_grid_.
  // It resizes the grids in this object to the dimensions of grid.
  void InsertCleanPartitions(ColPartitionGrid* grid, TO_BLOCK* block);

  // High level function to perform table detection
  // Finds tables and updates the grid object with new partitions for the
  // tables. The columns and width callbacks are used to merge tables.
  // The reskew argument is only used to write the tables to the out.png
  // if that feature is enabled.
  void LocateTables(ColPartitionGrid* grid,
                    ColPartitionSet** columns,
                    WidthCallback* width_cb,
                    const FCOORD& reskew);

 protected:
  // Access for the grid dimensions.
  // The results will not be correct until InsertCleanPartitions
  // has been called. The values are taken from the grid passed as an argument
  // to that function.
  int gridsize() const;
  int gridwidth() const;
  int gridheight() const;
  const ICOORD& bleft() const;
  const ICOORD& tright() const;

  // Makes a window for debugging, see BBGrid
  ScrollView* MakeWindow(int x, int y, const char* window_name);

  //////// Functions to insert objects from the grid into the table finder.
  //////// In all cases, ownership is transferred to the table finder.
  // Inserts text into the table finder.
  void InsertTextPartition(ColPartition* part);
  void InsertFragmentedTextPartition(ColPartition* part);
  void InsertLeaderPartition(ColPartition* part);
  void InsertRulingPartition(ColPartition* part);
  void InsertImagePartition(ColPartition* part);
  void SplitAndInsertFragmentedTextPartition(ColPartition* part);
  bool AllowTextPartition(const ColPartition& part) const;
  bool AllowBlob(const BLOBNBOX& blob) const;

  //////// Functions that manipulate ColPartitions in the part_grid_ /////
  //////// to find tables.
  ////////

  // Utility function to move segments to col_seg_grid
  // Note: Move includes ownership,
  // so segments will be be owned by col_seg_grid
  void MoveColSegmentsToGrid(ColSegment_LIST* segments,
                             ColSegmentGrid* col_seg_grid);

  //////// Set up code to run during table detection to correctly
  //////// initialize variables on column partitions that are used later.
  ////////

  // Initialize the grid and partitions
  void InitializePartitions(ColPartitionSet** all_columns);

  // Set left, right and top, bottom spacings of each colpartition.
  // Left/right spacings are w.r.t the column boundaries
  // Top/bottom spacings are w.r.t. previous and next colpartitions
  static void SetPartitionSpacings(ColPartitionGrid* grid,
                                   ColPartitionSet** all_columns);

  // Set spacing and closest neighbors above and below a given colpartition.
  void SetVerticalSpacing(ColPartition* part);

  // Set global spacing estimates. This function is dependent on the
  // partition spacings. So make sure SetPartitionSpacings is called
  // on the same grid before this.
  void SetGlobalSpacings(ColPartitionGrid* grid);
  // Access to the global median xheight. The xheight is the height
  // of a lowercase 'x' character on the page. This can be viewed as the
  // average height of a lowercase letter in a textline. As a result
  // it is used to make assumptions about spacing between words and
  // table cells.
  void set_global_median_xheight(int xheight);
  // Access to the global median blob width. The width is useful
  // when deciding if a partition is noise.
  void set_global_median_blob_width(int width);
  // Access to the global median ledding. The ledding is the distance between
  // two adjacent text lines. This value can be used to get a rough estimate
  // for the amount of space between two lines of text. As a result, it
  // is used to calculate appropriate spacing between adjacent rows of text.
  void set_global_median_ledding(int ledding);

  // Updates the nearest neighbors for each ColPartition in clean_part_grid_.
  // The neighbors are most likely SingletonPartner calls after the neighbors
  // are assigned. This is hear until it is decided to remove the
  // nearest_neighbor code in ColPartition
  void FindNeighbors();

  //////// Functions to mark candidate column partitions as tables.
  //////// Tables are marked as described in
  ////////   Table Detection in Heterogeneous Documents (2010, Shafait & Smith)
  ////////

  // High level function to mark partitions as table rows/cells.
  // When this function is done, the column partitions in clean_part_grid_
  // should mostly be marked as tables.
  void MarkTablePartitions();
  // Marks partitions given a local view of a single partition
  void MarkPartitionsUsingLocalInformation();
  /////// Heuristics for local marking
  // Check if the partition has at least one large gap between words or no
  // significant gap at all
  // TODO(nbeato): Make const, prevented because blobnbox array access
  bool HasWideOrNoInterWordGap(ColPartition* part) const;
  // Checks if a partition is adjacent to leaders on the page
  bool HasLeaderAdjacent(const ColPartition& part);
  // Filter individual text partitions marked as table partitions
  // consisting of paragraph endings, small section headings, and
  // headers and footers.
  void FilterFalseAlarms();
  void FilterParagraphEndings();
  void FilterHeaderAndFooter();
  // Mark all ColPartitions as table cells that have a table cell above
  // and below them
  void SmoothTablePartitionRuns();

  //////// Functions to create bounding boxes (ColSegment) objects for
  //////// the columns on the page. The columns are not necessarily
  //////// vertical lines, meaning if tab stops strongly suggests that
  //////// a column changes horizontal position, as in the case below,
  //////// The ColSegment objects will respect that after processing.
  ////////
  ////////     _____________
  //////// Ex. |     |      |
  ////////     |_____|______|  5 boxes: 2 on this line
  ////////     |   |    |   |           3 on this line
  ////////     |___|____|___|
  ////////

  // Get Column segments from best_columns_
  void GetColumnBlocks(ColPartitionSet** columns,
                       ColSegment_LIST *col_segments);

  // Group Column segments into consecutive single column regions.
  void GroupColumnBlocks(ColSegment_LIST *current_segments,
                        ColSegment_LIST *col_segments);

  // Check if two boxes are consecutive within the same column
  bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2);

  // Set the ratio of candidate table partitions in each column
  void SetColumnsType(ColSegment_LIST* col_segments);

  // Merge Column Blocks that were split due to the presence of a table
  void GridMergeColumnBlocks();

  //////// Functions to turn marked ColPartitions into candidate tables
  //////// using a modified T-Recs++ algorithm described in
  ////////   Applying The T-Recs Table Recognition System
  ////////   To The Business Letter Domain (2001, Kieninger & Dengel)
  ////////

  // Merge partititons cells into table columns
  // Differs from paper by just looking at marked table partitions
  // instead of similarity metric.
  // Modified section 4.1 of paper.
  void GetTableColumns(ColSegment_LIST *table_columns);

  // Finds regions within a column that potentially contain a table.
  // Ie, the table columns from GetTableColumns are turned into boxes
  // that span the entire page column (using ColumnBlocks found in
  // earlier functions) in the x direction and the min/max extent of
  // overlapping table columns in the y direction.
  // Section 4.2 of paper.
  void GetTableRegions(ColSegment_LIST *table_columns,
                       ColSegment_LIST *table_regions);


  //////// Functions to "patch up" found tables
  ////////

  // Merge table regions corresponding to tables spanning multiple columns
  void GridMergeTableRegions();
  bool BelongToOneTable(const TBOX &box1, const TBOX &box2);

  // Adjust table boundaries by building a tight bounding box around all
  // ColPartitions contained in it.
  void AdjustTableBoundaries();

  // Grows a table to include partitions that are partially covered
  // by the table. This includes lines and text. It does not include
  // noise or images.
  // On entry, result_box is the minimum size of the result. The results of the
  // function will union the actual result with result_box.
  void GrowTableBox(const TBOX& table_box, TBOX* result_box);
  // Grow a table by increasing the size of the box to include
  // partitions with significant overlap with the table.
  void GrowTableToIncludePartials(const TBOX& table_box,
                                  const TBOX& search_range,
                                  TBOX* result_box);
  // Grow a table by expanding to the extents of significantly
  // overlapping lines.
  void GrowTableToIncludeLines(const TBOX& table_box, const TBOX& search_range,
                               TBOX* result_box);
  // Checks whether the horizontal line belong to the table by looking at the
  // side spacing of extra ColParitions that will be included in the table
  // due to expansion
  bool HLineBelongsToTable(const ColPartition& part, const TBOX& table_box);

  // Look for isolated column headers above the given table box and
  // include them in the table
  void IncludeLeftOutColumnHeaders(TBOX* table_box);

  // Remove false alarms consiting of a single column
  void DeleteSingleColumnTables();

  // Return true if at least one gap larger than the global x-height
  // exists in the horizontal projection
  bool GapInXProjection(int* xprojection, int length);

  //////// Recognize the tables.
  ////////
  // This function will run the table recognizer and try to find better
  // bounding boxes. The structures of the tables never leave this function
  // right now. It just tries to prune and merge tables based on info it
  // has available.
  void RecognizeTables();

  //////// Debugging functions. Render different structures to GUI
  //////// for visual debugging / intuition.
  ////////

  // Displays Colpartitions marked as table row. Overlays them on top of
  // part_grid_.
  void DisplayColSegments(ScrollView* win, ColSegment_LIST *cols,
                          ScrollView::Color color);

  // Displays the colpartitions using a new coloring on an existing window.
  // Note: This method is only for debug purpose during development and
  // would not be part of checked in code
  void DisplayColPartitions(ScrollView* win, ColPartitionGrid* grid,
                            ScrollView::Color text_color,
                            ScrollView::Color table_color);
  void DisplayColPartitions(ScrollView* win, ColPartitionGrid* grid,
                            ScrollView::Color default_color);
  void DisplayColPartitionConnections(ScrollView* win,
                                      ColPartitionGrid* grid,
                                      ScrollView::Color default_color);
  void DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid,
                             ScrollView::Color color);

  // Write ColParitions and Tables to a PIX image
  // Note: This method is only for debug purpose during development and
  // would not be part of checked in code
  void WriteToPix(const FCOORD& reskew);

  // Merge all colpartitions in table regions to make them a single
  // colpartition and revert types of isolated table cells not
  // assigned to any table to their original types.
  void MakeTableBlocks(ColPartitionGrid* grid,
                       ColPartitionSet** columns,
                       WidthCallback* width_cb);

  /////////////////////////////////////////////////
  // Useful objects used during table find process.
  /////////////////////////////////////////////////
  // Resolution of the connected components in ppi.
  int resolution_;
  // Estimate of median x-height over the page
  int global_median_xheight_;
  // Estimate of the median blob width on the page
  int global_median_blob_width_;
  // Estimate of median leading on the page
  int global_median_ledding_;
  // Grid to hold cleaned colpartitions after removing all
  // colpartitions that consist of only noise blobs, and removing
  // noise blobs from remaining colpartitions.
  ColPartitionGrid clean_part_grid_;
  // Grid contains the leaders and ruling lines.
  ColPartitionGrid leader_and_ruling_grid_;
  // Grid contains the broken down column partitions. It can be thought
  // of as a "word" grid. However, it usually doesn't break apart text lines.
  // It does break apart table data (most of the time).
  ColPartitionGrid fragmented_text_grid_;
  // Grid of page column blocks
  ColSegmentGrid col_seg_grid_;
  // Grid of detected tables
  ColSegmentGrid table_grid_;
  // The reading order of text. Defaults to true, for languages such as English.
  bool left_to_right_language_;
};

}  // namespace tesseract.

#endif  // TESSERACT_TEXTORD_TABLEFIND_H__