1 ///////////////////////////////////////////////////////////////////////
2 // File:        tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author:      Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifdef HAVE_CONFIG_H
20 #  include "config_auto.h"
21 #endif
22 
23 #include <algorithm>
24 #include <cmath>
25 #include <utility>
26 #include "tablefind.h"
27 
28 #include <allheaders.h>
29 
30 #include "colpartitionset.h"
31 #include "tablerecog.h"
32 
33 namespace tesseract {
34 
35 // These numbers are used to calculate the global median stats.
36 // They just set an upper bound on the stats objects.
37 // Maximum vertical spacing between neighbor partitions.
38 const int kMaxVerticalSpacing = 500;
39 // Maximum width of a blob in a partition.
40 const int kMaxBlobWidth = 500;
41 
42 // Minimum whitespace size to split a partition (measured as a multiple
43 // of a partition's median width).
44 const double kSplitPartitionSize = 2.0;
45 // To insert text, the partition must satisfy these size constraints
46 // in AllowTextPartition(). The idea is to filter noise partitions
47 // determined by the size compared to the global medians.
48 // TODO(nbeato): Need to find good numbers again.
49 const double kAllowTextHeight = 0.5;
50 const double kAllowTextWidth = 0.6;
51 const double kAllowTextArea = 0.8;
52 // The same thing applies to blobs (to filter noise).
53 // TODO(nbeato): These numbers are a shot in the dark...
54 // height and width are 0.5 * gridsize() in colfind.cpp
55 // area is a rough guess for the size of a period.
56 const double kAllowBlobHeight = 0.3;
57 const double kAllowBlobWidth = 0.4;
58 const double kAllowBlobArea = 0.05;
59 
60 // Minimum number of components in a text partition. A partition having fewer
61 // components than that is more likely a data partition and is a candidate
62 // table cell.
63 const int kMinBoxesInTextPartition = 10;
64 
65 // Maximum number of components that a data partition can have
66 const int kMaxBoxesInDataPartition = 20;
67 
68 // Maximum allowed gap in a text partitions as a multiple of its median size.
69 const double kMaxGapInTextPartition = 4.0;
70 
71 // Minimum value that the maximum gap in a text partition should have as a
72 // factor of its median size.
73 const double kMinMaxGapInTextPartition = 0.5;
74 
75 // The amount of overlap that is "normal" for adjacent blobs in a text
76 // partition. This is used to calculate gap between overlapping blobs.
77 const double kMaxBlobOverlapFactor = 4.0;
78 
79 // Maximum x-height a table partition can have as a multiple of global
80 // median x-height
81 const double kMaxTableCellXheight = 2.0;
82 
83 // Maximum line spacing between a table column header and column contents
84 // for merging the two (as a multiple of the partition's median_height).
85 const int kMaxColumnHeaderDistance = 4;
86 
87 // Minimum ratio of num_table_partitions to num_text_partitions in a column
88 // block to be called it a table column
89 const double kTableColumnThreshold = 3.0;
90 
91 // Search for horizontal ruling lines within the vertical margin as a
92 // multiple of grid size
93 // const int kRulingVerticalMargin = 3;
94 
95 // Minimum overlap that a colpartition must have with a table region
96 // to become part of that table
97 const double kMinOverlapWithTable = 0.6;
98 
99 // Maximum side space (distance from column boundary) that a typical
100 // text-line in flowing text should have as a multiple of its x-height
101 // (Median size).
102 const int kSideSpaceMargin = 10;
103 
104 // Fraction of the peak of x-projection of a table region to set the
105 // threshold for the x-projection histogram
106 const double kSmallTableProjectionThreshold = 0.35;
107 const double kLargeTableProjectionThreshold = 0.45;
108 // Minimum number of rows required to look for more rows in the projection.
109 const int kLargeTableRowCount = 6;
110 
111 // Minimum number of rows in a table
112 const int kMinRowsInTable = 3;
113 
114 // The amount of padding (multiplied by global_median_xheight_ during use)
115 // that is vertically added to the search adjacent leader search during
116 // ColPartition marking.
117 const int kAdjacentLeaderSearchPadding = 2;
118 
119 // Used when filtering false positives. When finding the last line
120 // of a paragraph (typically left-aligned), the previous line should have
121 // its center to the right of the last line by this scaled amount.
122 const double kParagraphEndingPreviousLineRatio = 1.3;
123 
124 // The maximum amount of whitespace allowed left of a paragraph ending.
125 // Do not filter a ColPartition with more than this space left of it.
126 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
127 
128 // Used when filtering false positives. The last line of a paragraph
129 // should be preceded by a line that is predominantly text. This is the
130 // ratio of text to whitespace (to the right of the text) that is required
131 // for the previous line to be a text.
132 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
133 
134 // When counting table columns, this is the required gap between two columns
135 // (it is multiplied by global_median_xheight_).
136 const double kMaxXProjectionGapFactor = 2.0;
137 
138 // Used for similarity in partitions using stroke width. Values copied
139 // from ColFind.cpp in Ray's CL.
140 const double kStrokeWidthFractionalTolerance = 0.25;
141 const double kStrokeWidthConstantTolerance = 2.0;
142 
143 #ifndef GRAPHICS_DISABLED
144 static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)");
145 static BOOL_VAR(textord_tablefind_show_mark, false,
146                 "Debug table marking steps in detail (ScrollView)");
147 static BOOL_VAR(textord_tablefind_show_stats, false,
148                 "Show page stats used in table finding (ScrollView)");
149 #endif
150 static BOOL_VAR(textord_tablefind_recognize_tables, false,
151                 "Enables the table recognizer for table layout and filtering.");
152 
153 // Templated helper function used to create destructor callbacks for the
154 // BBGrid::ClearGridData() method.
155 template <typename T>
DeleteObject(T * object)156 void DeleteObject(T *object) {
157   delete object;
158 }
159 
TableFinder()160 TableFinder::TableFinder()
161     : resolution_(0),
162       global_median_xheight_(0),
163       global_median_blob_width_(0),
164       global_median_ledding_(0),
165       left_to_right_language_(true) {}
166 
~TableFinder()167 TableFinder::~TableFinder() {
168   // ColPartitions and ColSegments created by this class for storage in grids
169   // need to be deleted explicitly.
170   clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171   leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172   fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173   col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174   table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
176 
set_left_to_right_language(bool order)177 void TableFinder::set_left_to_right_language(bool order) {
178   left_to_right_language_ = order;
179 }
180 
Init(int grid_size,const ICOORD & bottom_left,const ICOORD & top_right)181 void TableFinder::Init(int grid_size, const ICOORD &bottom_left,
182                        const ICOORD &top_right) {
183   // Initialize clean partitions list and grid
184   clean_part_grid_.Init(grid_size, bottom_left, top_right);
185   leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186   fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187   col_seg_grid_.Init(grid_size, bottom_left, top_right);
188   table_grid_.Init(grid_size, bottom_left, top_right);
189 }
190 
191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192 // insert leaders and rulers into the leader_and_ruling_grid_
InsertCleanPartitions(ColPartitionGrid * grid,TO_BLOCK * block)193 void TableFinder::InsertCleanPartitions(ColPartitionGrid *grid,
194                                         TO_BLOCK *block) {
195   // Calculate stats. This lets us filter partitions in AllowTextPartition()
196   // and filter blobs in AllowBlob().
197   SetGlobalSpacings(grid);
198 
199   // Iterate the ColPartitions in the grid.
200   ColPartitionGridSearch gsearch(grid);
201   gsearch.SetUniqueMode(true);
202   gsearch.StartFullSearch();
203   ColPartition *part = nullptr;
204   while ((part = gsearch.NextFullSearch()) != nullptr) {
205     // Reject partitions with nothing useful inside of them.
206     if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207       continue;
208     }
209     ColPartition *clean_part = part->ShallowCopy();
210     ColPartition *leader_part = nullptr;
211     if (part->IsLineType()) {
212       InsertRulingPartition(clean_part);
213       continue;
214     }
215     // Insert all non-text partitions to clean_parts
216     if (!part->IsTextType()) {
217       InsertImagePartition(clean_part);
218       continue;
219     }
220     // Insert text colpartitions after removing noisy components from them
221     // The leaders are split into a separate grid.
222     BLOBNBOX_CLIST *part_boxes = part->boxes();
223     BLOBNBOX_C_IT pit(part_boxes);
224     for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225       BLOBNBOX *pblob = pit.data();
226       // Bad blobs... happens in UNLV set.
227       // news.3G1, page 17 (around x=6)
228       if (!AllowBlob(*pblob)) {
229         continue;
230       }
231       if (pblob->flow() == BTFT_LEADER) {
232         if (leader_part == nullptr) {
233           leader_part = part->ShallowCopy();
234           leader_part->set_flow(BTFT_LEADER);
235         }
236         leader_part->AddBox(pblob);
237       } else if (pblob->region_type() != BRT_NOISE) {
238         clean_part->AddBox(pblob);
239       }
240     }
241     clean_part->ComputeLimits();
242     ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243     InsertTextPartition(clean_part);
244     SplitAndInsertFragmentedTextPartition(fragmented);
245     if (leader_part != nullptr) {
246       // TODO(nbeato): Note that ComputeLimits does not update the column
247       // information. So the leader may appear to span more columns than it
248       // really does later on when IsInSameColumnAs gets called to test
249       // for adjacent leaders.
250       leader_part->ComputeLimits();
251       InsertLeaderPartition(leader_part);
252     }
253   }
254 
255   // Make the partition partners better for upper and lower neighbors.
256   clean_part_grid_.FindPartitionPartners();
257   clean_part_grid_.RefinePartitionPartners(false);
258 }
259 
260 // High level function to perform table detection
LocateTables(ColPartitionGrid * grid,ColPartitionSet ** all_columns,WidthCallback width_cb,const FCOORD & reskew)261 void TableFinder::LocateTables(ColPartitionGrid *grid,
262                                ColPartitionSet **all_columns,
263                                WidthCallback width_cb, const FCOORD &reskew) {
264   // initialize spacing, neighbors, and columns
265   InitializePartitions(all_columns);
266 
267 #ifndef GRAPHICS_DISABLED
268   if (textord_show_tables) {
269     ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
270     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
271     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
272                          ScrollView::AQUAMARINE);
273     DisplayColPartitionConnections(table_win, &clean_part_grid_,
274                                    ScrollView::ORANGE);
275 
276     table_win = MakeWindow(100, 300, "Fragmented Text");
277     DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
278   }
279 #endif // !GRAPHICS_DISABLED
280 
281   // mark, filter, and smooth candidate table partitions
282   MarkTablePartitions();
283 
284   // Make single-column blocks from good_columns_ partitions. col_segments are
285   // moved to a grid later which takes the ownership
286   ColSegment_LIST column_blocks;
287   GetColumnBlocks(all_columns, &column_blocks);
288   // Set the ratio of candidate table partitions in each column
289   SetColumnsType(&column_blocks);
290 
291   // Move column segments to col_seg_grid_
292   MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293 
294   // Detect split in column layout that might have occurred due to the
295   // presence of a table. In such a case, merge the corresponding columns.
296   GridMergeColumnBlocks();
297 
298   // Group horizontally overlapping table partitions into table columns.
299   // table_columns created here get deleted at the end of this method.
300   ColSegment_LIST table_columns;
301   GetTableColumns(&table_columns);
302 
303   // Within each column, mark the range table regions occupy based on the
304   // table columns detected. table_regions are moved to a grid later which
305   // takes the ownership
306   ColSegment_LIST table_regions;
307   GetTableRegions(&table_columns, &table_regions);
308 
309 #ifndef GRAPHICS_DISABLED
310   if (textord_tablefind_show_mark) {
311     ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312     DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313     DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314   }
315 #endif // !GRAPHICS_DISABLED
316 
317   // Merge table regions across columns for tables spanning multiple
318   // columns
319   MoveColSegmentsToGrid(&table_regions, &table_grid_);
320   GridMergeTableRegions();
321 
322   // Adjust table boundaries by including nearby horizontal lines and left
323   // out column headers
324   AdjustTableBoundaries();
325   GridMergeTableRegions();
326 
327   if (textord_tablefind_recognize_tables) {
328     // Remove false alarms consisting of a single column
329     DeleteSingleColumnTables();
330 
331 #ifndef GRAPHICS_DISABLED
332     if (textord_show_tables) {
333       ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
334       DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
335       DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336       table_grid_.DisplayBoxes(table_win);
337     }
338 #endif // !GRAPHICS_DISABLED
339 
340     // Find table grid structure and reject tables that are malformed.
341     RecognizeTables();
342     GridMergeTableRegions();
343     RecognizeTables();
344 
345 #ifndef GRAPHICS_DISABLED
346     if (textord_show_tables) {
347       ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
348       DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
349                            ScrollView::BLUE);
350       table_grid_.DisplayBoxes(table_win);
351     }
352 #endif // !GRAPHICS_DISABLED
353   } else {
354     // Remove false alarms consisting of a single column
355     // TODO(nbeato): verify this is a NOP after structured table rejection.
356     // Right now it isn't. If the recognize function is doing what it is
357     // supposed to do, this function is obsolete.
358     DeleteSingleColumnTables();
359 
360 #ifndef GRAPHICS_DISABLED
361     if (textord_show_tables) {
362       ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
363       DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
364                            ScrollView::BLUE);
365       table_grid_.DisplayBoxes(table_win);
366     }
367 #endif // !GRAPHICS_DISABLED
368   }
369 
370   // Merge all colpartitions in table regions to make them a single
371   // colpartition and revert types of isolated table cells not
372   // assigned to any table to their original types.
373   MakeTableBlocks(grid, all_columns, width_cb);
374 }
375 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
376 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
377 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
378 // dimensions instead of duplicated memory.
gridsize() const379 int TableFinder::gridsize() const {
380   return clean_part_grid_.gridsize();
381 }
gridwidth() const382 int TableFinder::gridwidth() const {
383   return clean_part_grid_.gridwidth();
384 }
gridheight() const385 int TableFinder::gridheight() const {
386   return clean_part_grid_.gridheight();
387 }
bleft() const388 const ICOORD &TableFinder::bleft() const {
389   return clean_part_grid_.bleft();
390 }
tright() const391 const ICOORD &TableFinder::tright() const {
392   return clean_part_grid_.tright();
393 }
394 
InsertTextPartition(ColPartition * part)395 void TableFinder::InsertTextPartition(ColPartition *part) {
396   ASSERT_HOST(part != nullptr);
397   if (AllowTextPartition(*part)) {
398     clean_part_grid_.InsertBBox(true, true, part);
399   } else {
400     delete part;
401   }
402 }
InsertFragmentedTextPartition(ColPartition * part)403 void TableFinder::InsertFragmentedTextPartition(ColPartition *part) {
404   ASSERT_HOST(part != nullptr);
405   if (AllowTextPartition(*part)) {
406     fragmented_text_grid_.InsertBBox(true, true, part);
407   } else {
408     delete part;
409   }
410 }
InsertLeaderPartition(ColPartition * part)411 void TableFinder::InsertLeaderPartition(ColPartition *part) {
412   ASSERT_HOST(part != nullptr);
413   if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414     leader_and_ruling_grid_.InsertBBox(true, true, part);
415   } else {
416     delete part;
417   }
418 }
InsertRulingPartition(ColPartition * part)419 void TableFinder::InsertRulingPartition(ColPartition *part) {
420   leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }
InsertImagePartition(ColPartition * part)422 void TableFinder::InsertImagePartition(ColPartition *part) {
423   // NOTE: If images are placed into a different grid in the future,
424   // the function SetPartitionSpacings needs to be updated. It should
425   // be the only thing that cares about image partitions.
426   clean_part_grid_.InsertBBox(true, true, part);
427 }
428 
429 // Splits a partition into its "words". The splits happen
430 // at locations with wide inter-blob spacing. This is useful
431 // because it allows the table recognize to "cut through" the
432 // text lines on the page. The assumption is that a table
433 // will have several lines with similar overlapping whitespace
434 // whereas text will not have this type of property.
435 // Note: The code Assumes that blobs are sorted by the left side x!
436 // This will not work (as well) if the blobs are sorted by center/right.
SplitAndInsertFragmentedTextPartition(ColPartition * part)437 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) {
438   ASSERT_HOST(part != nullptr);
439   // Bye bye empty partitions!
440   if (part->boxes()->empty()) {
441     delete part;
442     return;
443   }
444 
445   // The AllowBlob function prevents this.
446   ASSERT_HOST(part->median_width() > 0);
447   const double kThreshold = part->median_width() * kSplitPartitionSize;
448 
449   ColPartition *right_part = part;
450   bool found_split = true;
451   while (found_split) {
452     found_split = false;
453     BLOBNBOX_C_IT box_it(right_part->boxes());
454     // Blobs are sorted left side first. If blobs overlap,
455     // the previous blob may have a "more right" right side.
456     // Account for this by always keeping the largest "right"
457     // so far.
458     int previous_right = INT32_MIN;
459 
460     // Look for the next split in the partition.
461     for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462       const TBOX &box = box_it.data()->bounding_box();
463       if (previous_right != INT32_MIN &&
464           box.left() - previous_right > kThreshold) {
465         // We have a split position. Split the partition in two pieces.
466         // Insert the left piece in the grid and keep processing the right.
467         int mid_x = (box.left() + previous_right) / 2;
468         ColPartition *left_part = right_part;
469         right_part = left_part->SplitAt(mid_x);
470 
471         InsertFragmentedTextPartition(left_part);
472         found_split = true;
473         break;
474       }
475 
476       // The right side of the previous blobs.
477       previous_right = std::max(previous_right, static_cast<int>(box.right()));
478     }
479   }
480   // When a split is not found, the right part is minimized
481   // as much as possible, so process it.
482   InsertFragmentedTextPartition(right_part);
483 }
484 
485 // Some simple criteria to filter out now. We want to make sure the
486 // average blob size in the partition is consistent with the
487 // global page stats.
488 // The area metric will almost always pass for multi-blob partitions.
489 // It is useful when filtering out noise caused by an isolated blob.
AllowTextPartition(const ColPartition & part) const490 bool TableFinder::AllowTextPartition(const ColPartition &part) const {
491   const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492   const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493   const int median_area = global_median_xheight_ * global_median_blob_width_;
494   const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495   // Keep comparisons strictly greater to disallow 0!
496   return part.median_height() > kHeightRequired &&
497          part.median_width() > kWidthRequired &&
498          part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
500 
501 // Same as above, applied to blobs. Keep in mind that
502 // leaders, commas, and periods are important in tables.
AllowBlob(const BLOBNBOX & blob) const503 bool TableFinder::AllowBlob(const BLOBNBOX &blob) const {
504   const TBOX &box = blob.bounding_box();
505   const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506   const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507   const int median_area = global_median_xheight_ * global_median_blob_width_;
508   const double kAreaRequired = median_area * kAllowBlobArea;
509   // Keep comparisons strictly greater to disallow 0!
510   return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511          box.area() > kAreaRequired;
512 }
513 
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 #ifndef GRAPHICS_DISABLED
MakeWindow(int x,int y,const char * window_name)519 ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {
520   return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
522 #endif
523 
524 // Make single-column blocks from good_columns_ partitions.
GetColumnBlocks(ColPartitionSet ** all_columns,ColSegment_LIST * column_blocks)525 void TableFinder::GetColumnBlocks(ColPartitionSet **all_columns,
526                                   ColSegment_LIST *column_blocks) {
527   for (int i = 0; i < gridheight(); ++i) {
528     ColPartitionSet *columns = all_columns[i];
529     if (columns != nullptr) {
530       ColSegment_LIST new_blocks;
531       // Get boxes from the current vertical position on the grid
532       columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533                               &new_blocks);
534       // Merge the new_blocks boxes into column_blocks if they are well-aligned
535       GroupColumnBlocks(&new_blocks, column_blocks);
536     }
537   }
538 }
539 
540 // Merge column segments into the current list if they are well aligned.
GroupColumnBlocks(ColSegment_LIST * new_blocks,ColSegment_LIST * column_blocks)541 void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,
542                                     ColSegment_LIST *column_blocks) {
543   ColSegment_IT src_it(new_blocks);
544   ColSegment_IT dest_it(column_blocks);
545   // iterate through the source list
546   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547     ColSegment *src_seg = src_it.data();
548     const TBOX &src_box = src_seg->bounding_box();
549     bool match_found = false;
550     // iterate through the destination list to find a matching column block
551     for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552       ColSegment *dest_seg = dest_it.data();
553       TBOX dest_box = dest_seg->bounding_box();
554       if (ConsecutiveBoxes(src_box, dest_box)) {
555         // If matching block is found, insert the current block into it
556         // and delete the source block.
557         dest_seg->InsertBox(src_box);
558         match_found = true;
559         delete src_it.extract();
560         break;
561       }
562     }
563     // If no match is found, just append the source block to column_blocks
564     if (!match_found) {
565       dest_it.add_after_then_move(src_it.extract());
566     }
567   }
568 }
569 
570 // are the two boxes immediate neighbors along the vertical direction
ConsecutiveBoxes(const TBOX & b1,const TBOX & b2)571 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
572   int x_margin = 20;
573   int y_margin = 5;
574   return (abs(b1.left() - b2.left()) < x_margin) &&
575          (abs(b1.right() - b2.right()) < x_margin) &&
576          (abs(b1.top() - b2.bottom()) < y_margin ||
577           abs(b2.top() - b1.bottom()) < y_margin);
578 }
579 
580 // Set up info for clean_part_grid_ partitions to be valid during detection
581 // code.
InitializePartitions(ColPartitionSet ** all_columns)582 void TableFinder::InitializePartitions(ColPartitionSet **all_columns) {
583   FindNeighbors();
584   SetPartitionSpacings(&clean_part_grid_, all_columns);
585   SetGlobalSpacings(&clean_part_grid_);
586 }
587 
588 // Set left, right and top, bottom spacings of each colpartition.
SetPartitionSpacings(ColPartitionGrid * grid,ColPartitionSet ** all_columns)589 void TableFinder::SetPartitionSpacings(ColPartitionGrid *grid,
590                                        ColPartitionSet **all_columns) {
591   // Iterate the ColPartitions in the grid.
592   ColPartitionGridSearch gsearch(grid);
593   gsearch.StartFullSearch();
594   ColPartition *part = nullptr;
595   while ((part = gsearch.NextFullSearch()) != nullptr) {
596     ColPartitionSet *columns = all_columns[gsearch.GridY()];
597     TBOX box = part->bounding_box();
598     int y = part->MidY();
599     ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600     ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601     // set distance from left column as space to the left
602     if (left_column) {
603       int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604       part->set_space_to_left(left_space);
605     }
606     // set distance from right column as space to the right
607     if (right_column) {
608       int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609       part->set_space_to_right(right_space);
610     }
611 
612     // Look for images that may be closer.
613     // NOTE: used to be part_grid_, might cause issues now
614     ColPartitionGridSearch hsearch(grid);
615     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616     ColPartition *neighbor = nullptr;
617     while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618       if (neighbor->type() == PT_PULLOUT_IMAGE ||
619           neighbor->type() == PT_FLOWING_IMAGE ||
620           neighbor->type() == PT_HEADING_IMAGE) {
621         int right = neighbor->bounding_box().right();
622         if (right < box.left()) {
623           int space = std::min(box.left() - right, part->space_to_left());
624           part->set_space_to_left(space);
625         }
626       }
627     }
628     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629     neighbor = nullptr;
630     while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631       if (neighbor->type() == PT_PULLOUT_IMAGE ||
632           neighbor->type() == PT_FLOWING_IMAGE ||
633           neighbor->type() == PT_HEADING_IMAGE) {
634         int left = neighbor->bounding_box().left();
635         if (left > box.right()) {
636           int space = std::min(left - box.right(), part->space_to_right());
637           part->set_space_to_right(space);
638         }
639       }
640     }
641 
642     ColPartition *upper_part = part->SingletonPartner(true);
643     if (upper_part) {
644       int space =
645           std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646                                        part->bounding_box().bottom()));
647       part->set_space_above(space);
648     } else {
649       // TODO(nbeato): What constitutes a good value?
650       // 0 is the default value when not set, explicitly noting it needs to
651       // be something else.
652       part->set_space_above(INT32_MAX);
653     }
654 
655     ColPartition *lower_part = part->SingletonPartner(false);
656     if (lower_part) {
657       int space =
658           std::max(0, static_cast<int>(part->bounding_box().bottom() -
659                                        lower_part->bounding_box().bottom()));
660       part->set_space_below(space);
661     } else {
662       // TODO(nbeato): What constitutes a good value?
663       // 0 is the default value when not set, explicitly noting it needs to
664       // be something else.
665       part->set_space_below(INT32_MAX);
666     }
667   }
668 }
669 
670 // Set spacing and closest neighbors above and below a given colpartition.
SetVerticalSpacing(ColPartition * part)671 void TableFinder::SetVerticalSpacing(ColPartition *part) {
672   TBOX box = part->bounding_box();
673   int top_range =
674       std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675   int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676                               static_cast<int>(bleft().y()));
677   box.set_top(top_range);
678   box.set_bottom(bottom_range);
679 
680   TBOX part_box = part->bounding_box();
681   // Start a rect search
682   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
683       &clean_part_grid_);
684   rectsearch.StartRectSearch(box);
685   ColPartition *neighbor;
686   int min_space_above = kMaxVerticalSpacing;
687   int min_space_below = kMaxVerticalSpacing;
688   ColPartition *above_neighbor = nullptr;
689   ColPartition *below_neighbor = nullptr;
690   while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691     if (neighbor == part) {
692       continue;
693     }
694     TBOX neighbor_box = neighbor->bounding_box();
695     if (neighbor_box.major_x_overlap(part_box)) {
696       int gap = abs(part->median_bottom() - neighbor->median_bottom());
697       // If neighbor is below current partition
698       if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699         min_space_below = gap;
700         below_neighbor = neighbor;
701       } // If neighbor is above current partition
702       else if (part_box.top() < neighbor_box.bottom() &&
703                gap < min_space_above) {
704         min_space_above = gap;
705         above_neighbor = neighbor;
706       }
707     }
708   }
709   part->set_space_above(min_space_above);
710   part->set_space_below(min_space_below);
711   part->set_nearest_neighbor_above(above_neighbor);
712   part->set_nearest_neighbor_below(below_neighbor);
713 }
714 
715 // Set global spacing and x-height estimates
SetGlobalSpacings(ColPartitionGrid * grid)716 void TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) {
717   STATS xheight_stats(0, kMaxVerticalSpacing + 1);
718   STATS width_stats(0, kMaxBlobWidth + 1);
719   STATS ledding_stats(0, kMaxVerticalSpacing + 1);
720   // Iterate the ColPartitions in the grid.
721   ColPartitionGridSearch gsearch(grid);
722   gsearch.SetUniqueMode(true);
723   gsearch.StartFullSearch();
724   ColPartition *part = nullptr;
725   while ((part = gsearch.NextFullSearch()) != nullptr) {
726     // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727     // ComputeLimits needs to get called somewhere outside of TableFinder
728     // to make sure the partitions are properly initialized.
729     // When this is called, SmoothPartitionPartners dies in an assert after
730     // table find runs. Alternative solution.
731     // part->ComputeLimits();
732     if (part->IsTextType()) {
733       // xheight_stats.add(part->median_height(), part->boxes_count());
734       // width_stats.add(part->median_width(), part->boxes_count());
735 
736       // This loop can be removed when above issues are fixed.
737       // Replace it with the 2 lines commented out above.
738       BLOBNBOX_C_IT it(part->boxes());
739       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740         xheight_stats.add(it.data()->bounding_box().height(), 1);
741         width_stats.add(it.data()->bounding_box().width(), 1);
742       }
743 
744       ledding_stats.add(part->space_above(), 1);
745       ledding_stats.add(part->space_below(), 1);
746     }
747   }
748   // Set estimates based on median of statistics obtained
749   set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750   set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751   set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752 #ifndef GRAPHICS_DISABLED
753   if (textord_tablefind_show_stats) {
754     const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755     ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756     xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757     width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758     ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759   }
760 #endif // !GRAPHICS_DISABLED
761 }
762 
set_global_median_xheight(int xheight)763 void TableFinder::set_global_median_xheight(int xheight) {
764   global_median_xheight_ = xheight;
765 }
set_global_median_blob_width(int width)766 void TableFinder::set_global_median_blob_width(int width) {
767   global_median_blob_width_ = width;
768 }
set_global_median_ledding(int ledding)769 void TableFinder::set_global_median_ledding(int ledding) {
770   global_median_ledding_ = ledding;
771 }
772 
FindNeighbors()773 void TableFinder::FindNeighbors() {
774   ColPartitionGridSearch gsearch(&clean_part_grid_);
775   gsearch.StartFullSearch();
776   ColPartition *part = nullptr;
777   while ((part = gsearch.NextFullSearch()) != nullptr) {
778     // TODO(nbeato): Rename this function, meaning is different now.
779     // IT is finding nearest neighbors its own way
780     // SetVerticalSpacing(part);
781 
782     ColPartition *upper = part->SingletonPartner(true);
783     if (upper) {
784       part->set_nearest_neighbor_above(upper);
785     }
786 
787     ColPartition *lower = part->SingletonPartner(false);
788     if (lower) {
789       part->set_nearest_neighbor_below(lower);
790     }
791   }
792 }
793 
794 // High level interface. Input is an unmarked ColPartitionGrid
795 // (namely, clean_part_grid_). Partitions are identified using local
796 // information and filter/smoothed. The function exit should contain
797 // a good sampling of the table partitions.
MarkTablePartitions()798 void TableFinder::MarkTablePartitions() {
799   MarkPartitionsUsingLocalInformation();
800 #ifndef GRAPHICS_DISABLED
801   if (textord_tablefind_show_mark) {
802     ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
803     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
804     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
805                          ScrollView::AQUAMARINE);
806   }
807 #endif
808   FilterFalseAlarms();
809 #ifndef GRAPHICS_DISABLED
810   if (textord_tablefind_show_mark) {
811     ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
812     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
813     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
814                          ScrollView::AQUAMARINE);
815   }
816 #endif
817   SmoothTablePartitionRuns();
818 #ifndef GRAPHICS_DISABLED
819   if (textord_tablefind_show_mark) {
820     ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
821     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
822     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
823                          ScrollView::AQUAMARINE);
824   }
825 #endif
826   FilterFalseAlarms();
827 #ifndef GRAPHICS_DISABLED
828   if (textord_tablefind_show_mark || textord_show_tables) {
829     ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
830     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
831     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
832                          ScrollView::AQUAMARINE);
833   }
834 #endif
835 }
836 
837 // These types of partitions are marked as table partitions:
838 //  1- Partitions that have at lease one large gap between words
839 //  2- Partitions that consist of only one word (no significant gap
840 //     between components)
841 //  3- Partitions that vertically overlap with other partitions within the
842 //     same column.
843 //  4- Partitions with leaders before/after them.
MarkPartitionsUsingLocalInformation()844 void TableFinder::MarkPartitionsUsingLocalInformation() {
845   // Iterate the ColPartitions in the grid.
846   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
847       &clean_part_grid_);
848   gsearch.StartFullSearch();
849   ColPartition *part = nullptr;
850   while ((part = gsearch.NextFullSearch()) != nullptr) {
851     if (!part->IsTextType()) { // Only consider text partitions
852       continue;
853     }
854     // Only consider partitions in dominant font size or smaller
855     if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {
856       continue;
857     }
858     // Mark partitions with a large gap, or no significant gap as
859     // table partitions.
860     // Comments: It produces several false alarms at:
861     //  - last line of a paragraph (fixed)
862     //  - single word section headings
863     //  - page headers and footers
864     //  - numbered equations
865     //  - line drawing regions
866     // TODO(faisal): detect and fix above-mentioned cases
867     if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868       part->set_table_type();
869     }
870   }
871 }
872 
873 // Check if the partition has at least one large gap between words or no
874 // significant gap at all
HasWideOrNoInterWordGap(ColPartition * part) const875 bool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const {
876   // Should only get text partitions.
877   ASSERT_HOST(part->IsTextType());
878   // Blob access
879   BLOBNBOX_CLIST *part_boxes = part->boxes();
880   BLOBNBOX_C_IT it(part_boxes);
881   // Check if this is a relatively small partition (such as a single word)
882   if (part->bounding_box().width() <
883           kMinBoxesInTextPartition * part->median_height() &&
884       part_boxes->length() < kMinBoxesInTextPartition) {
885     return true;
886   }
887 
888   // Variables used to compute inter-blob spacing.
889   int current_x0 = -1;
890   int current_x1 = -1;
891   int previous_x1 = -1;
892   // Stores the maximum gap detected.
893   int largest_partition_gap_found = -1;
894   // Text partition gap limits. If this is text (and not a table),
895   // there should be at least one gap larger than min_gap and no gap
896   // larger than max_gap.
897   const double max_gap = kMaxGapInTextPartition * part->median_height();
898   const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899 
900   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901     BLOBNBOX *blob = it.data();
902     current_x0 = blob->bounding_box().left();
903     current_x1 = blob->bounding_box().right();
904     if (previous_x1 != -1) {
905       int gap = current_x0 - previous_x1;
906 
907       // TODO(nbeato): Boxes may overlap? Huh?
908       // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909       // on the top right of the page are filtered out with this line.
910       // Note 2: Iterating over blobs in a partition, so we are looking for
911       // spacing between the words.
912       if (gap < 0) {
913         // More likely case, the blobs slightly overlap. This can happen
914         // with diacritics (accents) or broken alphabet symbols (characters).
915         // Merge boxes together by taking max of right sides.
916         if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917           previous_x1 = std::max(previous_x1, current_x1);
918           continue;
919         }
920         // Extreme case, blobs overlap significantly in the same partition...
921         // This should not happen often (if at all), but it does.
922         // TODO(nbeato): investigate cases when this happens.
923         else {
924           // The behavior before was to completely ignore this case.
925         }
926       }
927 
928       // If a large enough gap is found, mark it as a table cell (return true)
929       if (gap > max_gap) {
930         return true;
931       }
932       if (gap > largest_partition_gap_found) {
933         largest_partition_gap_found = gap;
934       }
935     }
936     previous_x1 = current_x1;
937   }
938   // Since no large gap was found, return false if the partition is too
939   // long to be a data cell
940   if (part->bounding_box().width() >
941           kMaxBoxesInDataPartition * part->median_height() ||
942       part_boxes->length() > kMaxBoxesInDataPartition) {
943     return false;
944   }
945 
946   // A partition may be a single blob. In this case, it's an isolated symbol
947   // or non-text (such as a ruling or image).
948   // Detect these as table partitions? Shouldn't this be case by case?
949   // The behavior before was to ignore this, making max_partition_gap < 0
950   // and implicitly return true. Just making it explicit.
951   if (largest_partition_gap_found == -1) {
952     return true;
953   }
954 
955   // return true if the maximum gap found is smaller than the minimum allowed
956   // max_gap in a text partition. This indicates that there is no significant
957   // space in the partition, hence it is likely a single word.
958   return largest_partition_gap_found < min_gap;
959 }
960 
961 // A criteria for possible tables is that a table may have leaders
962 // between data cells. An aggressive solution to find such tables is to
963 // explicitly mark partitions that have adjacent leaders.
964 // Note that this includes overlapping leaders. However, it does not
965 // include leaders in different columns on the page.
966 // Possible false-positive will include lists, such as a table of contents.
967 // As these arise, the aggressive nature of this search may need to be
968 // trimmed down.
HasLeaderAdjacent(const ColPartition & part)969 bool TableFinder::HasLeaderAdjacent(const ColPartition &part) {
970   if (part.flow() == BTFT_LEADER) {
971     return true;
972   }
973   // Search range is left and right bounded by an offset of the
974   // median xheight. This offset is to allow some tolerance to the
975   // the leaders on the page in the event that the alignment is still
976   // a bit off.
977   const TBOX &box = part.bounding_box();
978   const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
979   const int top = box.top() + search_size;
980   const int bottom = box.bottom() - search_size;
981   ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
982   for (int direction = 0; direction < 2; ++direction) {
983     bool right_to_left = (direction == 0);
984     int x = right_to_left ? box.right() : box.left();
985     hsearch.StartSideSearch(x, bottom, top);
986     ColPartition *leader = nullptr;
987     while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988       // The leader could be a horizontal ruling in the grid.
989       // Make sure it is actually a leader.
990       if (leader->flow() != BTFT_LEADER) {
991         continue;
992       }
993       // This should not happen, they are in different grids.
994       ASSERT_HOST(&part != leader);
995       // Make sure the leader shares a page column with the partition,
996       // otherwise we are spreading across columns.
997       if (!part.IsInSameColumnAs(*leader)) {
998         break;
999       }
1000       // There should be a significant vertical overlap
1001       if (!leader->VSignificantCoreOverlap(part)) {
1002         continue;
1003       }
1004       // Leader passed all tests, so it is adjacent.
1005       return true;
1006     }
1007   }
1008   // No leaders are adjacent to the given partition.
1009   return false;
1010 }
1011 
1012 // Filter individual text partitions marked as table partitions
1013 // consisting of paragraph endings, small section headings, and
1014 // headers and footers.
FilterFalseAlarms()1015 void TableFinder::FilterFalseAlarms() {
1016   FilterParagraphEndings();
1017   FilterHeaderAndFooter();
1018   // TODO(nbeato): Fully justified text as non-table?
1019 }
1020 
FilterParagraphEndings()1021 void TableFinder::FilterParagraphEndings() {
1022   // Detect last line of paragraph
1023   // Iterate the ColPartitions in the grid.
1024   ColPartitionGridSearch gsearch(&clean_part_grid_);
1025   gsearch.StartFullSearch();
1026   ColPartition *part = nullptr;
1027   while ((part = gsearch.NextFullSearch()) != nullptr) {
1028     if (part->type() != PT_TABLE) {
1029       continue; // Consider only table partitions
1030     }
1031 
1032     // Paragraph ending should have flowing text above it.
1033     ColPartition *upper_part = part->nearest_neighbor_above();
1034     if (!upper_part) {
1035       continue;
1036     }
1037     if (upper_part->type() != PT_FLOWING_TEXT) {
1038       continue;
1039     }
1040     if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041       continue;
1042     }
1043     // Check if its the last line of a paragraph.
1044     // In most cases, a paragraph ending should be left-aligned to text line
1045     // above it. Sometimes, it could be a 2 line paragraph, in which case
1046     // the line above it is indented.
1047     // To account for that, check if the partition center is to
1048     // the left of the one above it.
1049     int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050     int upper_mid = (upper_part->bounding_box().left() +
1051                      upper_part->bounding_box().right()) /
1052                     2;
1053     int current_spacing = 0; // spacing of the current line to margin
1054     int upper_spacing = 0;   // spacing of the previous line to the margin
1055     if (left_to_right_language_) {
1056       // Left to right languages, use mid - left to figure out the distance
1057       // the middle is from the left margin.
1058       int left = std::min(part->bounding_box().left(),
1059                           upper_part->bounding_box().left());
1060       current_spacing = mid - left;
1061       upper_spacing = upper_mid - left;
1062     } else {
1063       // Right to left languages, use right - mid to figure out the distance
1064       // the middle is from the right margin.
1065       int right = std::max(part->bounding_box().right(),
1066                            upper_part->bounding_box().right());
1067       current_spacing = right - mid;
1068       upper_spacing = right - upper_mid;
1069     }
1070     if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071       continue;
1072     }
1073 
1074     // Paragraphs should have similar fonts.
1075     if (!part->MatchingSizes(*upper_part) ||
1076         !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1077                                    kStrokeWidthConstantTolerance)) {
1078       continue;
1079     }
1080 
1081     // The last line of a paragraph should be left aligned.
1082     // TODO(nbeato): This would be untrue if the text was right aligned.
1083     // How often is that?
1084     if (part->space_to_left() >
1085         kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {
1086       continue;
1087     }
1088     // The line above it should be right aligned (assuming justified format).
1089     // Since we can't assume justified text, we compare whitespace to text.
1090     // The above line should have majority spanning text (or the current
1091     // line could have fit on the previous line). So compare
1092     // whitespace to text.
1093     if (upper_part->bounding_box().width() <
1094         kMinParagraphEndingTextToWhitespaceRatio *
1095             upper_part->space_to_right()) {
1096       continue;
1097     }
1098 
1099     // Ledding above the line should be less than ledding below
1100     if (part->space_above() >= part->space_below() ||
1101         part->space_above() > 2 * global_median_ledding_) {
1102       continue;
1103     }
1104 
1105     // If all checks failed, it is probably text.
1106     part->clear_table_type();
1107   }
1108 }
1109 
FilterHeaderAndFooter()1110 void TableFinder::FilterHeaderAndFooter() {
1111   // Consider top-most text colpartition as header and bottom most as footer
1112   ColPartition *header = nullptr;
1113   ColPartition *footer = nullptr;
1114   int max_top = INT32_MIN;
1115   int min_bottom = INT32_MAX;
1116   ColPartitionGridSearch gsearch(&clean_part_grid_);
1117   gsearch.StartFullSearch();
1118   ColPartition *part = nullptr;
1119   while ((part = gsearch.NextFullSearch()) != nullptr) {
1120     if (!part->IsTextType()) {
1121       continue; // Consider only text partitions
1122     }
1123     int top = part->bounding_box().top();
1124     int bottom = part->bounding_box().bottom();
1125     if (top > max_top) {
1126       max_top = top;
1127       header = part;
1128     }
1129     if (bottom < min_bottom) {
1130       min_bottom = bottom;
1131       footer = part;
1132     }
1133   }
1134   if (header) {
1135     header->clear_table_type();
1136   }
1137   if (footer) {
1138     footer->clear_table_type();
1139   }
1140 }
1141 
1142 // Mark all ColPartitions as table cells that have a table cell above
1143 // and below them
1144 // TODO(faisal): This is too aggressive at the moment. The method needs to
1145 // consider spacing and alignment as well. Detection of false alarm table cells
1146 // should also be done as part of it.
SmoothTablePartitionRuns()1147 void TableFinder::SmoothTablePartitionRuns() {
1148   // Iterate the ColPartitions in the grid.
1149   ColPartitionGridSearch gsearch(&clean_part_grid_);
1150   gsearch.StartFullSearch();
1151   ColPartition *part = nullptr;
1152   while ((part = gsearch.NextFullSearch()) != nullptr) {
1153     if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154       continue; // Consider only text partitions
1155     }
1156     ColPartition *upper_part = part->nearest_neighbor_above();
1157     ColPartition *lower_part = part->nearest_neighbor_below();
1158     if (!upper_part || !lower_part) {
1159       continue;
1160     }
1161     if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162       part->set_table_type();
1163     }
1164   }
1165 
1166   // Pass 2, do the opposite. If both the upper and lower neighbors
1167   // exist and are not tables, this probably shouldn't be a table.
1168   gsearch.StartFullSearch();
1169   part = nullptr;
1170   while ((part = gsearch.NextFullSearch()) != nullptr) {
1171     if (part->type() != PT_TABLE) {
1172       continue; // Consider only text partitions
1173     }
1174     ColPartition *upper_part = part->nearest_neighbor_above();
1175     ColPartition *lower_part = part->nearest_neighbor_below();
1176 
1177     // table can't be by itself
1178     if ((upper_part && upper_part->type() != PT_TABLE) &&
1179         (lower_part && lower_part->type() != PT_TABLE)) {
1180       part->clear_table_type();
1181     }
1182   }
1183 }
1184 
1185 // Set the type of a column segment based on the ratio of table to text cells
SetColumnsType(ColSegment_LIST * column_blocks)1186 void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {
1187   ColSegment_IT it(column_blocks);
1188   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189     ColSegment *seg = it.data();
1190     TBOX box = seg->bounding_box();
1191     int num_table_cells = 0;
1192     int num_text_cells = 0;
1193     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(
1194         &clean_part_grid_);
1195     rsearch.SetUniqueMode(true);
1196     rsearch.StartRectSearch(box);
1197     ColPartition *part = nullptr;
1198     while ((part = rsearch.NextRectSearch()) != nullptr) {
1199       if (part->type() == PT_TABLE) {
1200         num_table_cells++;
1201       } else if (part->type() == PT_FLOWING_TEXT) {
1202         num_text_cells++;
1203       }
1204     }
1205     // If a column block has no text or table partition in it, it is not needed
1206     // for table detection.
1207     if (!num_table_cells && !num_text_cells) {
1208       delete it.extract();
1209     } else {
1210       seg->set_num_table_cells(num_table_cells);
1211       seg->set_num_text_cells(num_text_cells);
1212       // set column type based on the ratio of table to text cells
1213       seg->set_type();
1214     }
1215   }
1216 }
1217 
1218 // Move column blocks to grid
MoveColSegmentsToGrid(ColSegment_LIST * segments,ColSegmentGrid * col_seg_grid)1219 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1220                                         ColSegmentGrid *col_seg_grid) {
1221   ColSegment_IT it(segments);
1222   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223     ColSegment *seg = it.extract();
1224     col_seg_grid->InsertBBox(true, true, seg);
1225   }
1226 }
1227 
1228 // Merge column blocks if a split is detected due to the presence of a
1229 // table. A text block is considered split if it has multiple
1230 // neighboring blocks above/below it, and at least one of the
1231 // neighboring blocks is of table type (has a high density of table
1232 // partitions). In this case neighboring blocks in the direction
1233 // (above/below) of the table block are merged with the text block.
1234 
1235 // Comment: This method does not handle split due to a full page table
1236 // since table columns in this case do not have a text column on which
1237 // split decision can be based.
GridMergeColumnBlocks()1238 void TableFinder::GridMergeColumnBlocks() {
1239   int margin = gridsize();
1240 
1241   // Iterate the Column Blocks in the grid.
1242   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1243       &col_seg_grid_);
1244   gsearch.StartFullSearch();
1245   ColSegment *seg;
1246   while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247     if (seg->type() != COL_TEXT) {
1248       continue; // only consider text blocks for split detection
1249     }
1250     bool neighbor_found = false;
1251     bool modified = false; // Modified at least once
1252     // keep expanding current box as long as neighboring table columns
1253     // are found above or below it.
1254     do {
1255       TBOX box = seg->bounding_box();
1256       // slightly expand the search region vertically
1257       int top_range =
1258           std::min(box.top() + margin, static_cast<int>(tright().y()));
1259       int bottom_range =
1260           std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261       box.set_top(top_range);
1262       box.set_bottom(bottom_range);
1263       neighbor_found = false;
1264       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1265           &col_seg_grid_);
1266       rectsearch.StartRectSearch(box);
1267       ColSegment *neighbor = nullptr;
1268       while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269         if (neighbor == seg) {
1270           continue;
1271         }
1272         const TBOX &neighbor_box = neighbor->bounding_box();
1273         // If the neighbor box significantly overlaps with the current
1274         // box (due to the expansion of the current box in the
1275         // previous iteration of this loop), remove the neighbor box
1276         // and expand the current box to include it.
1277         if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278           seg->InsertBox(neighbor_box);
1279           modified = true;
1280           rectsearch.RemoveBBox();
1281           gsearch.RepositionIterator();
1282           delete neighbor;
1283           continue;
1284         }
1285         // Only expand if the neighbor box is of table type
1286         if (neighbor->type() != COL_TABLE) {
1287           continue;
1288         }
1289         // Insert the neighbor box into the current column block
1290         if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291           seg->InsertBox(neighbor_box);
1292           neighbor_found = true;
1293           modified = true;
1294           rectsearch.RemoveBBox();
1295           gsearch.RepositionIterator();
1296           delete neighbor;
1297         }
1298       }
1299     } while (neighbor_found);
1300     if (modified) {
1301       // Because the box has changed, it has to be removed first.
1302       gsearch.RemoveBBox();
1303       col_seg_grid_.InsertBBox(true, true, seg);
1304       gsearch.RepositionIterator();
1305     }
1306   }
1307 }
1308 
1309 // Group horizontally overlapping table partitions into table columns.
1310 // TODO(faisal): This is too aggressive at the moment. The method should
1311 // consider more attributes to group table partitions together. Some common
1312 // errors are:
1313 //  1- page number is merged with a table column above it even
1314 //      if there is a large vertical gap between them.
1315 //  2- column headers go on to catch one of the columns arbitrarily
1316 //  3- an isolated noise blob near page top or bottom merges with the table
1317 //     column below/above it
1318 //  4- cells from two vertically adjacent tables merge together to make a
1319 //     single column resulting in merging of the two tables
GetTableColumns(ColSegment_LIST * table_columns)1320 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1321   ColSegment_IT it(table_columns);
1322   // Iterate the ColPartitions in the grid.
1323   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
1324       &clean_part_grid_);
1325   gsearch.StartFullSearch();
1326   ColPartition *part;
1327   while ((part = gsearch.NextFullSearch()) != nullptr) {
1328     if (part->inside_table_column() || part->type() != PT_TABLE) {
1329       continue; // prevent a partition to be assigned to multiple columns
1330     }
1331     const TBOX &box = part->bounding_box();
1332     auto *col = new ColSegment();
1333     col->InsertBox(box);
1334     part->set_inside_table_column(true);
1335     // Start a search below the current cell to find bottom neighbours
1336     // Note: a full search will always process things above it first, so
1337     // this should be starting at the highest cell and working its way down.
1338     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> vsearch(
1339         &clean_part_grid_);
1340     vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341     ColPartition *neighbor = nullptr;
1342     bool found_neighbours = false;
1343     while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344       // only consider neighbors not assigned to any column yet
1345       if (neighbor->inside_table_column()) {
1346         continue;
1347       }
1348       // Horizontal lines should not break the flow
1349       if (neighbor->IsHorizontalLine()) {
1350         continue;
1351       }
1352       // presence of a non-table neighbor marks the end of current
1353       // table column
1354       if (neighbor->type() != PT_TABLE) {
1355         break;
1356       }
1357       // add the neighbor partition to the table column
1358       const TBOX &neighbor_box = neighbor->bounding_box();
1359       col->InsertBox(neighbor_box);
1360       neighbor->set_inside_table_column(true);
1361       found_neighbours = true;
1362     }
1363     if (found_neighbours) {
1364       it.add_after_then_move(col);
1365     } else {
1366       part->set_inside_table_column(false);
1367       delete col;
1368     }
1369   }
1370 }
1371 
1372 // Mark regions in a column that are x-bounded by the column boundaries and
1373 // y-bounded by the table columns' projection on the y-axis as table regions
GetTableRegions(ColSegment_LIST * table_columns,ColSegment_LIST * table_regions)1374 void TableFinder::GetTableRegions(ColSegment_LIST *table_columns,
1375                                   ColSegment_LIST *table_regions) {
1376   ColSegment_IT cit(table_columns);
1377   ColSegment_IT rit(table_regions);
1378   // Iterate through column blocks
1379   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1380       &col_seg_grid_);
1381   gsearch.StartFullSearch();
1382   ColSegment *part;
1383   int page_height = tright().y() - bleft().y();
1384   ASSERT_HOST(page_height > 0);
1385   // create a bool array to hold projection on y-axis
1386   bool *table_region = new bool[page_height];
1387   while ((part = gsearch.NextFullSearch()) != nullptr) {
1388     const TBOX &part_box = part->bounding_box();
1389     // reset the projection array
1390     for (int i = 0; i < page_height; i++) {
1391       table_region[i] = false;
1392     }
1393     // iterate through all table columns to find regions in the current
1394     // page column block
1395     cit.move_to_first();
1396     for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397       TBOX col_box = cit.data()->bounding_box();
1398       // find intersection region of table column and page column
1399       TBOX intersection_box = col_box.intersection(part_box);
1400       // project table column on the y-axis
1401       for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402         table_region[i - bleft().y()] = true;
1403       }
1404     }
1405     // set x-limits of table regions to page column width
1406     TBOX current_table_box;
1407     current_table_box.set_left(part_box.left());
1408     current_table_box.set_right(part_box.right());
1409     // go through the y-axis projection to find runs of table
1410     // regions. Each run makes one table region.
1411     for (int i = 1; i < page_height; i++) {
1412       // detect start of a table region
1413       if (!table_region[i - 1] && table_region[i]) {
1414         current_table_box.set_bottom(i + bleft().y());
1415       }
1416       // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417       // detect end of a table region
1418       if (table_region[i - 1] && !table_region[i]) {
1419         current_table_box.set_top(i + bleft().y());
1420         if (!current_table_box.null_box()) {
1421           auto *seg = new ColSegment();
1422           seg->InsertBox(current_table_box);
1423           rit.add_after_then_move(seg);
1424         }
1425       }
1426     }
1427   }
1428   delete[] table_region;
1429 }
1430 
1431 // Merge table regions corresponding to tables spanning multiple columns if
1432 // there is a colpartition (horizontal ruling line or normal text) that
1433 // touches both regions.
1434 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1435 // tables with aligned ruling lines. In this case, line finder returns a
1436 // single line and hence the tables get merged together
GridMergeTableRegions()1437 void TableFinder::GridMergeTableRegions() {
1438   // Iterate the table regions in the grid.
1439   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1440       &table_grid_);
1441   gsearch.StartFullSearch();
1442   ColSegment *seg = nullptr;
1443   while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444     bool neighbor_found = false;
1445     bool modified = false; // Modified at least once
1446     do {
1447       // Start a rectangle search x-bounded by the image and y by the table
1448       const TBOX &box = seg->bounding_box();
1449       TBOX search_region(box);
1450       search_region.set_left(bleft().x());
1451       search_region.set_right(tright().x());
1452       neighbor_found = false;
1453       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1454           &table_grid_);
1455       rectsearch.StartRectSearch(search_region);
1456       ColSegment *neighbor = nullptr;
1457       while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458         if (neighbor == seg) {
1459           continue;
1460         }
1461         const TBOX &neighbor_box = neighbor->bounding_box();
1462         // Check if a neighbor box has a large overlap with the table
1463         // region.  This may happen as a result of merging two table
1464         // regions in the previous iteration.
1465         if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466           seg->InsertBox(neighbor_box);
1467           rectsearch.RemoveBBox();
1468           gsearch.RepositionIterator();
1469           delete neighbor;
1470           modified = true;
1471           continue;
1472         }
1473         // Check if two table regions belong together based on a common
1474         // horizontal ruling line
1475         if (BelongToOneTable(box, neighbor_box)) {
1476           seg->InsertBox(neighbor_box);
1477           neighbor_found = true;
1478           modified = true;
1479           rectsearch.RemoveBBox();
1480           gsearch.RepositionIterator();
1481           delete neighbor;
1482         }
1483       }
1484     } while (neighbor_found);
1485     if (modified) {
1486       // Because the box has changed, it has to be removed first.
1487       gsearch.RemoveBBox();
1488       table_grid_.InsertBBox(true, true, seg);
1489       gsearch.RepositionIterator();
1490     }
1491   }
1492 }
1493 
1494 // Decide if two table regions belong to one table based on a common
1495 // horizontal ruling line or another colpartition
BelongToOneTable(const TBOX & box1,const TBOX & box2)1496 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1497   // Check the obvious case. Most likely not true because overlapping boxes
1498   // should already be merged, but seems like a good thing to do in case things
1499   // change.
1500   if (box1.overlap(box2)) {
1501     return true;
1502   }
1503   // Check for ColPartitions spanning both table regions
1504   TBOX bbox = box1.bounding_union(box2);
1505   // Start a rect search on bbox
1506   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1507       &clean_part_grid_);
1508   rectsearch.StartRectSearch(bbox);
1509   ColPartition *part = nullptr;
1510   while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511     const TBOX &part_box = part->bounding_box();
1512     // return true if a colpartition spanning both table regions is found
1513     if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514         !part->IsImageType()) {
1515       return true;
1516     }
1517   }
1518   return false;
1519 }
1520 
1521 // Adjust table boundaries by:
1522 //  - building a tight bounding box around all ColPartitions contained in it.
1523 //  - expanding table boundaries to include all colpartitions that overlap the
1524 //    table by more than half of their area
1525 //  - expanding table boundaries to include nearby horizontal rule lines
1526 //  - expanding table vertically to include left out column headers
1527 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1528 //               makes following errors:
1529 //  1- horizontal lines consisting of underlines are included in the table if
1530 //     they are close enough
1531 //  2- horizontal lines originating from noise tend to get merged with a table
1532 //     near the top of the page
1533 //  3- the criteria for including horizontal lines is very generous. Many times
1534 //     horizontal lines separating headers and footers get merged with a
1535 //     single-column table in a multi-column page thereby including text
1536 //     from the neighboring column inside the table
1537 //  4- the criteria for including left out column headers also tends to
1538 //     occasionally include text-lines above the tables, typically from
1539 //     table caption
AdjustTableBoundaries()1540 void TableFinder::AdjustTableBoundaries() {
1541   // Iterate the table regions in the grid
1542   ColSegment_CLIST adjusted_tables;
1543   ColSegment_C_IT it(&adjusted_tables);
1544   ColSegmentGridSearch gsearch(&table_grid_);
1545   gsearch.StartFullSearch();
1546   ColSegment *table = nullptr;
1547   while ((table = gsearch.NextFullSearch()) != nullptr) {
1548     const TBOX &table_box = table->bounding_box();
1549     TBOX grown_box = table_box;
1550     GrowTableBox(table_box, &grown_box);
1551     // To prevent a table from expanding again, do not insert the
1552     // modified box back to the grid. Instead move it to a list and
1553     // and remove it from the grid. The list is moved later back to the grid.
1554     if (!grown_box.null_box()) {
1555       auto *col = new ColSegment();
1556       col->InsertBox(grown_box);
1557       it.add_after_then_move(col);
1558     }
1559     gsearch.RemoveBBox();
1560     delete table;
1561   }
1562   // clear table grid to move final tables in it
1563   // TODO(nbeato): table_grid_ should already be empty. The above loop
1564   // removed everything. Maybe just assert it is empty?
1565   table_grid_.Clear();
1566   it.move_to_first();
1567   // move back final tables to table_grid_
1568   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569     ColSegment *seg = it.extract();
1570     table_grid_.InsertBBox(true, true, seg);
1571   }
1572 }
1573 
GrowTableBox(const TBOX & table_box,TBOX * result_box)1574 void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {
1575   // TODO(nbeato): The growing code is a bit excessive right now.
1576   // By removing these lines, the partitions considered need
1577   // to have some overlap or be special cases. These lines could
1578   // be added again once a check is put in place to make sure that
1579   // growing tables don't stomp on a lot of non-table partitions.
1580 
1581   // search for horizontal ruling lines within the vertical margin
1582   // int vertical_margin = kRulingVerticalMargin * gridsize();
1583   TBOX search_box = table_box;
1584   // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585   // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586   // search_box.set_top(top);
1587   // search_box.set_bottom(bottom);
1588 
1589   GrowTableToIncludePartials(table_box, search_box, result_box);
1590   GrowTableToIncludeLines(table_box, search_box, result_box);
1591   IncludeLeftOutColumnHeaders(result_box);
1592 }
1593 
1594 // Grow a table by increasing the size of the box to include
1595 // partitions with significant overlap with the table.
GrowTableToIncludePartials(const TBOX & table_box,const TBOX & search_range,TBOX * result_box)1596 void TableFinder::GrowTableToIncludePartials(const TBOX &table_box,
1597                                              const TBOX &search_range,
1598                                              TBOX *result_box) {
1599   // Rulings are in a different grid, so search 2 grids for rulings, text,
1600   // and table partitions that are not entirely within the new box.
1601   for (int i = 0; i < 2; ++i) {
1602     ColPartitionGrid *grid =
1603         (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_;
1604     ColPartitionGridSearch rectsearch(grid);
1605     rectsearch.StartRectSearch(search_range);
1606     ColPartition *part = nullptr;
1607     while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608       // Only include text and table types.
1609       if (part->IsImageType()) {
1610         continue;
1611       }
1612       const TBOX &part_box = part->bounding_box();
1613       // Include partition in the table if more than half of it
1614       // is covered by the table
1615       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616         *result_box = result_box->bounding_union(part_box);
1617         continue;
1618       }
1619     }
1620   }
1621 }
1622 
1623 // Grow a table by expanding to the extents of significantly
1624 // overlapping lines.
GrowTableToIncludeLines(const TBOX & table_box,const TBOX & search_range,TBOX * result_box)1625 void TableFinder::GrowTableToIncludeLines(const TBOX &table_box,
1626                                           const TBOX &search_range,
1627                                           TBOX *result_box) {
1628   ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);
1629   rsearch.SetUniqueMode(true);
1630   rsearch.StartRectSearch(search_range);
1631   ColPartition *part = nullptr;
1632   while ((part = rsearch.NextRectSearch()) != nullptr) {
1633     // TODO(nbeato) This should also do vertical, but column
1634     // boundaries are breaking things. This function needs to be
1635     // updated to allow vertical lines as well.
1636     if (!part->IsLineType()) {
1637       continue;
1638     }
1639     // Avoid the following function call if the result of the
1640     // function is irrelevant.
1641     const TBOX &part_box = part->bounding_box();
1642     if (result_box->contains(part_box)) {
1643       continue;
1644     }
1645     // Include a partially overlapping horizontal line only if the
1646     // extra ColPartitions that will be included due to expansion
1647     // have large side spacing w.r.t. columns containing them.
1648     if (HLineBelongsToTable(*part, table_box)) {
1649       *result_box = result_box->bounding_union(part_box);
1650     }
1651     // TODO(nbeato): Vertical
1652   }
1653 }
1654 
1655 // Checks whether the horizontal line belong to the table by looking at the
1656 // side spacing of extra ColParitions that will be included in the table
1657 // due to expansion
HLineBelongsToTable(const ColPartition & part,const TBOX & table_box)1658 bool TableFinder::HLineBelongsToTable(const ColPartition &part,
1659                                       const TBOX &table_box) {
1660   if (!part.IsHorizontalLine()) {
1661     return false;
1662   }
1663   const TBOX &part_box = part.bounding_box();
1664   if (!part_box.major_x_overlap(table_box)) {
1665     return false;
1666   }
1667   // Do not consider top-most horizontal line since it usually
1668   // originates from noise.
1669   // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670   // have neighbors solved.
1671   // if (!part.nearest_neighbor_above())
1672   //   return false;
1673   const TBOX bbox = part_box.bounding_union(table_box);
1674   // In the "unioned table" box (the table extents expanded by the line),
1675   // keep track of how many partitions have significant padding to the left
1676   // and right. If more than half of the partitions covered by the new table
1677   // have significant spacing, the line belongs to the table and the table
1678   // grows to include all of the partitions.
1679   int num_extra_partitions = 0;
1680   int extra_space_to_right = 0;
1681   int extra_space_to_left = 0;
1682   // Rulings are in a different grid, so search 2 grids for rulings, text,
1683   // and table partitions that are introduced by the new box.
1684   for (int i = 0; i < 2; ++i) {
1685     ColPartitionGrid *grid =
1686         (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_;
1687     // Start a rect search on bbox
1688     ColPartitionGridSearch rectsearch(grid);
1689     rectsearch.SetUniqueMode(true);
1690     rectsearch.StartRectSearch(bbox);
1691     ColPartition *extra_part = nullptr;
1692     while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693       // ColPartition already in table
1694       const TBOX &extra_part_box = extra_part->bounding_box();
1695       if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696         continue;
1697       }
1698       // Non-text ColPartitions do not contribute
1699       if (extra_part->IsImageType()) {
1700         continue;
1701       }
1702       // Consider this partition.
1703       num_extra_partitions++;
1704       // presence of a table cell is a strong hint, so just increment the scores
1705       // without looking at the spacing.
1706       if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707         extra_space_to_right++;
1708         extra_space_to_left++;
1709         continue;
1710       }
1711       int space_threshold = kSideSpaceMargin * part.median_height();
1712       if (extra_part->space_to_right() > space_threshold) {
1713         extra_space_to_right++;
1714       }
1715       if (extra_part->space_to_left() > space_threshold) {
1716         extra_space_to_left++;
1717       }
1718     }
1719   }
1720   // tprintf("%d %d %d\n",
1721   // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722   return (extra_space_to_right > num_extra_partitions / 2) ||
1723          (extra_space_to_left > num_extra_partitions / 2);
1724 }
1725 
1726 // Look for isolated column headers above the given table box and
1727 // include them in the table
IncludeLeftOutColumnHeaders(TBOX * table_box)1728 void TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) {
1729   // Start a search above the current table to look for column headers
1730   ColPartitionGridSearch vsearch(&clean_part_grid_);
1731   vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732                               table_box->top());
1733   ColPartition *neighbor = nullptr;
1734   ColPartition *previous_neighbor = nullptr;
1735   while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736     // Max distance to find a table heading.
1737     const int max_distance =
1738         kMaxColumnHeaderDistance * neighbor->median_height();
1739     int table_top = table_box->top();
1740     const TBOX &box = neighbor->bounding_box();
1741     // Do not continue if the next box is way above
1742     if (box.bottom() - table_top > max_distance) {
1743       break;
1744     }
1745     // Unconditionally include partitions of type TABLE or LINE
1746     // TODO(faisal): add some reasonable conditions here
1747     if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748       table_box->set_top(box.top());
1749       previous_neighbor = nullptr;
1750       continue;
1751     }
1752     // If there are two text partitions, one above the other, without a table
1753     // cell on their left or right side, consider them a barrier and quit
1754     if (previous_neighbor == nullptr) {
1755       previous_neighbor = neighbor;
1756     } else {
1757       const TBOX &previous_box = previous_neighbor->bounding_box();
1758       if (!box.major_y_overlap(previous_box)) {
1759         break;
1760       }
1761     }
1762   }
1763 }
1764 
1765 // Remove false alarms consisting of a single column based on their
1766 // projection on the x-axis. Projection of a real table on the x-axis
1767 // should have at least one zero-valley larger than the global median
1768 // x-height of the page.
DeleteSingleColumnTables()1769 void TableFinder::DeleteSingleColumnTables() {
1770   int page_width = tright().x() - bleft().x();
1771   ASSERT_HOST(page_width > 0);
1772   // create an integer array to hold projection on x-axis
1773   int *table_xprojection = new int[page_width];
1774   // Iterate through all tables in the table grid
1775   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
1776       &table_grid_);
1777   table_search.StartFullSearch();
1778   ColSegment *table;
1779   while ((table = table_search.NextFullSearch()) != nullptr) {
1780     TBOX table_box = table->bounding_box();
1781     // reset the projection array
1782     for (int i = 0; i < page_width; i++) {
1783       table_xprojection[i] = 0;
1784     }
1785     // Start a rect search on table_box
1786     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1787         &clean_part_grid_);
1788     rectsearch.SetUniqueMode(true);
1789     rectsearch.StartRectSearch(table_box);
1790     ColPartition *part;
1791     while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792       if (!part->IsTextType()) {
1793         continue; // Do not consider non-text partitions
1794       }
1795       if (part->flow() == BTFT_LEADER) {
1796         continue; // Assume leaders are in tables
1797       }
1798       TBOX part_box = part->bounding_box();
1799       // Do not consider partitions partially covered by the table
1800       if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801         continue;
1802       }
1803       BLOBNBOX_CLIST *part_boxes = part->boxes();
1804       BLOBNBOX_C_IT pit(part_boxes);
1805 
1806       // Make sure overlapping blobs don't artificially inflate the number
1807       // of rows in the table. This happens frequently with things such as
1808       // decimals and split characters. Do this by assuming the column
1809       // partition is sorted mostly left to right and just clip
1810       // bounding boxes by the previous box's extent.
1811       int next_position_to_write = 0;
1812 
1813       for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814         BLOBNBOX *pblob = pit.data();
1815         // ignore blob height for the purpose of projection since we
1816         // are only interested in finding valleys
1817         int xstart = pblob->bounding_box().left();
1818         int xend = pblob->bounding_box().right();
1819 
1820         xstart = std::max(xstart, next_position_to_write);
1821         for (int i = xstart; i < xend; i++) {
1822           table_xprojection[i - bleft().x()]++;
1823         }
1824         next_position_to_write = xend;
1825       }
1826     }
1827     // Find largest valley between two reasonable peaks in the table
1828     if (!GapInXProjection(table_xprojection, page_width)) {
1829       table_search.RemoveBBox();
1830       delete table;
1831     }
1832   }
1833   delete[] table_xprojection;
1834 }
1835 
1836 // Return true if at least one gap larger than the global x-height
1837 // exists in the horizontal projection
GapInXProjection(int * xprojection,int length)1838 bool TableFinder::GapInXProjection(int *xprojection, int length) {
1839   // Find peak value of the histogram
1840   int peak_value = 0;
1841   for (int i = 0; i < length; i++) {
1842     if (xprojection[i] > peak_value) {
1843       peak_value = xprojection[i];
1844     }
1845   }
1846   // Peak value represents the maximum number of horizontally
1847   // overlapping colpartitions, so this can be considered as the
1848   // number of rows in the table
1849   if (peak_value < kMinRowsInTable) {
1850     return false;
1851   }
1852   double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853   if (peak_value >= kLargeTableRowCount) {
1854     projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855   }
1856   // Threshold the histogram
1857   for (int i = 0; i < length; i++) {
1858     xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859   }
1860   // Find the largest run of zeros between two ones
1861   int largest_gap = 0;
1862   int run_start = -1;
1863   for (int i = 1; i < length; i++) {
1864     // detect start of a run of zeros
1865     if (xprojection[i - 1] && !xprojection[i]) {
1866       run_start = i;
1867     }
1868     // detect end of a run of zeros and update the value of largest gap
1869     if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870       int gap = i - run_start;
1871       if (gap > largest_gap) {
1872         largest_gap = gap;
1873       }
1874       run_start = -1;
1875     }
1876   }
1877   return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878 }
1879 
1880 // Given the location of a table "guess", try to overlay a cellular
1881 // grid in the location, adjusting the boundaries.
1882 // TODO(nbeato): Falsely introduces:
1883 //   -headers/footers (not any worse, too much overlap destroys cells)
1884 //   -page numbers (not worse, included because maximize margins)
1885 //   -equations (nicely fit into a celluar grid, but more sparsely)
1886 //   -figures (random text box, also sparse)
1887 //   -small left-aligned text areas with overlapping positioned whitespace
1888 //       (rejected before)
1889 // Overall, this just needs some more work.
RecognizeTables()1890 void TableFinder::RecognizeTables() {
1891 #ifndef GRAPHICS_DISABLED
1892   ScrollView *table_win = nullptr;
1893   if (textord_show_tables) {
1894     table_win = MakeWindow(0, 0, "Table Structure");
1895     DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE,
1896                          ScrollView::LIGHT_BLUE);
1897     // table_grid_.DisplayBoxes(table_win);
1898   }
1899 #endif
1900 
1901   TableRecognizer recognizer;
1902   recognizer.Init();
1903   recognizer.set_line_grid(&leader_and_ruling_grid_);
1904   recognizer.set_text_grid(&fragmented_text_grid_);
1905   recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1906   recognizer.set_min_height(1.5 * gridheight());
1907   // Loop over all of the tables and try to fit them.
1908   // Store the good tables here.
1909   ColSegment_CLIST good_tables;
1910   ColSegment_C_IT good_it(&good_tables);
1911 
1912   ColSegmentGridSearch gsearch(&table_grid_);
1913   gsearch.StartFullSearch();
1914   ColSegment *found_table = nullptr;
1915   while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916     gsearch.RemoveBBox();
1917 
1918     // The goal is to make the tables persistent in a list.
1919     // When that happens, this will move into the search loop.
1920     const TBOX &found_box = found_table->bounding_box();
1921     StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922 
1923     // Process a table. Good tables are inserted into the grid again later on
1924     // We can't change boxes in the grid while it is running a search.
1925     if (table_structure != nullptr) {
1926 #ifndef GRAPHICS_DISABLED
1927       if (textord_show_tables) {
1928         table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929       }
1930 #endif
1931       found_table->set_bounding_box(table_structure->bounding_box());
1932       delete table_structure;
1933       good_it.add_after_then_move(found_table);
1934     } else {
1935       delete found_table;
1936     }
1937   }
1938   // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939 
1940   // At this point, the grid is empty. We can safely insert the good tables
1941   // back into grid.
1942   for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943     table_grid_.InsertBBox(true, true, good_it.extract());
1944   }
1945 }
1946 
1947 #ifndef GRAPHICS_DISABLED
1948 
1949 // Displays the column segments in some window.
DisplayColSegments(ScrollView * win,ColSegment_LIST * segments,ScrollView::Color color)1950 void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,
1951                                      ScrollView::Color color) {
1952   win->Pen(color);
1953   win->Brush(ScrollView::NONE);
1954   ColSegment_IT it(segments);
1955   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956     ColSegment *col = it.data();
1957     const TBOX &box = col->bounding_box();
1958     int left_x = box.left();
1959     int right_x = box.right();
1960     int top_y = box.top();
1961     int bottom_y = box.bottom();
1962     win->Rectangle(left_x, bottom_y, right_x, top_y);
1963   }
1964   win->UpdateWindow();
1965 }
1966 
1967 // Displays the colpartitions using a new coloring on an existing window.
1968 // Note: This method is only for debug purpose during development and
1969 // would not be part of checked in code
DisplayColPartitions(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color default_color,ScrollView::Color table_color)1970 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1971                                        ScrollView::Color default_color,
1972                                        ScrollView::Color table_color) {
1973   ScrollView::Color color = default_color;
1974   // Iterate the ColPartitions in the grid.
1975   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
1976   gsearch.StartFullSearch();
1977   ColPartition *part = nullptr;
1978   while ((part = gsearch.NextFullSearch()) != nullptr) {
1979     color = default_color;
1980     if (part->type() == PT_TABLE) {
1981       color = table_color;
1982     }
1983 
1984     const TBOX &box = part->bounding_box();
1985     int left_x = box.left();
1986     int right_x = box.right();
1987     int top_y = box.top();
1988     int bottom_y = box.bottom();
1989     win->Brush(ScrollView::NONE);
1990     win->Pen(color);
1991     win->Rectangle(left_x, bottom_y, right_x, top_y);
1992   }
1993   win->UpdateWindow();
1994 }
1995 
DisplayColPartitions(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color default_color)1996 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1997                                        ScrollView::Color default_color) {
1998   DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999 }
2000 
DisplayColPartitionConnections(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color color)2001 void TableFinder::DisplayColPartitionConnections(ScrollView *win,
2002                                                  ColPartitionGrid *grid,
2003                                                  ScrollView::Color color) {
2004   // Iterate the ColPartitions in the grid.
2005   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2006   gsearch.StartFullSearch();
2007   ColPartition *part = nullptr;
2008   while ((part = gsearch.NextFullSearch()) != nullptr) {
2009     const TBOX &box = part->bounding_box();
2010     int left_x = box.left();
2011     int right_x = box.right();
2012     int top_y = box.top();
2013     int bottom_y = box.bottom();
2014 
2015     ColPartition *upper_part = part->nearest_neighbor_above();
2016     if (upper_part) {
2017       const TBOX &upper_box = upper_part->bounding_box();
2018       int mid_x = (left_x + right_x) / 2;
2019       int mid_y = (top_y + bottom_y) / 2;
2020       int other_x = (upper_box.left() + upper_box.right()) / 2;
2021       int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022       win->Brush(ScrollView::NONE);
2023       win->Pen(color);
2024       win->Line(mid_x, mid_y, other_x, other_y);
2025     }
2026     ColPartition *lower_part = part->nearest_neighbor_below();
2027     if (lower_part) {
2028       const TBOX &lower_box = lower_part->bounding_box();
2029       int mid_x = (left_x + right_x) / 2;
2030       int mid_y = (top_y + bottom_y) / 2;
2031       int other_x = (lower_box.left() + lower_box.right()) / 2;
2032       int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033       win->Brush(ScrollView::NONE);
2034       win->Pen(color);
2035       win->Line(mid_x, mid_y, other_x, other_y);
2036     }
2037   }
2038   win->UpdateWindow();
2039 }
2040 
2041 #endif
2042 
2043 // Merge all colpartitions in table regions to make them a single
2044 // colpartition and revert types of isolated table cells not
2045 // assigned to any table to their original types.
MakeTableBlocks(ColPartitionGrid * grid,ColPartitionSet ** all_columns,const WidthCallback & width_cb)2046 void TableFinder::MakeTableBlocks(ColPartitionGrid *grid,
2047                                   ColPartitionSet **all_columns,
2048                                   const WidthCallback &width_cb) {
2049   // Since we have table blocks already, remove table tags from all
2050   // colpartitions
2051   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2052   gsearch.StartFullSearch();
2053   ColPartition *part = nullptr;
2054 
2055   while ((part = gsearch.NextFullSearch()) != nullptr) {
2056     if (part->type() == PT_TABLE) {
2057       part->clear_table_type();
2058     }
2059   }
2060   // Now make a single colpartition out of each table block and remove
2061   // all colpartitions contained within a table
2062   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
2063       &table_grid_);
2064   table_search.StartFullSearch();
2065   ColSegment *table;
2066   while ((table = table_search.NextFullSearch()) != nullptr) {
2067     const TBOX &table_box = table->bounding_box();
2068     // Start a rect search on table_box
2069     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
2070         grid);
2071     rectsearch.StartRectSearch(table_box);
2072     ColPartition *part;
2073     ColPartition *table_partition = nullptr;
2074     while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075       // Do not consider image partitions
2076       if (!part->IsTextType()) {
2077         continue;
2078       }
2079       TBOX part_box = part->bounding_box();
2080       // Include partition in the table if more than half of it
2081       // is covered by the table
2082       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083         rectsearch.RemoveBBox();
2084         if (table_partition) {
2085           table_partition->Absorb(part, width_cb);
2086         } else {
2087           table_partition = part;
2088         }
2089       }
2090     }
2091     // Insert table colpartition back to part_grid_
2092     if (table_partition) {
2093       // To match the columns used when transforming to blocks, the new table
2094       // partition must have its first and last column set at the grid y that
2095       // corresponds to its bottom.
2096       const TBOX &table_box = table_partition->bounding_box();
2097       int grid_x, grid_y;
2098       grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099       table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100       table_partition->set_table_type();
2101       table_partition->set_blob_type(BRT_TEXT);
2102       table_partition->set_flow(BTFT_CHAIN);
2103       table_partition->SetBlobTypes();
2104       grid->InsertBBox(true, true, table_partition);
2105     }
2106   }
2107 }
2108 
2109 //////// ColSegment code
2110 ////////
ColSegment()2111 ColSegment::ColSegment()
2112     : ELIST_LINK(),
2113       num_table_cells_(0),
2114       num_text_cells_(0),
2115       type_(COL_UNKNOWN) {}
2116 
2117 // Provides a color for BBGrid to draw the rectangle.
BoxColor() const2118 ScrollView::Color ColSegment::BoxColor() const {
2119   const ScrollView::Color kBoxColors[PT_COUNT] = {
2120       ScrollView::YELLOW,
2121       ScrollView::BLUE,
2122       ScrollView::YELLOW,
2123       ScrollView::MAGENTA,
2124   };
2125   return kBoxColors[type_];
2126 }
2127 
2128 // Insert a box into this column segment
InsertBox(const TBOX & other)2129 void ColSegment::InsertBox(const TBOX &other) {
2130   bounding_box_ = bounding_box_.bounding_union(other);
2131 }
2132 
2133 // Set column segment type based on the ratio of text and table partitions
2134 // in it.
set_type()2135 void ColSegment::set_type() {
2136   if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {
2137     type_ = COL_TABLE;
2138   } else if (num_text_cells_ > num_table_cells_) {
2139     type_ = COL_TEXT;
2140   } else {
2141     type_ = COL_MIXED;
2142   }
2143 }
2144 
2145 } // namespace tesseract.
2146