1 ///////////////////////////////////////////////////////////////////////
2 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 //
6 // (C) Copyright 2009, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifdef HAVE_CONFIG_H
20 # include "config_auto.h"
21 #endif
22
23 #include <algorithm>
24 #include <cmath>
25 #include <utility>
26 #include "tablefind.h"
27
28 #include <allheaders.h>
29
30 #include "colpartitionset.h"
31 #include "tablerecog.h"
32
33 namespace tesseract {
34
35 // These numbers are used to calculate the global median stats.
36 // They just set an upper bound on the stats objects.
37 // Maximum vertical spacing between neighbor partitions.
38 const int kMaxVerticalSpacing = 500;
39 // Maximum width of a blob in a partition.
40 const int kMaxBlobWidth = 500;
41
42 // Minimum whitespace size to split a partition (measured as a multiple
43 // of a partition's median width).
44 const double kSplitPartitionSize = 2.0;
45 // To insert text, the partition must satisfy these size constraints
46 // in AllowTextPartition(). The idea is to filter noise partitions
47 // determined by the size compared to the global medians.
48 // TODO(nbeato): Need to find good numbers again.
49 const double kAllowTextHeight = 0.5;
50 const double kAllowTextWidth = 0.6;
51 const double kAllowTextArea = 0.8;
52 // The same thing applies to blobs (to filter noise).
53 // TODO(nbeato): These numbers are a shot in the dark...
54 // height and width are 0.5 * gridsize() in colfind.cpp
55 // area is a rough guess for the size of a period.
56 const double kAllowBlobHeight = 0.3;
57 const double kAllowBlobWidth = 0.4;
58 const double kAllowBlobArea = 0.05;
59
60 // Minimum number of components in a text partition. A partition having fewer
61 // components than that is more likely a data partition and is a candidate
62 // table cell.
63 const int kMinBoxesInTextPartition = 10;
64
65 // Maximum number of components that a data partition can have
66 const int kMaxBoxesInDataPartition = 20;
67
68 // Maximum allowed gap in a text partitions as a multiple of its median size.
69 const double kMaxGapInTextPartition = 4.0;
70
71 // Minimum value that the maximum gap in a text partition should have as a
72 // factor of its median size.
73 const double kMinMaxGapInTextPartition = 0.5;
74
75 // The amount of overlap that is "normal" for adjacent blobs in a text
76 // partition. This is used to calculate gap between overlapping blobs.
77 const double kMaxBlobOverlapFactor = 4.0;
78
79 // Maximum x-height a table partition can have as a multiple of global
80 // median x-height
81 const double kMaxTableCellXheight = 2.0;
82
83 // Maximum line spacing between a table column header and column contents
84 // for merging the two (as a multiple of the partition's median_height).
85 const int kMaxColumnHeaderDistance = 4;
86
87 // Minimum ratio of num_table_partitions to num_text_partitions in a column
88 // block to be called it a table column
89 const double kTableColumnThreshold = 3.0;
90
91 // Search for horizontal ruling lines within the vertical margin as a
92 // multiple of grid size
93 // const int kRulingVerticalMargin = 3;
94
95 // Minimum overlap that a colpartition must have with a table region
96 // to become part of that table
97 const double kMinOverlapWithTable = 0.6;
98
99 // Maximum side space (distance from column boundary) that a typical
100 // text-line in flowing text should have as a multiple of its x-height
101 // (Median size).
102 const int kSideSpaceMargin = 10;
103
104 // Fraction of the peak of x-projection of a table region to set the
105 // threshold for the x-projection histogram
106 const double kSmallTableProjectionThreshold = 0.35;
107 const double kLargeTableProjectionThreshold = 0.45;
108 // Minimum number of rows required to look for more rows in the projection.
109 const int kLargeTableRowCount = 6;
110
111 // Minimum number of rows in a table
112 const int kMinRowsInTable = 3;
113
114 // The amount of padding (multiplied by global_median_xheight_ during use)
115 // that is vertically added to the search adjacent leader search during
116 // ColPartition marking.
117 const int kAdjacentLeaderSearchPadding = 2;
118
119 // Used when filtering false positives. When finding the last line
120 // of a paragraph (typically left-aligned), the previous line should have
121 // its center to the right of the last line by this scaled amount.
122 const double kParagraphEndingPreviousLineRatio = 1.3;
123
124 // The maximum amount of whitespace allowed left of a paragraph ending.
125 // Do not filter a ColPartition with more than this space left of it.
126 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
127
128 // Used when filtering false positives. The last line of a paragraph
129 // should be preceded by a line that is predominantly text. This is the
130 // ratio of text to whitespace (to the right of the text) that is required
131 // for the previous line to be a text.
132 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
133
134 // When counting table columns, this is the required gap between two columns
135 // (it is multiplied by global_median_xheight_).
136 const double kMaxXProjectionGapFactor = 2.0;
137
138 // Used for similarity in partitions using stroke width. Values copied
139 // from ColFind.cpp in Ray's CL.
140 const double kStrokeWidthFractionalTolerance = 0.25;
141 const double kStrokeWidthConstantTolerance = 2.0;
142
143 #ifndef GRAPHICS_DISABLED
144 static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)");
145 static BOOL_VAR(textord_tablefind_show_mark, false,
146 "Debug table marking steps in detail (ScrollView)");
147 static BOOL_VAR(textord_tablefind_show_stats, false,
148 "Show page stats used in table finding (ScrollView)");
149 #endif
150 static BOOL_VAR(textord_tablefind_recognize_tables, false,
151 "Enables the table recognizer for table layout and filtering.");
152
153 // Templated helper function used to create destructor callbacks for the
154 // BBGrid::ClearGridData() method.
155 template <typename T>
DeleteObject(T * object)156 void DeleteObject(T *object) {
157 delete object;
158 }
159
TableFinder()160 TableFinder::TableFinder()
161 : resolution_(0),
162 global_median_xheight_(0),
163 global_median_blob_width_(0),
164 global_median_ledding_(0),
165 left_to_right_language_(true) {}
166
~TableFinder()167 TableFinder::~TableFinder() {
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175 }
176
set_left_to_right_language(bool order)177 void TableFinder::set_left_to_right_language(bool order) {
178 left_to_right_language_ = order;
179 }
180
Init(int grid_size,const ICOORD & bottom_left,const ICOORD & top_right)181 void TableFinder::Init(int grid_size, const ICOORD &bottom_left,
182 const ICOORD &top_right) {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189 }
190
191 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192 // insert leaders and rulers into the leader_and_ruling_grid_
InsertCleanPartitions(ColPartitionGrid * grid,TO_BLOCK * block)193 void TableFinder::InsertCleanPartitions(ColPartitionGrid *grid,
194 TO_BLOCK *block) {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition *part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207 continue;
208 }
209 ColPartition *clean_part = part->ShallowCopy();
210 ColPartition *leader_part = nullptr;
211 if (part->IsLineType()) {
212 InsertRulingPartition(clean_part);
213 continue;
214 }
215 // Insert all non-text partitions to clean_parts
216 if (!part->IsTextType()) {
217 InsertImagePartition(clean_part);
218 continue;
219 }
220 // Insert text colpartitions after removing noisy components from them
221 // The leaders are split into a separate grid.
222 BLOBNBOX_CLIST *part_boxes = part->boxes();
223 BLOBNBOX_C_IT pit(part_boxes);
224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225 BLOBNBOX *pblob = pit.data();
226 // Bad blobs... happens in UNLV set.
227 // news.3G1, page 17 (around x=6)
228 if (!AllowBlob(*pblob)) {
229 continue;
230 }
231 if (pblob->flow() == BTFT_LEADER) {
232 if (leader_part == nullptr) {
233 leader_part = part->ShallowCopy();
234 leader_part->set_flow(BTFT_LEADER);
235 }
236 leader_part->AddBox(pblob);
237 } else if (pblob->region_type() != BRT_NOISE) {
238 clean_part->AddBox(pblob);
239 }
240 }
241 clean_part->ComputeLimits();
242 ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243 InsertTextPartition(clean_part);
244 SplitAndInsertFragmentedTextPartition(fragmented);
245 if (leader_part != nullptr) {
246 // TODO(nbeato): Note that ComputeLimits does not update the column
247 // information. So the leader may appear to span more columns than it
248 // really does later on when IsInSameColumnAs gets called to test
249 // for adjacent leaders.
250 leader_part->ComputeLimits();
251 InsertLeaderPartition(leader_part);
252 }
253 }
254
255 // Make the partition partners better for upper and lower neighbors.
256 clean_part_grid_.FindPartitionPartners();
257 clean_part_grid_.RefinePartitionPartners(false);
258 }
259
260 // High level function to perform table detection
LocateTables(ColPartitionGrid * grid,ColPartitionSet ** all_columns,WidthCallback width_cb,const FCOORD & reskew)261 void TableFinder::LocateTables(ColPartitionGrid *grid,
262 ColPartitionSet **all_columns,
263 WidthCallback width_cb, const FCOORD &reskew) {
264 // initialize spacing, neighbors, and columns
265 InitializePartitions(all_columns);
266
267 #ifndef GRAPHICS_DISABLED
268 if (textord_show_tables) {
269 ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
270 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
271 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
272 ScrollView::AQUAMARINE);
273 DisplayColPartitionConnections(table_win, &clean_part_grid_,
274 ScrollView::ORANGE);
275
276 table_win = MakeWindow(100, 300, "Fragmented Text");
277 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
278 }
279 #endif // !GRAPHICS_DISABLED
280
281 // mark, filter, and smooth candidate table partitions
282 MarkTablePartitions();
283
284 // Make single-column blocks from good_columns_ partitions. col_segments are
285 // moved to a grid later which takes the ownership
286 ColSegment_LIST column_blocks;
287 GetColumnBlocks(all_columns, &column_blocks);
288 // Set the ratio of candidate table partitions in each column
289 SetColumnsType(&column_blocks);
290
291 // Move column segments to col_seg_grid_
292 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293
294 // Detect split in column layout that might have occurred due to the
295 // presence of a table. In such a case, merge the corresponding columns.
296 GridMergeColumnBlocks();
297
298 // Group horizontally overlapping table partitions into table columns.
299 // table_columns created here get deleted at the end of this method.
300 ColSegment_LIST table_columns;
301 GetTableColumns(&table_columns);
302
303 // Within each column, mark the range table regions occupy based on the
304 // table columns detected. table_regions are moved to a grid later which
305 // takes the ownership
306 ColSegment_LIST table_regions;
307 GetTableRegions(&table_columns, &table_regions);
308
309 #ifndef GRAPHICS_DISABLED
310 if (textord_tablefind_show_mark) {
311 ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314 }
315 #endif // !GRAPHICS_DISABLED
316
317 // Merge table regions across columns for tables spanning multiple
318 // columns
319 MoveColSegmentsToGrid(&table_regions, &table_grid_);
320 GridMergeTableRegions();
321
322 // Adjust table boundaries by including nearby horizontal lines and left
323 // out column headers
324 AdjustTableBoundaries();
325 GridMergeTableRegions();
326
327 if (textord_tablefind_recognize_tables) {
328 // Remove false alarms consisting of a single column
329 DeleteSingleColumnTables();
330
331 #ifndef GRAPHICS_DISABLED
332 if (textord_show_tables) {
333 ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
334 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
335 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336 table_grid_.DisplayBoxes(table_win);
337 }
338 #endif // !GRAPHICS_DISABLED
339
340 // Find table grid structure and reject tables that are malformed.
341 RecognizeTables();
342 GridMergeTableRegions();
343 RecognizeTables();
344
345 #ifndef GRAPHICS_DISABLED
346 if (textord_show_tables) {
347 ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
348 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
349 ScrollView::BLUE);
350 table_grid_.DisplayBoxes(table_win);
351 }
352 #endif // !GRAPHICS_DISABLED
353 } else {
354 // Remove false alarms consisting of a single column
355 // TODO(nbeato): verify this is a NOP after structured table rejection.
356 // Right now it isn't. If the recognize function is doing what it is
357 // supposed to do, this function is obsolete.
358 DeleteSingleColumnTables();
359
360 #ifndef GRAPHICS_DISABLED
361 if (textord_show_tables) {
362 ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
363 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE,
364 ScrollView::BLUE);
365 table_grid_.DisplayBoxes(table_win);
366 }
367 #endif // !GRAPHICS_DISABLED
368 }
369
370 // Merge all colpartitions in table regions to make them a single
371 // colpartition and revert types of isolated table cells not
372 // assigned to any table to their original types.
373 MakeTableBlocks(grid, all_columns, width_cb);
374 }
375 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
376 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
377 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
378 // dimensions instead of duplicated memory.
gridsize() const379 int TableFinder::gridsize() const {
380 return clean_part_grid_.gridsize();
381 }
gridwidth() const382 int TableFinder::gridwidth() const {
383 return clean_part_grid_.gridwidth();
384 }
gridheight() const385 int TableFinder::gridheight() const {
386 return clean_part_grid_.gridheight();
387 }
bleft() const388 const ICOORD &TableFinder::bleft() const {
389 return clean_part_grid_.bleft();
390 }
tright() const391 const ICOORD &TableFinder::tright() const {
392 return clean_part_grid_.tright();
393 }
394
InsertTextPartition(ColPartition * part)395 void TableFinder::InsertTextPartition(ColPartition *part) {
396 ASSERT_HOST(part != nullptr);
397 if (AllowTextPartition(*part)) {
398 clean_part_grid_.InsertBBox(true, true, part);
399 } else {
400 delete part;
401 }
402 }
InsertFragmentedTextPartition(ColPartition * part)403 void TableFinder::InsertFragmentedTextPartition(ColPartition *part) {
404 ASSERT_HOST(part != nullptr);
405 if (AllowTextPartition(*part)) {
406 fragmented_text_grid_.InsertBBox(true, true, part);
407 } else {
408 delete part;
409 }
410 }
InsertLeaderPartition(ColPartition * part)411 void TableFinder::InsertLeaderPartition(ColPartition *part) {
412 ASSERT_HOST(part != nullptr);
413 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414 leader_and_ruling_grid_.InsertBBox(true, true, part);
415 } else {
416 delete part;
417 }
418 }
InsertRulingPartition(ColPartition * part)419 void TableFinder::InsertRulingPartition(ColPartition *part) {
420 leader_and_ruling_grid_.InsertBBox(true, true, part);
421 }
InsertImagePartition(ColPartition * part)422 void TableFinder::InsertImagePartition(ColPartition *part) {
423 // NOTE: If images are placed into a different grid in the future,
424 // the function SetPartitionSpacings needs to be updated. It should
425 // be the only thing that cares about image partitions.
426 clean_part_grid_.InsertBBox(true, true, part);
427 }
428
429 // Splits a partition into its "words". The splits happen
430 // at locations with wide inter-blob spacing. This is useful
431 // because it allows the table recognize to "cut through" the
432 // text lines on the page. The assumption is that a table
433 // will have several lines with similar overlapping whitespace
434 // whereas text will not have this type of property.
435 // Note: The code Assumes that blobs are sorted by the left side x!
436 // This will not work (as well) if the blobs are sorted by center/right.
SplitAndInsertFragmentedTextPartition(ColPartition * part)437 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) {
438 ASSERT_HOST(part != nullptr);
439 // Bye bye empty partitions!
440 if (part->boxes()->empty()) {
441 delete part;
442 return;
443 }
444
445 // The AllowBlob function prevents this.
446 ASSERT_HOST(part->median_width() > 0);
447 const double kThreshold = part->median_width() * kSplitPartitionSize;
448
449 ColPartition *right_part = part;
450 bool found_split = true;
451 while (found_split) {
452 found_split = false;
453 BLOBNBOX_C_IT box_it(right_part->boxes());
454 // Blobs are sorted left side first. If blobs overlap,
455 // the previous blob may have a "more right" right side.
456 // Account for this by always keeping the largest "right"
457 // so far.
458 int previous_right = INT32_MIN;
459
460 // Look for the next split in the partition.
461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462 const TBOX &box = box_it.data()->bounding_box();
463 if (previous_right != INT32_MIN &&
464 box.left() - previous_right > kThreshold) {
465 // We have a split position. Split the partition in two pieces.
466 // Insert the left piece in the grid and keep processing the right.
467 int mid_x = (box.left() + previous_right) / 2;
468 ColPartition *left_part = right_part;
469 right_part = left_part->SplitAt(mid_x);
470
471 InsertFragmentedTextPartition(left_part);
472 found_split = true;
473 break;
474 }
475
476 // The right side of the previous blobs.
477 previous_right = std::max(previous_right, static_cast<int>(box.right()));
478 }
479 }
480 // When a split is not found, the right part is minimized
481 // as much as possible, so process it.
482 InsertFragmentedTextPartition(right_part);
483 }
484
485 // Some simple criteria to filter out now. We want to make sure the
486 // average blob size in the partition is consistent with the
487 // global page stats.
488 // The area metric will almost always pass for multi-blob partitions.
489 // It is useful when filtering out noise caused by an isolated blob.
AllowTextPartition(const ColPartition & part) const490 bool TableFinder::AllowTextPartition(const ColPartition &part) const {
491 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493 const int median_area = global_median_xheight_ * global_median_blob_width_;
494 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495 // Keep comparisons strictly greater to disallow 0!
496 return part.median_height() > kHeightRequired &&
497 part.median_width() > kWidthRequired &&
498 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499 }
500
501 // Same as above, applied to blobs. Keep in mind that
502 // leaders, commas, and periods are important in tables.
AllowBlob(const BLOBNBOX & blob) const503 bool TableFinder::AllowBlob(const BLOBNBOX &blob) const {
504 const TBOX &box = blob.bounding_box();
505 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507 const int median_area = global_median_xheight_ * global_median_blob_width_;
508 const double kAreaRequired = median_area * kAllowBlobArea;
509 // Keep comparisons strictly greater to disallow 0!
510 return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512 }
513
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 #ifndef GRAPHICS_DISABLED
MakeWindow(int x,int y,const char * window_name)519 ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {
520 return clean_part_grid_.MakeWindow(x, y, window_name);
521 }
522 #endif
523
524 // Make single-column blocks from good_columns_ partitions.
GetColumnBlocks(ColPartitionSet ** all_columns,ColSegment_LIST * column_blocks)525 void TableFinder::GetColumnBlocks(ColPartitionSet **all_columns,
526 ColSegment_LIST *column_blocks) {
527 for (int i = 0; i < gridheight(); ++i) {
528 ColPartitionSet *columns = all_columns[i];
529 if (columns != nullptr) {
530 ColSegment_LIST new_blocks;
531 // Get boxes from the current vertical position on the grid
532 columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533 &new_blocks);
534 // Merge the new_blocks boxes into column_blocks if they are well-aligned
535 GroupColumnBlocks(&new_blocks, column_blocks);
536 }
537 }
538 }
539
540 // Merge column segments into the current list if they are well aligned.
GroupColumnBlocks(ColSegment_LIST * new_blocks,ColSegment_LIST * column_blocks)541 void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,
542 ColSegment_LIST *column_blocks) {
543 ColSegment_IT src_it(new_blocks);
544 ColSegment_IT dest_it(column_blocks);
545 // iterate through the source list
546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547 ColSegment *src_seg = src_it.data();
548 const TBOX &src_box = src_seg->bounding_box();
549 bool match_found = false;
550 // iterate through the destination list to find a matching column block
551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552 ColSegment *dest_seg = dest_it.data();
553 TBOX dest_box = dest_seg->bounding_box();
554 if (ConsecutiveBoxes(src_box, dest_box)) {
555 // If matching block is found, insert the current block into it
556 // and delete the source block.
557 dest_seg->InsertBox(src_box);
558 match_found = true;
559 delete src_it.extract();
560 break;
561 }
562 }
563 // If no match is found, just append the source block to column_blocks
564 if (!match_found) {
565 dest_it.add_after_then_move(src_it.extract());
566 }
567 }
568 }
569
570 // are the two boxes immediate neighbors along the vertical direction
ConsecutiveBoxes(const TBOX & b1,const TBOX & b2)571 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
572 int x_margin = 20;
573 int y_margin = 5;
574 return (abs(b1.left() - b2.left()) < x_margin) &&
575 (abs(b1.right() - b2.right()) < x_margin) &&
576 (abs(b1.top() - b2.bottom()) < y_margin ||
577 abs(b2.top() - b1.bottom()) < y_margin);
578 }
579
580 // Set up info for clean_part_grid_ partitions to be valid during detection
581 // code.
InitializePartitions(ColPartitionSet ** all_columns)582 void TableFinder::InitializePartitions(ColPartitionSet **all_columns) {
583 FindNeighbors();
584 SetPartitionSpacings(&clean_part_grid_, all_columns);
585 SetGlobalSpacings(&clean_part_grid_);
586 }
587
588 // Set left, right and top, bottom spacings of each colpartition.
SetPartitionSpacings(ColPartitionGrid * grid,ColPartitionSet ** all_columns)589 void TableFinder::SetPartitionSpacings(ColPartitionGrid *grid,
590 ColPartitionSet **all_columns) {
591 // Iterate the ColPartitions in the grid.
592 ColPartitionGridSearch gsearch(grid);
593 gsearch.StartFullSearch();
594 ColPartition *part = nullptr;
595 while ((part = gsearch.NextFullSearch()) != nullptr) {
596 ColPartitionSet *columns = all_columns[gsearch.GridY()];
597 TBOX box = part->bounding_box();
598 int y = part->MidY();
599 ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600 ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601 // set distance from left column as space to the left
602 if (left_column) {
603 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604 part->set_space_to_left(left_space);
605 }
606 // set distance from right column as space to the right
607 if (right_column) {
608 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609 part->set_space_to_right(right_space);
610 }
611
612 // Look for images that may be closer.
613 // NOTE: used to be part_grid_, might cause issues now
614 ColPartitionGridSearch hsearch(grid);
615 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616 ColPartition *neighbor = nullptr;
617 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618 if (neighbor->type() == PT_PULLOUT_IMAGE ||
619 neighbor->type() == PT_FLOWING_IMAGE ||
620 neighbor->type() == PT_HEADING_IMAGE) {
621 int right = neighbor->bounding_box().right();
622 if (right < box.left()) {
623 int space = std::min(box.left() - right, part->space_to_left());
624 part->set_space_to_left(space);
625 }
626 }
627 }
628 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629 neighbor = nullptr;
630 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631 if (neighbor->type() == PT_PULLOUT_IMAGE ||
632 neighbor->type() == PT_FLOWING_IMAGE ||
633 neighbor->type() == PT_HEADING_IMAGE) {
634 int left = neighbor->bounding_box().left();
635 if (left > box.right()) {
636 int space = std::min(left - box.right(), part->space_to_right());
637 part->set_space_to_right(space);
638 }
639 }
640 }
641
642 ColPartition *upper_part = part->SingletonPartner(true);
643 if (upper_part) {
644 int space =
645 std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646 part->bounding_box().bottom()));
647 part->set_space_above(space);
648 } else {
649 // TODO(nbeato): What constitutes a good value?
650 // 0 is the default value when not set, explicitly noting it needs to
651 // be something else.
652 part->set_space_above(INT32_MAX);
653 }
654
655 ColPartition *lower_part = part->SingletonPartner(false);
656 if (lower_part) {
657 int space =
658 std::max(0, static_cast<int>(part->bounding_box().bottom() -
659 lower_part->bounding_box().bottom()));
660 part->set_space_below(space);
661 } else {
662 // TODO(nbeato): What constitutes a good value?
663 // 0 is the default value when not set, explicitly noting it needs to
664 // be something else.
665 part->set_space_below(INT32_MAX);
666 }
667 }
668 }
669
670 // Set spacing and closest neighbors above and below a given colpartition.
SetVerticalSpacing(ColPartition * part)671 void TableFinder::SetVerticalSpacing(ColPartition *part) {
672 TBOX box = part->bounding_box();
673 int top_range =
674 std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676 static_cast<int>(bleft().y()));
677 box.set_top(top_range);
678 box.set_bottom(bottom_range);
679
680 TBOX part_box = part->bounding_box();
681 // Start a rect search
682 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
683 &clean_part_grid_);
684 rectsearch.StartRectSearch(box);
685 ColPartition *neighbor;
686 int min_space_above = kMaxVerticalSpacing;
687 int min_space_below = kMaxVerticalSpacing;
688 ColPartition *above_neighbor = nullptr;
689 ColPartition *below_neighbor = nullptr;
690 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691 if (neighbor == part) {
692 continue;
693 }
694 TBOX neighbor_box = neighbor->bounding_box();
695 if (neighbor_box.major_x_overlap(part_box)) {
696 int gap = abs(part->median_bottom() - neighbor->median_bottom());
697 // If neighbor is below current partition
698 if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699 min_space_below = gap;
700 below_neighbor = neighbor;
701 } // If neighbor is above current partition
702 else if (part_box.top() < neighbor_box.bottom() &&
703 gap < min_space_above) {
704 min_space_above = gap;
705 above_neighbor = neighbor;
706 }
707 }
708 }
709 part->set_space_above(min_space_above);
710 part->set_space_below(min_space_below);
711 part->set_nearest_neighbor_above(above_neighbor);
712 part->set_nearest_neighbor_below(below_neighbor);
713 }
714
715 // Set global spacing and x-height estimates
SetGlobalSpacings(ColPartitionGrid * grid)716 void TableFinder::SetGlobalSpacings(ColPartitionGrid *grid) {
717 STATS xheight_stats(0, kMaxVerticalSpacing + 1);
718 STATS width_stats(0, kMaxBlobWidth + 1);
719 STATS ledding_stats(0, kMaxVerticalSpacing + 1);
720 // Iterate the ColPartitions in the grid.
721 ColPartitionGridSearch gsearch(grid);
722 gsearch.SetUniqueMode(true);
723 gsearch.StartFullSearch();
724 ColPartition *part = nullptr;
725 while ((part = gsearch.NextFullSearch()) != nullptr) {
726 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727 // ComputeLimits needs to get called somewhere outside of TableFinder
728 // to make sure the partitions are properly initialized.
729 // When this is called, SmoothPartitionPartners dies in an assert after
730 // table find runs. Alternative solution.
731 // part->ComputeLimits();
732 if (part->IsTextType()) {
733 // xheight_stats.add(part->median_height(), part->boxes_count());
734 // width_stats.add(part->median_width(), part->boxes_count());
735
736 // This loop can be removed when above issues are fixed.
737 // Replace it with the 2 lines commented out above.
738 BLOBNBOX_C_IT it(part->boxes());
739 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740 xheight_stats.add(it.data()->bounding_box().height(), 1);
741 width_stats.add(it.data()->bounding_box().width(), 1);
742 }
743
744 ledding_stats.add(part->space_above(), 1);
745 ledding_stats.add(part->space_below(), 1);
746 }
747 }
748 // Set estimates based on median of statistics obtained
749 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752 #ifndef GRAPHICS_DISABLED
753 if (textord_tablefind_show_stats) {
754 const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755 ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759 }
760 #endif // !GRAPHICS_DISABLED
761 }
762
set_global_median_xheight(int xheight)763 void TableFinder::set_global_median_xheight(int xheight) {
764 global_median_xheight_ = xheight;
765 }
set_global_median_blob_width(int width)766 void TableFinder::set_global_median_blob_width(int width) {
767 global_median_blob_width_ = width;
768 }
set_global_median_ledding(int ledding)769 void TableFinder::set_global_median_ledding(int ledding) {
770 global_median_ledding_ = ledding;
771 }
772
FindNeighbors()773 void TableFinder::FindNeighbors() {
774 ColPartitionGridSearch gsearch(&clean_part_grid_);
775 gsearch.StartFullSearch();
776 ColPartition *part = nullptr;
777 while ((part = gsearch.NextFullSearch()) != nullptr) {
778 // TODO(nbeato): Rename this function, meaning is different now.
779 // IT is finding nearest neighbors its own way
780 // SetVerticalSpacing(part);
781
782 ColPartition *upper = part->SingletonPartner(true);
783 if (upper) {
784 part->set_nearest_neighbor_above(upper);
785 }
786
787 ColPartition *lower = part->SingletonPartner(false);
788 if (lower) {
789 part->set_nearest_neighbor_below(lower);
790 }
791 }
792 }
793
794 // High level interface. Input is an unmarked ColPartitionGrid
795 // (namely, clean_part_grid_). Partitions are identified using local
796 // information and filter/smoothed. The function exit should contain
797 // a good sampling of the table partitions.
MarkTablePartitions()798 void TableFinder::MarkTablePartitions() {
799 MarkPartitionsUsingLocalInformation();
800 #ifndef GRAPHICS_DISABLED
801 if (textord_tablefind_show_mark) {
802 ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
803 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
804 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
805 ScrollView::AQUAMARINE);
806 }
807 #endif
808 FilterFalseAlarms();
809 #ifndef GRAPHICS_DISABLED
810 if (textord_tablefind_show_mark) {
811 ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
812 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
813 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
814 ScrollView::AQUAMARINE);
815 }
816 #endif
817 SmoothTablePartitionRuns();
818 #ifndef GRAPHICS_DISABLED
819 if (textord_tablefind_show_mark) {
820 ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
821 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
822 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
823 ScrollView::AQUAMARINE);
824 }
825 #endif
826 FilterFalseAlarms();
827 #ifndef GRAPHICS_DISABLED
828 if (textord_tablefind_show_mark || textord_show_tables) {
829 ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
830 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
831 DisplayColPartitions(table_win, &leader_and_ruling_grid_,
832 ScrollView::AQUAMARINE);
833 }
834 #endif
835 }
836
837 // These types of partitions are marked as table partitions:
838 // 1- Partitions that have at lease one large gap between words
839 // 2- Partitions that consist of only one word (no significant gap
840 // between components)
841 // 3- Partitions that vertically overlap with other partitions within the
842 // same column.
843 // 4- Partitions with leaders before/after them.
MarkPartitionsUsingLocalInformation()844 void TableFinder::MarkPartitionsUsingLocalInformation() {
845 // Iterate the ColPartitions in the grid.
846 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
847 &clean_part_grid_);
848 gsearch.StartFullSearch();
849 ColPartition *part = nullptr;
850 while ((part = gsearch.NextFullSearch()) != nullptr) {
851 if (!part->IsTextType()) { // Only consider text partitions
852 continue;
853 }
854 // Only consider partitions in dominant font size or smaller
855 if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {
856 continue;
857 }
858 // Mark partitions with a large gap, or no significant gap as
859 // table partitions.
860 // Comments: It produces several false alarms at:
861 // - last line of a paragraph (fixed)
862 // - single word section headings
863 // - page headers and footers
864 // - numbered equations
865 // - line drawing regions
866 // TODO(faisal): detect and fix above-mentioned cases
867 if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868 part->set_table_type();
869 }
870 }
871 }
872
873 // Check if the partition has at least one large gap between words or no
874 // significant gap at all
HasWideOrNoInterWordGap(ColPartition * part) const875 bool TableFinder::HasWideOrNoInterWordGap(ColPartition *part) const {
876 // Should only get text partitions.
877 ASSERT_HOST(part->IsTextType());
878 // Blob access
879 BLOBNBOX_CLIST *part_boxes = part->boxes();
880 BLOBNBOX_C_IT it(part_boxes);
881 // Check if this is a relatively small partition (such as a single word)
882 if (part->bounding_box().width() <
883 kMinBoxesInTextPartition * part->median_height() &&
884 part_boxes->length() < kMinBoxesInTextPartition) {
885 return true;
886 }
887
888 // Variables used to compute inter-blob spacing.
889 int current_x0 = -1;
890 int current_x1 = -1;
891 int previous_x1 = -1;
892 // Stores the maximum gap detected.
893 int largest_partition_gap_found = -1;
894 // Text partition gap limits. If this is text (and not a table),
895 // there should be at least one gap larger than min_gap and no gap
896 // larger than max_gap.
897 const double max_gap = kMaxGapInTextPartition * part->median_height();
898 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899
900 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901 BLOBNBOX *blob = it.data();
902 current_x0 = blob->bounding_box().left();
903 current_x1 = blob->bounding_box().right();
904 if (previous_x1 != -1) {
905 int gap = current_x0 - previous_x1;
906
907 // TODO(nbeato): Boxes may overlap? Huh?
908 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909 // on the top right of the page are filtered out with this line.
910 // Note 2: Iterating over blobs in a partition, so we are looking for
911 // spacing between the words.
912 if (gap < 0) {
913 // More likely case, the blobs slightly overlap. This can happen
914 // with diacritics (accents) or broken alphabet symbols (characters).
915 // Merge boxes together by taking max of right sides.
916 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917 previous_x1 = std::max(previous_x1, current_x1);
918 continue;
919 }
920 // Extreme case, blobs overlap significantly in the same partition...
921 // This should not happen often (if at all), but it does.
922 // TODO(nbeato): investigate cases when this happens.
923 else {
924 // The behavior before was to completely ignore this case.
925 }
926 }
927
928 // If a large enough gap is found, mark it as a table cell (return true)
929 if (gap > max_gap) {
930 return true;
931 }
932 if (gap > largest_partition_gap_found) {
933 largest_partition_gap_found = gap;
934 }
935 }
936 previous_x1 = current_x1;
937 }
938 // Since no large gap was found, return false if the partition is too
939 // long to be a data cell
940 if (part->bounding_box().width() >
941 kMaxBoxesInDataPartition * part->median_height() ||
942 part_boxes->length() > kMaxBoxesInDataPartition) {
943 return false;
944 }
945
946 // A partition may be a single blob. In this case, it's an isolated symbol
947 // or non-text (such as a ruling or image).
948 // Detect these as table partitions? Shouldn't this be case by case?
949 // The behavior before was to ignore this, making max_partition_gap < 0
950 // and implicitly return true. Just making it explicit.
951 if (largest_partition_gap_found == -1) {
952 return true;
953 }
954
955 // return true if the maximum gap found is smaller than the minimum allowed
956 // max_gap in a text partition. This indicates that there is no significant
957 // space in the partition, hence it is likely a single word.
958 return largest_partition_gap_found < min_gap;
959 }
960
961 // A criteria for possible tables is that a table may have leaders
962 // between data cells. An aggressive solution to find such tables is to
963 // explicitly mark partitions that have adjacent leaders.
964 // Note that this includes overlapping leaders. However, it does not
965 // include leaders in different columns on the page.
966 // Possible false-positive will include lists, such as a table of contents.
967 // As these arise, the aggressive nature of this search may need to be
968 // trimmed down.
HasLeaderAdjacent(const ColPartition & part)969 bool TableFinder::HasLeaderAdjacent(const ColPartition &part) {
970 if (part.flow() == BTFT_LEADER) {
971 return true;
972 }
973 // Search range is left and right bounded by an offset of the
974 // median xheight. This offset is to allow some tolerance to the
975 // the leaders on the page in the event that the alignment is still
976 // a bit off.
977 const TBOX &box = part.bounding_box();
978 const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
979 const int top = box.top() + search_size;
980 const int bottom = box.bottom() - search_size;
981 ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
982 for (int direction = 0; direction < 2; ++direction) {
983 bool right_to_left = (direction == 0);
984 int x = right_to_left ? box.right() : box.left();
985 hsearch.StartSideSearch(x, bottom, top);
986 ColPartition *leader = nullptr;
987 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988 // The leader could be a horizontal ruling in the grid.
989 // Make sure it is actually a leader.
990 if (leader->flow() != BTFT_LEADER) {
991 continue;
992 }
993 // This should not happen, they are in different grids.
994 ASSERT_HOST(&part != leader);
995 // Make sure the leader shares a page column with the partition,
996 // otherwise we are spreading across columns.
997 if (!part.IsInSameColumnAs(*leader)) {
998 break;
999 }
1000 // There should be a significant vertical overlap
1001 if (!leader->VSignificantCoreOverlap(part)) {
1002 continue;
1003 }
1004 // Leader passed all tests, so it is adjacent.
1005 return true;
1006 }
1007 }
1008 // No leaders are adjacent to the given partition.
1009 return false;
1010 }
1011
1012 // Filter individual text partitions marked as table partitions
1013 // consisting of paragraph endings, small section headings, and
1014 // headers and footers.
FilterFalseAlarms()1015 void TableFinder::FilterFalseAlarms() {
1016 FilterParagraphEndings();
1017 FilterHeaderAndFooter();
1018 // TODO(nbeato): Fully justified text as non-table?
1019 }
1020
FilterParagraphEndings()1021 void TableFinder::FilterParagraphEndings() {
1022 // Detect last line of paragraph
1023 // Iterate the ColPartitions in the grid.
1024 ColPartitionGridSearch gsearch(&clean_part_grid_);
1025 gsearch.StartFullSearch();
1026 ColPartition *part = nullptr;
1027 while ((part = gsearch.NextFullSearch()) != nullptr) {
1028 if (part->type() != PT_TABLE) {
1029 continue; // Consider only table partitions
1030 }
1031
1032 // Paragraph ending should have flowing text above it.
1033 ColPartition *upper_part = part->nearest_neighbor_above();
1034 if (!upper_part) {
1035 continue;
1036 }
1037 if (upper_part->type() != PT_FLOWING_TEXT) {
1038 continue;
1039 }
1040 if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041 continue;
1042 }
1043 // Check if its the last line of a paragraph.
1044 // In most cases, a paragraph ending should be left-aligned to text line
1045 // above it. Sometimes, it could be a 2 line paragraph, in which case
1046 // the line above it is indented.
1047 // To account for that, check if the partition center is to
1048 // the left of the one above it.
1049 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050 int upper_mid = (upper_part->bounding_box().left() +
1051 upper_part->bounding_box().right()) /
1052 2;
1053 int current_spacing = 0; // spacing of the current line to margin
1054 int upper_spacing = 0; // spacing of the previous line to the margin
1055 if (left_to_right_language_) {
1056 // Left to right languages, use mid - left to figure out the distance
1057 // the middle is from the left margin.
1058 int left = std::min(part->bounding_box().left(),
1059 upper_part->bounding_box().left());
1060 current_spacing = mid - left;
1061 upper_spacing = upper_mid - left;
1062 } else {
1063 // Right to left languages, use right - mid to figure out the distance
1064 // the middle is from the right margin.
1065 int right = std::max(part->bounding_box().right(),
1066 upper_part->bounding_box().right());
1067 current_spacing = right - mid;
1068 upper_spacing = right - upper_mid;
1069 }
1070 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071 continue;
1072 }
1073
1074 // Paragraphs should have similar fonts.
1075 if (!part->MatchingSizes(*upper_part) ||
1076 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1077 kStrokeWidthConstantTolerance)) {
1078 continue;
1079 }
1080
1081 // The last line of a paragraph should be left aligned.
1082 // TODO(nbeato): This would be untrue if the text was right aligned.
1083 // How often is that?
1084 if (part->space_to_left() >
1085 kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {
1086 continue;
1087 }
1088 // The line above it should be right aligned (assuming justified format).
1089 // Since we can't assume justified text, we compare whitespace to text.
1090 // The above line should have majority spanning text (or the current
1091 // line could have fit on the previous line). So compare
1092 // whitespace to text.
1093 if (upper_part->bounding_box().width() <
1094 kMinParagraphEndingTextToWhitespaceRatio *
1095 upper_part->space_to_right()) {
1096 continue;
1097 }
1098
1099 // Ledding above the line should be less than ledding below
1100 if (part->space_above() >= part->space_below() ||
1101 part->space_above() > 2 * global_median_ledding_) {
1102 continue;
1103 }
1104
1105 // If all checks failed, it is probably text.
1106 part->clear_table_type();
1107 }
1108 }
1109
FilterHeaderAndFooter()1110 void TableFinder::FilterHeaderAndFooter() {
1111 // Consider top-most text colpartition as header and bottom most as footer
1112 ColPartition *header = nullptr;
1113 ColPartition *footer = nullptr;
1114 int max_top = INT32_MIN;
1115 int min_bottom = INT32_MAX;
1116 ColPartitionGridSearch gsearch(&clean_part_grid_);
1117 gsearch.StartFullSearch();
1118 ColPartition *part = nullptr;
1119 while ((part = gsearch.NextFullSearch()) != nullptr) {
1120 if (!part->IsTextType()) {
1121 continue; // Consider only text partitions
1122 }
1123 int top = part->bounding_box().top();
1124 int bottom = part->bounding_box().bottom();
1125 if (top > max_top) {
1126 max_top = top;
1127 header = part;
1128 }
1129 if (bottom < min_bottom) {
1130 min_bottom = bottom;
1131 footer = part;
1132 }
1133 }
1134 if (header) {
1135 header->clear_table_type();
1136 }
1137 if (footer) {
1138 footer->clear_table_type();
1139 }
1140 }
1141
1142 // Mark all ColPartitions as table cells that have a table cell above
1143 // and below them
1144 // TODO(faisal): This is too aggressive at the moment. The method needs to
1145 // consider spacing and alignment as well. Detection of false alarm table cells
1146 // should also be done as part of it.
SmoothTablePartitionRuns()1147 void TableFinder::SmoothTablePartitionRuns() {
1148 // Iterate the ColPartitions in the grid.
1149 ColPartitionGridSearch gsearch(&clean_part_grid_);
1150 gsearch.StartFullSearch();
1151 ColPartition *part = nullptr;
1152 while ((part = gsearch.NextFullSearch()) != nullptr) {
1153 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154 continue; // Consider only text partitions
1155 }
1156 ColPartition *upper_part = part->nearest_neighbor_above();
1157 ColPartition *lower_part = part->nearest_neighbor_below();
1158 if (!upper_part || !lower_part) {
1159 continue;
1160 }
1161 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162 part->set_table_type();
1163 }
1164 }
1165
1166 // Pass 2, do the opposite. If both the upper and lower neighbors
1167 // exist and are not tables, this probably shouldn't be a table.
1168 gsearch.StartFullSearch();
1169 part = nullptr;
1170 while ((part = gsearch.NextFullSearch()) != nullptr) {
1171 if (part->type() != PT_TABLE) {
1172 continue; // Consider only text partitions
1173 }
1174 ColPartition *upper_part = part->nearest_neighbor_above();
1175 ColPartition *lower_part = part->nearest_neighbor_below();
1176
1177 // table can't be by itself
1178 if ((upper_part && upper_part->type() != PT_TABLE) &&
1179 (lower_part && lower_part->type() != PT_TABLE)) {
1180 part->clear_table_type();
1181 }
1182 }
1183 }
1184
1185 // Set the type of a column segment based on the ratio of table to text cells
SetColumnsType(ColSegment_LIST * column_blocks)1186 void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {
1187 ColSegment_IT it(column_blocks);
1188 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189 ColSegment *seg = it.data();
1190 TBOX box = seg->bounding_box();
1191 int num_table_cells = 0;
1192 int num_text_cells = 0;
1193 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(
1194 &clean_part_grid_);
1195 rsearch.SetUniqueMode(true);
1196 rsearch.StartRectSearch(box);
1197 ColPartition *part = nullptr;
1198 while ((part = rsearch.NextRectSearch()) != nullptr) {
1199 if (part->type() == PT_TABLE) {
1200 num_table_cells++;
1201 } else if (part->type() == PT_FLOWING_TEXT) {
1202 num_text_cells++;
1203 }
1204 }
1205 // If a column block has no text or table partition in it, it is not needed
1206 // for table detection.
1207 if (!num_table_cells && !num_text_cells) {
1208 delete it.extract();
1209 } else {
1210 seg->set_num_table_cells(num_table_cells);
1211 seg->set_num_text_cells(num_text_cells);
1212 // set column type based on the ratio of table to text cells
1213 seg->set_type();
1214 }
1215 }
1216 }
1217
1218 // Move column blocks to grid
MoveColSegmentsToGrid(ColSegment_LIST * segments,ColSegmentGrid * col_seg_grid)1219 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1220 ColSegmentGrid *col_seg_grid) {
1221 ColSegment_IT it(segments);
1222 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223 ColSegment *seg = it.extract();
1224 col_seg_grid->InsertBBox(true, true, seg);
1225 }
1226 }
1227
1228 // Merge column blocks if a split is detected due to the presence of a
1229 // table. A text block is considered split if it has multiple
1230 // neighboring blocks above/below it, and at least one of the
1231 // neighboring blocks is of table type (has a high density of table
1232 // partitions). In this case neighboring blocks in the direction
1233 // (above/below) of the table block are merged with the text block.
1234
1235 // Comment: This method does not handle split due to a full page table
1236 // since table columns in this case do not have a text column on which
1237 // split decision can be based.
GridMergeColumnBlocks()1238 void TableFinder::GridMergeColumnBlocks() {
1239 int margin = gridsize();
1240
1241 // Iterate the Column Blocks in the grid.
1242 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1243 &col_seg_grid_);
1244 gsearch.StartFullSearch();
1245 ColSegment *seg;
1246 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247 if (seg->type() != COL_TEXT) {
1248 continue; // only consider text blocks for split detection
1249 }
1250 bool neighbor_found = false;
1251 bool modified = false; // Modified at least once
1252 // keep expanding current box as long as neighboring table columns
1253 // are found above or below it.
1254 do {
1255 TBOX box = seg->bounding_box();
1256 // slightly expand the search region vertically
1257 int top_range =
1258 std::min(box.top() + margin, static_cast<int>(tright().y()));
1259 int bottom_range =
1260 std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261 box.set_top(top_range);
1262 box.set_bottom(bottom_range);
1263 neighbor_found = false;
1264 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1265 &col_seg_grid_);
1266 rectsearch.StartRectSearch(box);
1267 ColSegment *neighbor = nullptr;
1268 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269 if (neighbor == seg) {
1270 continue;
1271 }
1272 const TBOX &neighbor_box = neighbor->bounding_box();
1273 // If the neighbor box significantly overlaps with the current
1274 // box (due to the expansion of the current box in the
1275 // previous iteration of this loop), remove the neighbor box
1276 // and expand the current box to include it.
1277 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278 seg->InsertBox(neighbor_box);
1279 modified = true;
1280 rectsearch.RemoveBBox();
1281 gsearch.RepositionIterator();
1282 delete neighbor;
1283 continue;
1284 }
1285 // Only expand if the neighbor box is of table type
1286 if (neighbor->type() != COL_TABLE) {
1287 continue;
1288 }
1289 // Insert the neighbor box into the current column block
1290 if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291 seg->InsertBox(neighbor_box);
1292 neighbor_found = true;
1293 modified = true;
1294 rectsearch.RemoveBBox();
1295 gsearch.RepositionIterator();
1296 delete neighbor;
1297 }
1298 }
1299 } while (neighbor_found);
1300 if (modified) {
1301 // Because the box has changed, it has to be removed first.
1302 gsearch.RemoveBBox();
1303 col_seg_grid_.InsertBBox(true, true, seg);
1304 gsearch.RepositionIterator();
1305 }
1306 }
1307 }
1308
1309 // Group horizontally overlapping table partitions into table columns.
1310 // TODO(faisal): This is too aggressive at the moment. The method should
1311 // consider more attributes to group table partitions together. Some common
1312 // errors are:
1313 // 1- page number is merged with a table column above it even
1314 // if there is a large vertical gap between them.
1315 // 2- column headers go on to catch one of the columns arbitrarily
1316 // 3- an isolated noise blob near page top or bottom merges with the table
1317 // column below/above it
1318 // 4- cells from two vertically adjacent tables merge together to make a
1319 // single column resulting in merging of the two tables
GetTableColumns(ColSegment_LIST * table_columns)1320 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1321 ColSegment_IT it(table_columns);
1322 // Iterate the ColPartitions in the grid.
1323 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
1324 &clean_part_grid_);
1325 gsearch.StartFullSearch();
1326 ColPartition *part;
1327 while ((part = gsearch.NextFullSearch()) != nullptr) {
1328 if (part->inside_table_column() || part->type() != PT_TABLE) {
1329 continue; // prevent a partition to be assigned to multiple columns
1330 }
1331 const TBOX &box = part->bounding_box();
1332 auto *col = new ColSegment();
1333 col->InsertBox(box);
1334 part->set_inside_table_column(true);
1335 // Start a search below the current cell to find bottom neighbours
1336 // Note: a full search will always process things above it first, so
1337 // this should be starting at the highest cell and working its way down.
1338 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> vsearch(
1339 &clean_part_grid_);
1340 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341 ColPartition *neighbor = nullptr;
1342 bool found_neighbours = false;
1343 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344 // only consider neighbors not assigned to any column yet
1345 if (neighbor->inside_table_column()) {
1346 continue;
1347 }
1348 // Horizontal lines should not break the flow
1349 if (neighbor->IsHorizontalLine()) {
1350 continue;
1351 }
1352 // presence of a non-table neighbor marks the end of current
1353 // table column
1354 if (neighbor->type() != PT_TABLE) {
1355 break;
1356 }
1357 // add the neighbor partition to the table column
1358 const TBOX &neighbor_box = neighbor->bounding_box();
1359 col->InsertBox(neighbor_box);
1360 neighbor->set_inside_table_column(true);
1361 found_neighbours = true;
1362 }
1363 if (found_neighbours) {
1364 it.add_after_then_move(col);
1365 } else {
1366 part->set_inside_table_column(false);
1367 delete col;
1368 }
1369 }
1370 }
1371
1372 // Mark regions in a column that are x-bounded by the column boundaries and
1373 // y-bounded by the table columns' projection on the y-axis as table regions
GetTableRegions(ColSegment_LIST * table_columns,ColSegment_LIST * table_regions)1374 void TableFinder::GetTableRegions(ColSegment_LIST *table_columns,
1375 ColSegment_LIST *table_regions) {
1376 ColSegment_IT cit(table_columns);
1377 ColSegment_IT rit(table_regions);
1378 // Iterate through column blocks
1379 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1380 &col_seg_grid_);
1381 gsearch.StartFullSearch();
1382 ColSegment *part;
1383 int page_height = tright().y() - bleft().y();
1384 ASSERT_HOST(page_height > 0);
1385 // create a bool array to hold projection on y-axis
1386 bool *table_region = new bool[page_height];
1387 while ((part = gsearch.NextFullSearch()) != nullptr) {
1388 const TBOX &part_box = part->bounding_box();
1389 // reset the projection array
1390 for (int i = 0; i < page_height; i++) {
1391 table_region[i] = false;
1392 }
1393 // iterate through all table columns to find regions in the current
1394 // page column block
1395 cit.move_to_first();
1396 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397 TBOX col_box = cit.data()->bounding_box();
1398 // find intersection region of table column and page column
1399 TBOX intersection_box = col_box.intersection(part_box);
1400 // project table column on the y-axis
1401 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402 table_region[i - bleft().y()] = true;
1403 }
1404 }
1405 // set x-limits of table regions to page column width
1406 TBOX current_table_box;
1407 current_table_box.set_left(part_box.left());
1408 current_table_box.set_right(part_box.right());
1409 // go through the y-axis projection to find runs of table
1410 // regions. Each run makes one table region.
1411 for (int i = 1; i < page_height; i++) {
1412 // detect start of a table region
1413 if (!table_region[i - 1] && table_region[i]) {
1414 current_table_box.set_bottom(i + bleft().y());
1415 }
1416 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417 // detect end of a table region
1418 if (table_region[i - 1] && !table_region[i]) {
1419 current_table_box.set_top(i + bleft().y());
1420 if (!current_table_box.null_box()) {
1421 auto *seg = new ColSegment();
1422 seg->InsertBox(current_table_box);
1423 rit.add_after_then_move(seg);
1424 }
1425 }
1426 }
1427 }
1428 delete[] table_region;
1429 }
1430
1431 // Merge table regions corresponding to tables spanning multiple columns if
1432 // there is a colpartition (horizontal ruling line or normal text) that
1433 // touches both regions.
1434 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1435 // tables with aligned ruling lines. In this case, line finder returns a
1436 // single line and hence the tables get merged together
GridMergeTableRegions()1437 void TableFinder::GridMergeTableRegions() {
1438 // Iterate the table regions in the grid.
1439 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1440 &table_grid_);
1441 gsearch.StartFullSearch();
1442 ColSegment *seg = nullptr;
1443 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444 bool neighbor_found = false;
1445 bool modified = false; // Modified at least once
1446 do {
1447 // Start a rectangle search x-bounded by the image and y by the table
1448 const TBOX &box = seg->bounding_box();
1449 TBOX search_region(box);
1450 search_region.set_left(bleft().x());
1451 search_region.set_right(tright().x());
1452 neighbor_found = false;
1453 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1454 &table_grid_);
1455 rectsearch.StartRectSearch(search_region);
1456 ColSegment *neighbor = nullptr;
1457 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458 if (neighbor == seg) {
1459 continue;
1460 }
1461 const TBOX &neighbor_box = neighbor->bounding_box();
1462 // Check if a neighbor box has a large overlap with the table
1463 // region. This may happen as a result of merging two table
1464 // regions in the previous iteration.
1465 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466 seg->InsertBox(neighbor_box);
1467 rectsearch.RemoveBBox();
1468 gsearch.RepositionIterator();
1469 delete neighbor;
1470 modified = true;
1471 continue;
1472 }
1473 // Check if two table regions belong together based on a common
1474 // horizontal ruling line
1475 if (BelongToOneTable(box, neighbor_box)) {
1476 seg->InsertBox(neighbor_box);
1477 neighbor_found = true;
1478 modified = true;
1479 rectsearch.RemoveBBox();
1480 gsearch.RepositionIterator();
1481 delete neighbor;
1482 }
1483 }
1484 } while (neighbor_found);
1485 if (modified) {
1486 // Because the box has changed, it has to be removed first.
1487 gsearch.RemoveBBox();
1488 table_grid_.InsertBBox(true, true, seg);
1489 gsearch.RepositionIterator();
1490 }
1491 }
1492 }
1493
1494 // Decide if two table regions belong to one table based on a common
1495 // horizontal ruling line or another colpartition
BelongToOneTable(const TBOX & box1,const TBOX & box2)1496 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1497 // Check the obvious case. Most likely not true because overlapping boxes
1498 // should already be merged, but seems like a good thing to do in case things
1499 // change.
1500 if (box1.overlap(box2)) {
1501 return true;
1502 }
1503 // Check for ColPartitions spanning both table regions
1504 TBOX bbox = box1.bounding_union(box2);
1505 // Start a rect search on bbox
1506 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1507 &clean_part_grid_);
1508 rectsearch.StartRectSearch(bbox);
1509 ColPartition *part = nullptr;
1510 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511 const TBOX &part_box = part->bounding_box();
1512 // return true if a colpartition spanning both table regions is found
1513 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514 !part->IsImageType()) {
1515 return true;
1516 }
1517 }
1518 return false;
1519 }
1520
1521 // Adjust table boundaries by:
1522 // - building a tight bounding box around all ColPartitions contained in it.
1523 // - expanding table boundaries to include all colpartitions that overlap the
1524 // table by more than half of their area
1525 // - expanding table boundaries to include nearby horizontal rule lines
1526 // - expanding table vertically to include left out column headers
1527 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1528 // makes following errors:
1529 // 1- horizontal lines consisting of underlines are included in the table if
1530 // they are close enough
1531 // 2- horizontal lines originating from noise tend to get merged with a table
1532 // near the top of the page
1533 // 3- the criteria for including horizontal lines is very generous. Many times
1534 // horizontal lines separating headers and footers get merged with a
1535 // single-column table in a multi-column page thereby including text
1536 // from the neighboring column inside the table
1537 // 4- the criteria for including left out column headers also tends to
1538 // occasionally include text-lines above the tables, typically from
1539 // table caption
AdjustTableBoundaries()1540 void TableFinder::AdjustTableBoundaries() {
1541 // Iterate the table regions in the grid
1542 ColSegment_CLIST adjusted_tables;
1543 ColSegment_C_IT it(&adjusted_tables);
1544 ColSegmentGridSearch gsearch(&table_grid_);
1545 gsearch.StartFullSearch();
1546 ColSegment *table = nullptr;
1547 while ((table = gsearch.NextFullSearch()) != nullptr) {
1548 const TBOX &table_box = table->bounding_box();
1549 TBOX grown_box = table_box;
1550 GrowTableBox(table_box, &grown_box);
1551 // To prevent a table from expanding again, do not insert the
1552 // modified box back to the grid. Instead move it to a list and
1553 // and remove it from the grid. The list is moved later back to the grid.
1554 if (!grown_box.null_box()) {
1555 auto *col = new ColSegment();
1556 col->InsertBox(grown_box);
1557 it.add_after_then_move(col);
1558 }
1559 gsearch.RemoveBBox();
1560 delete table;
1561 }
1562 // clear table grid to move final tables in it
1563 // TODO(nbeato): table_grid_ should already be empty. The above loop
1564 // removed everything. Maybe just assert it is empty?
1565 table_grid_.Clear();
1566 it.move_to_first();
1567 // move back final tables to table_grid_
1568 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569 ColSegment *seg = it.extract();
1570 table_grid_.InsertBBox(true, true, seg);
1571 }
1572 }
1573
GrowTableBox(const TBOX & table_box,TBOX * result_box)1574 void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {
1575 // TODO(nbeato): The growing code is a bit excessive right now.
1576 // By removing these lines, the partitions considered need
1577 // to have some overlap or be special cases. These lines could
1578 // be added again once a check is put in place to make sure that
1579 // growing tables don't stomp on a lot of non-table partitions.
1580
1581 // search for horizontal ruling lines within the vertical margin
1582 // int vertical_margin = kRulingVerticalMargin * gridsize();
1583 TBOX search_box = table_box;
1584 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586 // search_box.set_top(top);
1587 // search_box.set_bottom(bottom);
1588
1589 GrowTableToIncludePartials(table_box, search_box, result_box);
1590 GrowTableToIncludeLines(table_box, search_box, result_box);
1591 IncludeLeftOutColumnHeaders(result_box);
1592 }
1593
1594 // Grow a table by increasing the size of the box to include
1595 // partitions with significant overlap with the table.
GrowTableToIncludePartials(const TBOX & table_box,const TBOX & search_range,TBOX * result_box)1596 void TableFinder::GrowTableToIncludePartials(const TBOX &table_box,
1597 const TBOX &search_range,
1598 TBOX *result_box) {
1599 // Rulings are in a different grid, so search 2 grids for rulings, text,
1600 // and table partitions that are not entirely within the new box.
1601 for (int i = 0; i < 2; ++i) {
1602 ColPartitionGrid *grid =
1603 (i == 0) ? &fragmented_text_grid_ : &leader_and_ruling_grid_;
1604 ColPartitionGridSearch rectsearch(grid);
1605 rectsearch.StartRectSearch(search_range);
1606 ColPartition *part = nullptr;
1607 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608 // Only include text and table types.
1609 if (part->IsImageType()) {
1610 continue;
1611 }
1612 const TBOX &part_box = part->bounding_box();
1613 // Include partition in the table if more than half of it
1614 // is covered by the table
1615 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616 *result_box = result_box->bounding_union(part_box);
1617 continue;
1618 }
1619 }
1620 }
1621 }
1622
1623 // Grow a table by expanding to the extents of significantly
1624 // overlapping lines.
GrowTableToIncludeLines(const TBOX & table_box,const TBOX & search_range,TBOX * result_box)1625 void TableFinder::GrowTableToIncludeLines(const TBOX &table_box,
1626 const TBOX &search_range,
1627 TBOX *result_box) {
1628 ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);
1629 rsearch.SetUniqueMode(true);
1630 rsearch.StartRectSearch(search_range);
1631 ColPartition *part = nullptr;
1632 while ((part = rsearch.NextRectSearch()) != nullptr) {
1633 // TODO(nbeato) This should also do vertical, but column
1634 // boundaries are breaking things. This function needs to be
1635 // updated to allow vertical lines as well.
1636 if (!part->IsLineType()) {
1637 continue;
1638 }
1639 // Avoid the following function call if the result of the
1640 // function is irrelevant.
1641 const TBOX &part_box = part->bounding_box();
1642 if (result_box->contains(part_box)) {
1643 continue;
1644 }
1645 // Include a partially overlapping horizontal line only if the
1646 // extra ColPartitions that will be included due to expansion
1647 // have large side spacing w.r.t. columns containing them.
1648 if (HLineBelongsToTable(*part, table_box)) {
1649 *result_box = result_box->bounding_union(part_box);
1650 }
1651 // TODO(nbeato): Vertical
1652 }
1653 }
1654
1655 // Checks whether the horizontal line belong to the table by looking at the
1656 // side spacing of extra ColParitions that will be included in the table
1657 // due to expansion
HLineBelongsToTable(const ColPartition & part,const TBOX & table_box)1658 bool TableFinder::HLineBelongsToTable(const ColPartition &part,
1659 const TBOX &table_box) {
1660 if (!part.IsHorizontalLine()) {
1661 return false;
1662 }
1663 const TBOX &part_box = part.bounding_box();
1664 if (!part_box.major_x_overlap(table_box)) {
1665 return false;
1666 }
1667 // Do not consider top-most horizontal line since it usually
1668 // originates from noise.
1669 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670 // have neighbors solved.
1671 // if (!part.nearest_neighbor_above())
1672 // return false;
1673 const TBOX bbox = part_box.bounding_union(table_box);
1674 // In the "unioned table" box (the table extents expanded by the line),
1675 // keep track of how many partitions have significant padding to the left
1676 // and right. If more than half of the partitions covered by the new table
1677 // have significant spacing, the line belongs to the table and the table
1678 // grows to include all of the partitions.
1679 int num_extra_partitions = 0;
1680 int extra_space_to_right = 0;
1681 int extra_space_to_left = 0;
1682 // Rulings are in a different grid, so search 2 grids for rulings, text,
1683 // and table partitions that are introduced by the new box.
1684 for (int i = 0; i < 2; ++i) {
1685 ColPartitionGrid *grid =
1686 (i == 0) ? &clean_part_grid_ : &leader_and_ruling_grid_;
1687 // Start a rect search on bbox
1688 ColPartitionGridSearch rectsearch(grid);
1689 rectsearch.SetUniqueMode(true);
1690 rectsearch.StartRectSearch(bbox);
1691 ColPartition *extra_part = nullptr;
1692 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693 // ColPartition already in table
1694 const TBOX &extra_part_box = extra_part->bounding_box();
1695 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696 continue;
1697 }
1698 // Non-text ColPartitions do not contribute
1699 if (extra_part->IsImageType()) {
1700 continue;
1701 }
1702 // Consider this partition.
1703 num_extra_partitions++;
1704 // presence of a table cell is a strong hint, so just increment the scores
1705 // without looking at the spacing.
1706 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707 extra_space_to_right++;
1708 extra_space_to_left++;
1709 continue;
1710 }
1711 int space_threshold = kSideSpaceMargin * part.median_height();
1712 if (extra_part->space_to_right() > space_threshold) {
1713 extra_space_to_right++;
1714 }
1715 if (extra_part->space_to_left() > space_threshold) {
1716 extra_space_to_left++;
1717 }
1718 }
1719 }
1720 // tprintf("%d %d %d\n",
1721 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722 return (extra_space_to_right > num_extra_partitions / 2) ||
1723 (extra_space_to_left > num_extra_partitions / 2);
1724 }
1725
1726 // Look for isolated column headers above the given table box and
1727 // include them in the table
IncludeLeftOutColumnHeaders(TBOX * table_box)1728 void TableFinder::IncludeLeftOutColumnHeaders(TBOX *table_box) {
1729 // Start a search above the current table to look for column headers
1730 ColPartitionGridSearch vsearch(&clean_part_grid_);
1731 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732 table_box->top());
1733 ColPartition *neighbor = nullptr;
1734 ColPartition *previous_neighbor = nullptr;
1735 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736 // Max distance to find a table heading.
1737 const int max_distance =
1738 kMaxColumnHeaderDistance * neighbor->median_height();
1739 int table_top = table_box->top();
1740 const TBOX &box = neighbor->bounding_box();
1741 // Do not continue if the next box is way above
1742 if (box.bottom() - table_top > max_distance) {
1743 break;
1744 }
1745 // Unconditionally include partitions of type TABLE or LINE
1746 // TODO(faisal): add some reasonable conditions here
1747 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748 table_box->set_top(box.top());
1749 previous_neighbor = nullptr;
1750 continue;
1751 }
1752 // If there are two text partitions, one above the other, without a table
1753 // cell on their left or right side, consider them a barrier and quit
1754 if (previous_neighbor == nullptr) {
1755 previous_neighbor = neighbor;
1756 } else {
1757 const TBOX &previous_box = previous_neighbor->bounding_box();
1758 if (!box.major_y_overlap(previous_box)) {
1759 break;
1760 }
1761 }
1762 }
1763 }
1764
1765 // Remove false alarms consisting of a single column based on their
1766 // projection on the x-axis. Projection of a real table on the x-axis
1767 // should have at least one zero-valley larger than the global median
1768 // x-height of the page.
DeleteSingleColumnTables()1769 void TableFinder::DeleteSingleColumnTables() {
1770 int page_width = tright().x() - bleft().x();
1771 ASSERT_HOST(page_width > 0);
1772 // create an integer array to hold projection on x-axis
1773 int *table_xprojection = new int[page_width];
1774 // Iterate through all tables in the table grid
1775 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
1776 &table_grid_);
1777 table_search.StartFullSearch();
1778 ColSegment *table;
1779 while ((table = table_search.NextFullSearch()) != nullptr) {
1780 TBOX table_box = table->bounding_box();
1781 // reset the projection array
1782 for (int i = 0; i < page_width; i++) {
1783 table_xprojection[i] = 0;
1784 }
1785 // Start a rect search on table_box
1786 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1787 &clean_part_grid_);
1788 rectsearch.SetUniqueMode(true);
1789 rectsearch.StartRectSearch(table_box);
1790 ColPartition *part;
1791 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792 if (!part->IsTextType()) {
1793 continue; // Do not consider non-text partitions
1794 }
1795 if (part->flow() == BTFT_LEADER) {
1796 continue; // Assume leaders are in tables
1797 }
1798 TBOX part_box = part->bounding_box();
1799 // Do not consider partitions partially covered by the table
1800 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801 continue;
1802 }
1803 BLOBNBOX_CLIST *part_boxes = part->boxes();
1804 BLOBNBOX_C_IT pit(part_boxes);
1805
1806 // Make sure overlapping blobs don't artificially inflate the number
1807 // of rows in the table. This happens frequently with things such as
1808 // decimals and split characters. Do this by assuming the column
1809 // partition is sorted mostly left to right and just clip
1810 // bounding boxes by the previous box's extent.
1811 int next_position_to_write = 0;
1812
1813 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814 BLOBNBOX *pblob = pit.data();
1815 // ignore blob height for the purpose of projection since we
1816 // are only interested in finding valleys
1817 int xstart = pblob->bounding_box().left();
1818 int xend = pblob->bounding_box().right();
1819
1820 xstart = std::max(xstart, next_position_to_write);
1821 for (int i = xstart; i < xend; i++) {
1822 table_xprojection[i - bleft().x()]++;
1823 }
1824 next_position_to_write = xend;
1825 }
1826 }
1827 // Find largest valley between two reasonable peaks in the table
1828 if (!GapInXProjection(table_xprojection, page_width)) {
1829 table_search.RemoveBBox();
1830 delete table;
1831 }
1832 }
1833 delete[] table_xprojection;
1834 }
1835
1836 // Return true if at least one gap larger than the global x-height
1837 // exists in the horizontal projection
GapInXProjection(int * xprojection,int length)1838 bool TableFinder::GapInXProjection(int *xprojection, int length) {
1839 // Find peak value of the histogram
1840 int peak_value = 0;
1841 for (int i = 0; i < length; i++) {
1842 if (xprojection[i] > peak_value) {
1843 peak_value = xprojection[i];
1844 }
1845 }
1846 // Peak value represents the maximum number of horizontally
1847 // overlapping colpartitions, so this can be considered as the
1848 // number of rows in the table
1849 if (peak_value < kMinRowsInTable) {
1850 return false;
1851 }
1852 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853 if (peak_value >= kLargeTableRowCount) {
1854 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855 }
1856 // Threshold the histogram
1857 for (int i = 0; i < length; i++) {
1858 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859 }
1860 // Find the largest run of zeros between two ones
1861 int largest_gap = 0;
1862 int run_start = -1;
1863 for (int i = 1; i < length; i++) {
1864 // detect start of a run of zeros
1865 if (xprojection[i - 1] && !xprojection[i]) {
1866 run_start = i;
1867 }
1868 // detect end of a run of zeros and update the value of largest gap
1869 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870 int gap = i - run_start;
1871 if (gap > largest_gap) {
1872 largest_gap = gap;
1873 }
1874 run_start = -1;
1875 }
1876 }
1877 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878 }
1879
1880 // Given the location of a table "guess", try to overlay a cellular
1881 // grid in the location, adjusting the boundaries.
1882 // TODO(nbeato): Falsely introduces:
1883 // -headers/footers (not any worse, too much overlap destroys cells)
1884 // -page numbers (not worse, included because maximize margins)
1885 // -equations (nicely fit into a celluar grid, but more sparsely)
1886 // -figures (random text box, also sparse)
1887 // -small left-aligned text areas with overlapping positioned whitespace
1888 // (rejected before)
1889 // Overall, this just needs some more work.
RecognizeTables()1890 void TableFinder::RecognizeTables() {
1891 #ifndef GRAPHICS_DISABLED
1892 ScrollView *table_win = nullptr;
1893 if (textord_show_tables) {
1894 table_win = MakeWindow(0, 0, "Table Structure");
1895 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE,
1896 ScrollView::LIGHT_BLUE);
1897 // table_grid_.DisplayBoxes(table_win);
1898 }
1899 #endif
1900
1901 TableRecognizer recognizer;
1902 recognizer.Init();
1903 recognizer.set_line_grid(&leader_and_ruling_grid_);
1904 recognizer.set_text_grid(&fragmented_text_grid_);
1905 recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1906 recognizer.set_min_height(1.5 * gridheight());
1907 // Loop over all of the tables and try to fit them.
1908 // Store the good tables here.
1909 ColSegment_CLIST good_tables;
1910 ColSegment_C_IT good_it(&good_tables);
1911
1912 ColSegmentGridSearch gsearch(&table_grid_);
1913 gsearch.StartFullSearch();
1914 ColSegment *found_table = nullptr;
1915 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916 gsearch.RemoveBBox();
1917
1918 // The goal is to make the tables persistent in a list.
1919 // When that happens, this will move into the search loop.
1920 const TBOX &found_box = found_table->bounding_box();
1921 StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922
1923 // Process a table. Good tables are inserted into the grid again later on
1924 // We can't change boxes in the grid while it is running a search.
1925 if (table_structure != nullptr) {
1926 #ifndef GRAPHICS_DISABLED
1927 if (textord_show_tables) {
1928 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929 }
1930 #endif
1931 found_table->set_bounding_box(table_structure->bounding_box());
1932 delete table_structure;
1933 good_it.add_after_then_move(found_table);
1934 } else {
1935 delete found_table;
1936 }
1937 }
1938 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939
1940 // At this point, the grid is empty. We can safely insert the good tables
1941 // back into grid.
1942 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943 table_grid_.InsertBBox(true, true, good_it.extract());
1944 }
1945 }
1946
1947 #ifndef GRAPHICS_DISABLED
1948
1949 // Displays the column segments in some window.
DisplayColSegments(ScrollView * win,ColSegment_LIST * segments,ScrollView::Color color)1950 void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,
1951 ScrollView::Color color) {
1952 win->Pen(color);
1953 win->Brush(ScrollView::NONE);
1954 ColSegment_IT it(segments);
1955 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956 ColSegment *col = it.data();
1957 const TBOX &box = col->bounding_box();
1958 int left_x = box.left();
1959 int right_x = box.right();
1960 int top_y = box.top();
1961 int bottom_y = box.bottom();
1962 win->Rectangle(left_x, bottom_y, right_x, top_y);
1963 }
1964 win->UpdateWindow();
1965 }
1966
1967 // Displays the colpartitions using a new coloring on an existing window.
1968 // Note: This method is only for debug purpose during development and
1969 // would not be part of checked in code
DisplayColPartitions(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color default_color,ScrollView::Color table_color)1970 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1971 ScrollView::Color default_color,
1972 ScrollView::Color table_color) {
1973 ScrollView::Color color = default_color;
1974 // Iterate the ColPartitions in the grid.
1975 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
1976 gsearch.StartFullSearch();
1977 ColPartition *part = nullptr;
1978 while ((part = gsearch.NextFullSearch()) != nullptr) {
1979 color = default_color;
1980 if (part->type() == PT_TABLE) {
1981 color = table_color;
1982 }
1983
1984 const TBOX &box = part->bounding_box();
1985 int left_x = box.left();
1986 int right_x = box.right();
1987 int top_y = box.top();
1988 int bottom_y = box.bottom();
1989 win->Brush(ScrollView::NONE);
1990 win->Pen(color);
1991 win->Rectangle(left_x, bottom_y, right_x, top_y);
1992 }
1993 win->UpdateWindow();
1994 }
1995
DisplayColPartitions(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color default_color)1996 void TableFinder::DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid,
1997 ScrollView::Color default_color) {
1998 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999 }
2000
DisplayColPartitionConnections(ScrollView * win,ColPartitionGrid * grid,ScrollView::Color color)2001 void TableFinder::DisplayColPartitionConnections(ScrollView *win,
2002 ColPartitionGrid *grid,
2003 ScrollView::Color color) {
2004 // Iterate the ColPartitions in the grid.
2005 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2006 gsearch.StartFullSearch();
2007 ColPartition *part = nullptr;
2008 while ((part = gsearch.NextFullSearch()) != nullptr) {
2009 const TBOX &box = part->bounding_box();
2010 int left_x = box.left();
2011 int right_x = box.right();
2012 int top_y = box.top();
2013 int bottom_y = box.bottom();
2014
2015 ColPartition *upper_part = part->nearest_neighbor_above();
2016 if (upper_part) {
2017 const TBOX &upper_box = upper_part->bounding_box();
2018 int mid_x = (left_x + right_x) / 2;
2019 int mid_y = (top_y + bottom_y) / 2;
2020 int other_x = (upper_box.left() + upper_box.right()) / 2;
2021 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022 win->Brush(ScrollView::NONE);
2023 win->Pen(color);
2024 win->Line(mid_x, mid_y, other_x, other_y);
2025 }
2026 ColPartition *lower_part = part->nearest_neighbor_below();
2027 if (lower_part) {
2028 const TBOX &lower_box = lower_part->bounding_box();
2029 int mid_x = (left_x + right_x) / 2;
2030 int mid_y = (top_y + bottom_y) / 2;
2031 int other_x = (lower_box.left() + lower_box.right()) / 2;
2032 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033 win->Brush(ScrollView::NONE);
2034 win->Pen(color);
2035 win->Line(mid_x, mid_y, other_x, other_y);
2036 }
2037 }
2038 win->UpdateWindow();
2039 }
2040
2041 #endif
2042
2043 // Merge all colpartitions in table regions to make them a single
2044 // colpartition and revert types of isolated table cells not
2045 // assigned to any table to their original types.
MakeTableBlocks(ColPartitionGrid * grid,ColPartitionSet ** all_columns,const WidthCallback & width_cb)2046 void TableFinder::MakeTableBlocks(ColPartitionGrid *grid,
2047 ColPartitionSet **all_columns,
2048 const WidthCallback &width_cb) {
2049 // Since we have table blocks already, remove table tags from all
2050 // colpartitions
2051 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2052 gsearch.StartFullSearch();
2053 ColPartition *part = nullptr;
2054
2055 while ((part = gsearch.NextFullSearch()) != nullptr) {
2056 if (part->type() == PT_TABLE) {
2057 part->clear_table_type();
2058 }
2059 }
2060 // Now make a single colpartition out of each table block and remove
2061 // all colpartitions contained within a table
2062 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
2063 &table_grid_);
2064 table_search.StartFullSearch();
2065 ColSegment *table;
2066 while ((table = table_search.NextFullSearch()) != nullptr) {
2067 const TBOX &table_box = table->bounding_box();
2068 // Start a rect search on table_box
2069 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
2070 grid);
2071 rectsearch.StartRectSearch(table_box);
2072 ColPartition *part;
2073 ColPartition *table_partition = nullptr;
2074 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075 // Do not consider image partitions
2076 if (!part->IsTextType()) {
2077 continue;
2078 }
2079 TBOX part_box = part->bounding_box();
2080 // Include partition in the table if more than half of it
2081 // is covered by the table
2082 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083 rectsearch.RemoveBBox();
2084 if (table_partition) {
2085 table_partition->Absorb(part, width_cb);
2086 } else {
2087 table_partition = part;
2088 }
2089 }
2090 }
2091 // Insert table colpartition back to part_grid_
2092 if (table_partition) {
2093 // To match the columns used when transforming to blocks, the new table
2094 // partition must have its first and last column set at the grid y that
2095 // corresponds to its bottom.
2096 const TBOX &table_box = table_partition->bounding_box();
2097 int grid_x, grid_y;
2098 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100 table_partition->set_table_type();
2101 table_partition->set_blob_type(BRT_TEXT);
2102 table_partition->set_flow(BTFT_CHAIN);
2103 table_partition->SetBlobTypes();
2104 grid->InsertBBox(true, true, table_partition);
2105 }
2106 }
2107 }
2108
2109 //////// ColSegment code
2110 ////////
ColSegment()2111 ColSegment::ColSegment()
2112 : ELIST_LINK(),
2113 num_table_cells_(0),
2114 num_text_cells_(0),
2115 type_(COL_UNKNOWN) {}
2116
2117 // Provides a color for BBGrid to draw the rectangle.
BoxColor() const2118 ScrollView::Color ColSegment::BoxColor() const {
2119 const ScrollView::Color kBoxColors[PT_COUNT] = {
2120 ScrollView::YELLOW,
2121 ScrollView::BLUE,
2122 ScrollView::YELLOW,
2123 ScrollView::MAGENTA,
2124 };
2125 return kBoxColors[type_];
2126 }
2127
2128 // Insert a box into this column segment
InsertBox(const TBOX & other)2129 void ColSegment::InsertBox(const TBOX &other) {
2130 bounding_box_ = bounding_box_.bounding_union(other);
2131 }
2132
2133 // Set column segment type based on the ratio of text and table partitions
2134 // in it.
set_type()2135 void ColSegment::set_type() {
2136 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {
2137 type_ = COL_TABLE;
2138 } else if (num_text_cells_ > num_table_cells_) {
2139 type_ = COL_TEXT;
2140 } else {
2141 type_ = COL_MIXED;
2142 }
2143 }
2144
2145 } // namespace tesseract.
2146