1 ///////////////////////////////////////////////////////////////////////
2 // File:        tablerecog.cpp
3 // Description: Helper class to help structure table areas. Given an bounding
4 //              box from TableFinder, the TableRecognizer should give a
5 //              StructuredTable (maybe a list in the future) of "good" tables
6 //              in that area.
7 // Author:      Nicholas Beato
8 //
9 // (C) Copyright 2009, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
20 ///////////////////////////////////////////////////////////////////////
21 
22 #ifdef HAVE_CONFIG_H
23 #  include "config_auto.h"
24 #endif
25 
26 #include "tablerecog.h"
27 
28 #include <algorithm>
29 
30 namespace tesseract {
31 
32 // The amount of space required between the ColPartitions in 2 columns
33 // of a non-lined table as a multiple of the median width.
34 const double kHorizontalSpacing = 0.30;
35 // The amount of space required between the ColPartitions in 2 rows
36 // of a non-lined table as multiples of the median height.
37 const double kVerticalSpacing = -0.2;
38 // The number of cells that the grid lines may intersect.
39 // See FindCellSplitLocations for explanation.
40 const int kCellSplitRowThreshold = 0;
41 const int kCellSplitColumnThreshold = 0;
42 // For "lined tables", the number of required lines. Currently a guess.
43 const int kLinedTableMinVerticalLines = 3;
44 const int kLinedTableMinHorizontalLines = 3;
45 // Number of columns required, as a fraction of the most columns found.
46 // None of these are tweaked at all.
47 const double kRequiredColumns = 0.7;
48 // The tolerance for comparing margins of potential tables.
49 const double kMarginFactor = 1.1;
50 // The first and last row should be consistent cell height.
51 // This factor is the first and last row cell height max.
52 const double kMaxRowSize = 2.5;
53 // Number of filled columns required to form a strong table row.
54 // For small tables, this is an absolute number.
55 const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};
56 // For large tables, it is a relative number
57 const double kGoodRowNumberOfColumnsLarge = 0.7;
58 // The amount of area that must be covered in a cell by ColPartitions to
59 // be considered "filled"
60 const double kMinFilledArea = 0.35;
61 
62 ////////
63 //////// StructuredTable Class
64 ////////
65 
StructuredTable()66 StructuredTable::StructuredTable()
67     : text_grid_(nullptr)
68     , line_grid_(nullptr)
69     , is_lined_(false)
70     , space_above_(0)
71     , space_below_(0)
72     , space_left_(0)
73     , space_right_(0)
74     , median_cell_height_(0)
75     , median_cell_width_(0)
76     , max_text_height_(INT32_MAX) {}
77 
Init()78 void StructuredTable::Init() {}
79 
set_text_grid(ColPartitionGrid * text_grid)80 void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) {
81   text_grid_ = text_grid;
82 }
set_line_grid(ColPartitionGrid * line_grid)83 void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) {
84   line_grid_ = line_grid;
85 }
set_max_text_height(int height)86 void StructuredTable::set_max_text_height(int height) {
87   max_text_height_ = height;
88 }
is_lined() const89 bool StructuredTable::is_lined() const {
90   return is_lined_;
91 }
row_count() const92 unsigned StructuredTable::row_count() const {
93   return cell_y_.empty() ? 0 : cell_y_.size() - 1;
94 }
column_count() const95 unsigned StructuredTable::column_count() const {
96   return cell_x_.empty() ? 0 : cell_x_.size() - 1;
97 }
cell_count() const98 unsigned StructuredTable::cell_count() const {
99   return row_count() * column_count();
100 }
set_bounding_box(const TBOX & box)101 void StructuredTable::set_bounding_box(const TBOX &box) {
102   bounding_box_ = box;
103 }
bounding_box() const104 const TBOX &StructuredTable::bounding_box() const {
105   return bounding_box_;
106 }
median_cell_height()107 int StructuredTable::median_cell_height() {
108   return median_cell_height_;
109 }
median_cell_width()110 int StructuredTable::median_cell_width() {
111   return median_cell_width_;
112 }
row_height(unsigned row) const113 int StructuredTable::row_height(unsigned row) const {
114   ASSERT_HOST(row < row_count());
115   return cell_y_[row + 1] - cell_y_[row];
116 }
column_width(unsigned column) const117 int StructuredTable::column_width(unsigned column) const {
118   ASSERT_HOST(column < column_count());
119   return cell_x_[column + 1] - cell_x_[column];
120 }
space_above() const121 int StructuredTable::space_above() const {
122   return space_above_;
123 }
space_below() const124 int StructuredTable::space_below() const {
125   return space_below_;
126 }
127 
128 // At this point, we know that the lines are contained
129 // by the box (by FindLinesBoundingBox).
130 // So try to find the cell structure and make sure it works out.
131 // The assumption is that all lines span the table. If this
132 // assumption fails, the VerifyLinedTable method will
133 // abort the lined table. The TableRecognizer will fall
134 // back on FindWhitespacedStructure.
FindLinedStructure()135 bool StructuredTable::FindLinedStructure() {
136   ClearStructure();
137 
138   // Search for all of the lines in the current box.
139   // Update the cellular structure with the exact lines.
140   ColPartitionGridSearch box_search(line_grid_);
141   box_search.SetUniqueMode(true);
142   box_search.StartRectSearch(bounding_box_);
143   ColPartition *line = nullptr;
144 
145   while ((line = box_search.NextRectSearch()) != nullptr) {
146     if (line->IsHorizontalLine()) {
147       cell_y_.push_back(line->MidY());
148     }
149     if (line->IsVerticalLine()) {
150       cell_x_.push_back(line->MidX());
151     }
152   }
153 
154   // HasSignificantLines should guarantee cells.
155   // Because that code is a different class, just gracefully
156   // return false. This could be an assert.
157   if (cell_x_.size() < 3 || cell_y_.size() < 3) {
158     return false;
159   }
160 
161   // Sort and remove duplicates that may have occurred due to split lines.
162   std::sort(cell_x_.begin(), cell_x_.end());
163   auto last_x = std::unique(cell_x_.begin(), cell_x_.end());
164   cell_x_.erase(last_x, cell_x_.end());
165   std::sort(cell_y_.begin(), cell_y_.end());
166   auto last_y = std::unique(cell_y_.begin(), cell_y_.end());
167   cell_y_.erase(last_y, cell_y_.end());
168 
169   // The border should be the extents of line boxes, not middle.
170   cell_x_[0] = bounding_box_.left();
171   cell_x_[cell_x_.size() - 1] = bounding_box_.right();
172   cell_y_[0] = bounding_box_.bottom();
173   cell_y_[cell_y_.size() - 1] = bounding_box_.top();
174 
175   // Remove duplicates that may have occurred due to moving the borders.
176   last_x = std::unique(cell_x_.begin(), cell_x_.end());
177   cell_x_.erase(last_x, cell_x_.end());
178   last_y = std::unique(cell_y_.begin(), cell_y_.end());
179   cell_y_.erase(last_y, cell_y_.end());
180 
181   CalculateMargins();
182   CalculateStats();
183   is_lined_ = VerifyLinedTableCells();
184   return is_lined_;
185 }
186 
187 // Finds the cellular structure given a particular box.
FindWhitespacedStructure()188 bool StructuredTable::FindWhitespacedStructure() {
189   ClearStructure();
190   FindWhitespacedColumns();
191   FindWhitespacedRows();
192 
193   if (!VerifyWhitespacedTable()) {
194     return false;
195   } else {
196     bounding_box_.set_left(cell_x_[0]);
197     bounding_box_.set_right(cell_x_[cell_x_.size() - 1]);
198     bounding_box_.set_bottom(cell_y_[0]);
199     bounding_box_.set_top(cell_y_[cell_y_.size() - 1]);
200     AbsorbNearbyLines();
201     CalculateMargins();
202     CalculateStats();
203     return true;
204   }
205 }
206 
207 // Tests if a partition fits inside the table structure.
208 // Partitions must fully span a grid line in order to intersect it.
209 // This means that a partition does not intersect a line
210 // that it "just" touches. This is mainly because the assumption
211 // throughout the code is that "0" distance is a very very small space.
DoesPartitionFit(const ColPartition & part) const212 bool StructuredTable::DoesPartitionFit(const ColPartition &part) const {
213   const TBOX &box = part.bounding_box();
214   for (int i : cell_x_) {
215     if (box.left() < i && i < box.right()) {
216       return false;
217     }
218   }
219   for (int i : cell_y_) {
220     if (box.bottom() < i && i < box.top()) {
221       return false;
222     }
223   }
224   return true;
225 }
226 
227 // Checks if a sub-table has multiple data cells filled.
CountFilledCells()228 int StructuredTable::CountFilledCells() {
229   return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
230 }
CountFilledCellsInRow(int row)231 int StructuredTable::CountFilledCellsInRow(int row) {
232   return CountFilledCells(row, row, 0, column_count() - 1);
233 }
CountFilledCellsInColumn(int column)234 int StructuredTable::CountFilledCellsInColumn(int column) {
235   return CountFilledCells(0, row_count() - 1, column, column);
236 }
CountFilledCells(unsigned row_start,unsigned row_end,unsigned column_start,unsigned column_end)237 int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
238                                       unsigned column_end) {
239   ASSERT_HOST(row_start <= row_end && row_end < row_count());
240   ASSERT_HOST(column_start <= column_end && column_end < column_count());
241   int cell_count = 0;
242   TBOX cell_box;
243   for (unsigned row = row_start; row <= row_end; ++row) {
244     cell_box.set_bottom(cell_y_[row]);
245     cell_box.set_top(cell_y_[row + 1]);
246     for (unsigned col = column_start; col <= column_end; ++col) {
247       cell_box.set_left(cell_x_[col]);
248       cell_box.set_right(cell_x_[col + 1]);
249       if (CountPartitions(cell_box) > 0) {
250         ++cell_count;
251       }
252     }
253   }
254   return cell_count;
255 }
256 
257 // Makes sure that at least one cell in a row has substantial area filled.
258 // This can filter out large whitespace caused by growing tables too far
259 // and page numbers.
VerifyRowFilled(int row)260 bool StructuredTable::VerifyRowFilled(int row) {
261   for (unsigned i = 0; i < column_count(); ++i) {
262     auto area_filled = CalculateCellFilledPercentage(row, i);
263     if (area_filled >= kMinFilledArea) {
264       return true;
265     }
266   }
267   return false;
268 }
269 
270 // Finds the filled area in a cell.
271 // Assume ColPartitions do not overlap for simplicity (even though they do).
CalculateCellFilledPercentage(unsigned row,unsigned column)272 double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
273   ASSERT_HOST(row <= row_count());
274   ASSERT_HOST(column <= column_count());
275   const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
276   ASSERT_HOST(!kCellBox.null_box());
277 
278   ColPartitionGridSearch gsearch(text_grid_);
279   gsearch.SetUniqueMode(true);
280   gsearch.StartRectSearch(kCellBox);
281   double area_covered = 0;
282   ColPartition *text = nullptr;
283   while ((text = gsearch.NextRectSearch()) != nullptr) {
284     if (text->IsTextType()) {
285       area_covered += text->bounding_box().intersection(kCellBox).area();
286     }
287   }
288   const int32_t current_area = kCellBox.area();
289   if (current_area == 0) {
290     return 1.0;
291   }
292   return std::min(1.0, area_covered / current_area);
293 }
294 
295 #ifndef GRAPHICS_DISABLED
296 
Display(ScrollView * window,ScrollView::Color color)297 void StructuredTable::Display(ScrollView *window, ScrollView::Color color) {
298   window->Brush(ScrollView::NONE);
299   window->Pen(color);
300   window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),
301                     bounding_box_.top());
302   for (int i : cell_x_) {
303     window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());
304   }
305   for (int i : cell_y_) {
306     window->Line(bounding_box_.left(), i, bounding_box_.right(), i);
307   }
308   window->UpdateWindow();
309 }
310 
311 #endif
312 
313 // Clear structure information.
ClearStructure()314 void StructuredTable::ClearStructure() {
315   cell_x_.clear();
316   cell_y_.clear();
317   is_lined_ = false;
318   space_above_ = 0;
319   space_below_ = 0;
320   space_left_ = 0;
321   space_right_ = 0;
322   median_cell_height_ = 0;
323   median_cell_width_ = 0;
324 }
325 
326 // When a table has lines, the lines should not intersect any partitions.
327 // The following function makes sure the previous assumption is met.
VerifyLinedTableCells()328 bool StructuredTable::VerifyLinedTableCells() {
329   // Function only called when lines exist.
330   ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);
331   for (int i : cell_y_) {
332     if (CountHorizontalIntersections(i) > 0) {
333       return false;
334     }
335   }
336   for (int i : cell_x_) {
337     if (CountVerticalIntersections(i) > 0) {
338       return false;
339     }
340   }
341   return true;
342 }
343 
344 // TODO(nbeato): Could be much better than this.
345 // Examples:
346 //   - Caclulate the percentage of filled cells.
347 //   - Calculate the average number of ColPartitions per cell.
348 //   - Calculate the number of cells per row with partitions.
349 //   - Check if ColPartitions in adjacent cells are similar.
350 //   - Check that all columns are at least a certain width.
351 //   - etc.
VerifyWhitespacedTable()352 bool StructuredTable::VerifyWhitespacedTable() {
353   // criteria for a table, must be at least 2x3 or 3x2
354   return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
355 }
356 
357 // Finds vertical splits in the ColPartitions of text_grid_ by considering
358 // all possible "good" guesses. A good guess is just the left/right sides of
359 // the partitions, since these locations will uniquely define where the
360 // extremal values where the splits can occur. The split happens
361 // in the middle of the two nearest partitions.
FindWhitespacedColumns()362 void StructuredTable::FindWhitespacedColumns() {
363   // Set of the extents of all partitions on the page.
364   std::vector<int> left_sides;
365   std::vector<int> right_sides;
366 
367   // Look at each text partition. We want to find the partitions
368   // that have extremal left/right sides. These will give us a basis
369   // for the table columns.
370   ColPartitionGridSearch gsearch(text_grid_);
371   gsearch.SetUniqueMode(true);
372   gsearch.StartRectSearch(bounding_box_);
373   ColPartition *text = nullptr;
374   while ((text = gsearch.NextRectSearch()) != nullptr) {
375     if (!text->IsTextType()) {
376       continue;
377     }
378 
379     ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
380     int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);
381     left_sides.push_back(text->bounding_box().left() - spacing);
382     right_sides.push_back(text->bounding_box().right() + spacing);
383   }
384   // It causes disaster below, so avoid it!
385   if (left_sides.empty() || right_sides.empty()) {
386     return;
387   }
388 
389   // Since data may be inserted in grid order, we sort the left/right sides.
390   std::sort(left_sides.begin(), left_sides.end());
391   std::sort(right_sides.begin(), right_sides.end());
392 
393   // At this point, in the "merged list", we expect to have a left side,
394   // followed by either more left sides or a right side. The last number
395   // should be a right side. We find places where the splits occur by looking
396   // for "valleys". If we want to force gap sizes or allow overlap, change
397   // the spacing above. If you want to let lines "slice" partitions as long
398   // as it is infrequent, change the following function.
399   FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);
400 }
401 
402 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
403 // all possible "good" guesses. A good guess is just the bottom/top sides of
404 // the partitions, since these locations will uniquely define where the
405 // extremal values where the splits can occur. The split happens
406 // in the middle of the two nearest partitions.
FindWhitespacedRows()407 void StructuredTable::FindWhitespacedRows() {
408   // Set of the extents of all partitions on the page.
409   std::vector<int> bottom_sides;
410   std::vector<int> top_sides;
411   // We will be "shrinking" partitions, so keep the min/max around to
412   // make sure the bottom/top lines do not intersect text.
413   int min_bottom = INT32_MAX;
414   int max_top = INT32_MIN;
415 
416   // Look at each text partition. We want to find the partitions
417   // that have extremal bottom/top sides. These will give us a basis
418   // for the table rows. Because the textlines can be skewed and close due
419   // to warping, the height of the partitions is toned down a little bit.
420   ColPartitionGridSearch gsearch(text_grid_);
421   gsearch.SetUniqueMode(true);
422   gsearch.StartRectSearch(bounding_box_);
423   ColPartition *text = nullptr;
424   while ((text = gsearch.NextRectSearch()) != nullptr) {
425     if (!text->IsTextType()) {
426       continue;
427     }
428 
429     ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
430     min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
431     max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
432 
433     // Ignore "tall" text partitions, as these are usually false positive
434     // vertical text or multiple lines pulled together.
435     if (text->bounding_box().height() > max_text_height_) {
436       continue;
437     }
438 
439     int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);
440     int bottom = text->bounding_box().bottom() - spacing;
441     int top = text->bounding_box().top() + spacing;
442     // For horizontal text, the factor can be negative. This should
443     // probably cause a warning or failure. I haven't actually checked if
444     // it happens.
445     if (bottom >= top) {
446       continue;
447     }
448 
449     bottom_sides.push_back(bottom);
450     top_sides.push_back(top);
451   }
452   // It causes disaster below, so avoid it!
453   if (bottom_sides.empty() || top_sides.empty()) {
454     return;
455   }
456 
457   // Since data may be inserted in grid order, we sort the bottom/top sides.
458   std::sort(bottom_sides.begin(), bottom_sides.end());
459   std::sort(top_sides.begin(), top_sides.end());
460 
461   // At this point, in the "merged list", we expect to have a bottom side,
462   // followed by either more bottom sides or a top side. The last number
463   // should be a top side. We find places where the splits occur by looking
464   // for "valleys". If we want to force gap sizes or allow overlap, change
465   // the spacing above. If you want to let lines "slice" partitions as long
466   // as it is infrequent, change the following function.
467   FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);
468 
469   // Recover the min/max correctly since it was shifted.
470   cell_y_[0] = min_bottom;
471   cell_y_[cell_y_.size() - 1] = max_top;
472 }
473 
CalculateMargins()474 void StructuredTable::CalculateMargins() {
475   space_above_ = INT32_MAX;
476   space_below_ = INT32_MAX;
477   space_right_ = INT32_MAX;
478   space_left_ = INT32_MAX;
479   UpdateMargins(text_grid_);
480   UpdateMargins(line_grid_);
481 }
482 // Finds the nearest partition in grid to the table
483 // boundaries and updates the margin.
UpdateMargins(ColPartitionGrid * grid)484 void StructuredTable::UpdateMargins(ColPartitionGrid *grid) {
485   int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
486   space_below_ = std::min(space_below_, below);
487   int above = FindVerticalMargin(grid, bounding_box_.top(), false);
488   space_above_ = std::min(space_above_, above);
489   int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
490   space_left_ = std::min(space_left_, left);
491   int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
492   space_right_ = std::min(space_right_, right);
493 }
FindVerticalMargin(ColPartitionGrid * grid,int border,bool decrease) const494 int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
495   ColPartitionGridSearch gsearch(grid);
496   gsearch.SetUniqueMode(true);
497   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border);
498   ColPartition *part = nullptr;
499   while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
500     if (!part->IsTextType() && !part->IsHorizontalLine()) {
501       continue;
502     }
503     int distance =
504         decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;
505     if (distance >= 0) {
506       return distance;
507     }
508   }
509   return INT32_MAX;
510 }
FindHorizontalMargin(ColPartitionGrid * grid,int border,bool decrease) const511 int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
512   ColPartitionGridSearch gsearch(grid);
513   gsearch.SetUniqueMode(true);
514   gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
515   ColPartition *part = nullptr;
516   while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
517     if (!part->IsTextType() && !part->IsVerticalLine()) {
518       continue;
519     }
520     int distance =
521         decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;
522     if (distance >= 0) {
523       return distance;
524     }
525   }
526   return INT32_MAX;
527 }
528 
CalculateStats()529 void StructuredTable::CalculateStats() {
530   const int kMaxCellHeight = 1000;
531   const int kMaxCellWidth = 1000;
532   STATS height_stats(0, kMaxCellHeight + 1);
533   STATS width_stats(0, kMaxCellWidth + 1);
534 
535   for (unsigned i = 0; i < row_count(); ++i) {
536     height_stats.add(row_height(i), column_count());
537   }
538   for (unsigned i = 0; i < column_count(); ++i) {
539     width_stats.add(column_width(i), row_count());
540   }
541 
542   median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
543   median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
544 }
545 
546 // Looks for grid lines near the current bounding box and
547 // grows the bounding box to include them if no intersections
548 // will occur as a result. This is necessary because the margins
549 // are calculated relative to the closest line/text. If the
550 // line isn't absorbed, the margin will be the distance to the line.
AbsorbNearbyLines()551 void StructuredTable::AbsorbNearbyLines() {
552   ColPartitionGridSearch gsearch(line_grid_);
553   gsearch.SetUniqueMode(true);
554 
555   // Is the closest line above good? Loop multiple times for tables with
556   // multi-line (sometimes 2) borders. Limit the number of lines by
557   // making sure they stay within a table cell or so.
558   ColPartition *line = nullptr;
559   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top());
560   while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
561     if (!line->IsHorizontalLine()) {
562       break;
563     }
564     TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(),
565                      line->MidY());
566     if (text_search.height() > median_cell_height_ * 2) {
567       break;
568     }
569     if (CountPartitions(text_search) > 0) {
570       break;
571     }
572     bounding_box_.set_top(line->MidY());
573   }
574   // As above, is the closest line below good?
575   line = nullptr;
576   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom());
577   while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
578     if (!line->IsHorizontalLine()) {
579       break;
580     }
581     TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),
582                      bounding_box_.bottom() - 1);
583     if (text_search.height() > median_cell_height_ * 2) {
584       break;
585     }
586     if (CountPartitions(text_search) > 0) {
587       break;
588     }
589     bounding_box_.set_bottom(line->MidY());
590   }
591   // TODO(nbeato): vertical lines
592 }
593 
594 // This function will find all "0 valleys" (of any length) given two
595 // arrays. The arrays are the mins and maxes of partitions (either
596 // left and right or bottom and top). Since the min/max lists are generated
597 // with pairs of increasing integers, we can make some assumptions in
598 // the function about ordering of the overall list, which are shown in the
599 // asserts.
600 // The algorithm works as follows:
601 //   While there are numbers to process, take the smallest number.
602 //     If it is from the min_list, increment the "hill" counter.
603 //     Otherwise, decrement the "hill" counter.
604 //     In the process of doing this, keep track of "crossing" the
605 //     desired height.
606 // The first/last items are extremal values of the list and known.
607 // NOTE: This function assumes the lists are sorted!
FindCellSplitLocations(const std::vector<int> & min_list,const std::vector<int> & max_list,int max_merged,std::vector<int> * locations)608 void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
609                                              const std::vector<int> &max_list, int max_merged,
610                                              std::vector<int> *locations) {
611   locations->clear();
612   ASSERT_HOST(min_list.size() == max_list.size());
613   if (min_list.empty()) {
614     return;
615   }
616   ASSERT_HOST(min_list.at(0) < max_list.at(0));
617   ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
618 
619   locations->push_back(min_list.at(0));
620   unsigned min_index = 0;
621   unsigned max_index = 0;
622   int stacked_partitions = 0;
623   int last_cross_position = INT32_MAX;
624   // max_index will expire after min_index.
625   // However, we can't "increase" the hill size if min_index expired.
626   // So finish processing when min_index expires.
627   while (min_index < min_list.size()) {
628     // Increase the hill count.
629     if (min_list[min_index] < max_list[max_index]) {
630       ++stacked_partitions;
631       if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {
632         int mid = (last_cross_position + min_list[min_index]) / 2;
633         locations->push_back(mid);
634         last_cross_position = INT32_MAX;
635       }
636       ++min_index;
637     } else {
638       // Decrease the hill count.
639       --stacked_partitions;
640       if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {
641         last_cross_position = max_list[max_index];
642       }
643       ++max_index;
644     }
645   }
646   locations->push_back(max_list.at(max_list.size() - 1));
647 }
648 
649 // Counts the number of partitions in the table
650 // box that intersection the given x value.
CountVerticalIntersections(int x)651 int StructuredTable::CountVerticalIntersections(int x) {
652   int count = 0;
653   // Make a small box to keep the search time down.
654   const int kGridSize = text_grid_->gridsize();
655   TBOX vertical_box = bounding_box_;
656   vertical_box.set_left(x - kGridSize);
657   vertical_box.set_right(x + kGridSize);
658 
659   ColPartitionGridSearch gsearch(text_grid_);
660   gsearch.SetUniqueMode(true);
661   gsearch.StartRectSearch(vertical_box);
662   ColPartition *text = nullptr;
663   while ((text = gsearch.NextRectSearch()) != nullptr) {
664     if (!text->IsTextType()) {
665       continue;
666     }
667     const TBOX &box = text->bounding_box();
668     if (box.left() < x && x < box.right()) {
669       ++count;
670     }
671   }
672   return count;
673 }
674 
675 // Counts the number of partitions in the table
676 // box that intersection the given y value.
CountHorizontalIntersections(int y)677 int StructuredTable::CountHorizontalIntersections(int y) {
678   int count = 0;
679   // Make a small box to keep the search time down.
680   const int kGridSize = text_grid_->gridsize();
681   TBOX horizontal_box = bounding_box_;
682   horizontal_box.set_bottom(y - kGridSize);
683   horizontal_box.set_top(y + kGridSize);
684 
685   ColPartitionGridSearch gsearch(text_grid_);
686   gsearch.SetUniqueMode(true);
687   gsearch.StartRectSearch(horizontal_box);
688   ColPartition *text = nullptr;
689   while ((text = gsearch.NextRectSearch()) != nullptr) {
690     if (!text->IsTextType()) {
691       continue;
692     }
693 
694     const TBOX &box = text->bounding_box();
695     if (box.bottom() < y && y < box.top()) {
696       ++count;
697     }
698   }
699   return count;
700 }
701 
702 // Counts how many text partitions are in this box.
703 // This is used to count partitions in cells, as that can indicate
704 // how "strong" a potential table row/column (or even full table) actually is.
CountPartitions(const TBOX & box)705 int StructuredTable::CountPartitions(const TBOX &box) {
706   ColPartitionGridSearch gsearch(text_grid_);
707   gsearch.SetUniqueMode(true);
708   gsearch.StartRectSearch(box);
709   int count = 0;
710   ColPartition *text = nullptr;
711   while ((text = gsearch.NextRectSearch()) != nullptr) {
712     if (text->IsTextType()) {
713       ++count;
714     }
715   }
716   return count;
717 }
718 
719 ////////
720 //////// TableRecognizer Class
721 ////////
722 
Init()723 void TableRecognizer::Init() {}
724 
set_text_grid(ColPartitionGrid * text_grid)725 void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) {
726   text_grid_ = text_grid;
727 }
set_line_grid(ColPartitionGrid * line_grid)728 void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) {
729   line_grid_ = line_grid;
730 }
set_min_height(int height)731 void TableRecognizer::set_min_height(int height) {
732   min_height_ = height;
733 }
set_min_width(int width)734 void TableRecognizer::set_min_width(int width) {
735   min_width_ = width;
736 }
set_max_text_height(int height)737 void TableRecognizer::set_max_text_height(int height) {
738   max_text_height_ = height;
739 }
740 
RecognizeTable(const TBOX & guess)741 StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) {
742   auto *table = new StructuredTable();
743   table->Init();
744   table->set_text_grid(text_grid_);
745   table->set_line_grid(line_grid_);
746   table->set_max_text_height(max_text_height_);
747 
748   // Try to solve this simple case, a table with *both*
749   // vertical and horizontal lines.
750   if (RecognizeLinedTable(guess, table)) {
751     return table;
752   }
753 
754   // Fallback to whitespace if that failed.
755   // TODO(nbeato): Break this apart to take advantage of horizontal
756   // lines or vertical lines when present.
757   if (RecognizeWhitespacedTable(guess, table)) {
758     return table;
759   }
760 
761   // No table found...
762   delete table;
763   return nullptr;
764 }
765 
RecognizeLinedTable(const TBOX & guess_box,StructuredTable * table)766 bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) {
767   if (!HasSignificantLines(guess_box)) {
768     return false;
769   }
770   TBOX line_bound = guess_box;
771   if (!FindLinesBoundingBox(&line_bound)) {
772     return false;
773   }
774   table->set_bounding_box(line_bound);
775   return table->FindLinedStructure();
776 }
777 
778 // Quick implementation. Just count the number of lines in the box.
779 // A better implementation would counter intersections and look for connected
780 // components. It could even go as far as finding similar length lines.
781 // To account for these possible issues, the VerifyLinedTableCells function
782 // will reject lined tables that cause intersections with text on the page.
783 // TODO(nbeato): look for "better" lines
HasSignificantLines(const TBOX & guess)784 bool TableRecognizer::HasSignificantLines(const TBOX &guess) {
785   ColPartitionGridSearch box_search(line_grid_);
786   box_search.SetUniqueMode(true);
787   box_search.StartRectSearch(guess);
788   ColPartition *line = nullptr;
789   int vertical_count = 0;
790   int horizontal_count = 0;
791 
792   while ((line = box_search.NextRectSearch()) != nullptr) {
793     if (line->IsHorizontalLine()) {
794       ++horizontal_count;
795     }
796     if (line->IsVerticalLine()) {
797       ++vertical_count;
798     }
799   }
800 
801   return vertical_count >= kLinedTableMinVerticalLines &&
802          horizontal_count >= kLinedTableMinHorizontalLines;
803 }
804 
805 // Given a bounding box with a bunch of horizontal / vertical lines,
806 // we just find the extents of all of these lines iteratively.
807 // The box will be at least as large as guess. This
808 // could possibly be a bad assumption.
809 // It is guaranteed to halt in at least O(n * gridarea) where n
810 // is the number of lines.
811 // The assumption is that growing the box iteratively will add lines
812 // several times, but eventually we'll find the extents.
813 //
814 // For tables, the approach is a bit aggressive, a single line (which could be
815 // noise or a column ruling) can destroy the table inside.
816 //
817 // TODO(nbeato): This is a quick first implementation.
818 // A better implementation would actually look for consistency
819 // in extents of the lines and find the extents using lines
820 // that clearly describe the table. This would allow the
821 // lines to "vote" for height/width. An approach like
822 // this would solve issues with page layout rulings.
823 // I haven't looked for these issues yet, so I can't even
824 // say they happen confidently.
FindLinesBoundingBox(TBOX * bounding_box)825 bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) {
826   // The first iteration will tell us if there are lines
827   // present and shrink the box to a minimal iterative size.
828   if (!FindLinesBoundingBoxIteration(bounding_box)) {
829     return false;
830   }
831 
832   // Keep growing until the area of the table stabilizes.
833   // The box can only get bigger, increasing area.
834   bool changed = true;
835   while (changed) {
836     changed = false;
837     int old_area = bounding_box->area();
838     bool check = FindLinesBoundingBoxIteration(bounding_box);
839     // At this point, the function will return true.
840     ASSERT_HOST(check);
841     ASSERT_HOST(bounding_box->area() >= old_area);
842     changed = (bounding_box->area() > old_area);
843   }
844 
845   return true;
846 }
847 
FindLinesBoundingBoxIteration(TBOX * bounding_box)848 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) {
849   // Search for all of the lines in the current box, keeping track of extents.
850   ColPartitionGridSearch box_search(line_grid_);
851   box_search.SetUniqueMode(true);
852   box_search.StartRectSearch(*bounding_box);
853   ColPartition *line = nullptr;
854   bool first_line = true;
855 
856   while ((line = box_search.NextRectSearch()) != nullptr) {
857     if (line->IsLineType()) {
858       if (first_line) {
859         // The first iteration can shrink the box.
860         *bounding_box = line->bounding_box();
861         first_line = false;
862       } else {
863         *bounding_box += line->bounding_box();
864       }
865     }
866   }
867   return !first_line;
868 }
869 
870 // The goal of this function is to move the table boundaries around and find
871 // a table that maximizes the whitespace around the table while maximizing
872 // the cellular structure. As a result, it gets confused by headers, footers,
873 // and merged columns (text that crosses columns). There is a tolerance
874 // that allows a few partitions to count towards potential cell merges.
875 // It's the max_merged parameter to FindPartitionLocations.
876 // It can work, but it needs some false positive remove on boundaries.
877 // For now, the grid structure must not intersect any partitions.
878 // Also, small tolerance is added to the horizontal lines for tightly packed
879 // tables. The tolerance is added by adjusting the bounding boxes of the
880 // partitions (in FindHorizontalPartitions). The current implementation
881 // only adjusts the vertical extents of the table.
882 //
883 // Also note. This was hacked at a lot. It could probably use some
884 // more hacking at to find a good set of border conditions and then a
885 // nice clean up.
RecognizeWhitespacedTable(const TBOX & guess_box,StructuredTable * table)886 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) {
887   TBOX best_box = guess_box; // Best borders known.
888   int best_below = 0;        // Margin size above best table.
889   int best_above = 0;        // Margin size below best table.
890   TBOX adjusted = guess_box; // The search box.
891 
892   // We assume that the guess box is somewhat accurate, so we don't allow
893   // the adjusted border to pass half of the guessed area. This prevents
894   // "negative" tables from forming.
895   const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
896   // Keeps track of the most columns in an accepted table. The resulting table
897   // may be less than the max, but we don't want to stray too far.
898   unsigned best_cols = 0;
899   // Make sure we find a good border.
900   bool found_good_border = false;
901 
902   // Find the bottom of the table by trying a few different locations. For
903   // each location, the top, left, and right are fixed. We start the search
904   // in a smaller table to favor best_cols getting a good estimate sooner.
905   int last_bottom = INT32_MAX;
906   int bottom =
907       NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
908   int top =
909       NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
910   adjusted.set_top(top);
911 
912   // Headers/footers can be spaced far from everything.
913   // Make sure that the space below is greater than the space above
914   // the lowest row.
915   int previous_below = 0;
916   const int kMaxChances = 10;
917   int chances = kMaxChances;
918   while (bottom != last_bottom) {
919     adjusted.set_bottom(bottom);
920 
921     if (adjusted.height() >= min_height_) {
922       // Try to fit the grid on the current box. We give it a chance
923       // if the number of columns didn't significantly drop.
924       table->set_bounding_box(adjusted);
925       if (table->FindWhitespacedStructure() &&
926           table->column_count() >= best_cols * kRequiredColumns) {
927         if (false && IsWeakTableRow(table, 0)) {
928           // Currently buggy, but was looking promising so disabled.
929           --chances;
930         } else {
931           // We favor 2 things,
932           //   1- Adding rows that have partitioned data.
933           //   2- Better margins (to find header/footer).
934           // For better tables, we just look for multiple cells in the
935           // bottom row with data in them.
936           // For margins, the space below the last row should
937           // be better than a table with the last row removed.
938           chances = kMaxChances;
939           double max_row_height = kMaxRowSize * table->median_cell_height();
940           if ((table->space_below() * kMarginFactor >= best_below &&
941                table->space_below() >= previous_below) ||
942               (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
943             best_box.set_bottom(bottom);
944             best_below = table->space_below();
945             best_cols = std::max(table->column_count(), best_cols);
946             found_good_border = true;
947           }
948         }
949         previous_below = table->space_below();
950       } else {
951         --chances;
952       }
953     }
954     if (chances <= 0) {
955       break;
956     }
957 
958     last_bottom = bottom;
959     bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
960   }
961   if (!found_good_border) {
962     return false;
963   }
964 
965   // TODO(nbeato) comments: follow modified code above... put it in a function!
966   found_good_border = false;
967   int last_top = INT32_MIN;
968   top =
969       NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
970   int previous_above = 0;
971   chances = kMaxChances;
972 
973   adjusted.set_bottom(best_box.bottom());
974   while (last_top != top) {
975     adjusted.set_top(top);
976     if (adjusted.height() >= min_height_) {
977       table->set_bounding_box(adjusted);
978       if (table->FindWhitespacedStructure() &&
979           table->column_count() >= best_cols * kRequiredColumns) {
980         int last_row = table->row_count() - 1;
981         if (false && IsWeakTableRow(table, last_row)) {
982           // Currently buggy, but was looking promising so disabled.
983           --chances;
984         } else {
985           chances = kMaxChances;
986           double max_row_height = kMaxRowSize * table->median_cell_height();
987           if ((table->space_above() * kMarginFactor >= best_above &&
988                table->space_above() >= previous_above) ||
989               (table->CountFilledCellsInRow(last_row) > 1 &&
990                table->row_height(last_row) < max_row_height)) {
991             best_box.set_top(top);
992             best_above = table->space_above();
993             best_cols = std::max(table->column_count(), best_cols);
994             found_good_border = true;
995           }
996         }
997         previous_above = table->space_above();
998       } else {
999         --chances;
1000       }
1001     }
1002     if (chances <= 0) {
1003       break;
1004     }
1005 
1006     last_top = top;
1007     top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1008   }
1009 
1010   if (!found_good_border) {
1011     return false;
1012   }
1013 
1014   // If we get here, this shouldn't happen. It can be an assert, but
1015   // I haven't tested it enough to make it crash things.
1016   if (best_box.null_box()) {
1017     return false;
1018   }
1019 
1020   // Given the best locations, fit the box to those locations.
1021   table->set_bounding_box(best_box);
1022   return table->FindWhitespacedStructure();
1023 }
1024 
1025 // Finds the closest value to y that can safely cause a horizontal
1026 // split in the partitions.
1027 // This function has been buggy and not as reliable as I would've
1028 // liked. I suggest finding all of the splits using the
1029 // FindPartitionLocations once and then just keeping the results
1030 // of that function cached somewhere.
NextHorizontalSplit(int left,int right,int y,bool top_to_bottom)1031 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {
1032   ColPartitionGridSearch gsearch(text_grid_);
1033   gsearch.SetUniqueMode(true);
1034   gsearch.StartVerticalSearch(left, right, y);
1035   ColPartition *text = nullptr;
1036   int last_y = y;
1037   while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1038     if (!text->IsTextType() || !text->IsHorizontalType()) {
1039       continue;
1040     }
1041     if (text->bounding_box().height() > max_text_height_) {
1042       continue;
1043     }
1044 
1045     const TBOX &text_box = text->bounding_box();
1046     if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1047       last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1048       continue;
1049     }
1050     if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1051       last_y = std::max(last_y, static_cast<int>(text_box.top()));
1052       continue;
1053     }
1054 
1055     return last_y;
1056   }
1057   // If none is found, we at least want to preserve the min/max,
1058   // which defines the overlap of y with the last partition in the grid.
1059   return last_y;
1060 }
1061 
1062 // Code is buggy right now. It is disabled in the calling function.
1063 // It seems like sometimes the row that is passed in is not correct
1064 // sometimes (like a phantom row is introduced). There's something going
1065 // on in the cell_y_ data member before this is called... not certain.
IsWeakTableRow(StructuredTable * table,int row)1066 bool TableRecognizer::IsWeakTableRow(StructuredTable *table, int row) {
1067   if (!table->VerifyRowFilled(row)) {
1068     return false;
1069   }
1070 
1071   double threshold;
1072   if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) {
1073     threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
1074   } else {
1075     threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
1076   }
1077 
1078   return table->CountFilledCellsInRow(row) < threshold;
1079 }
1080 
1081 } // namespace tesseract
1082