1 ///////////////////////////////////////////////////////////////////////
2 // File:        tabfind.cpp
3 // Description: Subclass of BBGrid to find vertically aligned blobs.
4 // Author:      Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifdef HAVE_CONFIG_H
20 #  include "config_auto.h"
21 #endif
22 
23 #include "alignedblob.h"
24 #include "colpartitiongrid.h"
25 #include "detlinefit.h"
26 #include "host.h" // for NearlyEqual
27 #include "linefind.h"
28 #include "tabfind.h"
29 
30 #include <algorithm>
31 
32 namespace tesseract {
33 
34 // Multiple of box size to search for initial gaps.
35 const int kTabRadiusFactor = 5;
36 // Min and Max multiple of height to search vertically when extrapolating.
37 const int kMinVerticalSearch = 3;
38 const int kMaxVerticalSearch = 12;
39 const int kMaxRaggedSearch = 25;
40 // Minimum number of lines in a column width to make it interesting.
41 const int kMinLinesInColumn = 10;
42 // Minimum width of a column to be interesting.
43 const int kMinColumnWidth = 200;
44 // Minimum fraction of total column lines for a column to be interesting.
45 const double kMinFractionalLinesInColumn = 0.125;
46 // Fraction of height used as alignment tolerance for aligned tabs.
47 const double kAlignedFraction = 0.03125;
48 // Maximum gutter width (in absolute inch) that we care about
49 const double kMaxGutterWidthAbsolute = 2.00;
50 // Multiplier of gridsize for min gutter width of TT_MAYBE_RAGGED blobs.
51 const int kRaggedGutterMultiple = 5;
52 // Min aspect ratio of tall objects to be considered a separator line.
53 // (These will be ignored in searching the gutter for obstructions.)
54 const double kLineFragmentAspectRatio = 10.0;
55 // Min number of points to accept after evaluation.
56 const int kMinEvaluatedTabs = 3;
57 // Up to 30 degrees is allowed for rotations of diacritic blobs.
58 // Keep this value slightly larger than kCosSmallAngle in blobbox.cpp
59 // so that the assert there never fails.
60 const double kCosMaxSkewAngle = 0.866025;
61 
62 static BOOL_VAR(textord_tabfind_show_initialtabs, false, "Show tab candidates");
63 static BOOL_VAR(textord_tabfind_show_finaltabs, false, "Show tab vectors");
64 
TabFind(int gridsize,const ICOORD & bleft,const ICOORD & tright,TabVector_LIST * vlines,int vertical_x,int vertical_y,int resolution)65 TabFind::TabFind(int gridsize, const ICOORD &bleft, const ICOORD &tright, TabVector_LIST *vlines,
66                  int vertical_x, int vertical_y, int resolution)
67     : AlignedBlob(gridsize, bleft, tright)
68     , resolution_(resolution)
69     , image_origin_(0, tright.y() - 1)
70     , v_it_(&vectors_) {
71   width_cb_ = nullptr;
72   v_it_.add_list_after(vlines);
73   SetVerticalSkewAndParallelize(vertical_x, vertical_y);
74   using namespace std::placeholders; // for _1
75   width_cb_ = std::bind(&TabFind::CommonWidth, this, _1);
76 }
77 
78 TabFind::~TabFind() = default;
79 
80 ///////////////// PUBLIC functions (mostly used by TabVector). //////////////
81 
82 // Insert a list of blobs into the given grid (not necessarily this).
83 // If take_ownership is true, then the blobs are removed from the source list.
84 // See InsertBlob for the other arguments.
85 // It would seem to make more sense to swap this and grid, but this way
86 // around allows grid to not be derived from TabFind, eg a ColPartitionGrid,
87 // while the grid that provides the tab stops(this) has to be derived from
88 // TabFind.
InsertBlobsToGrid(bool h_spread,bool v_spread,BLOBNBOX_LIST * blobs,BBGrid<BLOBNBOX,BLOBNBOX_CLIST,BLOBNBOX_C_IT> * grid)89 void TabFind::InsertBlobsToGrid(bool h_spread, bool v_spread, BLOBNBOX_LIST *blobs,
90                                 BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) {
91   BLOBNBOX_IT blob_it(blobs);
92   int b_count = 0;
93   int reject_count = 0;
94   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
95     BLOBNBOX *blob = blob_it.data();
96     //    if (InsertBlob(true, true, blob, grid)) {
97     if (InsertBlob(h_spread, v_spread, blob, grid)) {
98       ++b_count;
99     } else {
100       ++reject_count;
101     }
102   }
103   if (textord_debug_tabfind) {
104     tprintf("Inserted %d blobs into grid, %d rejected.\n", b_count, reject_count);
105   }
106 }
107 
108 // Insert a single blob into the given grid (not necessarily this).
109 // If h_spread, then all cells covered horizontally by the box are
110 // used, otherwise, just the bottom-left. Similarly for v_spread.
111 // A side effect is that the left and right rule edges of the blob are
112 // set according to the tab vectors in this (not grid).
InsertBlob(bool h_spread,bool v_spread,BLOBNBOX * blob,BBGrid<BLOBNBOX,BLOBNBOX_CLIST,BLOBNBOX_C_IT> * grid)113 bool TabFind::InsertBlob(bool h_spread, bool v_spread, BLOBNBOX *blob,
114                          BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) {
115   TBOX box = blob->bounding_box();
116   blob->set_left_rule(LeftEdgeForBox(box, false, false));
117   blob->set_right_rule(RightEdgeForBox(box, false, false));
118   blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));
119   blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));
120   if (blob->joined_to_prev()) {
121     return false;
122   }
123   grid->InsertBBox(h_spread, v_spread, blob);
124   return true;
125 }
126 
127 // Calls SetBlobRuleEdges for all the blobs in the given block.
SetBlockRuleEdges(TO_BLOCK * block)128 void TabFind::SetBlockRuleEdges(TO_BLOCK *block) {
129   SetBlobRuleEdges(&block->blobs);
130   SetBlobRuleEdges(&block->small_blobs);
131   SetBlobRuleEdges(&block->noise_blobs);
132   SetBlobRuleEdges(&block->large_blobs);
133 }
134 
135 // Sets the left and right rule and crossing_rules for the blobs in the given
136 // list by fiding the next outermost tabvectors for each blob.
SetBlobRuleEdges(BLOBNBOX_LIST * blobs)137 void TabFind::SetBlobRuleEdges(BLOBNBOX_LIST *blobs) {
138   BLOBNBOX_IT blob_it(blobs);
139   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
140     BLOBNBOX *blob = blob_it.data();
141     TBOX box = blob->bounding_box();
142     blob->set_left_rule(LeftEdgeForBox(box, false, false));
143     blob->set_right_rule(RightEdgeForBox(box, false, false));
144     blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));
145     blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));
146   }
147 }
148 
149 // Returns the gutter width of the given TabVector between the given y limits.
150 // Also returns x-shift to be added to the vector to clear any intersecting
151 // blobs. The shift is deducted from the returned gutter.
152 // If ignore_unmergeables is true, then blobs of UnMergeableType are
153 // ignored as if they don't exist. (Used for text on image.)
154 // max_gutter_width is used as the maximum width worth searching for in case
155 // there is nothing near the TabVector.
GutterWidth(int bottom_y,int top_y,const TabVector & v,bool ignore_unmergeables,int max_gutter_width,int * required_shift)156 int TabFind::GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables,
157                          int max_gutter_width, int *required_shift) {
158   bool right_to_left = v.IsLeftTab();
159   int bottom_x = v.XAtY(bottom_y);
160   int top_x = v.XAtY(top_y);
161   int start_x = right_to_left ? std::max(top_x, bottom_x) : std::min(top_x, bottom_x);
162   BlobGridSearch sidesearch(this);
163   sidesearch.StartSideSearch(start_x, bottom_y, top_y);
164   int min_gap = max_gutter_width;
165   *required_shift = 0;
166   BLOBNBOX *blob = nullptr;
167   while ((blob = sidesearch.NextSideSearch(right_to_left)) != nullptr) {
168     const TBOX &box = blob->bounding_box();
169     if (box.bottom() >= top_y || box.top() <= bottom_y) {
170       continue; // Doesn't overlap enough.
171     }
172     if (box.height() >= gridsize() * 2 && box.height() > box.width() * kLineFragmentAspectRatio) {
173       // Skip likely separator line residue.
174       continue;
175     }
176     if (ignore_unmergeables && BLOBNBOX::UnMergeableType(blob->region_type())) {
177       continue; // Skip non-text if required.
178     }
179     int mid_y = (box.bottom() + box.top()) / 2;
180     // We use the x at the mid-y so that the required_shift guarantees
181     // to clear all the blobs on the tab-stop. If we use the min/max
182     // of x at top/bottom of the blob, then exactness would be required,
183     // which is not a good thing.
184     int tab_x = v.XAtY(mid_y);
185     int gap;
186     if (right_to_left) {
187       gap = tab_x - box.right();
188       if (gap < 0 && box.left() - tab_x < *required_shift) {
189         *required_shift = box.left() - tab_x;
190       }
191     } else {
192       gap = box.left() - tab_x;
193       if (gap < 0 && box.right() - tab_x > *required_shift) {
194         *required_shift = box.right() - tab_x;
195       }
196     }
197     if (gap > 0 && gap < min_gap) {
198       min_gap = gap;
199     }
200   }
201   // Result may be negative, in which case,  this is a really bad tabstop.
202   return min_gap - abs(*required_shift);
203 }
204 
205 // Find the gutter width and distance to inner neighbour for the given blob.
GutterWidthAndNeighbourGap(int tab_x,int mean_height,int max_gutter,bool left,BLOBNBOX * bbox,int * gutter_width,int * neighbour_gap)206 void TabFind::GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left,
207                                          BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap) {
208   const TBOX &box = bbox->bounding_box();
209   // The gutter and internal sides of the box.
210   int gutter_x = left ? box.left() : box.right();
211   int internal_x = left ? box.right() : box.left();
212   // On ragged edges, the gutter side of the box is away from the tabstop.
213   int tab_gap = left ? gutter_x - tab_x : tab_x - gutter_x;
214   *gutter_width = max_gutter;
215   // If the box is away from the tabstop, we need to increase
216   // the allowed gutter width.
217   if (tab_gap > 0) {
218     *gutter_width += tab_gap;
219   }
220   bool debug = WithinTestRegion(2, box.left(), box.bottom());
221   if (debug) {
222     tprintf("Looking in gutter\n");
223   }
224   // Find the nearest blob on the outside of the column.
225   BLOBNBOX *gutter_bbox = AdjacentBlob(bbox, left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,
226                                        *gutter_width, box.top(), box.bottom());
227   if (gutter_bbox != nullptr) {
228     const TBOX &gutter_box = gutter_bbox->bounding_box();
229     *gutter_width = left ? tab_x - gutter_box.right() : gutter_box.left() - tab_x;
230   }
231   if (*gutter_width >= max_gutter) {
232     // If there is no box because a tab was in the way, get the tab coord.
233     TBOX gutter_box(box);
234     if (left) {
235       gutter_box.set_left(tab_x - max_gutter - 1);
236       gutter_box.set_right(tab_x - max_gutter);
237       int tab_gutter = RightEdgeForBox(gutter_box, true, false);
238       if (tab_gutter < tab_x - 1) {
239         *gutter_width = tab_x - tab_gutter;
240       }
241     } else {
242       gutter_box.set_left(tab_x + max_gutter);
243       gutter_box.set_right(tab_x + max_gutter + 1);
244       int tab_gutter = LeftEdgeForBox(gutter_box, true, false);
245       if (tab_gutter > tab_x + 1) {
246         *gutter_width = tab_gutter - tab_x;
247       }
248     }
249   }
250   if (*gutter_width > max_gutter) {
251     *gutter_width = max_gutter;
252   }
253   // Now look for a neighbour on the inside.
254   if (debug) {
255     tprintf("Looking for neighbour\n");
256   }
257   BLOBNBOX *neighbour = AdjacentBlob(bbox, !left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,
258                                      *gutter_width, box.top(), box.bottom());
259   int neighbour_edge = left ? RightEdgeForBox(box, true, false) : LeftEdgeForBox(box, true, false);
260   if (neighbour != nullptr) {
261     const TBOX &n_box = neighbour->bounding_box();
262     if (debug) {
263       tprintf("Found neighbour:");
264       n_box.print();
265     }
266     if (left && n_box.left() < neighbour_edge) {
267       neighbour_edge = n_box.left();
268     } else if (!left && n_box.right() > neighbour_edge) {
269       neighbour_edge = n_box.right();
270     }
271   }
272   *neighbour_gap = left ? neighbour_edge - internal_x : internal_x - neighbour_edge;
273 }
274 
275 // Return the x-coord that corresponds to the right edge for the given
276 // box. If there is a rule line to the right that vertically overlaps it,
277 // then return the x-coord of the rule line, otherwise return the right
278 // edge of the page. For details see RightTabForBox below.
RightEdgeForBox(const TBOX & box,bool crossing,bool extended)279 int TabFind::RightEdgeForBox(const TBOX &box, bool crossing, bool extended) {
280   TabVector *v = RightTabForBox(box, crossing, extended);
281   return v == nullptr ? tright_.x() : v->XAtY((box.top() + box.bottom()) / 2);
282 }
283 // As RightEdgeForBox, but finds the left Edge instead.
LeftEdgeForBox(const TBOX & box,bool crossing,bool extended)284 int TabFind::LeftEdgeForBox(const TBOX &box, bool crossing, bool extended) {
285   TabVector *v = LeftTabForBox(box, crossing, extended);
286   return v == nullptr ? bleft_.x() : v->XAtY((box.top() + box.bottom()) / 2);
287 }
288 
289 // This comment documents how this function works.
290 // For its purpose and arguments, see the comment in tabfind.h.
291 // TabVectors are stored sorted by perpendicular distance of middle from
292 // the global mean vertical vector. Since the individual vectors can have
293 // differing directions, their XAtY for a given y is not necessarily in the
294 // right order. Therefore the search has to be run with a margin.
295 // The middle of a vector that passes through (x,y) cannot be higher than
296 // halfway from y to the top, or lower than halfway from y to the bottom
297 // of the coordinate range; therefore, the search margin is the range of
298 // sort keys between these halfway points. Any vector with a sort key greater
299 // than the upper margin must be to the right of x at y, and likewise any
300 // vector with a sort key less than the lower margin must pass to the left
301 // of x at y.
RightTabForBox(const TBOX & box,bool crossing,bool extended)302 TabVector *TabFind::RightTabForBox(const TBOX &box, bool crossing, bool extended) {
303   if (v_it_.empty()) {
304     return nullptr;
305   }
306   int top_y = box.top();
307   int bottom_y = box.bottom();
308   int mid_y = (top_y + bottom_y) / 2;
309   int right = crossing ? (box.left() + box.right()) / 2 : box.right();
310   int min_key, max_key;
311   SetupTabSearch(right, mid_y, &min_key, &max_key);
312   // Position the iterator at the first TabVector with sort_key >= min_key.
313   while (!v_it_.at_first() && v_it_.data()->sort_key() >= min_key) {
314     v_it_.backward();
315   }
316   while (!v_it_.at_last() && v_it_.data()->sort_key() < min_key) {
317     v_it_.forward();
318   }
319   // Find the leftmost tab vector that overlaps and has XAtY(mid_y) >= right.
320   TabVector *best_v = nullptr;
321   int best_x = -1;
322   int key_limit = -1;
323   do {
324     TabVector *v = v_it_.data();
325     int x = v->XAtY(mid_y);
326     if (x >= right && (v->VOverlap(top_y, bottom_y) > 0 ||
327                        (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) {
328       if (best_v == nullptr || x < best_x) {
329         best_v = v;
330         best_x = x;
331         // We can guarantee that no better vector can be found if the
332         // sort key exceeds that of the best by max_key - min_key.
333         key_limit = v->sort_key() + max_key - min_key;
334       }
335     }
336     // Break when the search is done to avoid wrapping the iterator and
337     // thereby potentially slowing the next search.
338     if (v_it_.at_last() || (best_v != nullptr && v->sort_key() > key_limit)) {
339       break; // Prevent restarting list for next call.
340     }
341     v_it_.forward();
342   } while (!v_it_.at_first());
343   return best_v;
344 }
345 
346 // As RightTabForBox, but finds the left TabVector instead.
LeftTabForBox(const TBOX & box,bool crossing,bool extended)347 TabVector *TabFind::LeftTabForBox(const TBOX &box, bool crossing, bool extended) {
348   if (v_it_.empty()) {
349     return nullptr;
350   }
351   int top_y = box.top();
352   int bottom_y = box.bottom();
353   int mid_y = (top_y + bottom_y) / 2;
354   int left = crossing ? (box.left() + box.right()) / 2 : box.left();
355   int min_key, max_key;
356   SetupTabSearch(left, mid_y, &min_key, &max_key);
357   // Position the iterator at the last TabVector with sort_key <= max_key.
358   while (!v_it_.at_last() && v_it_.data()->sort_key() <= max_key) {
359     v_it_.forward();
360   }
361   while (!v_it_.at_first() && v_it_.data()->sort_key() > max_key) {
362     v_it_.backward();
363   }
364   // Find the rightmost tab vector that overlaps and has XAtY(mid_y) <= left.
365   TabVector *best_v = nullptr;
366   int best_x = -1;
367   int key_limit = -1;
368   do {
369     TabVector *v = v_it_.data();
370     int x = v->XAtY(mid_y);
371     if (x <= left && (v->VOverlap(top_y, bottom_y) > 0 ||
372                       (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) {
373       if (best_v == nullptr || x > best_x) {
374         best_v = v;
375         best_x = x;
376         // We can guarantee that no better vector can be found if the
377         // sort key is less than that of the best by max_key - min_key.
378         key_limit = v->sort_key() - (max_key - min_key);
379       }
380     }
381     // Break when the search is done to avoid wrapping the iterator and
382     // thereby potentially slowing the next search.
383     if (v_it_.at_first() || (best_v != nullptr && v->sort_key() < key_limit)) {
384       break; // Prevent restarting list for next call.
385     }
386     v_it_.backward();
387   } while (!v_it_.at_last());
388   return best_v;
389 }
390 
391 // Return true if the given width is close to one of the common
392 // widths in column_widths_.
CommonWidth(int width)393 bool TabFind::CommonWidth(int width) {
394   width /= kColumnWidthFactor;
395   ICOORDELT_IT it(&column_widths_);
396   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
397     ICOORDELT *w = it.data();
398     if (w->x() - 1 <= width && width <= w->y() + 1) {
399       return true;
400     }
401   }
402   return false;
403 }
404 
405 // Return true if the sizes are more than a
406 // factor of 2 different.
DifferentSizes(int size1,int size2)407 bool TabFind::DifferentSizes(int size1, int size2) {
408   return size1 > size2 * 2 || size2 > size1 * 2;
409 }
410 
411 // Return true if the sizes are more than a
412 // factor of 5 different.
VeryDifferentSizes(int size1,int size2)413 bool TabFind::VeryDifferentSizes(int size1, int size2) {
414   return size1 > size2 * 5 || size2 > size1 * 5;
415 }
416 
417 ///////////////// PROTECTED functions (used by ColumnFinder). //////////////
418 
419 // Top-level function to find TabVectors in an input page block.
420 // Returns false if the detected skew angle is impossible.
421 // Applies the detected skew angle to deskew the tabs, blobs and part_grid.
FindTabVectors(TabVector_LIST * hlines,BLOBNBOX_LIST * image_blobs,TO_BLOCK * block,int min_gutter_width,double tabfind_aligned_gap_fraction,ColPartitionGrid * part_grid,FCOORD * deskew,FCOORD * reskew)422 bool TabFind::FindTabVectors(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,
423                              int min_gutter_width, double tabfind_aligned_gap_fraction,
424                              ColPartitionGrid *part_grid, FCOORD *deskew, FCOORD *reskew) {
425   ScrollView *tab_win =
426       FindInitialTabVectors(image_blobs, min_gutter_width, tabfind_aligned_gap_fraction, block);
427   ComputeColumnWidths(tab_win, part_grid);
428   TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
429   SortVectors();
430   CleanupTabs();
431   if (!Deskew(hlines, image_blobs, block, deskew, reskew)) {
432     return false; // Skew angle is too large.
433   }
434   part_grid->Deskew(*deskew);
435   ApplyTabConstraints();
436 #ifndef GRAPHICS_DISABLED
437   if (textord_tabfind_show_finaltabs) {
438     tab_win = MakeWindow(640, 50, "FinalTabs");
439     DisplayBoxes(tab_win);
440     DisplayTabs("FinalTabs", tab_win);
441     tab_win = DisplayTabVectors(tab_win);
442   }
443 #endif // !GRAPHICS_DISABLED
444   return true;
445 }
446 
447 // Top-level function to not find TabVectors in an input page block,
448 // but setup for single column mode.
DontFindTabVectors(BLOBNBOX_LIST * image_blobs,TO_BLOCK * block,FCOORD * deskew,FCOORD * reskew)449 void TabFind::DontFindTabVectors(BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew,
450                                  FCOORD *reskew) {
451   InsertBlobsToGrid(false, false, image_blobs, this);
452   InsertBlobsToGrid(true, false, &block->blobs, this);
453   deskew->set_x(1.0f);
454   deskew->set_y(0.0f);
455   reskew->set_x(1.0f);
456   reskew->set_y(0.0f);
457 }
458 
459 // Cleans up the lists of blobs in the block ready for use by TabFind.
460 // Large blobs that look like text are moved to the main blobs list.
461 // Main blobs that are superseded by the image blobs are deleted.
TidyBlobs(TO_BLOCK * block)462 void TabFind::TidyBlobs(TO_BLOCK *block) {
463   BLOBNBOX_IT large_it = &block->large_blobs;
464   BLOBNBOX_IT blob_it = &block->blobs;
465   int b_count = 0;
466   for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
467     BLOBNBOX *large_blob = large_it.data();
468     if (large_blob->owner() != nullptr) {
469       blob_it.add_to_end(large_it.extract());
470       ++b_count;
471     }
472   }
473   if (textord_debug_tabfind) {
474     tprintf("Moved %d large blobs to normal list\n", b_count);
475 #ifndef GRAPHICS_DISABLED
476     ScrollView *rej_win = MakeWindow(500, 300, "Image blobs");
477     block->plot_graded_blobs(rej_win);
478     block->plot_noise_blobs(rej_win);
479     rej_win->Update();
480 #endif // !GRAPHICS_DISABLED
481   }
482   block->DeleteUnownedNoise();
483 }
484 
485 // Helper function to setup search limits for *TabForBox.
SetupTabSearch(int x,int y,int * min_key,int * max_key)486 void TabFind::SetupTabSearch(int x, int y, int *min_key, int *max_key) {
487   int key1 = TabVector::SortKey(vertical_skew_, x, (y + tright_.y()) / 2);
488   int key2 = TabVector::SortKey(vertical_skew_, x, (y + bleft_.y()) / 2);
489   *min_key = std::min(key1, key2);
490   *max_key = std::max(key1, key2);
491 }
492 
493 #ifndef GRAPHICS_DISABLED
494 
DisplayTabVectors(ScrollView * tab_win)495 ScrollView *TabFind::DisplayTabVectors(ScrollView *tab_win) {
496   // For every vector, display it.
497   TabVector_IT it(&vectors_);
498   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
499     TabVector *vector = it.data();
500     vector->Display(tab_win);
501   }
502   tab_win->Update();
503   return tab_win;
504 }
505 
506 #endif
507 
508 // PRIVATE CODE.
509 //
510 // First part of FindTabVectors, which may be used twice if the text
511 // is mostly of vertical alignment.
FindInitialTabVectors(BLOBNBOX_LIST * image_blobs,int min_gutter_width,double tabfind_aligned_gap_fraction,TO_BLOCK * block)512 ScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_gutter_width,
513                                            double tabfind_aligned_gap_fraction, TO_BLOCK *block) {
514 #ifndef GRAPHICS_DISABLED
515   if (textord_tabfind_show_initialtabs) {
516     ScrollView *line_win = MakeWindow(0, 0, "VerticalLines");
517     line_win = DisplayTabVectors(line_win);
518   }
519 #endif
520   // Prepare the grid.
521   if (image_blobs != nullptr) {
522     InsertBlobsToGrid(true, false, image_blobs, this);
523   }
524   InsertBlobsToGrid(true, false, &block->blobs, this);
525   ScrollView *initial_win = FindTabBoxes(min_gutter_width, tabfind_aligned_gap_fraction);
526   FindAllTabVectors(min_gutter_width);
527 
528   TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
529   SortVectors();
530   EvaluateTabs();
531 #ifndef GRAPHICS_DISABLED
532   if (textord_tabfind_show_initialtabs && initial_win != nullptr) {
533     initial_win = DisplayTabVectors(initial_win);
534   }
535 #endif
536   MarkVerticalText();
537   return initial_win;
538 }
539 
540 #ifndef GRAPHICS_DISABLED
541 
542 // Helper displays all the boxes in the given vector on the given window.
DisplayBoxVector(const std::vector<BLOBNBOX * > & boxes,ScrollView * win)543 static void DisplayBoxVector(const std::vector<BLOBNBOX *> &boxes, ScrollView *win) {
544   for (auto boxe : boxes) {
545     TBOX box = boxe->bounding_box();
546     int left_x = box.left();
547     int right_x = box.right();
548     int top_y = box.top();
549     int bottom_y = box.bottom();
550     ScrollView::Color box_color = boxe->BoxColor();
551     win->Pen(box_color);
552     win->Rectangle(left_x, bottom_y, right_x, top_y);
553   }
554   win->Update();
555 }
556 
557 #endif // !GRAPHICS_DISABLED
558 
559 // For each box in the grid, decide whether it is a candidate tab-stop,
560 // and if so add it to the left/right tab boxes.
FindTabBoxes(int min_gutter_width,double tabfind_aligned_gap_fraction)561 ScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_gap_fraction) {
562   left_tab_boxes_.clear();
563   right_tab_boxes_.clear();
564   // For every bbox in the grid, determine whether it uses a tab on an edge.
565   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> gsearch(this);
566   gsearch.StartFullSearch();
567   BLOBNBOX *bbox;
568   while ((bbox = gsearch.NextFullSearch()) != nullptr) {
569     if (TestBoxForTabs(bbox, min_gutter_width, tabfind_aligned_gap_fraction)) {
570       // If it is any kind of tab, insert it into the vectors.
571       if (bbox->left_tab_type() != TT_NONE) {
572         left_tab_boxes_.push_back(bbox);
573       }
574       if (bbox->right_tab_type() != TT_NONE) {
575         right_tab_boxes_.push_back(bbox);
576       }
577     }
578   }
579   // Sort left tabs by left and right by right to see the outermost one first
580   // on a ragged tab.
581   std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft<BLOBNBOX>);
582   std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft<BLOBNBOX>);
583   ScrollView *tab_win = nullptr;
584 #ifndef GRAPHICS_DISABLED
585   if (textord_tabfind_show_initialtabs) {
586     tab_win = MakeWindow(0, 100, "InitialTabs");
587     tab_win->Pen(ScrollView::BLUE);
588     tab_win->Brush(ScrollView::NONE);
589     // Display the left and right tab boxes.
590     DisplayBoxVector(left_tab_boxes_, tab_win);
591     DisplayBoxVector(right_tab_boxes_, tab_win);
592     tab_win = DisplayTabs("Tabs", tab_win);
593   }
594 #endif // !GRAPHICS_DISABLED
595   return tab_win;
596 }
597 
TestBoxForTabs(BLOBNBOX * bbox,int min_gutter_width,double tabfind_aligned_gap_fraction)598 bool TabFind::TestBoxForTabs(BLOBNBOX *bbox, int min_gutter_width,
599                              double tabfind_aligned_gap_fraction) {
600   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);
601   TBOX box = bbox->bounding_box();
602   // If there are separator lines, get the column edges.
603   int left_column_edge = bbox->left_rule();
604   int right_column_edge = bbox->right_rule();
605   // The edges of the bounding box of the blob being processed.
606   int left_x = box.left();
607   int right_x = box.right();
608   int top_y = box.top();
609   int bottom_y = box.bottom();
610   int height = box.height();
611   bool debug = WithinTestRegion(3, left_x, top_y);
612   if (debug) {
613     tprintf("Column edges for blob at (%d,%d)->(%d,%d) are [%d, %d]\n", left_x, top_y, right_x,
614             bottom_y, left_column_edge, right_column_edge);
615   }
616   // Compute a search radius based on a multiple of the height.
617   int radius = (height * kTabRadiusFactor + gridsize_ - 1) / gridsize_;
618   radsearch.StartRadSearch((left_x + right_x) / 2, (top_y + bottom_y) / 2, radius);
619   // In Vertical Page mode, once we have an estimate of the vertical line
620   // spacing, the minimum amount of gutter space before a possible tab is
621   // increased under the assumption that column partition is always larger
622   // than line spacing.
623   int min_spacing = static_cast<int>(height * tabfind_aligned_gap_fraction);
624   if (min_gutter_width > min_spacing) {
625     min_spacing = min_gutter_width;
626   }
627   int min_ragged_gutter = kRaggedGutterMultiple * gridsize();
628   if (min_gutter_width > min_ragged_gutter) {
629     min_ragged_gutter = min_gutter_width;
630   }
631   int target_right = left_x - min_spacing;
632   int target_left = right_x + min_spacing;
633   // We will be evaluating whether the left edge could be a left tab, and
634   // whether the right edge could be a right tab.
635   // A box can be a tab if its bool is_(left/right)_tab remains true, meaning
636   // that no blobs have been found in the gutter during the radial search.
637   // A box can also be a tab if there are objects in the gutter only above
638   // or only below, and there are aligned objects on the opposite side, but
639   // not too many unaligned objects. The maybe_(left/right)_tab_up counts
640   // aligned objects above and negatively counts unaligned objects above,
641   // and is set to -INT32_MAX if a gutter object is found above.
642   // The other 3 maybe ints work similarly for the other sides.
643   // These conditions are very strict, to minimize false positives, and really
644   // only aligned tabs and outermost ragged tab blobs will qualify, so we
645   // also have maybe_ragged_left/right with less stringent rules.
646   // A blob that is maybe_ragged_left/right will be further qualified later,
647   // using the min_ragged_gutter.
648   bool is_left_tab = true;
649   bool is_right_tab = true;
650   bool maybe_ragged_left = true;
651   bool maybe_ragged_right = true;
652   int maybe_left_tab_up = 0;
653   int maybe_right_tab_up = 0;
654   int maybe_left_tab_down = 0;
655   int maybe_right_tab_down = 0;
656   if (bbox->leader_on_left()) {
657     is_left_tab = false;
658     maybe_ragged_left = false;
659     maybe_left_tab_up = -INT32_MAX;
660     maybe_left_tab_down = -INT32_MAX;
661   }
662   if (bbox->leader_on_right()) {
663     is_right_tab = false;
664     maybe_ragged_right = false;
665     maybe_right_tab_up = -INT32_MAX;
666     maybe_right_tab_down = -INT32_MAX;
667   }
668   int alignment_tolerance = static_cast<int>(resolution_ * kAlignedFraction);
669   BLOBNBOX *neighbour = nullptr;
670   while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
671     if (neighbour == bbox) {
672       continue;
673     }
674     TBOX nbox = neighbour->bounding_box();
675     int n_left = nbox.left();
676     int n_right = nbox.right();
677     if (debug) {
678       tprintf("Neighbour at (%d,%d)->(%d,%d)\n", n_left, nbox.bottom(), n_right, nbox.top());
679     }
680     // If the neighbouring blob is the wrong side of a separator line, then it
681     // "doesn't exist" as far as we are concerned.
682     if (n_right > right_column_edge || n_left < left_column_edge ||
683         left_x < neighbour->left_rule() || right_x > neighbour->right_rule()) {
684       continue; // Separator line in the way.
685     }
686     int n_mid_x = (n_left + n_right) / 2;
687     int n_mid_y = (nbox.top() + nbox.bottom()) / 2;
688     if (n_mid_x <= left_x && n_right >= target_right) {
689       if (debug) {
690         tprintf("Not a left tab\n");
691       }
692       is_left_tab = false;
693       if (n_mid_y < top_y) {
694         maybe_left_tab_down = -INT32_MAX;
695       }
696       if (n_mid_y > bottom_y) {
697         maybe_left_tab_up = -INT32_MAX;
698       }
699     } else if (NearlyEqual(left_x, n_left, alignment_tolerance)) {
700       if (debug) {
701         tprintf("Maybe a left tab\n");
702       }
703       if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) {
704         ++maybe_left_tab_up;
705       }
706       if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) {
707         ++maybe_left_tab_down;
708       }
709     } else if (n_left < left_x && n_right >= left_x) {
710       // Overlaps but not aligned so negative points on a maybe.
711       if (debug) {
712         tprintf("Maybe Not a left tab\n");
713       }
714       if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) {
715         --maybe_left_tab_up;
716       }
717       if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) {
718         --maybe_left_tab_down;
719       }
720     }
721     if (n_left < left_x && nbox.y_overlap(box) && n_right >= target_right) {
722       maybe_ragged_left = false;
723       if (debug) {
724         tprintf("Not a ragged left\n");
725       }
726     }
727     if (n_mid_x >= right_x && n_left <= target_left) {
728       if (debug) {
729         tprintf("Not a right tab\n");
730       }
731       is_right_tab = false;
732       if (n_mid_y < top_y) {
733         maybe_right_tab_down = -INT32_MAX;
734       }
735       if (n_mid_y > bottom_y) {
736         maybe_right_tab_up = -INT32_MAX;
737       }
738     } else if (NearlyEqual(right_x, n_right, alignment_tolerance)) {
739       if (debug) {
740         tprintf("Maybe a right tab\n");
741       }
742       if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) {
743         ++maybe_right_tab_up;
744       }
745       if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) {
746         ++maybe_right_tab_down;
747       }
748     } else if (n_right > right_x && n_left <= right_x) {
749       // Overlaps but not aligned so negative points on a maybe.
750       if (debug) {
751         tprintf("Maybe Not a right tab\n");
752       }
753       if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) {
754         --maybe_right_tab_up;
755       }
756       if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) {
757         --maybe_right_tab_down;
758       }
759     }
760     if (n_right > right_x && nbox.y_overlap(box) && n_left <= target_left) {
761       maybe_ragged_right = false;
762       if (debug) {
763         tprintf("Not a ragged right\n");
764       }
765     }
766     if (maybe_left_tab_down == -INT32_MAX && maybe_left_tab_up == -INT32_MAX &&
767         maybe_right_tab_down == -INT32_MAX && maybe_right_tab_up == -INT32_MAX) {
768       break;
769     }
770   }
771   if (is_left_tab || maybe_left_tab_up > 1 || maybe_left_tab_down > 1) {
772     bbox->set_left_tab_type(TT_MAYBE_ALIGNED);
773   } else if (maybe_ragged_left && ConfirmRaggedLeft(bbox, min_ragged_gutter)) {
774     bbox->set_left_tab_type(TT_MAYBE_RAGGED);
775   } else {
776     bbox->set_left_tab_type(TT_NONE);
777   }
778   if (is_right_tab || maybe_right_tab_up > 1 || maybe_right_tab_down > 1) {
779     bbox->set_right_tab_type(TT_MAYBE_ALIGNED);
780   } else if (maybe_ragged_right && ConfirmRaggedRight(bbox, min_ragged_gutter)) {
781     bbox->set_right_tab_type(TT_MAYBE_RAGGED);
782   } else {
783     bbox->set_right_tab_type(TT_NONE);
784   }
785   if (debug) {
786     tprintf("Left result = %s, Right result=%s\n",
787             bbox->left_tab_type() == TT_MAYBE_ALIGNED
788                 ? "Aligned"
789                 : (bbox->left_tab_type() == TT_MAYBE_RAGGED ? "Ragged" : "None"),
790             bbox->right_tab_type() == TT_MAYBE_ALIGNED
791                 ? "Aligned"
792                 : (bbox->right_tab_type() == TT_MAYBE_RAGGED ? "Ragged" : "None"));
793   }
794   return bbox->left_tab_type() != TT_NONE || bbox->right_tab_type() != TT_NONE;
795 }
796 
797 // Returns true if there is nothing in the rectangle of width min_gutter to
798 // the left of bbox.
ConfirmRaggedLeft(BLOBNBOX * bbox,int min_gutter)799 bool TabFind::ConfirmRaggedLeft(BLOBNBOX *bbox, int min_gutter) {
800   TBOX search_box(bbox->bounding_box());
801   search_box.set_right(search_box.left());
802   search_box.set_left(search_box.left() - min_gutter);
803   return NothingYOverlapsInBox(search_box, bbox->bounding_box());
804 }
805 
806 // Returns true if there is nothing in the rectangle of width min_gutter to
807 // the right of bbox.
ConfirmRaggedRight(BLOBNBOX * bbox,int min_gutter)808 bool TabFind::ConfirmRaggedRight(BLOBNBOX *bbox, int min_gutter) {
809   TBOX search_box(bbox->bounding_box());
810   search_box.set_left(search_box.right());
811   search_box.set_right(search_box.right() + min_gutter);
812   return NothingYOverlapsInBox(search_box, bbox->bounding_box());
813 }
814 
815 // Returns true if there is nothing in the given search_box that vertically
816 // overlaps target_box other than target_box itself.
NothingYOverlapsInBox(const TBOX & search_box,const TBOX & target_box)817 bool TabFind::NothingYOverlapsInBox(const TBOX &search_box, const TBOX &target_box) {
818   BlobGridSearch rsearch(this);
819   rsearch.StartRectSearch(search_box);
820   BLOBNBOX *blob;
821   while ((blob = rsearch.NextRectSearch()) != nullptr) {
822     const TBOX &box = blob->bounding_box();
823     if (box.y_overlap(target_box) && !(box == target_box)) {
824       return false;
825     }
826   }
827   return true;
828 }
829 
FindAllTabVectors(int min_gutter_width)830 void TabFind::FindAllTabVectors(int min_gutter_width) {
831   // A list of vectors that will be created in estimating the skew.
832   TabVector_LIST dummy_vectors;
833   // An estimate of the vertical direction, revised as more lines are added.
834   int vertical_x = 0;
835   int vertical_y = 1;
836   // Find an estimate of the vertical direction by finding some tab vectors.
837   // Slowly up the search size until we get some vectors.
838   for (int search_size = kMinVerticalSearch; search_size < kMaxVerticalSearch;
839        search_size += kMinVerticalSearch) {
840     int vector_count = FindTabVectors(search_size, TA_LEFT_ALIGNED, min_gutter_width,
841                                       &dummy_vectors, &vertical_x, &vertical_y);
842     vector_count += FindTabVectors(search_size, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,
843                                    &vertical_x, &vertical_y);
844     if (vector_count > 0) {
845       break;
846     }
847   }
848   // Get rid of the test vectors and reset the types of the tabs.
849   dummy_vectors.clear();
850   for (auto bbox : left_tab_boxes_) {
851     if (bbox->left_tab_type() == TT_CONFIRMED) {
852       bbox->set_left_tab_type(TT_MAYBE_ALIGNED);
853     }
854   }
855   for (auto bbox : right_tab_boxes_) {
856     if (bbox->right_tab_type() == TT_CONFIRMED) {
857       bbox->set_right_tab_type(TT_MAYBE_ALIGNED);
858     }
859   }
860   if (textord_debug_tabfind) {
861     tprintf("Beginning real tab search with vertical = %d,%d...\n", vertical_x, vertical_y);
862   }
863   // Now do the real thing ,but keep the vectors in the dummy_vectors list
864   // until they are all done, so we don't get the tab vectors confused with
865   // the rule line vectors.
866   FindTabVectors(kMaxVerticalSearch, TA_LEFT_ALIGNED, min_gutter_width, &dummy_vectors, &vertical_x,
867                  &vertical_y);
868   FindTabVectors(kMaxVerticalSearch, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,
869                  &vertical_x, &vertical_y);
870   FindTabVectors(kMaxRaggedSearch, TA_LEFT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,
871                  &vertical_y);
872   FindTabVectors(kMaxRaggedSearch, TA_RIGHT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,
873                  &vertical_y);
874   // Now add the vectors to the vectors_ list.
875   TabVector_IT v_it(&vectors_);
876   v_it.add_list_after(&dummy_vectors);
877   // Now use the summed (mean) vertical vector as the direction for everything.
878   SetVerticalSkewAndParallelize(vertical_x, vertical_y);
879 }
880 
881 // Helper for FindAllTabVectors finds the vectors of a particular type.
FindTabVectors(int search_size_multiple,TabAlignment alignment,int min_gutter_width,TabVector_LIST * vectors,int * vertical_x,int * vertical_y)882 int TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, int min_gutter_width,
883                             TabVector_LIST *vectors, int *vertical_x, int *vertical_y) {
884   TabVector_IT vector_it(vectors);
885   int vector_count = 0;
886   // Search the right or left tab boxes, looking for tab vectors.
887   bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED;
888   const std::vector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;
889   for (auto bbox : boxes) {
890     if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) ||
891         (right && bbox->right_tab_type() == TT_MAYBE_ALIGNED)) {
892       TabVector *vector = FindTabVector(search_size_multiple, min_gutter_width, alignment, bbox,
893                                         vertical_x, vertical_y);
894       if (vector != nullptr) {
895         ++vector_count;
896         vector_it.add_to_end(vector);
897       }
898     }
899   }
900   return vector_count;
901 }
902 
903 // Finds a vector corresponding to a tabstop running through the
904 // given box of the given alignment type.
905 // search_size_multiple is a multiple of height used to control
906 // the size of the search.
907 // vertical_x and y are updated with an estimate of the real
908 // vertical direction. (skew finding.)
909 // Returns nullptr if no decent tabstop can be found.
FindTabVector(int search_size_multiple,int min_gutter_width,TabAlignment alignment,BLOBNBOX * bbox,int * vertical_x,int * vertical_y)910 TabVector *TabFind::FindTabVector(int search_size_multiple, int min_gutter_width,
911                                   TabAlignment alignment, BLOBNBOX *bbox, int *vertical_x,
912                                   int *vertical_y) {
913   int height = std::max(static_cast<int>(bbox->bounding_box().height()), gridsize());
914   AlignedBlobParams align_params(*vertical_x, *vertical_y, height, search_size_multiple,
915                                  min_gutter_width, resolution_, alignment);
916   // FindVerticalAlignment is in the parent (AlignedBlob) class.
917   return FindVerticalAlignment(align_params, bbox, vertical_x, vertical_y);
918 }
919 
920 // Set the vertical_skew_ member from the given vector and refit
921 // all vectors parallel to the skew vector.
SetVerticalSkewAndParallelize(int vertical_x,int vertical_y)922 void TabFind::SetVerticalSkewAndParallelize(int vertical_x, int vertical_y) {
923   // Fit the vertical vector into an ICOORD, which is 16 bit.
924   vertical_skew_.set_with_shrink(vertical_x, vertical_y);
925   if (textord_debug_tabfind) {
926     tprintf("Vertical skew vector=(%d,%d)\n", vertical_skew_.x(), vertical_skew_.y());
927   }
928   v_it_.set_to_list(&vectors_);
929   for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) {
930     TabVector *v = v_it_.data();
931     v->Fit(vertical_skew_, true);
932   }
933   // Now sort the vectors as their direction has potentially changed.
934   SortVectors();
935 }
936 
937 // Sort all the current vectors using the given vertical direction vector.
SortVectors()938 void TabFind::SortVectors() {
939   vectors_.sort(TabVector::SortVectorsByKey);
940   v_it_.set_to_list(&vectors_);
941 }
942 
943 // Evaluate all the current tab vectors.
EvaluateTabs()944 void TabFind::EvaluateTabs() {
945   TabVector_IT rule_it(&vectors_);
946   for (rule_it.mark_cycle_pt(); !rule_it.cycled_list(); rule_it.forward()) {
947     TabVector *tab = rule_it.data();
948     if (!tab->IsSeparator()) {
949       tab->Evaluate(vertical_skew_, this);
950       if (tab->BoxCount() < kMinEvaluatedTabs) {
951         if (textord_debug_tabfind > 2) {
952           tab->Print("Too few boxes");
953         }
954         delete rule_it.extract();
955         v_it_.set_to_list(&vectors_);
956       } else if (WithinTestRegion(3, tab->startpt().x(), tab->startpt().y())) {
957         tab->Print("Evaluated tab");
958       }
959     }
960   }
961 }
962 
963 // Trace textlines from one side to the other of each tab vector, saving
964 // the most frequent column widths found in a list so that a given width
965 // can be tested for being a common width with a simple callback function.
ComputeColumnWidths(ScrollView * tab_win,ColPartitionGrid * part_grid)966 void TabFind::ComputeColumnWidths(ScrollView *tab_win, ColPartitionGrid *part_grid) {
967 #ifndef GRAPHICS_DISABLED
968   if (tab_win != nullptr) {
969     tab_win->Pen(ScrollView::WHITE);
970   }
971 #endif // !GRAPHICS_DISABLED
972   // Accumulate column sections into a STATS
973   int col_widths_size = (tright_.x() - bleft_.x()) / kColumnWidthFactor;
974   STATS col_widths(0, col_widths_size + 1);
975   ApplyPartitionsToColumnWidths(part_grid, &col_widths);
976 #ifndef GRAPHICS_DISABLED
977   if (tab_win != nullptr) {
978     tab_win->Update();
979   }
980 #endif // !GRAPHICS_DISABLED
981   if (textord_debug_tabfind > 1) {
982     col_widths.print();
983   }
984   // Now make a list of column widths.
985   MakeColumnWidths(col_widths_size, &col_widths);
986   // Turn the column width into a range.
987   ApplyPartitionsToColumnWidths(part_grid, nullptr);
988 }
989 
990 // Finds column width and:
991 //   if col_widths is not null (pass1):
992 //     pair-up tab vectors with existing ColPartitions and accumulate widths.
993 //   else (pass2):
994 //     find the largest real partition width for each recorded column width,
995 //     to be used as the minimum acceptable width.
ApplyPartitionsToColumnWidths(ColPartitionGrid * part_grid,STATS * col_widths)996 void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid *part_grid, STATS *col_widths) {
997   // For every ColPartition in the part_grid, add partners to the tabvectors
998   // and accumulate the column widths.
999   ColPartitionGridSearch gsearch(part_grid);
1000   gsearch.StartFullSearch();
1001   ColPartition *part;
1002   while ((part = gsearch.NextFullSearch()) != nullptr) {
1003     BLOBNBOX_C_IT blob_it(part->boxes());
1004     if (blob_it.empty()) {
1005       continue;
1006     }
1007     BLOBNBOX *left_blob = blob_it.data();
1008     blob_it.move_to_last();
1009     BLOBNBOX *right_blob = blob_it.data();
1010     TabVector *left_vector = LeftTabForBox(left_blob->bounding_box(), true, false);
1011     if (left_vector == nullptr || left_vector->IsRightTab()) {
1012       continue;
1013     }
1014     TabVector *right_vector = RightTabForBox(right_blob->bounding_box(), true, false);
1015     if (right_vector == nullptr || right_vector->IsLeftTab()) {
1016       continue;
1017     }
1018 
1019     int line_left = left_vector->XAtY(left_blob->bounding_box().bottom());
1020     int line_right = right_vector->XAtY(right_blob->bounding_box().bottom());
1021     // Add to STATS of measurements if the width is significant.
1022     int width = line_right - line_left;
1023     if (col_widths != nullptr) {
1024       AddPartnerVector(left_blob, right_blob, left_vector, right_vector);
1025       if (width >= kMinColumnWidth) {
1026         col_widths->add(width / kColumnWidthFactor, 1);
1027       }
1028     } else {
1029       width /= kColumnWidthFactor;
1030       ICOORDELT_IT it(&column_widths_);
1031       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1032         ICOORDELT *w = it.data();
1033         if (NearlyEqual<int>(width, w->y(), 1)) {
1034           int true_width = part->bounding_box().width() / kColumnWidthFactor;
1035           if (true_width <= w->y() && true_width > w->x()) {
1036             w->set_x(true_width);
1037           }
1038           break;
1039         }
1040       }
1041     }
1042   }
1043 }
1044 
1045 // Helper makes the list of common column widths in column_widths_ from the
1046 // input col_widths. Destroys the content of col_widths by repeatedly
1047 // finding the mode and erasing the peak.
MakeColumnWidths(int col_widths_size,STATS * col_widths)1048 void TabFind::MakeColumnWidths(int col_widths_size, STATS *col_widths) {
1049   ICOORDELT_IT w_it(&column_widths_);
1050   int total_col_count = col_widths->get_total();
1051   while (col_widths->get_total() > 0) {
1052     int width = col_widths->mode();
1053     int col_count = col_widths->pile_count(width);
1054     col_widths->add(width, -col_count);
1055     // Get the entire peak.
1056     for (int left = width - 1; left > 0 && col_widths->pile_count(left) > 0; --left) {
1057       int new_count = col_widths->pile_count(left);
1058       col_count += new_count;
1059       col_widths->add(left, -new_count);
1060     }
1061     for (int right = width + 1; right < col_widths_size && col_widths->pile_count(right) > 0;
1062          ++right) {
1063       int new_count = col_widths->pile_count(right);
1064       col_count += new_count;
1065       col_widths->add(right, -new_count);
1066     }
1067     if (col_count > kMinLinesInColumn &&
1068         col_count > kMinFractionalLinesInColumn * total_col_count) {
1069       auto *w = new ICOORDELT(0, width);
1070       w_it.add_after_then_move(w);
1071       if (textord_debug_tabfind) {
1072         tprintf("Column of width %d has %d = %.2f%% lines\n", width * kColumnWidthFactor, col_count,
1073                 100.0 * col_count / total_col_count);
1074       }
1075     }
1076   }
1077 }
1078 
1079 // Mark blobs as being in a vertical text line where that is the case.
1080 // Returns true if the majority of the image is vertical text lines.
MarkVerticalText()1081 void TabFind::MarkVerticalText() {
1082   if (textord_debug_tabfind) {
1083     tprintf("Checking for vertical lines\n");
1084   }
1085   BlobGridSearch gsearch(this);
1086   gsearch.StartFullSearch();
1087   BLOBNBOX *blob = nullptr;
1088   while ((blob = gsearch.NextFullSearch()) != nullptr) {
1089     if (blob->region_type() < BRT_UNKNOWN) {
1090       continue;
1091     }
1092     if (blob->UniquelyVertical()) {
1093       blob->set_region_type(BRT_VERT_TEXT);
1094     }
1095   }
1096 }
1097 
FindMedianGutterWidth(TabVector_LIST * lines)1098 int TabFind::FindMedianGutterWidth(TabVector_LIST *lines) {
1099   TabVector_IT it(lines);
1100   int prev_right = -1;
1101   int max_gap = static_cast<int>(kMaxGutterWidthAbsolute * resolution_);
1102   STATS gaps(0, max_gap);
1103   STATS heights(0, max_gap);
1104   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1105     TabVector *v = it.data();
1106     TabVector *partner = v->GetSinglePartner();
1107     if (!v->IsLeftTab() || v->IsSeparator() || !partner) {
1108       continue;
1109     }
1110     heights.add(partner->startpt().x() - v->startpt().x(), 1);
1111     if (prev_right > 0 && v->startpt().x() > prev_right) {
1112       gaps.add(v->startpt().x() - prev_right, 1);
1113     }
1114     prev_right = partner->startpt().x();
1115   }
1116   if (textord_debug_tabfind) {
1117     tprintf("TabGutter total %d  median_gap %.2f  median_hgt %.2f\n", gaps.get_total(),
1118             gaps.median(), heights.median());
1119   }
1120   if (gaps.get_total() < kMinLinesInColumn) {
1121     return 0;
1122   }
1123   return static_cast<int>(gaps.median());
1124 }
1125 
1126 // Find the next adjacent (looking to the left or right) blob on this text
1127 // line, with the constraint that it must vertically significantly overlap
1128 // the [top_y, bottom_y] range.
1129 // If ignore_images is true, then blobs with aligned_text() < 0 are treated
1130 // as if they do not exist.
AdjacentBlob(const BLOBNBOX * bbox,bool look_left,bool ignore_images,double min_overlap_fraction,int gap_limit,int top_y,int bottom_y)1131 BLOBNBOX *TabFind::AdjacentBlob(const BLOBNBOX *bbox, bool look_left, bool ignore_images,
1132                                 double min_overlap_fraction, int gap_limit, int top_y,
1133                                 int bottom_y) {
1134   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> sidesearch(this);
1135   const TBOX &box = bbox->bounding_box();
1136   int left = box.left();
1137   int right = box.right();
1138   int mid_x = (left + right) / 2;
1139   sidesearch.StartSideSearch(mid_x, bottom_y, top_y);
1140   int best_gap = 0;
1141   bool debug = WithinTestRegion(3, left, bottom_y);
1142   BLOBNBOX *result = nullptr;
1143   BLOBNBOX *neighbour = nullptr;
1144   while ((neighbour = sidesearch.NextSideSearch(look_left)) != nullptr) {
1145     if (debug) {
1146       tprintf("Adjacent blob: considering box:");
1147       neighbour->bounding_box().print();
1148     }
1149     if (neighbour == bbox || (ignore_images && neighbour->region_type() < BRT_UNKNOWN)) {
1150       continue;
1151     }
1152     const TBOX &nbox = neighbour->bounding_box();
1153     int n_top_y = nbox.top();
1154     int n_bottom_y = nbox.bottom();
1155     int v_overlap = std::min(n_top_y, top_y) - std::max(n_bottom_y, bottom_y);
1156     int height = top_y - bottom_y;
1157     int n_height = n_top_y - n_bottom_y;
1158     if (v_overlap > min_overlap_fraction * std::min(height, n_height) &&
1159         (min_overlap_fraction == 0.0 || !DifferentSizes(height, n_height))) {
1160       int n_left = nbox.left();
1161       int n_right = nbox.right();
1162       int h_gap = std::max(n_left, left) - std::min(n_right, right);
1163       int n_mid_x = (n_left + n_right) / 2;
1164       if (look_left == (n_mid_x < mid_x) && n_mid_x != mid_x) {
1165         if (h_gap > gap_limit) {
1166           // Hit a big gap before next tab so don't return anything.
1167           if (debug) {
1168             tprintf("Giving up due to big gap = %d vs %d\n", h_gap, gap_limit);
1169           }
1170           return result;
1171         }
1172         if (h_gap > 0 && (look_left ? neighbour->right_tab_type() : neighbour->left_tab_type()) >=
1173                              TT_CONFIRMED) {
1174           // Hit a tab facing the wrong way. Stop in case we are crossing
1175           // the column boundary.
1176           if (debug) {
1177             tprintf("Collision with like tab of type %d at %d,%d\n",
1178                     look_left ? neighbour->right_tab_type() : neighbour->left_tab_type(), n_left,
1179                     nbox.bottom());
1180           }
1181           return result;
1182         }
1183         // This is a good fit to the line. Continue with this
1184         // neighbour as the bbox if the best gap.
1185         if (result == nullptr || h_gap < best_gap) {
1186           if (debug) {
1187             tprintf("Good result\n");
1188           }
1189           result = neighbour;
1190           best_gap = h_gap;
1191         } else {
1192           // The new one is worse, so we probably already have the best result.
1193           return result;
1194         }
1195       } else if (debug) {
1196         tprintf("Wrong way\n");
1197       }
1198     } else if (debug) {
1199       tprintf("Insufficient overlap\n");
1200     }
1201   }
1202   if (WithinTestRegion(3, left, box.top())) {
1203     tprintf("Giving up due to end of search\n");
1204   }
1205   return result; // Hit the edge and found nothing.
1206 }
1207 
1208 // Add a bi-directional partner relationship between the left
1209 // and the right. If one (or both) of the vectors is a separator,
1210 // extend a nearby extendable vector or create a new one of the
1211 // correct type, using the given left or right blob as a guide.
AddPartnerVector(BLOBNBOX * left_blob,BLOBNBOX * right_blob,TabVector * left,TabVector * right)1212 void TabFind::AddPartnerVector(BLOBNBOX *left_blob, BLOBNBOX *right_blob, TabVector *left,
1213                                TabVector *right) {
1214   const TBOX &left_box = left_blob->bounding_box();
1215   const TBOX &right_box = right_blob->bounding_box();
1216   if (left->IsSeparator()) {
1217     // Try to find a nearby left edge to extend.
1218     TabVector *v = LeftTabForBox(left_box, true, true);
1219     if (v != nullptr && v != left && v->IsLeftTab() &&
1220         v->XAtY(left_box.top()) > left->XAtY(left_box.top())) {
1221       left = v; // Found a good replacement.
1222       left->ExtendToBox(left_blob);
1223     } else {
1224       // Fake a vector.
1225       left = new TabVector(*left, TA_LEFT_RAGGED, vertical_skew_, left_blob);
1226       vectors_.add_sorted(TabVector::SortVectorsByKey, left);
1227       v_it_.move_to_first();
1228     }
1229   }
1230   if (right->IsSeparator()) {
1231     // Try to find a nearby left edge to extend.
1232     if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {
1233       tprintf("Box edge (%d,%d-%d)", right_box.right(), right_box.bottom(), right_box.top());
1234       right->Print(" looking for improvement for");
1235     }
1236     TabVector *v = RightTabForBox(right_box, true, true);
1237     if (v != nullptr && v != right && v->IsRightTab() &&
1238         v->XAtY(right_box.top()) < right->XAtY(right_box.top())) {
1239       right = v; // Found a good replacement.
1240       right->ExtendToBox(right_blob);
1241       if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {
1242         right->Print("Extended vector");
1243       }
1244     } else {
1245       // Fake a vector.
1246       right = new TabVector(*right, TA_RIGHT_RAGGED, vertical_skew_, right_blob);
1247       vectors_.add_sorted(TabVector::SortVectorsByKey, right);
1248       v_it_.move_to_first();
1249       if (WithinTestRegion(3, right_box.right(), right_box.bottom())) {
1250         right->Print("Created new vector");
1251       }
1252     }
1253   }
1254   left->AddPartner(right);
1255   right->AddPartner(left);
1256 }
1257 
1258 // Remove separators and unused tabs from the main vectors_ list
1259 // to the dead_vectors_ list.
CleanupTabs()1260 void TabFind::CleanupTabs() {
1261   // TODO(rays) Before getting rid of separators and unused vectors, it
1262   // would be useful to try moving ragged vectors outwards to see if this
1263   // allows useful extension. Could be combined with checking ends of partners.
1264   TabVector_IT it(&vectors_);
1265   TabVector_IT dead_it(&dead_vectors_);
1266   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1267     TabVector *v = it.data();
1268     if (v->IsSeparator() || v->Partnerless()) {
1269       dead_it.add_after_then_move(it.extract());
1270       v_it_.set_to_list(&vectors_);
1271     } else {
1272       v->FitAndEvaluateIfNeeded(vertical_skew_, this);
1273     }
1274   }
1275 }
1276 
1277 // Apply the given rotation to the given list of blobs.
RotateBlobList(const FCOORD & rotation,BLOBNBOX_LIST * blobs)1278 void TabFind::RotateBlobList(const FCOORD &rotation, BLOBNBOX_LIST *blobs) {
1279   BLOBNBOX_IT it(blobs);
1280   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1281     it.data()->rotate_box(rotation);
1282   }
1283 }
1284 
1285 // Recreate the grid with deskewed BLOBNBOXes.
1286 // Returns false if the detected skew angle is impossible.
Deskew(TabVector_LIST * hlines,BLOBNBOX_LIST * image_blobs,TO_BLOCK * block,FCOORD * deskew,FCOORD * reskew)1287 bool TabFind::Deskew(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,
1288                      FCOORD *deskew, FCOORD *reskew) {
1289   ComputeDeskewVectors(deskew, reskew);
1290   if (deskew->x() < kCosMaxSkewAngle) {
1291     return false;
1292   }
1293   RotateBlobList(*deskew, image_blobs);
1294   RotateBlobList(*deskew, &block->blobs);
1295   RotateBlobList(*deskew, &block->small_blobs);
1296   RotateBlobList(*deskew, &block->noise_blobs);
1297 
1298   // Rotate the horizontal vectors. The vertical vectors don't need
1299   // rotating as they can just be refitted.
1300   TabVector_IT h_it(hlines);
1301   for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {
1302     TabVector *h = h_it.data();
1303     h->Rotate(*deskew);
1304   }
1305   TabVector_IT d_it(&dead_vectors_);
1306   for (d_it.mark_cycle_pt(); !d_it.cycled_list(); d_it.forward()) {
1307     TabVector *d = d_it.data();
1308     d->Rotate(*deskew);
1309   }
1310   SetVerticalSkewAndParallelize(0, 1);
1311   // Rebuild the grid to the new size.
1312   TBOX grid_box(bleft_, tright_);
1313   grid_box.rotate_large(*deskew);
1314   Init(gridsize(), grid_box.botleft(), grid_box.topright());
1315   InsertBlobsToGrid(false, false, image_blobs, this);
1316   InsertBlobsToGrid(true, false, &block->blobs, this);
1317   return true;
1318 }
1319 
1320 // Flip the vertical and horizontal lines and rotate the grid ready
1321 // for working on the rotated image.
1322 // This also makes parameter adjustments for FindInitialTabVectors().
ResetForVerticalText(const FCOORD & rotate,const FCOORD & rerotate,TabVector_LIST * horizontal_lines,int * min_gutter_width)1323 void TabFind::ResetForVerticalText(const FCOORD &rotate, const FCOORD &rerotate,
1324                                    TabVector_LIST *horizontal_lines, int *min_gutter_width) {
1325   // Rotate the horizontal and vertical vectors and swap them over.
1326   // Only the separators are kept and rotated; other tabs are used
1327   // to estimate the gutter width then thrown away.
1328   TabVector_LIST ex_verticals;
1329   TabVector_IT ex_v_it(&ex_verticals);
1330   TabVector_LIST vlines;
1331   TabVector_IT v_it(&vlines);
1332   while (!v_it_.empty()) {
1333     TabVector *v = v_it_.extract();
1334     if (v->IsSeparator()) {
1335       v->Rotate(rotate);
1336       ex_v_it.add_after_then_move(v);
1337     } else {
1338       v_it.add_after_then_move(v);
1339     }
1340     v_it_.forward();
1341   }
1342 
1343   // Adjust the min gutter width for better tabbox selection
1344   // in 2nd call to FindInitialTabVectors().
1345   int median_gutter = FindMedianGutterWidth(&vlines);
1346   if (median_gutter > *min_gutter_width) {
1347     *min_gutter_width = median_gutter;
1348   }
1349 
1350   TabVector_IT h_it(horizontal_lines);
1351   for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {
1352     TabVector *h = h_it.data();
1353     h->Rotate(rotate);
1354   }
1355   v_it_.add_list_after(horizontal_lines);
1356   v_it_.move_to_first();
1357   h_it.set_to_list(horizontal_lines);
1358   h_it.add_list_after(&ex_verticals);
1359 
1360   // Rebuild the grid to the new size.
1361   TBOX grid_box(bleft(), tright());
1362   grid_box.rotate_large(rotate);
1363   Init(gridsize(), grid_box.botleft(), grid_box.topright());
1364 }
1365 
1366 // Clear the grid and get rid of the tab vectors, but not separators,
1367 // ready to start again.
Reset()1368 void TabFind::Reset() {
1369   v_it_.move_to_first();
1370   for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) {
1371     if (!v_it_.data()->IsSeparator()) {
1372       delete v_it_.extract();
1373     }
1374   }
1375   Clear();
1376 }
1377 
1378 // Reflect the separator tab vectors and the grids in the y-axis.
1379 // Can only be called after Reset!
ReflectInYAxis()1380 void TabFind::ReflectInYAxis() {
1381   TabVector_LIST temp_list;
1382   TabVector_IT temp_it(&temp_list);
1383   v_it_.move_to_first();
1384   // The TabVector list only contains vertical lines, but they need to be
1385   // reflected and the list needs to be reversed, so they are still in
1386   // sort_key order.
1387   while (!v_it_.empty()) {
1388     TabVector *v = v_it_.extract();
1389     v_it_.forward();
1390     v->ReflectInYAxis();
1391     temp_it.add_before_then_move(v);
1392   }
1393   v_it_.add_list_after(&temp_list);
1394   v_it_.move_to_first();
1395   // Reset this grid with reflected bounding boxes.
1396   TBOX grid_box(bleft(), tright());
1397   int tmp = grid_box.left();
1398   grid_box.set_left(-grid_box.right());
1399   grid_box.set_right(-tmp);
1400   Init(gridsize(), grid_box.botleft(), grid_box.topright());
1401 }
1402 
1403 // Compute the rotation required to deskew, and its inverse rotation.
ComputeDeskewVectors(FCOORD * deskew,FCOORD * reskew)1404 void TabFind::ComputeDeskewVectors(FCOORD *deskew, FCOORD *reskew) {
1405   double length = vertical_skew_ % vertical_skew_;
1406   length = sqrt(length);
1407   deskew->set_x(static_cast<float>(vertical_skew_.y() / length));
1408   deskew->set_y(static_cast<float>(vertical_skew_.x() / length));
1409   reskew->set_x(deskew->x());
1410   reskew->set_y(-deskew->y());
1411 }
1412 
1413 // Compute and apply constraints to the end positions of TabVectors so
1414 // that where possible partners end at the same y coordinate.
ApplyTabConstraints()1415 void TabFind::ApplyTabConstraints() {
1416   TabVector_IT it(&vectors_);
1417   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1418     TabVector *v = it.data();
1419     v->SetupConstraints();
1420   }
1421   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1422     TabVector *v = it.data();
1423     // With the first and last partner, we want a common bottom and top,
1424     // respectively, and for each change of partner, we want a common
1425     // top of first with bottom of next.
1426     v->SetupPartnerConstraints();
1427   }
1428   // TODO(rays) The back-to-back pairs should really be done like the
1429   // front-to-front pairs, but there is no convenient way of producing the
1430   // list of partners like there is with the front-to-front.
1431   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1432     TabVector *v = it.data();
1433     if (!v->IsRightTab()) {
1434       continue;
1435     }
1436     // For each back-to-back pair of vectors, try for common top and bottom.
1437     TabVector_IT partner_it(it);
1438     for (partner_it.forward(); !partner_it.at_first(); partner_it.forward()) {
1439       TabVector *partner = partner_it.data();
1440       if (!partner->IsLeftTab() || !v->VOverlap(*partner)) {
1441         continue;
1442       }
1443       v->SetupPartnerConstraints(partner);
1444     }
1445   }
1446   // Now actually apply the constraints to get common start/end points.
1447   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1448     TabVector *v = it.data();
1449     if (!v->IsSeparator()) {
1450       v->ApplyConstraints();
1451     }
1452   }
1453   // TODO(rays) Where constraint application fails, it would be good to try
1454   // checking the ends to see if they really should be moved.
1455 }
1456 
1457 } // namespace tesseract.
1458