1 ///////////////////////////////////////////////////////////////////////
2 // File:        tabvector.cpp
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author:      Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifdef HAVE_CONFIG_H
20 #  include "config_auto.h"
21 #endif
22 
23 #include "blobbox.h"
24 #include "colfind.h"
25 #include "colpartitionset.h"
26 #include "detlinefit.h"
27 #include "helpers.h" // for IntCastRounded
28 #include "statistc.h"
29 #include "tabvector.h"
30 
31 #include <algorithm>
32 
33 namespace tesseract {
34 
35 // Multiple of height used as a gutter for evaluation search.
36 const int kGutterMultiple = 4;
37 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
38 const int kGutterToNeighbourRatio = 3;
39 // Pixel distance for tab vectors to be considered the same.
40 const int kSimilarVectorDist = 10;
41 // Pixel distance for ragged tab vectors to be considered the same if there
42 // is nothing in the overlap box
43 const int kSimilarRaggedDist = 50;
44 // Max multiple of height to allow filling in between blobs when evaluating.
45 const int kMaxFillinMultiple = 11;
46 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
47 const double kMinGutterFraction = 0.5;
48 // Multiple of 1/n lines as a minimum gutter in evaluation.
49 const double kLineCountReciprocal = 4.0;
50 // Constant add-on for minimum gutter for aligned tabs.
51 const double kMinAlignedGutter = 0.25;
52 // Constant add-on for minimum gutter for ragged tabs.
53 const double kMinRaggedGutter = 1.5;
54 
55 double_VAR(textord_tabvector_vertical_gap_fraction, 0.5,
56            "max fraction of mean blob width allowed for vertical gaps in "
57            "vertical text");
58 
59 double_VAR(textord_tabvector_vertical_box_ratio, 0.5,
60            "Fraction of box matches required to declare a line vertical");
61 
62 // Create a constraint for the top or bottom of this TabVector.
CreateConstraint(TabVector * vector,bool is_top)63 void TabConstraint::CreateConstraint(TabVector *vector, bool is_top) {
64   auto *constraint = new TabConstraint(vector, is_top);
65   auto *constraints = new TabConstraint_LIST;
66   TabConstraint_IT it(constraints);
67   it.add_to_end(constraint);
68   if (is_top) {
69     vector->set_top_constraints(constraints);
70   } else {
71     vector->set_bottom_constraints(constraints);
72   }
73 }
74 
75 // Test to see if the constraints are compatible enough to merge.
CompatibleConstraints(TabConstraint_LIST * list1,TabConstraint_LIST * list2)76 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {
77   if (list1 == list2) {
78     return false;
79   }
80   int y_min = -INT32_MAX;
81   int y_max = INT32_MAX;
82   if (textord_debug_tabfind > 3) {
83     tprintf("Testing constraint compatibility\n");
84   }
85   GetConstraints(list1, &y_min, &y_max);
86   GetConstraints(list2, &y_min, &y_max);
87   if (textord_debug_tabfind > 3) {
88     tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
89   }
90   return y_max >= y_min;
91 }
92 
93 // Merge the lists of constraints and update the TabVector pointers.
94 // The second list is deleted.
MergeConstraints(TabConstraint_LIST * list1,TabConstraint_LIST * list2)95 void TabConstraint::MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {
96   if (list1 == list2) {
97     return;
98   }
99   TabConstraint_IT it(list2);
100   if (textord_debug_tabfind > 3) {
101     tprintf("Merging constraints\n");
102   }
103   // The vectors of all constraints on list2 are now going to be on list1.
104   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
105     TabConstraint *constraint = it.data();
106     if (textord_debug_tabfind > 3) {
107       constraint->vector_->Print("Merge");
108     }
109     if (constraint->is_top_) {
110       constraint->vector_->set_top_constraints(list1);
111     } else {
112       constraint->vector_->set_bottom_constraints(list1);
113     }
114   }
115   it = list1;
116   it.add_list_before(list2);
117   delete list2;
118 }
119 
120 // Set all the tops and bottoms as appropriate to a mean of the
121 // constrained range. Delete all the constraints and list.
ApplyConstraints(TabConstraint_LIST * constraints)122 void TabConstraint::ApplyConstraints(TabConstraint_LIST *constraints) {
123   int y_min = -INT32_MAX;
124   int y_max = INT32_MAX;
125   GetConstraints(constraints, &y_min, &y_max);
126   int y = (y_min + y_max) / 2;
127   TabConstraint_IT it(constraints);
128   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
129     TabConstraint *constraint = it.data();
130     TabVector *v = constraint->vector_;
131     if (constraint->is_top_) {
132       v->SetYEnd(y);
133       v->set_top_constraints(nullptr);
134     } else {
135       v->SetYStart(y);
136       v->set_bottom_constraints(nullptr);
137     }
138   }
139   delete constraints;
140 }
141 
TabConstraint(TabVector * vector,bool is_top)142 TabConstraint::TabConstraint(TabVector *vector, bool is_top) : vector_(vector), is_top_(is_top) {
143   if (is_top) {
144     y_min_ = vector->endpt().y();
145     y_max_ = vector->extended_ymax();
146   } else {
147     y_max_ = vector->startpt().y();
148     y_min_ = vector->extended_ymin();
149   }
150 }
151 
152 // Get the max of the mins and the min of the maxes.
GetConstraints(TabConstraint_LIST * constraints,int * y_min,int * y_max)153 void TabConstraint::GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max) {
154   TabConstraint_IT it(constraints);
155   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
156     TabConstraint *constraint = it.data();
157     if (textord_debug_tabfind > 3) {
158       tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
159       constraint->vector_->Print(" for");
160     }
161     *y_min = std::max(*y_min, constraint->y_min_);
162     *y_max = std::min(*y_max, constraint->y_max_);
163   }
164 }
165 
166 // The constructor is private. See the bottom of the file...
167 
168 // Public factory to build a TabVector from a list of boxes.
169 // The TabVector will be of the given alignment type.
170 // The input vertical vector is used in fitting, and the output
171 // vertical_x, vertical_y have the resulting line vector added to them
172 // if the alignment is not ragged.
173 // The extended_start_y and extended_end_y are the maximum possible
174 // extension to the line segment that can be used to align with others.
175 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
FitVector(TabAlignment alignment,ICOORD vertical,int extended_start_y,int extended_end_y,BLOBNBOX_CLIST * good_points,int * vertical_x,int * vertical_y)176 TabVector *TabVector::FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y,
177                                 int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x,
178                                 int *vertical_y) {
179   auto *vector = new TabVector(extended_start_y, extended_end_y, alignment, good_points);
180   if (!vector->Fit(vertical, false)) {
181     delete vector;
182     return nullptr;
183   }
184   if (!vector->IsRagged()) {
185     vertical = vector->endpt_ - vector->startpt_;
186     int weight = vector->BoxCount();
187     *vertical_x += vertical.x() * weight;
188     *vertical_y += vertical.y() * weight;
189   }
190   return vector;
191 }
192 
193 // Build a ragged TabVector by copying another's direction, shifting it
194 // to match the given blob, and making its initial extent the height
195 // of the blob, but its extended bounds from the bounds of the original.
TabVector(const TabVector & src,TabAlignment alignment,const ICOORD & vertical_skew,BLOBNBOX * blob)196 TabVector::TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew,
197                      BLOBNBOX *blob)
198     : extended_ymin_(src.extended_ymin_)
199     , extended_ymax_(src.extended_ymax_)
200     , needs_refit_(true)
201     , needs_evaluation_(true)
202     , alignment_(alignment) {
203   BLOBNBOX_C_IT it(&boxes_);
204   it.add_to_end(blob);
205   TBOX box = blob->bounding_box();
206   if (IsLeftTab()) {
207     startpt_ = box.botleft();
208     endpt_ = box.topleft();
209   } else {
210     startpt_ = box.botright();
211     endpt_ = box.topright();
212   }
213   sort_key_ =
214       SortKey(vertical_skew, (startpt_.x() + endpt_.x()) / 2, (startpt_.y() + endpt_.y()) / 2);
215   if (textord_debug_tabfind > 3) {
216     Print("Constructed a new tab vector:");
217   }
218 }
219 
220 // Copies basic attributes of a tab vector for simple operations.
221 // Copies things such startpt, endpt, range.
222 // Does not copy things such as partners, boxes, or constraints.
223 // This is useful if you only need vector information for processing, such
224 // as in the table detection code.
ShallowCopy() const225 TabVector *TabVector::ShallowCopy() const {
226   auto *copy = new TabVector();
227   copy->startpt_ = startpt_;
228   copy->endpt_ = endpt_;
229   copy->alignment_ = alignment_;
230   copy->extended_ymax_ = extended_ymax_;
231   copy->extended_ymin_ = extended_ymin_;
232   copy->intersects_other_lines_ = intersects_other_lines_;
233   return copy;
234 }
235 
236 // Extend this vector to include the supplied blob if it doesn't
237 // already have it.
ExtendToBox(BLOBNBOX * new_blob)238 void TabVector::ExtendToBox(BLOBNBOX *new_blob) {
239   TBOX new_box = new_blob->bounding_box();
240   BLOBNBOX_C_IT it(&boxes_);
241   if (!it.empty()) {
242     BLOBNBOX *blob = it.data();
243     TBOX box = blob->bounding_box();
244     while (!it.at_last() && box.top() <= new_box.top()) {
245       if (blob == new_blob) {
246         return; // We have it already.
247       }
248       it.forward();
249       blob = it.data();
250       box = blob->bounding_box();
251     }
252     if (box.top() >= new_box.top()) {
253       it.add_before_stay_put(new_blob);
254       needs_refit_ = true;
255       return;
256     }
257   }
258   needs_refit_ = true;
259   it.add_after_stay_put(new_blob);
260 }
261 
262 // Set the ycoord of the start and move the xcoord to match.
SetYStart(int start_y)263 void TabVector::SetYStart(int start_y) {
264   startpt_.set_x(XAtY(start_y));
265   startpt_.set_y(start_y);
266 }
267 // Set the ycoord of the end and move the xcoord to match.
SetYEnd(int end_y)268 void TabVector::SetYEnd(int end_y) {
269   endpt_.set_x(XAtY(end_y));
270   endpt_.set_y(end_y);
271 }
272 
273 // Rotate the ends by the given vector. Auto flip start and end if needed.
Rotate(const FCOORD & rotation)274 void TabVector::Rotate(const FCOORD &rotation) {
275   startpt_.rotate(rotation);
276   endpt_.rotate(rotation);
277   int dx = endpt_.x() - startpt_.x();
278   int dy = endpt_.y() - startpt_.y();
279   if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
280     // Need to flip start/end.
281     ICOORD tmp = startpt_;
282     startpt_ = endpt_;
283     endpt_ = tmp;
284   }
285 }
286 
287 // Setup the initial constraints, being the limits of
288 // the vector and the extended ends.
SetupConstraints()289 void TabVector::SetupConstraints() {
290   TabConstraint::CreateConstraint(this, false);
291   TabConstraint::CreateConstraint(this, true);
292 }
293 
294 // Setup the constraints between the partners of this TabVector.
SetupPartnerConstraints()295 void TabVector::SetupPartnerConstraints() {
296   // With the first and last partner, we want a common bottom and top,
297   // respectively, and for each change of partner, we want a common
298   // top of first with bottom of next.
299   TabVector_C_IT it(&partners_);
300   TabVector *prev_partner = nullptr;
301   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
302     TabVector *partner = it.data();
303     if (partner->top_constraints_ == nullptr || partner->bottom_constraints_ == nullptr) {
304       partner->Print("Impossible: has no constraints");
305       Print("This vector has it as a partner");
306       continue;
307     }
308     if (prev_partner == nullptr) {
309       // This is the first partner, so common bottom.
310       if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {
311         TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);
312       }
313     } else {
314       // We need prev top to be common with partner bottom.
315       if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
316                                                partner->bottom_constraints_)) {
317         TabConstraint::MergeConstraints(prev_partner->top_constraints_,
318                                         partner->bottom_constraints_);
319       }
320     }
321     prev_partner = partner;
322     if (it.at_last()) {
323       // This is the last partner, so common top.
324       if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {
325         TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);
326       }
327     }
328   }
329 }
330 
331 // Setup the constraints between this and its partner.
SetupPartnerConstraints(TabVector * partner)332 void TabVector::SetupPartnerConstraints(TabVector *partner) {
333   if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {
334     TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);
335   }
336   if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {
337     TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);
338   }
339 }
340 
341 // Use the constraints to modify the top and bottom.
ApplyConstraints()342 void TabVector::ApplyConstraints() {
343   if (top_constraints_ != nullptr) {
344     TabConstraint::ApplyConstraints(top_constraints_);
345   }
346   if (bottom_constraints_ != nullptr) {
347     TabConstraint::ApplyConstraints(bottom_constraints_);
348   }
349 }
350 
351 // Merge close tab vectors of the same side that overlap.
MergeSimilarTabVectors(const ICOORD & vertical,TabVector_LIST * vectors,BlobGrid * grid)352 void TabVector::MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors,
353                                        BlobGrid *grid) {
354   TabVector_IT it1(vectors);
355   for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
356     TabVector *v1 = it1.data();
357     TabVector_IT it2(it1);
358     for (it2.forward(); !it2.at_first(); it2.forward()) {
359       TabVector *v2 = it2.data();
360       if (v2->SimilarTo(vertical, *v1, grid)) {
361         // Merge into the forward one, in case the combined vector now
362         // overlaps one in between.
363         if (textord_debug_tabfind) {
364           v2->Print("Merging");
365           v1->Print("by deleting");
366         }
367         v2->MergeWith(vertical, it1.extract());
368         if (textord_debug_tabfind) {
369           v2->Print("Producing");
370         }
371         ICOORD merged_vector = v2->endpt();
372         merged_vector -= v2->startpt();
373         if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
374           v2->Print("Garbage result of merge?");
375         }
376         break;
377       }
378     }
379   }
380 }
381 
382 // Return true if this vector is the same side, overlaps, and close
383 // enough to the other to be merged.
SimilarTo(const ICOORD & vertical,const TabVector & other,BlobGrid * grid) const384 bool TabVector::SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const {
385   if ((IsRightTab() && other.IsRightTab()) || (IsLeftTab() && other.IsLeftTab())) {
386     // If they don't overlap, at least in extensions, then there is no chance.
387     if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0) {
388       return false;
389     }
390     // A fast approximation to the scale factor of the sort_key_.
391     int v_scale = abs(vertical.y());
392     if (v_scale == 0) {
393       v_scale = 1;
394     }
395     // If they are close enough, then OK.
396     if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
397         sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_) {
398       return true;
399     }
400     // Ragged tabs get a bigger threshold.
401     if (!IsRagged() || !other.IsRagged() ||
402         sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
403         sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_) {
404       return false;
405     }
406     if (grid == nullptr) {
407       // There is nothing else to test!
408       return true;
409     }
410     // If there is nothing in the rectangle between the vector that is going to
411     // move, and the place it is moving to, then they can be merged.
412     // Setup a vertical search for any blob.
413     const TabVector *mover = (IsRightTab() && sort_key_ < other.sort_key_) ? this : &other;
414     int top_y = mover->endpt_.y();
415     int bottom_y = mover->startpt_.y();
416     int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));
417     int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));
418     int shift = abs(sort_key_ - other.sort_key_) / v_scale;
419     if (IsRightTab()) {
420       right += shift;
421     } else {
422       left -= shift;
423     }
424 
425     GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> vsearch(grid);
426     vsearch.StartVerticalSearch(left, right, top_y);
427     BLOBNBOX *blob;
428     while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {
429       const TBOX &box = blob->bounding_box();
430       if (box.top() > bottom_y) {
431         return true; // Nothing found.
432       }
433       if (box.bottom() < top_y) {
434         continue; // Doesn't overlap.
435       }
436       int left_at_box = XAtY(box.bottom());
437       int right_at_box = left_at_box;
438       if (IsRightTab()) {
439         right_at_box += shift;
440       } else {
441         left_at_box -= shift;
442       }
443       if (std::min(right_at_box, static_cast<int>(box.right())) >
444           std::max(left_at_box, static_cast<int>(box.left()))) {
445         return false;
446       }
447     }
448     return true; // Nothing found.
449   }
450   return false;
451 }
452 
453 // Eat the other TabVector into this and delete it.
MergeWith(const ICOORD & vertical,TabVector * other)454 void TabVector::MergeWith(const ICOORD &vertical, TabVector *other) {
455   extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);
456   extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);
457   if (other->IsRagged()) {
458     alignment_ = other->alignment_;
459   }
460   // Merge sort the two lists of boxes.
461   BLOBNBOX_C_IT it1(&boxes_);
462   BLOBNBOX_C_IT it2(&other->boxes_);
463   while (!it2.empty()) {
464     BLOBNBOX *bbox2 = it2.extract();
465     it2.forward();
466     TBOX box2 = bbox2->bounding_box();
467     BLOBNBOX *bbox1 = it1.data();
468     TBOX box1 = bbox1->bounding_box();
469     while (box1.bottom() < box2.bottom() && !it1.at_last()) {
470       it1.forward();
471       bbox1 = it1.data();
472       box1 = bbox1->bounding_box();
473     }
474     if (box1.bottom() < box2.bottom()) {
475       it1.add_to_end(bbox2);
476     } else if (bbox1 != bbox2) {
477       it1.add_before_stay_put(bbox2);
478     }
479   }
480   Fit(vertical, true);
481   other->Delete(this);
482 }
483 
484 // Add a new element to the list of partner TabVectors.
485 // Partners must be added in order of increasing y coordinate of the text line
486 // that makes them partners.
487 // Groups of identical partners are merged into one.
AddPartner(TabVector * partner)488 void TabVector::AddPartner(TabVector *partner) {
489   if (IsSeparator() || partner->IsSeparator()) {
490     return;
491   }
492   TabVector_C_IT it(&partners_);
493   if (!it.empty()) {
494     it.move_to_last();
495     if (it.data() == partner) {
496       return;
497     }
498   }
499   it.add_after_then_move(partner);
500 }
501 
502 // Return true if other is a partner of this.
IsAPartner(const TabVector * other)503 bool TabVector::IsAPartner(const TabVector *other) {
504   TabVector_C_IT it(&partners_);
505   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
506     if (it.data() == other) {
507       return true;
508     }
509   }
510   return false;
511 }
512 
513 // These names must be synced with the TabAlignment enum in tabvector.h.
514 static const char *const kAlignmentNames[] = {"Left Aligned",  "Left Ragged",  "Center",
515                                               "Right Aligned", "Right Ragged", "Separator"};
516 
517 // Print basic information about this tab vector.
Print(const char * prefix)518 void TabVector::Print(const char *prefix) {
519   tprintf(
520       "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
521       " partners=%d\n",
522       prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),
523       mean_width_, percent_score_, sort_key_, boxes_.length(), partners_.length());
524 }
525 
526 // Print basic information about this tab vector and every box in it.
Debug(const char * prefix)527 void TabVector::Debug(const char *prefix) {
528   Print(prefix);
529   BLOBNBOX_C_IT it(&boxes_);
530   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
531     BLOBNBOX *bbox = it.data();
532     const TBOX &box = bbox->bounding_box();
533     tprintf("Box at (%d,%d)->(%d,%d)\n", box.left(), box.bottom(), box.right(), box.top());
534   }
535 }
536 
537 #ifndef GRAPHICS_DISABLED
538 
539 // Draw this tabvector in place in the given window.
Display(ScrollView * tab_win)540 void TabVector::Display(ScrollView *tab_win) {
541   if (textord_debug_printable) {
542     tab_win->Pen(ScrollView::BLUE);
543   } else if (alignment_ == TA_LEFT_ALIGNED) {
544     tab_win->Pen(ScrollView::LIME_GREEN);
545   } else if (alignment_ == TA_LEFT_RAGGED) {
546     tab_win->Pen(ScrollView::DARK_GREEN);
547   } else if (alignment_ == TA_RIGHT_ALIGNED) {
548     tab_win->Pen(ScrollView::PINK);
549   } else if (alignment_ == TA_RIGHT_RAGGED) {
550     tab_win->Pen(ScrollView::CORAL);
551   } else {
552     tab_win->Pen(ScrollView::WHITE);
553   }
554   tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
555   tab_win->Pen(ScrollView::GREY);
556   tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
557   tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
558   auto score_string = std::to_string(percent_score_);
559   tab_win->TextAttributes("Times", 50, false, false, false);
560   tab_win->Text(startpt_.x(), startpt_.y(), score_string.c_str());
561 }
562 
563 #endif
564 
565 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
FitAndEvaluateIfNeeded(const ICOORD & vertical,TabFind * finder)566 void TabVector::FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder) {
567   if (needs_refit_) {
568     Fit(vertical, true);
569   }
570   if (needs_evaluation_) {
571     Evaluate(vertical, finder);
572   }
573 }
574 
575 // Evaluate the vector in terms of coverage of its length by good-looking
576 // box edges. A good looking box is one where its nearest neighbour on the
577 // inside is nearer than half the distance its nearest neighbour on the
578 // outside of the putative column. Bad boxes are removed from the line.
579 // A second pass then further filters boxes by requiring that the gutter
580 // width be a minimum fraction of the mean gutter along the line.
Evaluate(const ICOORD & vertical,TabFind * finder)581 void TabVector::Evaluate(const ICOORD &vertical, TabFind *finder) {
582   bool debug = false;
583   needs_evaluation_ = false;
584   int length = endpt_.y() - startpt_.y();
585   if (length == 0 || boxes_.empty()) {
586     percent_score_ = 0;
587     Print("Zero length in evaluate");
588     return;
589   }
590   // Compute the mean box height.
591   BLOBNBOX_C_IT it(&boxes_);
592   int mean_height = 0;
593   int height_count = 0;
594   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
595     BLOBNBOX *bbox = it.data();
596     const TBOX &box = bbox->bounding_box();
597     int height = box.height();
598     mean_height += height;
599     ++height_count;
600   }
601   if (height_count > 0) {
602     mean_height /= height_count;
603   }
604   int max_gutter = kGutterMultiple * mean_height;
605   if (IsRagged()) {
606     // Ragged edges face a tougher test in that the gap must always be within
607     // the height of the blob.
608     max_gutter = kGutterToNeighbourRatio * mean_height;
609   }
610 
611   STATS gutters(0, max_gutter + 1);
612   // Evaluate the boxes for their goodness, calculating the coverage as we go.
613   // Remove boxes that are not good and shorten the list to the first and
614   // last good boxes.
615   int num_deleted_boxes = 0;
616   bool text_on_image = false;
617   int good_length = 0;
618   const TBOX *prev_good_box = nullptr;
619   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
620     BLOBNBOX *bbox = it.data();
621     const TBOX &box = bbox->bounding_box();
622     int mid_y = (box.top() + box.bottom()) / 2;
623     if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
624       if (!debug) {
625         tprintf("After already deleting %d boxes, ", num_deleted_boxes);
626         Print("Starting evaluation");
627       }
628       debug = true;
629     }
630     // A good box is one where the nearest neighbour on the inside is closer
631     // than half the distance to the nearest neighbour on the outside
632     // (of the putative column).
633     bool left = IsLeftTab();
634     int tab_x = XAtY(mid_y);
635     int gutter_width;
636     int neighbour_gap;
637     finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,
638                                        &neighbour_gap);
639     if (debug) {
640       tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n", box.left(), box.bottom(),
641               box.right(), box.top(), gutter_width, neighbour_gap);
642     }
643     // Now we can make the test.
644     if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
645       // A good box contributes its height to the good_length.
646       good_length += box.top() - box.bottom();
647       gutters.add(gutter_width, 1);
648       // Two good boxes together contribute the gap between them
649       // to the good_length as well, as long as the gap is not
650       // too big.
651       if (prev_good_box != nullptr) {
652         int vertical_gap = box.bottom() - prev_good_box->top();
653         double size1 = sqrt(static_cast<double>(prev_good_box->area()));
654         double size2 = sqrt(static_cast<double>(box.area()));
655         if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2)) {
656           good_length += vertical_gap;
657         }
658         if (debug) {
659           tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n", vertical_gap,
660                   kMaxFillinMultiple * std::min(size1, size2), good_length);
661         }
662       } else {
663         // Adjust the start to the first good box.
664         SetYStart(box.bottom());
665       }
666       prev_good_box = &box;
667       if (bbox->flow() == BTFT_TEXT_ON_IMAGE) {
668         text_on_image = true;
669       }
670     } else {
671       // Get rid of boxes that are not good.
672       if (debug) {
673         tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n", box.left(), box.bottom(),
674                 box.right(), box.top(), gutter_width, neighbour_gap);
675       }
676       it.extract();
677       ++num_deleted_boxes;
678     }
679   }
680   if (debug) {
681     Print("Evaluating:");
682   }
683   // If there are any good boxes, do it again, except this time get rid of
684   // boxes that have a gutter that is a small fraction of the mean gutter.
685   // This filters out ends that run into a coincidental gap in the text.
686   int search_top = endpt_.y();
687   int search_bottom = startpt_.y();
688   int median_gutter = IntCastRounded(gutters.median());
689   if (gutters.get_total() > 0) {
690     prev_good_box = nullptr;
691     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
692       BLOBNBOX *bbox = it.data();
693       const TBOX &box = bbox->bounding_box();
694       int mid_y = (box.top() + box.bottom()) / 2;
695       // A good box is one where the gutter width is at least some constant
696       // fraction of the mean gutter width.
697       bool left = IsLeftTab();
698       int tab_x = XAtY(mid_y);
699       int max_gutter = kGutterMultiple * mean_height;
700       if (IsRagged()) {
701         // Ragged edges face a tougher test in that the gap must always be
702         // within the height of the blob.
703         max_gutter = kGutterToNeighbourRatio * mean_height;
704       }
705       int gutter_width;
706       int neighbour_gap;
707       finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,
708                                          &neighbour_gap);
709       // Now we can make the test.
710       if (gutter_width >= median_gutter * kMinGutterFraction) {
711         if (prev_good_box == nullptr) {
712           // Adjust the start to the first good box.
713           SetYStart(box.bottom());
714           search_bottom = box.top();
715         }
716         prev_good_box = &box;
717         search_top = box.bottom();
718       } else {
719         // Get rid of boxes that are not good.
720         if (debug) {
721           tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n", box.left(),
722                   box.bottom(), box.right(), box.top(), gutter_width, median_gutter);
723         }
724         it.extract();
725         ++num_deleted_boxes;
726       }
727     }
728   }
729   // If there has been a good box, adjust the end.
730   if (prev_good_box != nullptr) {
731     SetYEnd(prev_good_box->top());
732     // Compute the percentage of the vector that is occupied by good boxes.
733     int length = endpt_.y() - startpt_.y();
734     percent_score_ = 100 * good_length / length;
735     if (num_deleted_boxes > 0) {
736       needs_refit_ = true;
737       FitAndEvaluateIfNeeded(vertical, finder);
738       if (boxes_.empty()) {
739         return;
740       }
741     }
742     // Test the gutter over the whole vector, instead of just at the boxes.
743     int required_shift;
744     if (search_bottom > search_top) {
745       search_bottom = startpt_.y();
746       search_top = endpt_.y();
747     }
748     double min_gutter_width = kLineCountReciprocal / boxes_.length();
749     min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
750     min_gutter_width *= mean_height;
751     int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
752     if (median_gutter > max_gutter_width) {
753       max_gutter_width = median_gutter;
754     }
755     int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, text_on_image,
756                                            max_gutter_width, &required_shift);
757     if (gutter_width < min_gutter_width) {
758       if (debug) {
759         tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n", gutter_width,
760                 min_gutter_width);
761       }
762       boxes_.shallow_clear();
763       percent_score_ = 0;
764     } else if (debug) {
765       tprintf("Final gutter %d, vs limit of %g, required shift = %d\n", gutter_width,
766               min_gutter_width, required_shift);
767     }
768   } else {
769     // There are no good boxes left, so score is 0.
770     percent_score_ = 0;
771   }
772 
773   if (debug) {
774     Print("Evaluation complete:");
775   }
776 }
777 
778 // (Re)Fit a line to the stored points. Returns false if the line
779 // is degenerate. Although the TabVector code mostly doesn't care about the
780 // direction of lines, XAtY would give silly results for a horizontal line.
781 // The class is mostly aimed at use for vertical lines representing
782 // horizontal tab stops.
Fit(ICOORD vertical,bool force_parallel)783 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
784   needs_refit_ = false;
785   if (boxes_.empty()) {
786     // Don't refit something with no boxes, as that only happens
787     // in Evaluate, and we don't want to end up with a zero vector.
788     if (!force_parallel) {
789       return false;
790     }
791     // If we are forcing parallel, then we just need to set the sort_key_.
792     ICOORD midpt = startpt_;
793     midpt += endpt_;
794     midpt /= 2;
795     sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
796     return startpt_.y() != endpt_.y();
797   }
798   if (!force_parallel && !IsRagged()) {
799     // Use a fitted line as the vertical.
800     DetLineFit linepoints;
801     BLOBNBOX_C_IT it(&boxes_);
802     // Fit a line to all the boxes in the list.
803     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
804       BLOBNBOX *bbox = it.data();
805       const TBOX &box = bbox->bounding_box();
806       int x1 = IsRightTab() ? box.right() : box.left();
807       ICOORD boxpt(x1, box.bottom());
808       linepoints.Add(boxpt);
809       if (it.at_last()) {
810         ICOORD top_pt(x1, box.top());
811         linepoints.Add(top_pt);
812       }
813     }
814     linepoints.Fit(&startpt_, &endpt_);
815     if (startpt_.y() != endpt_.y()) {
816       vertical = endpt_;
817       vertical -= startpt_;
818     }
819   }
820   int start_y = startpt_.y();
821   int end_y = endpt_.y();
822   sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;
823   BLOBNBOX_C_IT it(&boxes_);
824   // Choose a line parallel to the vertical such that all boxes are on the
825   // correct side of it.
826   mean_width_ = 0;
827   int width_count = 0;
828   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
829     BLOBNBOX *bbox = it.data();
830     const TBOX &box = bbox->bounding_box();
831     mean_width_ += box.width();
832     ++width_count;
833     int x1 = IsRightTab() ? box.right() : box.left();
834     // Test both the bottom and the top, as one will be more extreme, depending
835     // on the direction of skew.
836     int bottom_y = box.bottom();
837     int top_y = box.top();
838     int key = SortKey(vertical, x1, bottom_y);
839     if (IsLeftTab() == (key < sort_key_)) {
840       sort_key_ = key;
841       startpt_ = ICOORD(x1, bottom_y);
842     }
843     key = SortKey(vertical, x1, top_y);
844     if (IsLeftTab() == (key < sort_key_)) {
845       sort_key_ = key;
846       startpt_ = ICOORD(x1, top_y);
847     }
848     if (it.at_first()) {
849       start_y = bottom_y;
850     }
851     if (it.at_last()) {
852       end_y = top_y;
853     }
854   }
855   if (width_count > 0) {
856     mean_width_ = (mean_width_ + width_count - 1) / width_count;
857   }
858   endpt_ = startpt_ + vertical;
859   needs_evaluation_ = true;
860   if (start_y != end_y) {
861     // Set the ends of the vector to fully include the first and last blobs.
862     startpt_.set_x(XAtY(vertical, sort_key_, start_y));
863     startpt_.set_y(start_y);
864     endpt_.set_x(XAtY(vertical, sort_key_, end_y));
865     endpt_.set_y(end_y);
866     return true;
867   }
868   return false;
869 }
870 
871 // Returns the singleton partner if there is one, or nullptr otherwise.
GetSinglePartner()872 TabVector *TabVector::GetSinglePartner() {
873   if (!partners_.singleton()) {
874     return nullptr;
875   }
876   TabVector_C_IT partner_it(&partners_);
877   TabVector *partner = partner_it.data();
878   return partner;
879 }
880 
881 // Return the partner of this TabVector if the vector qualifies as
882 // being a vertical text line, otherwise nullptr.
VerticalTextlinePartner()883 TabVector *TabVector::VerticalTextlinePartner() {
884   if (!partners_.singleton()) {
885     return nullptr;
886   }
887   TabVector_C_IT partner_it(&partners_);
888   TabVector *partner = partner_it.data();
889   BLOBNBOX_C_IT box_it1(&boxes_);
890   BLOBNBOX_C_IT box_it2(&partner->boxes_);
891   // Count how many boxes are also in the other list.
892   // At the same time, gather the mean width and median vertical gap.
893   if (textord_debug_tabfind > 1) {
894     Print("Testing for vertical text");
895     partner->Print("           partner");
896   }
897   int num_matched = 0;
898   int num_unmatched = 0;
899   int total_widths = 0;
900   int width = startpt().x() - partner->startpt().x();
901   if (width < 0) {
902     width = -width;
903   }
904   STATS gaps(0, width * 2);
905   BLOBNBOX *prev_bbox = nullptr;
906   box_it2.mark_cycle_pt();
907   for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
908     BLOBNBOX *bbox = box_it1.data();
909     TBOX box = bbox->bounding_box();
910     if (prev_bbox != nullptr) {
911       gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
912     }
913     while (!box_it2.cycled_list() && box_it2.data() != bbox &&
914            box_it2.data()->bounding_box().bottom() < box.bottom()) {
915       box_it2.forward();
916     }
917     if (!box_it2.cycled_list() && box_it2.data() == bbox && bbox->region_type() >= BRT_UNKNOWN &&
918         (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN)) {
919       ++num_matched;
920     } else {
921       ++num_unmatched;
922     }
923     total_widths += box.width();
924     prev_bbox = bbox;
925   }
926   if (num_unmatched + num_matched == 0) {
927     return nullptr;
928   }
929   double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
930   double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
931   int min_box_match =
932       static_cast<int>((num_matched + num_unmatched) * textord_tabvector_vertical_box_ratio);
933   bool is_vertical =
934       (gaps.get_total() > 0 && num_matched >= min_box_match && gaps.median() <= max_gap);
935   if (textord_debug_tabfind > 1) {
936     tprintf(
937         "gaps=%d, matched=%d, unmatched=%d, min_match=%d "
938         "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
939         gaps.get_total(), num_matched, num_unmatched, min_box_match, gaps.median(), avg_width,
940         max_gap, is_vertical ? "Yes" : "No");
941   }
942   return (is_vertical) ? partner : nullptr;
943 }
944 
945 // The constructor is private.
TabVector(int extended_ymin,int extended_ymax,TabAlignment alignment,BLOBNBOX_CLIST * boxes)946 TabVector::TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment,
947                      BLOBNBOX_CLIST *boxes)
948     : extended_ymin_(extended_ymin)
949     , extended_ymax_(extended_ymax)
950     , sort_key_(0)
951     , percent_score_(0)
952     , mean_width_(0)
953     , needs_refit_(true)
954     , needs_evaluation_(true)
955     , alignment_(alignment)
956     , top_constraints_(nullptr)
957     , bottom_constraints_(nullptr) {
958   BLOBNBOX_C_IT it(&boxes_);
959   it.add_list_after(boxes);
960 }
961 
962 // Delete this, but first, repoint all the partners to point to
963 // replacement. If replacement is nullptr, then partner relationships
964 // are removed.
Delete(TabVector * replacement)965 void TabVector::Delete(TabVector *replacement) {
966   TabVector_C_IT it(&partners_);
967   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
968     TabVector *partner = it.data();
969     TabVector_C_IT p_it(&partner->partners_);
970     // If partner already has replacement in its list, then make
971     // replacement null, and just remove this TabVector when we find it.
972     TabVector *partner_replacement = replacement;
973     for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
974       TabVector *p_partner = p_it.data();
975       if (p_partner == partner_replacement) {
976         partner_replacement = nullptr;
977         break;
978       }
979     }
980     // Remove all references to this, and replace with replacement if not
981     // nullptr.
982     for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
983       TabVector *p_partner = p_it.data();
984       if (p_partner == this) {
985         p_it.extract();
986         if (partner_replacement != nullptr) {
987           p_it.add_before_stay_put(partner_replacement);
988         }
989       }
990     }
991     if (partner_replacement != nullptr) {
992       partner_replacement->AddPartner(partner);
993     }
994   }
995   delete this;
996 }
997 
998 } // namespace tesseract.
999