1 /**********************************************************************
2  * File:        blobbox.h  (Formerly blobnbox.h)
3  * Description: Code for the textord blob class.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef BLOBBOX_H
20 #define BLOBBOX_H
21 
22 #include "elst.h"       // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
23 #include "elst2.h"      // for ELIST2_ITERATOR, ELIST2IZEH, ELIST2_LINK
24 #include "errcode.h"    // for ASSERT_HOST
25 #include "ocrblock.h"   // for BLOCK
26 #include "params.h"     // for DoubleParam, double_VAR_H
27 #include "pdblock.h"    // for PDBLK
28 #include "points.h"     // for FCOORD, ICOORD, ICOORDELT_LIST
29 #include "quspline.h"   // for QSPLINE
30 #include "rect.h"       // for TBOX
31 #include "scrollview.h" // for ScrollView, ScrollView::Color
32 #include "statistc.h"   // for STATS
33 #include "stepblob.h"   // for C_BLOB
34 #include "tprintf.h"    // for tprintf
35 #include "werd.h"       // for WERD_LIST
36 
37 #include <cinttypes> // for PRId32
38 #include <cmath>     // for std::sqrt
39 #include <cstdint>   // for int16_t, int32_t
40 
41 struct Pix;
42 
43 namespace tesseract {
44 
45 class C_OUTLINE;
46 
47 enum PITCH_TYPE {
48   PITCH_DUNNO,       // insufficient data
49   PITCH_DEF_FIXED,   // definitely fixed
50   PITCH_MAYBE_FIXED, // could be
51   PITCH_DEF_PROP,
52   PITCH_MAYBE_PROP,
53   PITCH_CORR_FIXED,
54   PITCH_CORR_PROP
55 };
56 
57 // The possible tab-stop types of each side of a BLOBNBOX.
58 // The ordering is important, as it is used for deleting dead-ends in the
59 // search. ALIGNED, CONFIRMED and VLINE should remain greater than the
60 // non-aligned, unset, or deleted members.
61 enum TabType {
62   TT_NONE,          // Not a tab.
63   TT_DELETED,       // Not a tab after detailed analysis.
64   TT_MAYBE_RAGGED,  // Initial designation of a tab-stop candidate.
65   TT_MAYBE_ALIGNED, // Initial designation of a tab-stop candidate.
66   TT_CONFIRMED,     // Aligned with neighbours.
67   TT_VLINE          // Detected as a vertical line.
68 };
69 
70 // The possible region types of a BLOBNBOX.
71 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
72 // Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
73 // *Type static functions below.
74 enum BlobRegionType {
75   BRT_NOISE,     // Neither text nor image.
76   BRT_HLINE,     // Horizontal separator line.
77   BRT_VLINE,     // Vertical separator line.
78   BRT_RECTIMAGE, // Rectangular image.
79   BRT_POLYIMAGE, // Non-rectangular image.
80   BRT_UNKNOWN,   // Not determined yet.
81   BRT_VERT_TEXT, // Vertical alignment, not necessarily vertically oriented.
82   BRT_TEXT,      // Convincing text.
83 
84   BRT_COUNT // Number of possibilities.
85 };
86 
87 // enum for elements of arrays that refer to neighbours.
88 // NOTE: keep in this order, so ^2 can be used to flip direction.
89 enum BlobNeighbourDir { BND_LEFT, BND_BELOW, BND_RIGHT, BND_ABOVE, BND_COUNT };
90 
91 // enum for special type of text characters, such as math symbol or italic.
92 enum BlobSpecialTextType {
93   BSTT_NONE,    // No special.
94   BSTT_ITALIC,  // Italic style.
95   BSTT_DIGIT,   // Digit symbols.
96   BSTT_MATH,    // Mathematical symbols (not including digit).
97   BSTT_UNCLEAR, // Characters with low recognition rate.
98   BSTT_SKIP,    // Characters that we skip labeling (usually too small).
99   BSTT_COUNT
100 };
101 
DirOtherWay(BlobNeighbourDir dir)102 inline BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir) {
103   return static_cast<BlobNeighbourDir>(dir ^ 2);
104 }
105 
106 // BlobTextFlowType indicates the quality of neighbouring information
107 // related to a chain of connected components, either horizontally or
108 // vertically. Also used by ColPartition for the collection of blobs
109 // within, which should all have the same value in most cases.
110 enum BlobTextFlowType {
111   BTFT_NONE,          // No text flow set yet.
112   BTFT_NONTEXT,       // Flow too poor to be likely text.
113   BTFT_NEIGHBOURS,    // Neighbours support flow in this direction.
114   BTFT_CHAIN,         // There is a weak chain of text in this direction.
115   BTFT_STRONG_CHAIN,  // There is a strong chain of text in this direction.
116   BTFT_TEXT_ON_IMAGE, // There is a strong chain of text on an image.
117   BTFT_LEADER,        // Leader dots/dashes etc.
118   BTFT_COUNT
119 };
120 
121 // Returns true if type1 dominates type2 in a merge. Mostly determined by the
122 // ordering of the enum, LEADER is weak and dominates nothing.
123 // The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
124 // this cannot be true if t1 == t2, so the result is undefined.
DominatesInMerge(BlobTextFlowType type1,BlobTextFlowType type2)125 inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
126   // LEADER always loses.
127   if (type1 == BTFT_LEADER) {
128     return false;
129   }
130   if (type2 == BTFT_LEADER) {
131     return true;
132   }
133   // With those out of the way, the ordering of the enum determines the result.
134   return type1 >= type2;
135 }
136 
137 class ColPartition;
138 
139 class BLOBNBOX;
ELISTIZEH(BLOBNBOX)140 ELISTIZEH(BLOBNBOX)
141 class BLOBNBOX : public ELIST_LINK {
142 public:
143   BLOBNBOX() {
144     ReInit();
145   }
146   explicit BLOBNBOX(C_BLOB *srcblob) {
147     box = srcblob->bounding_box();
148     ReInit();
149     cblob_ptr = srcblob;
150     area = static_cast<int>(srcblob->area());
151   }
152   ~BLOBNBOX() {
153     if (owns_cblob_) {
154       delete cblob_ptr;
155     }
156   }
157 
158   static void clear_blobnboxes(BLOBNBOX_LIST *boxes) {
159     BLOBNBOX_IT it = boxes;
160     // A BLOBNBOX generally doesn't own its blobs, so if they do, you
161     // have to delete them explicitly.
162     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
163       BLOBNBOX *box = it.data();
164       // TODO: remove next line, currently still needed for resultiterator_test.
165       delete box->remove_cblob();
166     }
167   }
168 
169   static BLOBNBOX *RealBlob(C_OUTLINE *outline) {
170     auto *blob = new C_BLOB(outline);
171     return new BLOBNBOX(blob);
172   }
173 
174   // Rotates the box and the underlying blob.
175   void rotate(FCOORD rotation);
176 
177   // Methods that act on the box without touching the underlying blob.
178   // Reflect the box in the y-axis, leaving the underlying blob untouched.
179   void reflect_box_in_y_axis();
180   // Rotates the box by the angle given by rotation.
181   // If the blob is a diacritic, then only small rotations for skew
182   // correction can be applied.
183   void rotate_box(FCOORD rotation);
184   // Moves just the box by the given vector.
185   void translate_box(ICOORD v) {
186     if (IsDiacritic()) {
187       box.move(v);
188       base_char_top_ += v.y();
189       base_char_bottom_ += v.y();
190     } else {
191       box.move(v);
192       set_diacritic_box(box);
193     }
194   }
195   void merge(BLOBNBOX *nextblob);
196   void really_merge(BLOBNBOX *other);
197   void chop(                 // fake chop blob
198       BLOBNBOX_IT *start_it, // location of this
199       BLOBNBOX_IT *blob_it,  // iterator
200       FCOORD rotation,       // for landscape
201       float xheight);        // line height
202 
203   void NeighbourGaps(int gaps[BND_COUNT]) const;
204   void MinMaxGapsClipped(int *h_min, int *h_max, int *v_min, int *v_max) const;
205   void CleanNeighbours();
206   // Returns positive if there is at least one side neighbour that has a
207   // similar stroke width and is not on the other side of a rule line.
208   int GoodTextBlob() const;
209   // Returns the number of side neighbours that are of type BRT_NOISE.
210   int NoisyNeighbours() const;
211 
212   // Returns true if the blob is noise and has no owner.
213   bool DeletableNoise() const {
214     return owner() == nullptr && region_type() == BRT_NOISE;
215   }
216 
217   // Returns true, and sets vert_possible/horz_possible if the blob has some
218   // feature that makes it individually appear to flow one way.
219   // eg if it has a high aspect ratio, yet has a complex shape, such as a
220   // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
221   bool DefiniteIndividualFlow();
222 
223   // Returns true if there is no tabstop violation in merging this and other.
224   bool ConfirmNoTabViolation(const BLOBNBOX &other) const;
225 
226   // Returns true if other has a similar stroke width to this.
227   bool MatchingStrokeWidth(const BLOBNBOX &other, double fractional_tolerance,
228                            double constant_tolerance) const;
229 
230   // Returns a bounding box of the outline contained within the
231   // given horizontal range.
232   TBOX BoundsWithinLimits(int left, int right);
233 
234   // Estimates and stores the baseline position based on the shape of the
235   // outline.
236   void EstimateBaselinePosition();
237 
238   // Simple accessors.
239   const TBOX &bounding_box() const {
240     return box;
241   }
242   // Set the bounding box. Use with caution.
243   // Normally use compute_bounding_box instead.
244   void set_bounding_box(const TBOX &new_box) {
245     box = new_box;
246     base_char_top_ = box.top();
247     base_char_bottom_ = box.bottom();
248   }
249   void compute_bounding_box() {
250     box = cblob_ptr->bounding_box();
251     base_char_top_ = box.top();
252     base_char_bottom_ = box.bottom();
253     baseline_y_ = box.bottom();
254   }
255   const TBOX &reduced_box() const {
256     return red_box;
257   }
258   void set_reduced_box(TBOX new_box) {
259     red_box = new_box;
260     reduced = true;
261   }
262   int32_t enclosed_area() const {
263     return area;
264   }
265   bool joined_to_prev() const {
266     return joined;
267   }
268   bool red_box_set() const {
269     return reduced;
270   }
271   int repeated_set() const {
272     return repeated_set_;
273   }
274   void set_repeated_set(int set_id) {
275     repeated_set_ = set_id;
276   }
277   C_BLOB *cblob() const {
278     return cblob_ptr;
279   }
280   C_BLOB *remove_cblob() {
281     auto blob = cblob_ptr;
282     cblob_ptr = nullptr;
283     owns_cblob_ = false;
284     return blob;
285   }
286   TabType left_tab_type() const {
287     return left_tab_type_;
288   }
289   void set_left_tab_type(TabType new_type) {
290     left_tab_type_ = new_type;
291   }
292   TabType right_tab_type() const {
293     return right_tab_type_;
294   }
295   void set_right_tab_type(TabType new_type) {
296     right_tab_type_ = new_type;
297   }
298   BlobRegionType region_type() const {
299     return region_type_;
300   }
301   void set_region_type(BlobRegionType new_type) {
302     region_type_ = new_type;
303   }
304   BlobSpecialTextType special_text_type() const {
305     return spt_type_;
306   }
307   void set_special_text_type(BlobSpecialTextType new_type) {
308     spt_type_ = new_type;
309   }
310   BlobTextFlowType flow() const {
311     return flow_;
312   }
313   void set_flow(BlobTextFlowType value) {
314     flow_ = value;
315   }
316   bool vert_possible() const {
317     return vert_possible_;
318   }
319   void set_vert_possible(bool value) {
320     vert_possible_ = value;
321   }
322   bool horz_possible() const {
323     return horz_possible_;
324   }
325   void set_horz_possible(bool value) {
326     horz_possible_ = value;
327   }
328   int left_rule() const {
329     return left_rule_;
330   }
331   void set_left_rule(int new_left) {
332     left_rule_ = new_left;
333   }
334   int right_rule() const {
335     return right_rule_;
336   }
337   void set_right_rule(int new_right) {
338     right_rule_ = new_right;
339   }
340   int left_crossing_rule() const {
341     return left_crossing_rule_;
342   }
343   void set_left_crossing_rule(int new_left) {
344     left_crossing_rule_ = new_left;
345   }
346   int right_crossing_rule() const {
347     return right_crossing_rule_;
348   }
349   void set_right_crossing_rule(int new_right) {
350     right_crossing_rule_ = new_right;
351   }
352   float horz_stroke_width() const {
353     return horz_stroke_width_;
354   }
355   void set_horz_stroke_width(float width) {
356     horz_stroke_width_ = width;
357   }
358   float vert_stroke_width() const {
359     return vert_stroke_width_;
360   }
361   void set_vert_stroke_width(float width) {
362     vert_stroke_width_ = width;
363   }
364   float area_stroke_width() const {
365     return area_stroke_width_;
366   }
367   tesseract::ColPartition *owner() const {
368     return owner_;
369   }
370   void set_owner(tesseract::ColPartition *new_owner) {
371     owner_ = new_owner;
372   }
373   bool leader_on_left() const {
374     return leader_on_left_;
375   }
376   void set_leader_on_left(bool flag) {
377     leader_on_left_ = flag;
378   }
379   bool leader_on_right() const {
380     return leader_on_right_;
381   }
382   void set_leader_on_right(bool flag) {
383     leader_on_right_ = flag;
384   }
385   BLOBNBOX *neighbour(BlobNeighbourDir n) const {
386     return neighbours_[n];
387   }
388   bool good_stroke_neighbour(BlobNeighbourDir n) const {
389     return good_stroke_neighbours_[n];
390   }
391   void set_neighbour(BlobNeighbourDir n, BLOBNBOX *neighbour, bool good) {
392     neighbours_[n] = neighbour;
393     good_stroke_neighbours_[n] = good;
394   }
395   bool IsDiacritic() const {
396     return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
397   }
398   int base_char_top() const {
399     return base_char_top_;
400   }
401   int base_char_bottom() const {
402     return base_char_bottom_;
403   }
404   int baseline_position() const {
405     return baseline_y_;
406   }
407   int line_crossings() const {
408     return line_crossings_;
409   }
410   void set_line_crossings(int value) {
411     line_crossings_ = value;
412   }
413   void set_diacritic_box(const TBOX &diacritic_box) {
414     base_char_top_ = diacritic_box.top();
415     base_char_bottom_ = diacritic_box.bottom();
416   }
417   BLOBNBOX *base_char_blob() const {
418     return base_char_blob_;
419   }
420   void set_base_char_blob(BLOBNBOX *blob) {
421     base_char_blob_ = blob;
422   }
423   void set_owns_cblob(bool value) {
424     owns_cblob_ = value;
425   }
426 
427   bool UniquelyVertical() const {
428     return vert_possible_ && !horz_possible_;
429   }
430   bool UniquelyHorizontal() const {
431     return horz_possible_ && !vert_possible_;
432   }
433 
434   // Returns true if the region type is text.
435   static bool IsTextType(BlobRegionType type) {
436     return type == BRT_TEXT || type == BRT_VERT_TEXT;
437   }
438   // Returns true if the region type is image.
439   static bool IsImageType(BlobRegionType type) {
440     return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
441   }
442   // Returns true if the region type is line.
443   static bool IsLineType(BlobRegionType type) {
444     return type == BRT_HLINE || type == BRT_VLINE;
445   }
446   // Returns true if the region type cannot be merged.
447   static bool UnMergeableType(BlobRegionType type) {
448     return IsLineType(type) || IsImageType(type);
449   }
450   // Helper to call CleanNeighbours on all blobs on the list.
451   static void CleanNeighbours(BLOBNBOX_LIST *blobs);
452   // Helper to delete all the deletable blobs on the list.
453   static void DeleteNoiseBlobs(BLOBNBOX_LIST *blobs);
454   // Helper to compute edge offsets for  all the blobs on the list.
455   // See coutln.h for an explanation of edge offsets.
456   static void ComputeEdgeOffsets(Image thresholds, Image grey, BLOBNBOX_LIST *blobs);
457 
458 #ifndef GRAPHICS_DISABLED
459   // Helper to draw all the blobs on the list in the given body_colour,
460   // with child outlines in the child_colour.
461   static void PlotBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
462                         ScrollView::Color child_colour, ScrollView *win);
463   // Helper to draw only DeletableNoise blobs (unowned, BRT_NOISE) on the
464   // given list in the given body_colour, with child outlines in the
465   // child_colour.
466   static void PlotNoiseBlobs(BLOBNBOX_LIST *list, ScrollView::Color body_colour,
467                              ScrollView::Color child_colour, ScrollView *win);
468 
469   static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type);
470 
471   // Keep in sync with BlobRegionType.
472   ScrollView::Color BoxColor() const;
473 
474   void plot(ScrollView *window,              // window to draw in
475             ScrollView::Color blob_colour,   // for outer bits
476             ScrollView::Color child_colour); // for holes
477 #endif
478 
479   // Initializes members set by StrokeWidth and beyond, without discarding
480   // stored area and strokewidth values, which are expensive to calculate.
481   void ReInit() {
482     joined = false;
483     reduced = false;
484     repeated_set_ = 0;
485     left_tab_type_ = TT_NONE;
486     right_tab_type_ = TT_NONE;
487     region_type_ = BRT_UNKNOWN;
488     flow_ = BTFT_NONE;
489     spt_type_ = BSTT_SKIP;
490     left_rule_ = 0;
491     right_rule_ = 0;
492     left_crossing_rule_ = 0;
493     right_crossing_rule_ = 0;
494     if (area_stroke_width_ == 0.0f && area > 0 && cblob() != nullptr && cblob()->perimeter() != 0) {
495       area_stroke_width_ = 2.0f * area / cblob()->perimeter();
496     }
497     owner_ = nullptr;
498     base_char_top_ = box.top();
499     base_char_bottom_ = box.bottom();
500     baseline_y_ = box.bottom();
501     line_crossings_ = 0;
502     base_char_blob_ = nullptr;
503     horz_possible_ = false;
504     vert_possible_ = false;
505     leader_on_left_ = false;
506     leader_on_right_ = false;
507     ClearNeighbours();
508   }
509 
510   void ClearNeighbours() {
511     for (int n = 0; n < BND_COUNT; ++n) {
512       neighbours_[n] = nullptr;
513       good_stroke_neighbours_[n] = false;
514     }
515   }
516 
517 private:
518   C_BLOB *cblob_ptr = nullptr;               // edgestep blob
519   TBOX box;                                  // bounding box
520   TBOX red_box;                              // bounding box
521   int32_t area = 0;                          // enclosed area
522   int32_t repeated_set_ = 0;                 // id of the set of repeated blobs
523   TabType left_tab_type_ = TT_NONE;          // Indicates tab-stop assessment
524   TabType right_tab_type_ = TT_NONE;         // Indicates tab-stop assessment
525   BlobRegionType region_type_ = BRT_UNKNOWN; // Type of region this blob belongs to
526   BlobTextFlowType flow_ = BTFT_NONE;        // Quality of text flow.
527   BlobSpecialTextType spt_type_;             // Special text type.
528   bool joined = false;                       // joined to prev
529   bool reduced = false;                      // reduced box set
530   int16_t left_rule_ = 0;                    // x-coord of nearest but not crossing rule line
531   int16_t right_rule_ = 0;                   // x-coord of nearest but not crossing rule line
532   int16_t left_crossing_rule_;               // x-coord of nearest or crossing rule line
533   int16_t right_crossing_rule_;              // x-coord of nearest or crossing rule line
534   int16_t base_char_top_;                    // y-coord of top/bottom of diacritic base,
535   int16_t base_char_bottom_;                 // if it exists else top/bottom of this blob.
536   int16_t baseline_y_;                       // Estimate of baseline position.
537   int32_t line_crossings_;                   // Number of line intersections touched.
538   BLOBNBOX *base_char_blob_;                 // The blob that was the base char.
539   tesseract::ColPartition *owner_;           // Who will delete me when I am not needed
540   BLOBNBOX *neighbours_[BND_COUNT];
541   float horz_stroke_width_ = 0.0f; // Median horizontal stroke width
542   float vert_stroke_width_ = 0.0f; // Median vertical stroke width
543   float area_stroke_width_ = 0.0f; // Stroke width from area/perimeter ratio.
544   bool good_stroke_neighbours_[BND_COUNT];
545   bool horz_possible_;   // Could be part of horizontal flow.
546   bool vert_possible_;   // Could be part of vertical flow.
547   bool leader_on_left_;  // There is a leader to the left.
548   bool leader_on_right_; // There is a leader to the right.
549   // Iff true, then the destructor should delete the cblob_ptr.
550   // TODO(rays) migrate all uses to correctly setting this flag instead of
551   // deleting the C_BLOB before deleting the BLOBNBOX.
552   bool owns_cblob_ = false;
553 };
554 
555 class TO_ROW : public ELIST2_LINK {
556 public:
557   static const int kErrorWeight = 3;
558 
TO_ROW()559   TO_ROW() {
560     clear();
561   }                   // empty
562   TO_ROW(             // constructor
563       BLOBNBOX *blob, // from first blob
564       float top,      // of row //target height
565       float bottom, float row_size);
566 
567   void print() const;
max_y()568   float max_y() const { // access function
569     return y_max;
570   }
min_y()571   float min_y() const {
572     return y_min;
573   }
mean_y()574   float mean_y() const {
575     return (y_min + y_max) / 2.0f;
576   }
initial_min_y()577   float initial_min_y() const {
578     return initial_y_min;
579   }
line_m()580   float line_m() const { // access to line fit
581     return m;
582   }
line_c()583   float line_c() const {
584     return c;
585   }
line_error()586   float line_error() const {
587     return error;
588   }
parallel_c()589   float parallel_c() const {
590     return para_c;
591   }
parallel_error()592   float parallel_error() const {
593     return para_error;
594   }
believability()595   float believability() const { // baseline goodness
596     return credibility;
597   }
intercept()598   float intercept() const { // real parallel_c
599     return y_origin;
600   }
601   void add_blob(      // put in row
602       BLOBNBOX *blob, // blob to add
603       float top,      // of row //target height
604       float bottom, float row_size);
605   void insert_blob( // put in row in order
606       BLOBNBOX *blob);
607 
blob_list()608   BLOBNBOX_LIST *blob_list() { // get list
609     return &blobs;
610   }
611 
set_line(float new_m,float new_c,float new_error)612   void set_line(   // set line spec
613       float new_m, // line to set
614       float new_c, float new_error) {
615     m = new_m;
616     c = new_c;
617     error = new_error;
618   }
set_parallel_line(float gradient,float new_c,float new_error)619   void set_parallel_line( // set fixed gradient line
620       float gradient,     // page gradient
621       float new_c, float new_error) {
622     para_c = new_c;
623     para_error = new_error;
624     credibility = blobs.length() - kErrorWeight * new_error;
625     y_origin = new_c / std::sqrt(1 + gradient * gradient);
626     // real intercept
627   }
set_limits(float new_min,float new_max)628   void set_limits(     // set min,max
629       float new_min,   // bottom and
630       float new_max) { // top of row
631     y_min = new_min;
632     y_max = new_max;
633   }
634   void compute_vertical_projection();
635   // get projection
636 
rep_chars_marked()637   bool rep_chars_marked() const {
638     return num_repeated_sets_ != -1;
639   }
clear_rep_chars_marked()640   void clear_rep_chars_marked() {
641     num_repeated_sets_ = -1;
642   }
num_repeated_sets()643   int num_repeated_sets() const {
644     return num_repeated_sets_;
645   }
set_num_repeated_sets(int num_sets)646   void set_num_repeated_sets(int num_sets) {
647     num_repeated_sets_ = num_sets;
648   }
649 
650   // true when dead
651   bool merged = false;
652   bool all_caps;             // had no ascenders
653   bool used_dm_model;        // in guessing pitch
654   int16_t projection_left;   // start of projection
655   int16_t projection_right;  // start of projection
656   PITCH_TYPE pitch_decision; // how strong is decision
657   float fixed_pitch;         // pitch or 0
658   float fp_space;            // sp if fixed pitch
659   float fp_nonsp;            // nonsp if fixed pitch
660   float pr_space;            // sp if prop
661   float pr_nonsp;            // non sp if prop
662   float spacing;             // to "next" row
663   float xheight;             // of line
664   int xheight_evidence;      // number of blobs of height xheight
665   float ascrise;             // ascenders
666   float descdrop;            // descenders
667   float body_size;           // of CJK characters.  Assumed to be
668                              // xheight+ascrise for non-CJK text.
669   int32_t min_space;         // min size for real space
670   int32_t max_nonspace;      // max size of non-space
671   int32_t space_threshold;   // space vs nonspace
672   float kern_size;           // average non-space
673   float space_size;          // average space
674   WERD_LIST rep_words;       // repeated chars
675   ICOORDELT_LIST char_cells; // fixed pitch cells
676   QSPLINE baseline;          // curved baseline
677   STATS projection;          // vertical projection
678 
679 private:
680   void clear(); // clear all values to reasonable defaults
681 
682   BLOBNBOX_LIST blobs; // blobs in row
683   float y_min;         // coords
684   float y_max;
685   float initial_y_min;
686   float m, c;   // line spec
687   float error;  // line error
688   float para_c; // constrained fit
689   float para_error;
690   float y_origin;         // rotated para_c;
691   float credibility;      // baseline believability
692   int num_repeated_sets_; // number of sets of repeated blobs
693                           // set to -1 if we have not searched
694                           // for repeated blobs in this row yet
695 };
696 
ELIST2IZEH(TO_ROW)697 ELIST2IZEH(TO_ROW)
698 class TESS_API TO_BLOCK : public ELIST_LINK {
699 public:
700   TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
701     clear();
702   }                      // empty
703   TO_BLOCK(              // constructor
704       BLOCK *src_block); // real block
705   ~TO_BLOCK();
706 
707   void clear(); // clear all scalar members.
708 
709   TO_ROW_LIST *get_rows() { // access function
710     return &row_list;
711   }
712 
713   // Rotate all the blobnbox lists and the underlying block. Then update the
714   // median size statistic from the blobs list.
715   void rotate(const FCOORD &rotation) {
716     BLOBNBOX_LIST *blobnbox_list[] = {&blobs,       &underlines,  &noise_blobs,
717                                       &small_blobs, &large_blobs, nullptr};
718     for (BLOBNBOX_LIST **list = blobnbox_list; *list != nullptr; ++list) {
719       BLOBNBOX_IT it(*list);
720       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
721         it.data()->rotate(rotation);
722       }
723     }
724     // Rotate the block
725     ASSERT_HOST(block->pdblk.poly_block() != nullptr);
726     block->rotate(rotation);
727     // Update the median size statistic from the blobs list.
728     STATS widths(0, block->pdblk.bounding_box().width());
729     STATS heights(0, block->pdblk.bounding_box().height());
730     BLOBNBOX_IT blob_it(&blobs);
731     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
732       widths.add(blob_it.data()->bounding_box().width(), 1);
733       heights.add(blob_it.data()->bounding_box().height(), 1);
734     }
735     block->set_median_size(static_cast<int>(widths.median() + 0.5),
736                            static_cast<int>(heights.median() + 0.5));
737   }
738 
739   void print_rows() { // debug info
740     TO_ROW_IT row_it = &row_list;
741     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
742       auto row = row_it.data();
743       tprintf("Row range (%g,%g), para_c=%g, blobcount=%" PRId32 "\n",
744               static_cast<double>(row->min_y()),
745               static_cast<double>(row->max_y()),
746               static_cast<double>(row->parallel_c()),
747               row->blob_list()->length());
748     }
749   }
750 
751   // Reorganizes the blob lists with a different definition of small, medium
752   // and large, compared to the original definition.
753   // Height is still the primary filter key, but medium width blobs of small
754   // height become medium, and very wide blobs of small height stay small.
755   void ReSetAndReFilterBlobs();
756 
757   // Deletes noise blobs from all lists where not owned by a ColPartition.
758   void DeleteUnownedNoise();
759 
760   // Computes and stores the edge offsets on each blob for use in feature
761   // extraction, using greyscale if the supplied grey and thresholds pixes
762   // are 8-bit or otherwise (if nullptr or not 8 bit) the original binary
763   // edge step outlines.
764   // Thresholds must either be the same size as grey or an integer down-scale
765   // of grey.
766   // See coutln.h for an explanation of edge offsets.
767   void ComputeEdgeOffsets(Image thresholds, Image grey);
768 
769 #ifndef GRAPHICS_DISABLED
770   // Draw the noise blobs from all lists in red.
771   void plot_noise_blobs(ScrollView *to_win);
772   // Draw the blobs on on the various lists in the block in different colors.
773   void plot_graded_blobs(ScrollView *to_win);
774 #endif
775 
776   BLOBNBOX_LIST blobs;       // medium size
777   BLOBNBOX_LIST underlines;  // underline blobs
778   BLOBNBOX_LIST noise_blobs; // very small
779   BLOBNBOX_LIST small_blobs; // fairly small
780   BLOBNBOX_LIST large_blobs; // big blobs
781   BLOCK *block;              // real block
782   PITCH_TYPE pitch_decision; // how strong is decision
783   float line_spacing;        // estimate
784   // line_size is a lower-bound estimate of the font size in pixels of
785   // the text in the block (with ascenders and descenders), being a small
786   // (1.25) multiple of the median height of filtered blobs.
787   // In most cases the font size will be bigger, but it will be closer
788   // if the text is allcaps, or in a no-x-height script.
789   float line_size;       // estimate
790   float max_blob_size;   // line assignment limit
791   float baseline_offset; // phase shift
792   float xheight;         // median blob size
793   float fixed_pitch;     // pitch or 0
794   float kern_size;       // average non-space
795   float space_size;      // average space
796   int32_t min_space;     // min definite space
797   int32_t max_nonspace;  // max definite
798   float fp_space;        // sp if fixed pitch
799   float fp_nonsp;        // nonsp if fixed pitch
800   float pr_space;        // sp if prop
801   float pr_nonsp;        // non sp if prop
802   TO_ROW *key_row;       // starting row
803 
804 private:
805   TO_ROW_LIST row_list; // temporary rows
806 };
807 
808 ELISTIZEH(TO_BLOCK)
809 void find_cblob_limits( // get y limits
810     C_BLOB *blob,       // blob to search
811     float leftx,        // x limits
812     float rightx,
813     FCOORD rotation, // for landscape
814     float &ymin,     // output y limits
815     float &ymax);
816 void find_cblob_vlimits( // get y limits
817     C_BLOB *blob,        // blob to search
818     float leftx,         // x limits
819     float rightx,
820     float &ymin, // output y limits
821     float &ymax);
822 void find_cblob_hlimits( // get x limits
823     C_BLOB *blob,        // blob to search
824     float bottomy,       // y limits
825     float topy,
826     float &xmin, // output x limits
827     float &xymax);
828 C_BLOB *crotate_cblob( // rotate it
829     C_BLOB *blob,      // blob to search
830     FCOORD rotation    // for landscape
831 );
832 TBOX box_next(      // get bounding box
833     BLOBNBOX_IT *it // iterator to blobds
834 );
835 TBOX box_next_pre_chopped( // get bounding box
836     BLOBNBOX_IT *it        // iterator to blobds
837 );
838 void vertical_cblob_projection( // project outlines
839     C_BLOB *blob,               // blob to project
840     STATS *stats                // output
841 );
842 void vertical_coutline_projection( // project outlines
843     C_OUTLINE *outline,            // outline to project
844     STATS *stats                   // output
845 );
846 #ifndef GRAPHICS_DISABLED
847 void plot_blob_list(ScrollView *win,                 // window to draw in
848                     BLOBNBOX_LIST *list,             // blob list
849                     ScrollView::Color body_colour,   // colour to draw
850                     ScrollView::Color child_colour); // colour of child
851 #endif                                               // !GRAPHICS_DISABLED
852 
853 } // namespace tesseract
854 
855 #endif
856