1 /**********************************************************************
2  * File: ratngs.cpp  (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifdef HAVE_CONFIG_H
20 #  include "config_auto.h"
21 #endif
22 
23 #include "ratngs.h"
24 
25 #include "blobs.h"
26 #include "matrix.h"
27 #include "normalis.h" // kBlnBaselineOffset.
28 #include "unicharset.h"
29 
30 #include <algorithm>
31 #include <cmath>
32 #include <string>
33 #include <vector>
34 
35 namespace tesseract {
36 
37 const float WERD_CHOICE::kBadRating = 100000.0;
38 // Min offset in baseline-normalized coords to make a character a subscript.
39 const int kMinSubscriptOffset = 20;
40 // Min offset in baseline-normalized coords to make a character a superscript.
41 const int kMinSuperscriptOffset = 20;
42 // Max y of bottom of a drop-cap blob.
43 const int kMaxDropCapBottom = -128;
44 // Max fraction of x-height to use as denominator in measuring x-height overlap.
45 const double kMaxOverlapDenominator = 0.125;
46 // Min fraction of x-height range that should be in agreement for matching
47 // x-heights.
48 const double kMinXHeightMatch = 0.5;
49 // Max tolerance on baseline position as a fraction of x-height for matching
50 // baselines.
51 const double kMaxBaselineDrift = 0.0625;
52 
53 static const char kPermuterTypeNoPerm[] = "None";
54 static const char kPermuterTypePuncPerm[] = "Punctuation";
55 static const char kPermuterTypeTopPerm[] = "Top Choice";
56 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
57 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
58 static const char kPermuterTypeNgramPerm[] = "Ngram";
59 static const char kPermuterTypeNumberPerm[] = "Number";
60 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
61 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
62 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
63 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
64 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
65 static const char kPermuterTypeCompoundPerm[] = "Compound";
66 
67 static const char *const kPermuterTypeNames[] = {
68     kPermuterTypeNoPerm,       // 0
69     kPermuterTypePuncPerm,     // 1
70     kPermuterTypeTopPerm,      // 2
71     kPermuterTypeLowerPerm,    // 3
72     kPermuterTypeUpperPerm,    // 4
73     kPermuterTypeNgramPerm,    // 5
74     kPermuterTypeNumberPerm,   // 6
75     kPermuterTypeUserPatPerm,  // 7
76     kPermuterTypeSysDawgPerm,  // 8
77     kPermuterTypeDocDawgPerm,  // 9
78     kPermuterTypeUserDawgPerm, // 10
79     kPermuterTypeFreqDawgPerm, // 11
80     kPermuterTypeCompoundPerm  // 12
81 };
82 
83 /**
84  * BLOB_CHOICE::BLOB_CHOICE
85  *
86  * Constructor to build a BLOB_CHOICE from a char, rating and certainty.
87  */
BLOB_CHOICE(UNICHAR_ID src_unichar_id,float src_rating,float src_cert,int src_script_id,float min_xheight,float max_xheight,float yshift,BlobChoiceClassifier c)88 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
89                          float src_rating,          // rating
90                          float src_cert,            // certainty
91                          int src_script_id,         // script
92                          float min_xheight,         // min xheight allowed
93                          float max_xheight,         // max xheight by this char
94                          float yshift,              // yshift out of position
95                          BlobChoiceClassifier c) {  // adapted match or other
96   unichar_id_ = src_unichar_id;
97   rating_ = src_rating;
98   certainty_ = src_cert;
99   fontinfo_id_ = -1;
100   fontinfo_id2_ = -1;
101   script_id_ = src_script_id;
102   min_xheight_ = min_xheight;
103   max_xheight_ = max_xheight;
104   yshift_ = yshift;
105   classifier_ = c;
106 }
107 
108 /**
109  * BLOB_CHOICE::BLOB_CHOICE
110  *
111  * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
112  */
BLOB_CHOICE(const BLOB_CHOICE & other)113 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST_LINK(other) {
114   unichar_id_ = other.unichar_id();
115   rating_ = other.rating();
116   certainty_ = other.certainty();
117   fontinfo_id_ = other.fontinfo_id();
118   fontinfo_id2_ = other.fontinfo_id2();
119   script_id_ = other.script_id();
120   matrix_cell_ = other.matrix_cell_;
121   min_xheight_ = other.min_xheight_;
122   max_xheight_ = other.max_xheight_;
123   yshift_ = other.yshift();
124   classifier_ = other.classifier_;
125 #ifndef DISABLED_LEGACY_ENGINE
126   fonts_ = other.fonts_;
127 #endif // ndef DISABLED_LEGACY_ENGINE
128 }
129 
130 // Copy assignment operator.
operator =(const BLOB_CHOICE & other)131 BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) {
132   ELIST_LINK::operator=(other);
133   unichar_id_ = other.unichar_id();
134   rating_ = other.rating();
135   certainty_ = other.certainty();
136   fontinfo_id_ = other.fontinfo_id();
137   fontinfo_id2_ = other.fontinfo_id2();
138   script_id_ = other.script_id();
139   matrix_cell_ = other.matrix_cell_;
140   min_xheight_ = other.min_xheight_;
141   max_xheight_ = other.max_xheight_;
142   yshift_ = other.yshift();
143   classifier_ = other.classifier_;
144 #ifndef DISABLED_LEGACY_ENGINE
145   fonts_ = other.fonts_;
146 #endif // ndef DISABLED_LEGACY_ENGINE
147   return *this;
148 }
149 
150 // Returns true if *this and other agree on the baseline and x-height
151 // to within some tolerance based on a given estimate of the x-height.
PosAndSizeAgree(const BLOB_CHOICE & other,float x_height,bool debug) const152 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const {
153   double baseline_diff = std::fabs(yshift() - other.yshift());
154   if (baseline_diff > kMaxBaselineDrift * x_height) {
155     if (debug) {
156       tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_);
157     }
158     return false;
159   }
160   double this_range = max_xheight() - min_xheight();
161   double other_range = other.max_xheight() - other.min_xheight();
162   double denominator =
163       ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height);
164   double overlap =
165       std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight());
166   overlap /= denominator;
167   if (debug) {
168     tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_,
169             other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap);
170   }
171 
172   return overlap >= kMinXHeightMatch;
173 }
174 
175 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
176 // unichar_id, or nullptr if there is no match.
FindMatchingChoice(UNICHAR_ID char_id,BLOB_CHOICE_LIST * bc_list)177 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) {
178   // Find the corresponding best BLOB_CHOICE.
179   BLOB_CHOICE_IT choice_it(bc_list);
180   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
181     BLOB_CHOICE *choice = choice_it.data();
182     if (choice->unichar_id() == char_id) {
183       return choice;
184     }
185   }
186   return nullptr;
187 }
188 
permuter_name(uint8_t permuter)189 const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
190   return kPermuterTypeNames[permuter];
191 }
192 
ScriptPosToString(enum ScriptPos script_pos)193 const char *ScriptPosToString(enum ScriptPos script_pos) {
194   switch (script_pos) {
195     case SP_NORMAL:
196       return "NORM";
197     case SP_SUBSCRIPT:
198       return "SUB";
199     case SP_SUPERSCRIPT:
200       return "SUPER";
201     case SP_DROPCAP:
202       return "DROPC";
203   }
204   return "SP_UNKNOWN";
205 }
206 
207 /**
208  * WERD_CHOICE::WERD_CHOICE
209  *
210  * Constructor to build a WERD_CHOICE from the given string.
211  * The function assumes that src_string is not nullptr.
212  */
WERD_CHOICE(const char * src_string,const UNICHARSET & unicharset)213 WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset)
214     : unicharset_(&unicharset) {
215   std::vector<UNICHAR_ID> encoding;
216   std::vector<char> lengths;
217   std::string cleaned = unicharset.CleanupString(src_string);
218   if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {
219     lengths.push_back('\0');
220     std::string src_lengths = &lengths[0];
221     this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
222   } else { // There must have been an invalid unichar in the string.
223     this->init(8);
224     this->make_bad();
225   }
226 }
227 
228 /**
229  * WERD_CHOICE::init
230  *
231  * Helper function to build a WERD_CHOICE from the given string,
232  * fragment lengths, rating, certainty and permuter.
233  *
234  * The function assumes that src_string is not nullptr.
235  * src_lengths argument could be nullptr, in which case the unichars
236  * in src_string are assumed to all be of length 1.
237  */
init(const char * src_string,const char * src_lengths,float src_rating,float src_certainty,uint8_t src_permuter)238 void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating,
239                        float src_certainty, uint8_t src_permuter) {
240   int src_string_len = strlen(src_string);
241   if (src_string_len == 0) {
242     this->init(8);
243   } else {
244     this->init(src_lengths ? strlen(src_lengths) : src_string_len);
245     length_ = reserved_;
246     int offset = 0;
247     for (unsigned i = 0; i < length_; ++i) {
248       int unichar_length = src_lengths ? src_lengths[i] : 1;
249       unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
250       state_[i] = 1;
251       certainties_[i] = src_certainty;
252       offset += unichar_length;
253     }
254   }
255   adjust_factor_ = 1.0f;
256   rating_ = src_rating;
257   certainty_ = src_certainty;
258   permuter_ = src_permuter;
259   dangerous_ambig_found_ = false;
260 }
261 
262 /**
263  * WERD_CHOICE::~WERD_CHOICE
264  */
265 WERD_CHOICE::~WERD_CHOICE() = default;
266 
permuter_name() const267 const char *WERD_CHOICE::permuter_name() const {
268   return kPermuterTypeNames[permuter_];
269 }
270 
271 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
272 // taken from the appropriate cell in the ratings MATRIX.
273 // Borrowed pointer, so do not delete.
blob_choices(unsigned index,MATRIX * ratings) const274 BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const {
275   MATRIX_COORD coord = MatrixCoord(index);
276   BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
277   if (result == nullptr) {
278     result = new BLOB_CHOICE_LIST;
279     ratings->put(coord.col, coord.row, result);
280   }
281   return result;
282 }
283 
284 // Returns the MATRIX_COORD corresponding to the location in the ratings
285 // MATRIX for the given index into the word.
MatrixCoord(unsigned index) const286 MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const {
287   int col = 0;
288   for (unsigned i = 0; i < index; ++i) {
289     col += state_[i];
290   }
291   int row = col + state_[index] - 1;
292   return MATRIX_COORD(col, row);
293 }
294 
295 // Sets the entries for the given index from the BLOB_CHOICE, assuming
296 // unit fragment lengths, but setting the state for this index to blob_count.
set_blob_choice(unsigned index,int blob_count,const BLOB_CHOICE * blob_choice)297 void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) {
298   unichar_ids_[index] = blob_choice->unichar_id();
299   script_pos_[index] = tesseract::SP_NORMAL;
300   state_[index] = blob_count;
301   certainties_[index] = blob_choice->certainty();
302 }
303 
304 /**
305  * contains_unichar_id
306  *
307  * Returns true if unichar_ids_ contain the given unichar_id, false otherwise.
308  */
contains_unichar_id(UNICHAR_ID unichar_id) const309 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
310   for (unsigned i = 0; i < length_; ++i) {
311     if (unichar_ids_[i] == unichar_id) {
312       return true;
313     }
314   }
315   return false;
316 }
317 
318 /**
319  * remove_unichar_ids
320  *
321  * Removes num unichar ids starting from index start from unichar_ids_
322  * and updates length_ and fragment_lengths_ to reflect this change.
323  * Note: this function does not modify rating_ and certainty_.
324  */
remove_unichar_ids(unsigned start,int num)325 void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) {
326   ASSERT_HOST(start + num <= length_);
327   // Accumulate the states to account for the merged blobs.
328   for (int i = 0; i < num; ++i) {
329     if (start > 0) {
330       state_[start - 1] += state_[start + i];
331     } else if (start + num < length_) {
332       state_[start + num] += state_[start + i];
333     }
334   }
335   for (unsigned i = start; i + num < length_; ++i) {
336     unichar_ids_[i] = unichar_ids_[i + num];
337     script_pos_[i] = script_pos_[i + num];
338     state_[i] = state_[i + num];
339     certainties_[i] = certainties_[i + num];
340   }
341   length_ -= num;
342 }
343 
344 /**
345  * reverse_and_mirror_unichar_ids
346  *
347  * Reverses and mirrors unichars in unichar_ids.
348  */
reverse_and_mirror_unichar_ids()349 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
350   for (unsigned i = 0; i < length_ / 2; ++i) {
351     UNICHAR_ID tmp_id = unichar_ids_[i];
352     unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
353     unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
354   }
355   if (length_ % 2 != 0) {
356     unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);
357   }
358 }
359 
360 /**
361  * punct_stripped
362  *
363  * Returns the half-open interval of unichar_id indices [start, end) which
364  * enclose the core portion of this word -- the part after stripping
365  * punctuation from the left and right.
366  */
punct_stripped(unsigned * start,unsigned * end) const367 void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {
368   *start = 0;
369   *end = length();
370   while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
371     (*start)++;
372   }
373   while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
374     (*end)--;
375   }
376 }
377 
GetNonSuperscriptSpan(int * pstart,int * pend) const378 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
379   int end = length();
380   while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
381          BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
382     end--;
383   }
384   int start = 0;
385   while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
386          BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
387     start++;
388   }
389   *pstart = start;
390   *pend = end;
391 }
392 
shallow_copy(unsigned start,unsigned end) const393 WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const {
394   ASSERT_HOST(start <= length_);
395   ASSERT_HOST(end <= length_);
396   if (end < start) {
397     end = start;
398   }
399   WERD_CHOICE retval(unicharset_, end - start);
400   for (auto i = start; i < end; i++) {
401     retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
402   }
403   return retval;
404 }
405 
406 /**
407  * has_rtl_unichar_id
408  *
409  * Returns true if unichar_ids contain at least one "strongly" RTL unichar.
410  */
has_rtl_unichar_id() const411 bool WERD_CHOICE::has_rtl_unichar_id() const {
412   for (unsigned i = 0; i < length_; ++i) {
413     UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
414     if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
415       return true;
416     }
417   }
418   return false;
419 }
420 
421 /**
422  * string_and_lengths
423  *
424  * Populates the given word_str with unichars from unichar_ids and
425  * and word_lengths_str with the corresponding unichar lengths.
426  */
string_and_lengths(std::string * word_str,std::string * word_lengths_str) const427 void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const {
428   *word_str = "";
429   if (word_lengths_str != nullptr) {
430     *word_lengths_str = "";
431   }
432   for (unsigned i = 0; i < length_; ++i) {
433     const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
434     *word_str += ch;
435     if (word_lengths_str != nullptr) {
436       *word_lengths_str += (char)strlen(ch);
437     }
438   }
439 }
440 
441 /**
442  * append_unichar_id
443  *
444  * Make sure there is enough space in the word for the new unichar id
445  * and call append_unichar_id_space_allocated().
446  */
append_unichar_id(UNICHAR_ID unichar_id,int blob_count,float rating,float certainty)447 void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating,
448                                     float certainty) {
449   if (length_ == reserved_) {
450     this->double_the_size();
451   }
452   this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty);
453 }
454 
455 /**
456  * WERD_CHOICE::operator+=
457  *
458  * Cat a second word rating on the end of this current one.
459  * The ratings are added and the confidence is the min.
460  * If the permuters are NOT the same the permuter is set to COMPOUND_PERM
461  */
operator +=(const WERD_CHOICE & second)462 WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) {
463   ASSERT_HOST(unicharset_ == second.unicharset_);
464   while (reserved_ < length_ + second.length()) {
465     this->double_the_size();
466   }
467   const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
468   for (unsigned i = 0; i < second.length(); ++i) {
469     unichar_ids_[length_ + i] = other_unichar_ids[i];
470     state_[length_ + i] = second.state_[i];
471     certainties_[length_ + i] = second.certainties_[i];
472     script_pos_[length_ + i] = second.BlobPosition(i);
473   }
474   length_ += second.length();
475   if (second.adjust_factor_ > adjust_factor_) {
476     adjust_factor_ = second.adjust_factor_;
477   }
478   rating_ += second.rating();          // add ratings
479   if (second.certainty() < certainty_) { // take min
480     certainty_ = second.certainty();
481   }
482   if (second.dangerous_ambig_found_) {
483     dangerous_ambig_found_ = true;
484   }
485   if (permuter_ == NO_PERM) {
486     permuter_ = second.permuter();
487   } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {
488     permuter_ = COMPOUND_PERM;
489   }
490   return *this;
491 }
492 
493 /**
494  * WERD_CHOICE::operator=
495  *
496  * Allocate enough memory to hold a copy of source and copy over
497  * all the information from source to this WERD_CHOICE.
498  */
operator =(const WERD_CHOICE & source)499 WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) {
500   while (reserved_ < source.length()) {
501     this->double_the_size();
502   }
503 
504   unicharset_ = source.unicharset_;
505   const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
506   for (unsigned i = 0; i < source.length(); ++i) {
507     unichar_ids_[i] = other_unichar_ids[i];
508     state_[i] = source.state_[i];
509     certainties_[i] = source.certainties_[i];
510     script_pos_[i] = source.BlobPosition(i);
511   }
512   length_ = source.length();
513   adjust_factor_ = source.adjust_factor_;
514   rating_ = source.rating();
515   certainty_ = source.certainty();
516   min_x_height_ = source.min_x_height();
517   max_x_height_ = source.max_x_height();
518   permuter_ = source.permuter();
519   dangerous_ambig_found_ = source.dangerous_ambig_found_;
520   return *this;
521 }
522 
523 // Sets up the script_pos_ member using the blobs_list to get the bln
524 // bounding boxes, *this to get the unichars, and this->unicharset
525 // to get the target positions. If small_caps is true, sub/super are not
526 // considered, but dropcaps are.
527 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
SetScriptPositions(bool small_caps,TWERD * word,int debug)528 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
529   // Initialize to normal.
530   for (unsigned i = 0; i < length_; ++i) {
531     script_pos_[i] = tesseract::SP_NORMAL;
532   }
533   if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
534     return;
535   }
536 
537   unsigned position_counts[4] = {0, 0, 0, 0};
538 
539   int chunk_index = 0;
540   for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
541     TBLOB *tblob = word->blobs[chunk_index];
542     int uni_id = unichar_id(blob_index);
543     TBOX blob_box = tblob->bounding_box();
544     if (!state_.empty()) {
545       for (int i = 1; i < state_[blob_index]; ++i) {
546         ++chunk_index;
547         tblob = word->blobs[chunk_index];
548         blob_box += tblob->bounding_box();
549       }
550     }
551     script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);
552     if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
553       script_pos_[blob_index] = tesseract::SP_NORMAL;
554     }
555     position_counts[script_pos_[blob_index]]++;
556   }
557   // If almost everything looks like a superscript or subscript,
558   // we most likely just got the baseline wrong.
559   if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
560       4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
561     if (debug >= 2) {
562       tprintf(
563           "Most characters of %s are subscript or superscript.\n"
564           "That seems wrong, so I'll assume we got the baseline wrong\n",
565           unichar_string().c_str());
566     }
567     for (unsigned i = 0; i < length_; i++) {
568       ScriptPos sp = script_pos_[i];
569       if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
570         ASSERT_HOST(position_counts[sp] > 0);
571 	position_counts[sp]--;
572         position_counts[tesseract::SP_NORMAL]++;
573         script_pos_[i] = tesseract::SP_NORMAL;
574       }
575     }
576   }
577 
578   if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
579     tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
580     int chunk_index = 0;
581     for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
582       if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
583         TBLOB *tblob = word->blobs[chunk_index];
584         ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
585       }
586       chunk_index += state_.empty() ? 1 :  state_[blob_index];
587     }
588   }
589 }
590 
591 // Sets all the script_pos_ positions to the given position.
SetAllScriptPositions(tesseract::ScriptPos position)592 void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
593   for (unsigned i = 0; i < length_; ++i) {
594     script_pos_[i] = position;
595   }
596 }
597 
598 /* static */
ScriptPositionOf(bool print_debug,const UNICHARSET & unicharset,const TBOX & blob_box,UNICHAR_ID unichar_id)599 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
600                                         const TBOX &blob_box, UNICHAR_ID unichar_id) {
601   ScriptPos retval = tesseract::SP_NORMAL;
602   int top = blob_box.top();
603   int bottom = blob_box.bottom();
604   int min_bottom, max_bottom, min_top, max_top;
605   unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
606 
607   int sub_thresh_top = min_top - kMinSubscriptOffset;
608   int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
609   int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
610   if (bottom <= kMaxDropCapBottom) {
611     retval = tesseract::SP_DROPCAP;
612   } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
613     retval = tesseract::SP_SUBSCRIPT;
614   } else if (bottom > sup_thresh_bot) {
615     retval = tesseract::SP_SUPERSCRIPT;
616   }
617 
618   if (print_debug) {
619     const char *pos = ScriptPosToString(retval);
620     tprintf(
621         "%s Character %s[bot:%d top: %d]  "
622         "bot_range[%d,%d]  top_range[%d, %d] "
623         "sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\n",
624         pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,
625         max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
626   }
627   return retval;
628 }
629 
630 // Returns the script-id (eg Han) of the dominant script in the word.
GetTopScriptID() const631 int WERD_CHOICE::GetTopScriptID() const {
632   unsigned max_script = unicharset_->get_script_table_size();
633   std::vector<unsigned> sid(max_script);
634   for (unsigned x = 0; x < length_; ++x) {
635     int script_id = unicharset_->get_script(unichar_id(x));
636     sid[script_id]++;
637   }
638   if (unicharset_->han_sid() != unicharset_->null_sid()) {
639     // Add the Hiragana & Katakana counts to Han and zero them out.
640     if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
641       sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
642       sid[unicharset_->hiragana_sid()] = 0;
643     }
644     if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
645       sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
646       sid[unicharset_->katakana_sid()] = 0;
647     }
648   }
649   // Note that high script ID overrides lower one on a tie, thus biasing
650   // towards non-Common script (if sorted that way in unicharset file).
651   unsigned max_sid = 0;
652   for (unsigned x = 1; x < max_script; x++) {
653     if (sid[x] >= sid[max_sid]) {
654       max_sid = x;
655     }
656   }
657   if (sid[max_sid] < length_ / 2) {
658     max_sid = unicharset_->null_sid();
659   }
660   return max_sid;
661 }
662 
663 // Fixes the state_ for a chop at the given blob_posiiton.
UpdateStateForSplit(int blob_position)664 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
665   int total_chunks = 0;
666   for (unsigned i = 0; i < length_; ++i) {
667     total_chunks += state_[i];
668     if (total_chunks > blob_position) {
669       ++state_[i];
670       return;
671     }
672   }
673 }
674 
675 // Returns the sum of all the state elements, being the total number of blobs.
TotalOfStates() const676 unsigned WERD_CHOICE::TotalOfStates() const {
677   unsigned total_chunks = 0;
678   for (unsigned i = 0; i < length_; ++i) {
679     total_chunks += state_[i];
680   }
681   return total_chunks;
682 }
683 
684 /**
685  * WERD_CHOICE::print
686  *
687  * Print WERD_CHOICE to stdout.
688  */
print(const char * msg) const689 void WERD_CHOICE::print(const char *msg) const {
690   tprintf("%s : ", msg);
691   for (unsigned i = 0; i < length_; ++i) {
692     tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
693   }
694   tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695           adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
696   tprintf("pos");
697   for (unsigned i = 0; i < length_; ++i) {
698     tprintf("\t%s", ScriptPosToString(script_pos_[i]));
699   }
700   tprintf("\nstr");
701   for (unsigned i = 0; i < length_; ++i) {
702     tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
703   }
704   tprintf("\nstate:");
705   for (unsigned i = 0; i < length_; ++i) {
706     tprintf("\t%d ", state_[i]);
707   }
708   tprintf("\nC");
709   for (unsigned i = 0; i < length_; ++i) {
710     tprintf("\t%.3f", certainties_[i]);
711   }
712   tprintf("\n");
713 }
714 
715 // Prints the segmentation state with an introductory message.
print_state(const char * msg) const716 void WERD_CHOICE::print_state(const char *msg) const {
717   tprintf("%s", msg);
718   for (unsigned i = 0; i < length_; ++i) {
719     tprintf(" %d", state_[i]);
720   }
721   tprintf("\n");
722 }
723 
724 #ifndef GRAPHICS_DISABLED
725 
726 // Displays the segmentation state of *this (if not the same as the last
727 // one displayed) and waits for a click in the window.
DisplaySegmentation(TWERD * word)728 void WERD_CHOICE::DisplaySegmentation(TWERD *word) {
729   // Number of different colors to draw with.
730   const int kNumColors = 6;
731   static ScrollView *segm_window = nullptr;
732   // Check the state against the static prev_drawn_state.
733   static std::vector<int> prev_drawn_state;
734   bool already_done = prev_drawn_state.size() == length_;
735   if (!already_done) {
736     prev_drawn_state.clear();
737     prev_drawn_state.resize(length_);
738   }
739   for (unsigned i = 0; i < length_; ++i) {
740     if (prev_drawn_state[i] != state_[i]) {
741       already_done = false;
742     }
743     prev_drawn_state[i] = state_[i];
744   }
745   if (already_done || word->blobs.empty()) {
746     return;
747   }
748 
749   // Create the window if needed.
750   if (segm_window == nullptr) {
751     segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true);
752   } else {
753     segm_window->Clear();
754   }
755 
756   TBOX bbox;
757   int blob_index = 0;
758   for (unsigned c = 0; c < length_; ++c) {
759     auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
760     for (int i = 0; i < state_[c]; ++i, ++blob_index) {
761       TBLOB *blob = word->blobs[blob_index];
762       bbox += blob->bounding_box();
763       blob->plot(segm_window, color, color);
764     }
765   }
766   segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());
767   segm_window->Update();
768   segm_window->Wait();
769 }
770 
771 #endif // !GRAPHICS_DISABLED
772 
EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE & word1,const WERD_CHOICE & word2)773 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) {
774   const UNICHARSET *uchset = word1.unicharset();
775   if (word2.unicharset() != uchset) {
776     return false;
777   }
778   unsigned w1start, w1end;
779   word1.punct_stripped(&w1start, &w1end);
780   unsigned w2start, w2end;
781   word2.punct_stripped(&w2start, &w2end);
782   if (w1end - w1start != w2end - w2start) {
783     return false;
784   }
785   for (unsigned i = 0; i < w1end - w1start; i++) {
786     if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
787         uchset->to_lower(word2.unichar_id(w2start + i))) {
788       return false;
789     }
790   }
791   return true;
792 }
793 
794 /**
795  * print_ratings_list
796  *
797  * Send all the ratings out to the logfile.
798  *
799  * @param msg intro message
800  * @param ratings list of ratings
801  * @param current_unicharset unicharset that can be used
802  * for id-to-unichar conversion
803  */
print_ratings_list(const char * msg,BLOB_CHOICE_LIST * ratings,const UNICHARSET & current_unicharset)804 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings,
805                         const UNICHARSET &current_unicharset) {
806   if (ratings->empty()) {
807     tprintf("%s:<none>\n", msg);
808     return;
809   }
810   if (*msg != '\0') {
811     tprintf("%s\n", msg);
812   }
813   BLOB_CHOICE_IT c_it;
814   c_it.set_to_list(ratings);
815   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
816     c_it.data()->print(&current_unicharset);
817     if (!c_it.at_last()) {
818       tprintf("\n");
819     }
820   }
821   tprintf("\n");
822   fflush(stdout);
823 }
824 
825 } // namespace tesseract
826