1 /**********************************************************************
2  * File:        ratngs.h  (Formerly ratings.h)
3  * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef RATNGS_H
20 #define RATNGS_H
21 
22 #ifdef HAVE_CONFIG_H
23 #  include "config_auto.h" // DISABLED_LEGACY_ENGINE
24 #endif
25 
26 #include "clst.h"
27 #include "elst.h"
28 #ifndef DISABLED_LEGACY_ENGINE
29 #  include "fontinfo.h"
30 #endif // undef DISABLED_LEGACY_ENGINE
31 #include "matrix.h"
32 #include "unicharset.h"
33 #include "werd.h"
34 
35 #include <tesseract/unichar.h>
36 
37 #include <cassert>
38 #include <cfloat> // for FLT_MAX
39 
40 namespace tesseract {
41 
42 class MATRIX;
43 struct TBLOB;
44 struct TWERD;
45 
46 // Enum to describe the source of a BLOB_CHOICE to make it possible to determine
47 // whether a blob has been classified by inspecting the BLOB_CHOICEs.
48 enum BlobChoiceClassifier {
49   BCC_STATIC_CLASSIFIER,  // From the char_norm classifier.
50   BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
51   BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
52   BCC_AMBIG,              // Generated by ambiguity detection.
53   BCC_FAKE,               // From some other process.
54 };
55 
56 class BLOB_CHOICE : public ELIST_LINK {
57 public:
BLOB_CHOICE()58   BLOB_CHOICE() {
59     unichar_id_ = UNICHAR_SPACE;
60     fontinfo_id_ = -1;
61     fontinfo_id2_ = -1;
62     rating_ = 10.0;
63     certainty_ = -1.0;
64     script_id_ = -1;
65     min_xheight_ = 0.0f;
66     max_xheight_ = 0.0f;
67     yshift_ = 0.0f;
68     classifier_ = BCC_FAKE;
69   }
70   BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
71               float src_rating,          // rating
72               float src_cert,            // certainty
73               int script_id,             // script
74               float min_xheight,         // min xheight in image pixel units
75               float max_xheight,         // max xheight allowed by this char
76               float yshift,              // the larger of y shift (top or bottom)
77               BlobChoiceClassifier c);   // adapted match or other
78   BLOB_CHOICE(const BLOB_CHOICE &other);
79   ~BLOB_CHOICE() = default;
80 
unichar_id()81   UNICHAR_ID unichar_id() const {
82     return unichar_id_;
83   }
rating()84   float rating() const {
85     return rating_;
86   }
certainty()87   float certainty() const {
88     return certainty_;
89   }
fontinfo_id()90   int16_t fontinfo_id() const {
91     return fontinfo_id_;
92   }
fontinfo_id2()93   int16_t fontinfo_id2() const {
94     return fontinfo_id2_;
95   }
96 #ifndef DISABLED_LEGACY_ENGINE
fonts()97   const std::vector<ScoredFont> &fonts() const {
98     return fonts_;
99   }
set_fonts(const std::vector<ScoredFont> & fonts)100   void set_fonts(const std::vector<ScoredFont> &fonts) {
101     fonts_ = fonts;
102     int score1 = 0, score2 = 0;
103     fontinfo_id_ = -1;
104     fontinfo_id2_ = -1;
105     for (auto &f : fonts_) {
106       if (f.score > score1) {
107         score2 = score1;
108         fontinfo_id2_ = fontinfo_id_;
109         score1 = f.score;
110         fontinfo_id_ = f.fontinfo_id;
111       } else if (f.score > score2) {
112         score2 = f.score;
113         fontinfo_id2_ = f.fontinfo_id;
114       }
115     }
116   }
117 #endif // ndef DISABLED_LEGACY_ENGINE
script_id()118   int script_id() const {
119     return script_id_;
120   }
matrix_cell()121   const MATRIX_COORD &matrix_cell() {
122     return matrix_cell_;
123   }
min_xheight()124   float min_xheight() const {
125     return min_xheight_;
126   }
max_xheight()127   float max_xheight() const {
128     return max_xheight_;
129   }
yshift()130   float yshift() const {
131     return yshift_;
132   }
classifier()133   BlobChoiceClassifier classifier() const {
134     return classifier_;
135   }
IsAdapted()136   bool IsAdapted() const {
137     return classifier_ == BCC_ADAPTED_CLASSIFIER;
138   }
IsClassified()139   bool IsClassified() const {
140     return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER ||
141            classifier_ == BCC_SPECKLE_CLASSIFIER;
142   }
143 
set_unichar_id(UNICHAR_ID newunichar_id)144   void set_unichar_id(UNICHAR_ID newunichar_id) {
145     unichar_id_ = newunichar_id;
146   }
set_rating(float newrat)147   void set_rating(float newrat) {
148     rating_ = newrat;
149   }
set_certainty(float newrat)150   void set_certainty(float newrat) {
151     certainty_ = newrat;
152   }
set_script(int newscript_id)153   void set_script(int newscript_id) {
154     script_id_ = newscript_id;
155   }
set_matrix_cell(int col,int row)156   void set_matrix_cell(int col, int row) {
157     matrix_cell_.col = col;
158     matrix_cell_.row = row;
159   }
set_classifier(BlobChoiceClassifier classifier)160   void set_classifier(BlobChoiceClassifier classifier) {
161     classifier_ = classifier;
162   }
deep_copy(const BLOB_CHOICE * src)163   static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) {
164     auto *choice = new BLOB_CHOICE;
165     *choice = *src;
166     return choice;
167   }
168   // Returns true if *this and other agree on the baseline and x-height
169   // to within some tolerance based on a given estimate of the x-height.
170   bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const;
171 
print(const UNICHARSET * unicharset)172   void print(const UNICHARSET *unicharset) const {
173     tprintf("r%.2f c%.2f x[%g,%g]: %d %s", rating_, certainty_, min_xheight_, max_xheight_,
174             unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str());
175   }
print_full()176   void print_full() const {
177     print(nullptr);
178     tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_,
179             fontinfo_id2_, yshift_, classifier_);
180   }
181   // Sort function for sorting BLOB_CHOICEs in increasing order of rating.
SortByRating(const void * p1,const void * p2)182   static int SortByRating(const void *p1, const void *p2) {
183     const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);
184     const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);
185     return (bc1->rating_ < bc2->rating_) ? -1 : 1;
186   }
187 
188 private:
189   // Copy assignment operator.
190   BLOB_CHOICE &operator=(const BLOB_CHOICE &other);
191 
192   UNICHAR_ID unichar_id_; // unichar id
193 #ifndef DISABLED_LEGACY_ENGINE
194   // Fonts and scores. Allowed to be empty.
195   std::vector<ScoredFont> fonts_;
196 #endif                   // ndef DISABLED_LEGACY_ENGINE
197   int16_t fontinfo_id_;  // char font information
198   int16_t fontinfo_id2_; // 2nd choice font information
199   // Rating is the classifier distance weighted by the length of the outline
200   // in the blob. In terms of probability, classifier distance is -klog p such
201   // that the resulting distance is in the range [0, 1] and then
202   // rating = w (-k log p) where w is the weight for the length of the outline.
203   // Sums of ratings may be compared meaningfully for words of different
204   // segmentation.
205   float rating_; // size related
206   // Certainty is a number in [-20, 0] indicating the classifier certainty
207   // of the choice. In terms of probability, certainty = 20 (k log p) where
208   // k is defined as above to normalize -klog p to the range [0, 1].
209   float certainty_; // absolute
210   int script_id_;
211   // Holds the position of this choice in the ratings matrix.
212   // Used to location position in the matrix during path backtracking.
213   MATRIX_COORD matrix_cell_;
214   // X-height range (in image pixels) that this classification supports.
215   float min_xheight_;
216   float max_xheight_;
217   // yshift_ - The vertical distance (in image pixels) the character is
218   //           shifted (up or down) from an acceptable y position.
219   float yshift_;
220   BlobChoiceClassifier classifier_; // What generated *this.
221 };
222 
223 // Make BLOB_CHOICE listable.
224 ELISTIZEH(BLOB_CHOICE)
225 
226 // Return the BLOB_CHOICE in bc_list matching a given unichar_id,
227 // or nullptr if there is no match.
228 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
229 
230 // Permuter codes used in WERD_CHOICEs.
231 enum PermuterType {
232   NO_PERM,           // 0
233   PUNC_PERM,         // 1
234   TOP_CHOICE_PERM,   // 2
235   LOWER_CASE_PERM,   // 3
236   UPPER_CASE_PERM,   // 4
237   NGRAM_PERM,        // 5
238   NUMBER_PERM,       // 6
239   USER_PATTERN_PERM, // 7
240   SYSTEM_DAWG_PERM,  // 8
241   DOC_DAWG_PERM,     // 9
242   USER_DAWG_PERM,    // 10
243   FREQ_DAWG_PERM,    // 11
244   COMPOUND_PERM,     // 12
245 
246   NUM_PERMUTER_TYPES
247 };
248 
249 // ScriptPos tells whether a character is subscript, superscript or normal.
250 enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP };
251 
252 const char *ScriptPosToString(ScriptPos script_pos);
253 
254 class TESS_API WERD_CHOICE : public ELIST_LINK {
255 public:
256   static const float kBadRating;
257   static const char *permuter_name(uint8_t permuter);
258 
WERD_CHOICE(const UNICHARSET * unicharset)259   WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) {
260     this->init(8);
261   }
WERD_CHOICE(const UNICHARSET * unicharset,int reserved)262   WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) {
263     this->init(reserved);
264   }
WERD_CHOICE(const char * src_string,const char * src_lengths,float src_rating,float src_certainty,uint8_t src_permuter,const UNICHARSET & unicharset)265   WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating,
266               float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
267       : unicharset_(&unicharset) {
268     this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
269   }
270   WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
WERD_CHOICE(const WERD_CHOICE & word)271   WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) {
272     this->init(word.length());
273     this->operator=(word);
274   }
275   ~WERD_CHOICE();
276 
unicharset()277   const UNICHARSET *unicharset() const {
278     return unicharset_;
279   }
empty()280   bool empty() const {
281     return length_ == 0;
282   }
length()283   inline unsigned length() const {
284     return length_;
285   }
adjust_factor()286   float adjust_factor() const {
287     return adjust_factor_;
288   }
set_adjust_factor(float factor)289   void set_adjust_factor(float factor) {
290     adjust_factor_ = factor;
291   }
unichar_ids()292   inline const std::vector<UNICHAR_ID> &unichar_ids() const {
293     return unichar_ids_;
294   }
unichar_id(unsigned index)295   inline UNICHAR_ID unichar_id(unsigned index) const {
296     assert(index < length_);
297     return unichar_ids_[index];
298   }
state(unsigned index)299   inline unsigned state(unsigned index) const {
300     return state_[index];
301   }
BlobPosition(unsigned index)302   ScriptPos BlobPosition(unsigned index) const {
303     if (index >= length_) {
304       return SP_NORMAL;
305     }
306     return script_pos_[index];
307   }
rating()308   inline float rating() const {
309     return rating_;
310   }
certainty()311   inline float certainty() const {
312     return certainty_;
313   }
certainty(unsigned index)314   inline float certainty(unsigned index) const {
315     return certainties_[index];
316   }
min_x_height()317   inline float min_x_height() const {
318     return min_x_height_;
319   }
max_x_height()320   inline float max_x_height() const {
321     return max_x_height_;
322   }
set_x_heights(float min_height,float max_height)323   inline void set_x_heights(float min_height, float max_height) {
324     min_x_height_ = min_height;
325     max_x_height_ = max_height;
326   }
permuter()327   inline uint8_t permuter() const {
328     return permuter_;
329   }
330   const char *permuter_name() const;
331   // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
332   // taken from the appropriate cell in the ratings MATRIX.
333   // Borrowed pointer, so do not delete.
334   BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;
335 
336   // Returns the MATRIX_COORD corresponding to the location in the ratings
337   // MATRIX for the given index into the word.
338   MATRIX_COORD MatrixCoord(unsigned index) const;
339 
set_unichar_id(UNICHAR_ID unichar_id,unsigned index)340   inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {
341     assert(index < length_);
342     unichar_ids_[index] = unichar_id;
343   }
dangerous_ambig_found()344   bool dangerous_ambig_found() const {
345     return dangerous_ambig_found_;
346   }
set_dangerous_ambig_found_(bool value)347   void set_dangerous_ambig_found_(bool value) {
348     dangerous_ambig_found_ = value;
349   }
set_rating(float new_val)350   inline void set_rating(float new_val) {
351     rating_ = new_val;
352   }
set_certainty(float new_val)353   inline void set_certainty(float new_val) {
354     certainty_ = new_val;
355   }
set_permuter(uint8_t perm)356   inline void set_permuter(uint8_t perm) {
357     permuter_ = perm;
358   }
359   // Note: this function should only be used if all the fields
360   // are populated manually with set_* functions (rather than
361   // (copy)constructors and append_* functions).
set_length(unsigned len)362   inline void set_length(unsigned len) {
363     ASSERT_HOST(reserved_ >= len);
364     length_ = len;
365   }
366 
367   /// Make more space in unichar_id_ and fragment_lengths_ arrays.
double_the_size()368   inline void double_the_size() {
369     if (reserved_ > 0) {
370       reserved_ *= 2;
371     } else {
372       reserved_ = 1;
373     }
374     unichar_ids_.resize(reserved_);
375     script_pos_.resize(reserved_);
376     state_.resize(reserved_);
377     certainties_.resize(reserved_);
378   }
379 
380   /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
381   /// fragment_length_ arrays. Sets other values to default (blank) values.
init(unsigned reserved)382   inline void init(unsigned reserved) {
383     reserved_ = reserved;
384     if (reserved > 0) {
385       unichar_ids_.resize(reserved);
386       script_pos_.resize(reserved);
387       state_.resize(reserved);
388       certainties_.resize(reserved);
389     } else {
390       unichar_ids_.clear();
391       script_pos_.clear();
392       state_.clear();
393       certainties_.clear();
394     }
395     length_ = 0;
396     adjust_factor_ = 1.0f;
397     rating_ = 0.0;
398     certainty_ = FLT_MAX;
399     min_x_height_ = 0.0f;
400     max_x_height_ = FLT_MAX;
401     permuter_ = NO_PERM;
402     unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
403     dangerous_ambig_found_ = false;
404   }
405 
406   /// Helper function to build a WERD_CHOICE from the given string,
407   /// fragment lengths, rating, certainty and permuter.
408   /// The function assumes that src_string is not nullptr.
409   /// src_lengths argument could be nullptr, in which case the unichars
410   /// in src_string are assumed to all be of length 1.
411   void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty,
412             uint8_t src_permuter);
413 
414   /// Set the fields in this choice to be default (bad) values.
make_bad()415   inline void make_bad() {
416     length_ = 0;
417     rating_ = kBadRating;
418     certainty_ = -FLT_MAX;
419   }
420 
421   /// This function assumes that there is enough space reserved
422   /// in the WERD_CHOICE for adding another unichar.
423   /// This is an efficient alternative to append_unichar_id().
append_unichar_id_space_allocated(UNICHAR_ID unichar_id,int blob_count,float rating,float certainty)424   inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating,
425                                                 float certainty) {
426     assert(reserved_ > length_);
427     length_++;
428     this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
429   }
430 
431   void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);
432 
set_unichar_id(UNICHAR_ID unichar_id,int blob_count,float rating,float certainty,unsigned index)433   inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
434                              unsigned index) {
435     assert(index < length_);
436     unichar_ids_[index] = unichar_id;
437     state_[index] = blob_count;
438     certainties_[index] = certainty;
439     script_pos_[index] = SP_NORMAL;
440     rating_ += rating;
441     if (certainty < certainty_) {
442       certainty_ = certainty;
443     }
444   }
445   // Sets the entries for the given index from the BLOB_CHOICE, assuming
446   // unit fragment lengths, but setting the state for this index to blob_count.
447   void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);
448 
449   bool contains_unichar_id(UNICHAR_ID unichar_id) const;
450   void remove_unichar_ids(unsigned index, int num);
remove_last_unichar_id()451   inline void remove_last_unichar_id() {
452     --length_;
453   }
remove_unichar_id(unsigned index)454   inline void remove_unichar_id(unsigned index) {
455     this->remove_unichar_ids(index, 1);
456   }
457   bool has_rtl_unichar_id() const;
458   void reverse_and_mirror_unichar_ids();
459 
460   // Returns the half-open interval of unichar_id indices [start, end) which
461   // enclose the core portion of this word -- the part after stripping
462   // punctuation from the left and right.
463   void punct_stripped(unsigned *start_core, unsigned *end_core) const;
464 
465   // Returns the indices [start, end) containing the core of the word, stripped
466   // of any superscript digits on either side. (i.e., the non-footnote part
467   // of the word). There is no guarantee that the output range is non-empty.
468   void GetNonSuperscriptSpan(int *start, int *end) const;
469 
470   // Return a copy of this WERD_CHOICE with the choices [start, end).
471   // The result is useful only for checking against a dictionary.
472   WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;
473 
474   void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
debug_string()475   std::string debug_string() const {
476     std::string word_str;
477     for (unsigned i = 0; i < length_; ++i) {
478       word_str += unicharset_->debug_str(unichar_ids_[i]);
479       word_str += " ";
480     }
481     return word_str;
482   }
483   // Returns true if any unichar_id in the word is a non-space-delimited char.
ContainsAnyNonSpaceDelimited()484   bool ContainsAnyNonSpaceDelimited() const {
485     for (unsigned i = 0; i < length_; ++i) {
486       if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
487         return true;
488       }
489     }
490     return false;
491   }
492   // Returns true if the word is all spaces.
IsAllSpaces()493   bool IsAllSpaces() const {
494     for (unsigned i = 0; i < length_; ++i) {
495       if (unichar_ids_[i] != UNICHAR_SPACE) {
496         return false;
497       }
498     }
499     return true;
500   }
501 
502   // Call this to override the default (strict left to right graphemes)
503   // with the fact that some engine produces a "reading order" set of
504   // Graphemes for each word.
set_unichars_in_script_order(bool in_script_order)505   bool set_unichars_in_script_order(bool in_script_order) {
506     return unichars_in_script_order_ = in_script_order;
507   }
508 
unichars_in_script_order()509   bool unichars_in_script_order() const {
510     return unichars_in_script_order_;
511   }
512 
513   // Returns a UTF-8 string equivalent to the current choice
514   // of UNICHAR IDs.
unichar_string()515   std::string &unichar_string() {
516     this->string_and_lengths(&unichar_string_, &unichar_lengths_);
517     return unichar_string_;
518   }
519 
520   // Returns a UTF-8 string equivalent to the current choice
521   // of UNICHAR IDs.
unichar_string()522   const std::string &unichar_string() const {
523     this->string_and_lengths(&unichar_string_, &unichar_lengths_);
524     return unichar_string_;
525   }
526 
527   // Returns the lengths, one byte each, representing the number of bytes
528   // required in the unichar_string for each UNICHAR_ID.
unichar_lengths()529   const std::string &unichar_lengths() const {
530     this->string_and_lengths(&unichar_string_, &unichar_lengths_);
531     return unichar_lengths_;
532   }
533 
534   // Sets up the script_pos_ member using the blobs_list to get the bln
535   // bounding boxes, *this to get the unichars, and this->unicharset
536   // to get the target positions. If small_caps is true, sub/super are not
537   // considered, but dropcaps are.
538   // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
539   void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0);
540   // Sets all the script_pos_ positions to the given position.
541   void SetAllScriptPositions(ScriptPos position);
542 
543   static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
544                                     const TBOX &blob_box, UNICHAR_ID unichar_id);
545 
546   // Returns the "dominant" script ID for the word.  By "dominant", the script
547   // must account for at least half the characters.  Otherwise, it returns 0.
548   // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
549   int GetTopScriptID() const;
550 
551   // Fixes the state_ for a chop at the given blob_posiiton.
552   void UpdateStateForSplit(int blob_position);
553 
554   // Returns the sum of all the state elements, being the total number of blobs.
555   unsigned TotalOfStates() const;
556 
print()557   void print() const {
558     this->print("");
559   }
560   void print(const char *msg) const;
561   // Prints the segmentation state with an introductory message.
562   void print_state(const char *msg) const;
563 
564   // Displays the segmentation state of *this (if not the same as the last
565   // one displayed) and waits for a click in the window.
566   void DisplaySegmentation(TWERD *word);
567 
568   WERD_CHOICE &operator+=(        // concatanate
569       const WERD_CHOICE &second); // second on first
570 
571   WERD_CHOICE &operator=(const WERD_CHOICE &source);
572 
573 private:
574   const UNICHARSET *unicharset_;
575   // TODO(rays) Perhaps replace the multiple arrays with an array of structs?
576   // unichar_ids_ is an array of classifier "results" that make up a word.
577   // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
578   // of each unichar_id.
579   // state_[i] indicates the number of blobs in WERD_RES::chopped_word that
580   // were put together to make the classification results in the ith position
581   // in unichar_ids_, and certainties_[i] is the certainty of the choice that
582   // was used in this word.
583   // == Change from before ==
584   // Previously there was fragment_lengths_ that allowed a word to be
585   // artificially composed of multiple fragment results. Since the new
586   // segmentation search doesn't do fragments, treatment of fragments has
587   // been moved to a lower level, augmenting the ratings matrix with the
588   // combined fragments, and allowing the language-model/segmentation-search
589   // to deal with only the combined unichar_ids.
590   std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word
591   std::vector<ScriptPos> script_pos_;   // Normal/Sub/Superscript of each unichar.
592   std::vector<int> state_;              // Number of blobs in each unichar.
593   std::vector<float> certainties_;      // Certainty of each unichar.
594   unsigned reserved_;            // size of the above arrays
595   unsigned length_;              // word length
596   // Factor that was used to adjust the rating.
597   float adjust_factor_;
598   // Rating is the sum of the ratings of the individual blobs in the word.
599   float rating_; // size related
600   // certainty is the min (worst) certainty of the individual blobs in the word.
601   float certainty_; // absolute
602   // xheight computed from the result, or 0 if inconsistent.
603   float min_x_height_;
604   float max_x_height_;
605   uint8_t permuter_; // permuter code
606 
607   // Normally, the ratings_ matrix represents the recognition results in order
608   // from left-to-right.  However, some engines (say Cube) may return
609   // recognition results in the order of the script's major reading direction
610   // (for Arabic, that is right-to-left).
611   bool unichars_in_script_order_;
612   // True if NoDangerousAmbig found an ambiguity.
613   bool dangerous_ambig_found_;
614 
615   // The following variables are populated and passed by reference any
616   // time unichar_string() or unichar_lengths() are called.
617   mutable std::string unichar_string_;
618   mutable std::string unichar_lengths_;
619 };
620 
621 // Make WERD_CHOICE listable.
622 ELISTIZEH(WERD_CHOICE)
623 using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>;
624 
625 // Utilities for comparing WERD_CHOICEs
626 
627 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2);
628 
629 // Utilities for debug printing.
630 void print_ratings_list(const char *msg,                     // intro message
631                         BLOB_CHOICE_LIST *ratings,           // list of results
632                         const UNICHARSET &current_unicharset // unicharset that can be used
633                                                              // for id-to-unichar conversion
634 );
635 
636 } // namespace tesseract
637 
638 #endif
639