1 /********************************************************************** 2 * File: ratngs.h (Formerly ratings.h) 3 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes. 4 * Author: Ray Smith 5 * 6 * (C) Copyright 1992, Hewlett-Packard Ltd. 7 ** Licensed under the Apache License, Version 2.0 (the "License"); 8 ** you may not use this file except in compliance with the License. 9 ** You may obtain a copy of the License at 10 ** http://www.apache.org/licenses/LICENSE-2.0 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 * 17 **********************************************************************/ 18 19 #ifndef RATNGS_H 20 #define RATNGS_H 21 22 #ifdef HAVE_CONFIG_H 23 # include "config_auto.h" // DISABLED_LEGACY_ENGINE 24 #endif 25 26 #include "clst.h" 27 #include "elst.h" 28 #ifndef DISABLED_LEGACY_ENGINE 29 # include "fontinfo.h" 30 #endif // undef DISABLED_LEGACY_ENGINE 31 #include "matrix.h" 32 #include "unicharset.h" 33 #include "werd.h" 34 35 #include <tesseract/unichar.h> 36 37 #include <cassert> 38 #include <cfloat> // for FLT_MAX 39 40 namespace tesseract { 41 42 class MATRIX; 43 struct TBLOB; 44 struct TWERD; 45 46 // Enum to describe the source of a BLOB_CHOICE to make it possible to determine 47 // whether a blob has been classified by inspecting the BLOB_CHOICEs. 48 enum BlobChoiceClassifier { 49 BCC_STATIC_CLASSIFIER, // From the char_norm classifier. 50 BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier. 51 BCC_SPECKLE_CLASSIFIER, // Backup for failed classification. 52 BCC_AMBIG, // Generated by ambiguity detection. 53 BCC_FAKE, // From some other process. 54 }; 55 56 class BLOB_CHOICE : public ELIST_LINK { 57 public: BLOB_CHOICE()58 BLOB_CHOICE() { 59 unichar_id_ = UNICHAR_SPACE; 60 fontinfo_id_ = -1; 61 fontinfo_id2_ = -1; 62 rating_ = 10.0; 63 certainty_ = -1.0; 64 script_id_ = -1; 65 min_xheight_ = 0.0f; 66 max_xheight_ = 0.0f; 67 yshift_ = 0.0f; 68 classifier_ = BCC_FAKE; 69 } 70 BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 71 float src_rating, // rating 72 float src_cert, // certainty 73 int script_id, // script 74 float min_xheight, // min xheight in image pixel units 75 float max_xheight, // max xheight allowed by this char 76 float yshift, // the larger of y shift (top or bottom) 77 BlobChoiceClassifier c); // adapted match or other 78 BLOB_CHOICE(const BLOB_CHOICE &other); 79 ~BLOB_CHOICE() = default; 80 unichar_id()81 UNICHAR_ID unichar_id() const { 82 return unichar_id_; 83 } rating()84 float rating() const { 85 return rating_; 86 } certainty()87 float certainty() const { 88 return certainty_; 89 } fontinfo_id()90 int16_t fontinfo_id() const { 91 return fontinfo_id_; 92 } fontinfo_id2()93 int16_t fontinfo_id2() const { 94 return fontinfo_id2_; 95 } 96 #ifndef DISABLED_LEGACY_ENGINE fonts()97 const std::vector<ScoredFont> &fonts() const { 98 return fonts_; 99 } set_fonts(const std::vector<ScoredFont> & fonts)100 void set_fonts(const std::vector<ScoredFont> &fonts) { 101 fonts_ = fonts; 102 int score1 = 0, score2 = 0; 103 fontinfo_id_ = -1; 104 fontinfo_id2_ = -1; 105 for (auto &f : fonts_) { 106 if (f.score > score1) { 107 score2 = score1; 108 fontinfo_id2_ = fontinfo_id_; 109 score1 = f.score; 110 fontinfo_id_ = f.fontinfo_id; 111 } else if (f.score > score2) { 112 score2 = f.score; 113 fontinfo_id2_ = f.fontinfo_id; 114 } 115 } 116 } 117 #endif // ndef DISABLED_LEGACY_ENGINE script_id()118 int script_id() const { 119 return script_id_; 120 } matrix_cell()121 const MATRIX_COORD &matrix_cell() { 122 return matrix_cell_; 123 } min_xheight()124 float min_xheight() const { 125 return min_xheight_; 126 } max_xheight()127 float max_xheight() const { 128 return max_xheight_; 129 } yshift()130 float yshift() const { 131 return yshift_; 132 } classifier()133 BlobChoiceClassifier classifier() const { 134 return classifier_; 135 } IsAdapted()136 bool IsAdapted() const { 137 return classifier_ == BCC_ADAPTED_CLASSIFIER; 138 } IsClassified()139 bool IsClassified() const { 140 return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER || 141 classifier_ == BCC_SPECKLE_CLASSIFIER; 142 } 143 set_unichar_id(UNICHAR_ID newunichar_id)144 void set_unichar_id(UNICHAR_ID newunichar_id) { 145 unichar_id_ = newunichar_id; 146 } set_rating(float newrat)147 void set_rating(float newrat) { 148 rating_ = newrat; 149 } set_certainty(float newrat)150 void set_certainty(float newrat) { 151 certainty_ = newrat; 152 } set_script(int newscript_id)153 void set_script(int newscript_id) { 154 script_id_ = newscript_id; 155 } set_matrix_cell(int col,int row)156 void set_matrix_cell(int col, int row) { 157 matrix_cell_.col = col; 158 matrix_cell_.row = row; 159 } set_classifier(BlobChoiceClassifier classifier)160 void set_classifier(BlobChoiceClassifier classifier) { 161 classifier_ = classifier; 162 } deep_copy(const BLOB_CHOICE * src)163 static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) { 164 auto *choice = new BLOB_CHOICE; 165 *choice = *src; 166 return choice; 167 } 168 // Returns true if *this and other agree on the baseline and x-height 169 // to within some tolerance based on a given estimate of the x-height. 170 bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const; 171 print(const UNICHARSET * unicharset)172 void print(const UNICHARSET *unicharset) const { 173 tprintf("r%.2f c%.2f x[%g,%g]: %d %s", rating_, certainty_, min_xheight_, max_xheight_, 174 unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str()); 175 } print_full()176 void print_full() const { 177 print(nullptr); 178 tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_, 179 fontinfo_id2_, yshift_, classifier_); 180 } 181 // Sort function for sorting BLOB_CHOICEs in increasing order of rating. SortByRating(const void * p1,const void * p2)182 static int SortByRating(const void *p1, const void *p2) { 183 const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1); 184 const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2); 185 return (bc1->rating_ < bc2->rating_) ? -1 : 1; 186 } 187 188 private: 189 // Copy assignment operator. 190 BLOB_CHOICE &operator=(const BLOB_CHOICE &other); 191 192 UNICHAR_ID unichar_id_; // unichar id 193 #ifndef DISABLED_LEGACY_ENGINE 194 // Fonts and scores. Allowed to be empty. 195 std::vector<ScoredFont> fonts_; 196 #endif // ndef DISABLED_LEGACY_ENGINE 197 int16_t fontinfo_id_; // char font information 198 int16_t fontinfo_id2_; // 2nd choice font information 199 // Rating is the classifier distance weighted by the length of the outline 200 // in the blob. In terms of probability, classifier distance is -klog p such 201 // that the resulting distance is in the range [0, 1] and then 202 // rating = w (-k log p) where w is the weight for the length of the outline. 203 // Sums of ratings may be compared meaningfully for words of different 204 // segmentation. 205 float rating_; // size related 206 // Certainty is a number in [-20, 0] indicating the classifier certainty 207 // of the choice. In terms of probability, certainty = 20 (k log p) where 208 // k is defined as above to normalize -klog p to the range [0, 1]. 209 float certainty_; // absolute 210 int script_id_; 211 // Holds the position of this choice in the ratings matrix. 212 // Used to location position in the matrix during path backtracking. 213 MATRIX_COORD matrix_cell_; 214 // X-height range (in image pixels) that this classification supports. 215 float min_xheight_; 216 float max_xheight_; 217 // yshift_ - The vertical distance (in image pixels) the character is 218 // shifted (up or down) from an acceptable y position. 219 float yshift_; 220 BlobChoiceClassifier classifier_; // What generated *this. 221 }; 222 223 // Make BLOB_CHOICE listable. 224 ELISTIZEH(BLOB_CHOICE) 225 226 // Return the BLOB_CHOICE in bc_list matching a given unichar_id, 227 // or nullptr if there is no match. 228 BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list); 229 230 // Permuter codes used in WERD_CHOICEs. 231 enum PermuterType { 232 NO_PERM, // 0 233 PUNC_PERM, // 1 234 TOP_CHOICE_PERM, // 2 235 LOWER_CASE_PERM, // 3 236 UPPER_CASE_PERM, // 4 237 NGRAM_PERM, // 5 238 NUMBER_PERM, // 6 239 USER_PATTERN_PERM, // 7 240 SYSTEM_DAWG_PERM, // 8 241 DOC_DAWG_PERM, // 9 242 USER_DAWG_PERM, // 10 243 FREQ_DAWG_PERM, // 11 244 COMPOUND_PERM, // 12 245 246 NUM_PERMUTER_TYPES 247 }; 248 249 // ScriptPos tells whether a character is subscript, superscript or normal. 250 enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }; 251 252 const char *ScriptPosToString(ScriptPos script_pos); 253 254 class TESS_API WERD_CHOICE : public ELIST_LINK { 255 public: 256 static const float kBadRating; 257 static const char *permuter_name(uint8_t permuter); 258 WERD_CHOICE(const UNICHARSET * unicharset)259 WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) { 260 this->init(8); 261 } WERD_CHOICE(const UNICHARSET * unicharset,int reserved)262 WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) { 263 this->init(reserved); 264 } WERD_CHOICE(const char * src_string,const char * src_lengths,float src_rating,float src_certainty,uint8_t src_permuter,const UNICHARSET & unicharset)265 WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating, 266 float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset) 267 : unicharset_(&unicharset) { 268 this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter); 269 } 270 WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); WERD_CHOICE(const WERD_CHOICE & word)271 WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) { 272 this->init(word.length()); 273 this->operator=(word); 274 } 275 ~WERD_CHOICE(); 276 unicharset()277 const UNICHARSET *unicharset() const { 278 return unicharset_; 279 } empty()280 bool empty() const { 281 return length_ == 0; 282 } length()283 inline unsigned length() const { 284 return length_; 285 } adjust_factor()286 float adjust_factor() const { 287 return adjust_factor_; 288 } set_adjust_factor(float factor)289 void set_adjust_factor(float factor) { 290 adjust_factor_ = factor; 291 } unichar_ids()292 inline const std::vector<UNICHAR_ID> &unichar_ids() const { 293 return unichar_ids_; 294 } unichar_id(unsigned index)295 inline UNICHAR_ID unichar_id(unsigned index) const { 296 assert(index < length_); 297 return unichar_ids_[index]; 298 } state(unsigned index)299 inline unsigned state(unsigned index) const { 300 return state_[index]; 301 } BlobPosition(unsigned index)302 ScriptPos BlobPosition(unsigned index) const { 303 if (index >= length_) { 304 return SP_NORMAL; 305 } 306 return script_pos_[index]; 307 } rating()308 inline float rating() const { 309 return rating_; 310 } certainty()311 inline float certainty() const { 312 return certainty_; 313 } certainty(unsigned index)314 inline float certainty(unsigned index) const { 315 return certainties_[index]; 316 } min_x_height()317 inline float min_x_height() const { 318 return min_x_height_; 319 } max_x_height()320 inline float max_x_height() const { 321 return max_x_height_; 322 } set_x_heights(float min_height,float max_height)323 inline void set_x_heights(float min_height, float max_height) { 324 min_x_height_ = min_height; 325 max_x_height_ = max_height; 326 } permuter()327 inline uint8_t permuter() const { 328 return permuter_; 329 } 330 const char *permuter_name() const; 331 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, 332 // taken from the appropriate cell in the ratings MATRIX. 333 // Borrowed pointer, so do not delete. 334 BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const; 335 336 // Returns the MATRIX_COORD corresponding to the location in the ratings 337 // MATRIX for the given index into the word. 338 MATRIX_COORD MatrixCoord(unsigned index) const; 339 set_unichar_id(UNICHAR_ID unichar_id,unsigned index)340 inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) { 341 assert(index < length_); 342 unichar_ids_[index] = unichar_id; 343 } dangerous_ambig_found()344 bool dangerous_ambig_found() const { 345 return dangerous_ambig_found_; 346 } set_dangerous_ambig_found_(bool value)347 void set_dangerous_ambig_found_(bool value) { 348 dangerous_ambig_found_ = value; 349 } set_rating(float new_val)350 inline void set_rating(float new_val) { 351 rating_ = new_val; 352 } set_certainty(float new_val)353 inline void set_certainty(float new_val) { 354 certainty_ = new_val; 355 } set_permuter(uint8_t perm)356 inline void set_permuter(uint8_t perm) { 357 permuter_ = perm; 358 } 359 // Note: this function should only be used if all the fields 360 // are populated manually with set_* functions (rather than 361 // (copy)constructors and append_* functions). set_length(unsigned len)362 inline void set_length(unsigned len) { 363 ASSERT_HOST(reserved_ >= len); 364 length_ = len; 365 } 366 367 /// Make more space in unichar_id_ and fragment_lengths_ arrays. double_the_size()368 inline void double_the_size() { 369 if (reserved_ > 0) { 370 reserved_ *= 2; 371 } else { 372 reserved_ = 1; 373 } 374 unichar_ids_.resize(reserved_); 375 script_pos_.resize(reserved_); 376 state_.resize(reserved_); 377 certainties_.resize(reserved_); 378 } 379 380 /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and 381 /// fragment_length_ arrays. Sets other values to default (blank) values. init(unsigned reserved)382 inline void init(unsigned reserved) { 383 reserved_ = reserved; 384 if (reserved > 0) { 385 unichar_ids_.resize(reserved); 386 script_pos_.resize(reserved); 387 state_.resize(reserved); 388 certainties_.resize(reserved); 389 } else { 390 unichar_ids_.clear(); 391 script_pos_.clear(); 392 state_.clear(); 393 certainties_.clear(); 394 } 395 length_ = 0; 396 adjust_factor_ = 1.0f; 397 rating_ = 0.0; 398 certainty_ = FLT_MAX; 399 min_x_height_ = 0.0f; 400 max_x_height_ = FLT_MAX; 401 permuter_ = NO_PERM; 402 unichars_in_script_order_ = false; // Tesseract is strict left-to-right. 403 dangerous_ambig_found_ = false; 404 } 405 406 /// Helper function to build a WERD_CHOICE from the given string, 407 /// fragment lengths, rating, certainty and permuter. 408 /// The function assumes that src_string is not nullptr. 409 /// src_lengths argument could be nullptr, in which case the unichars 410 /// in src_string are assumed to all be of length 1. 411 void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, 412 uint8_t src_permuter); 413 414 /// Set the fields in this choice to be default (bad) values. make_bad()415 inline void make_bad() { 416 length_ = 0; 417 rating_ = kBadRating; 418 certainty_ = -FLT_MAX; 419 } 420 421 /// This function assumes that there is enough space reserved 422 /// in the WERD_CHOICE for adding another unichar. 423 /// This is an efficient alternative to append_unichar_id(). append_unichar_id_space_allocated(UNICHAR_ID unichar_id,int blob_count,float rating,float certainty)424 inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, 425 float certainty) { 426 assert(reserved_ > length_); 427 length_++; 428 this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1); 429 } 430 431 void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty); 432 set_unichar_id(UNICHAR_ID unichar_id,int blob_count,float rating,float certainty,unsigned index)433 inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, 434 unsigned index) { 435 assert(index < length_); 436 unichar_ids_[index] = unichar_id; 437 state_[index] = blob_count; 438 certainties_[index] = certainty; 439 script_pos_[index] = SP_NORMAL; 440 rating_ += rating; 441 if (certainty < certainty_) { 442 certainty_ = certainty; 443 } 444 } 445 // Sets the entries for the given index from the BLOB_CHOICE, assuming 446 // unit fragment lengths, but setting the state for this index to blob_count. 447 void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice); 448 449 bool contains_unichar_id(UNICHAR_ID unichar_id) const; 450 void remove_unichar_ids(unsigned index, int num); remove_last_unichar_id()451 inline void remove_last_unichar_id() { 452 --length_; 453 } remove_unichar_id(unsigned index)454 inline void remove_unichar_id(unsigned index) { 455 this->remove_unichar_ids(index, 1); 456 } 457 bool has_rtl_unichar_id() const; 458 void reverse_and_mirror_unichar_ids(); 459 460 // Returns the half-open interval of unichar_id indices [start, end) which 461 // enclose the core portion of this word -- the part after stripping 462 // punctuation from the left and right. 463 void punct_stripped(unsigned *start_core, unsigned *end_core) const; 464 465 // Returns the indices [start, end) containing the core of the word, stripped 466 // of any superscript digits on either side. (i.e., the non-footnote part 467 // of the word). There is no guarantee that the output range is non-empty. 468 void GetNonSuperscriptSpan(int *start, int *end) const; 469 470 // Return a copy of this WERD_CHOICE with the choices [start, end). 471 // The result is useful only for checking against a dictionary. 472 WERD_CHOICE shallow_copy(unsigned start, unsigned end) const; 473 474 void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const; debug_string()475 std::string debug_string() const { 476 std::string word_str; 477 for (unsigned i = 0; i < length_; ++i) { 478 word_str += unicharset_->debug_str(unichar_ids_[i]); 479 word_str += " "; 480 } 481 return word_str; 482 } 483 // Returns true if any unichar_id in the word is a non-space-delimited char. ContainsAnyNonSpaceDelimited()484 bool ContainsAnyNonSpaceDelimited() const { 485 for (unsigned i = 0; i < length_; ++i) { 486 if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) { 487 return true; 488 } 489 } 490 return false; 491 } 492 // Returns true if the word is all spaces. IsAllSpaces()493 bool IsAllSpaces() const { 494 for (unsigned i = 0; i < length_; ++i) { 495 if (unichar_ids_[i] != UNICHAR_SPACE) { 496 return false; 497 } 498 } 499 return true; 500 } 501 502 // Call this to override the default (strict left to right graphemes) 503 // with the fact that some engine produces a "reading order" set of 504 // Graphemes for each word. set_unichars_in_script_order(bool in_script_order)505 bool set_unichars_in_script_order(bool in_script_order) { 506 return unichars_in_script_order_ = in_script_order; 507 } 508 unichars_in_script_order()509 bool unichars_in_script_order() const { 510 return unichars_in_script_order_; 511 } 512 513 // Returns a UTF-8 string equivalent to the current choice 514 // of UNICHAR IDs. unichar_string()515 std::string &unichar_string() { 516 this->string_and_lengths(&unichar_string_, &unichar_lengths_); 517 return unichar_string_; 518 } 519 520 // Returns a UTF-8 string equivalent to the current choice 521 // of UNICHAR IDs. unichar_string()522 const std::string &unichar_string() const { 523 this->string_and_lengths(&unichar_string_, &unichar_lengths_); 524 return unichar_string_; 525 } 526 527 // Returns the lengths, one byte each, representing the number of bytes 528 // required in the unichar_string for each UNICHAR_ID. unichar_lengths()529 const std::string &unichar_lengths() const { 530 this->string_and_lengths(&unichar_string_, &unichar_lengths_); 531 return unichar_lengths_; 532 } 533 534 // Sets up the script_pos_ member using the blobs_list to get the bln 535 // bounding boxes, *this to get the unichars, and this->unicharset 536 // to get the target positions. If small_caps is true, sub/super are not 537 // considered, but dropcaps are. 538 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.) 539 void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0); 540 // Sets all the script_pos_ positions to the given position. 541 void SetAllScriptPositions(ScriptPos position); 542 543 static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, 544 const TBOX &blob_box, UNICHAR_ID unichar_id); 545 546 // Returns the "dominant" script ID for the word. By "dominant", the script 547 // must account for at least half the characters. Otherwise, it returns 0. 548 // Note that for Japanese, Hiragana and Katakana are simply treated as Han. 549 int GetTopScriptID() const; 550 551 // Fixes the state_ for a chop at the given blob_posiiton. 552 void UpdateStateForSplit(int blob_position); 553 554 // Returns the sum of all the state elements, being the total number of blobs. 555 unsigned TotalOfStates() const; 556 print()557 void print() const { 558 this->print(""); 559 } 560 void print(const char *msg) const; 561 // Prints the segmentation state with an introductory message. 562 void print_state(const char *msg) const; 563 564 // Displays the segmentation state of *this (if not the same as the last 565 // one displayed) and waits for a click in the window. 566 void DisplaySegmentation(TWERD *word); 567 568 WERD_CHOICE &operator+=( // concatanate 569 const WERD_CHOICE &second); // second on first 570 571 WERD_CHOICE &operator=(const WERD_CHOICE &source); 572 573 private: 574 const UNICHARSET *unicharset_; 575 // TODO(rays) Perhaps replace the multiple arrays with an array of structs? 576 // unichar_ids_ is an array of classifier "results" that make up a word. 577 // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position 578 // of each unichar_id. 579 // state_[i] indicates the number of blobs in WERD_RES::chopped_word that 580 // were put together to make the classification results in the ith position 581 // in unichar_ids_, and certainties_[i] is the certainty of the choice that 582 // was used in this word. 583 // == Change from before == 584 // Previously there was fragment_lengths_ that allowed a word to be 585 // artificially composed of multiple fragment results. Since the new 586 // segmentation search doesn't do fragments, treatment of fragments has 587 // been moved to a lower level, augmenting the ratings matrix with the 588 // combined fragments, and allowing the language-model/segmentation-search 589 // to deal with only the combined unichar_ids. 590 std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word 591 std::vector<ScriptPos> script_pos_; // Normal/Sub/Superscript of each unichar. 592 std::vector<int> state_; // Number of blobs in each unichar. 593 std::vector<float> certainties_; // Certainty of each unichar. 594 unsigned reserved_; // size of the above arrays 595 unsigned length_; // word length 596 // Factor that was used to adjust the rating. 597 float adjust_factor_; 598 // Rating is the sum of the ratings of the individual blobs in the word. 599 float rating_; // size related 600 // certainty is the min (worst) certainty of the individual blobs in the word. 601 float certainty_; // absolute 602 // xheight computed from the result, or 0 if inconsistent. 603 float min_x_height_; 604 float max_x_height_; 605 uint8_t permuter_; // permuter code 606 607 // Normally, the ratings_ matrix represents the recognition results in order 608 // from left-to-right. However, some engines (say Cube) may return 609 // recognition results in the order of the script's major reading direction 610 // (for Arabic, that is right-to-left). 611 bool unichars_in_script_order_; 612 // True if NoDangerousAmbig found an ambiguity. 613 bool dangerous_ambig_found_; 614 615 // The following variables are populated and passed by reference any 616 // time unichar_string() or unichar_lengths() are called. 617 mutable std::string unichar_string_; 618 mutable std::string unichar_lengths_; 619 }; 620 621 // Make WERD_CHOICE listable. 622 ELISTIZEH(WERD_CHOICE) 623 using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>; 624 625 // Utilities for comparing WERD_CHOICEs 626 627 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2); 628 629 // Utilities for debug printing. 630 void print_ratings_list(const char *msg, // intro message 631 BLOB_CHOICE_LIST *ratings, // list of results 632 const UNICHARSET ¤t_unicharset // unicharset that can be used 633 // for id-to-unichar conversion 634 ); 635 636 } // namespace tesseract 637 638 #endif 639