1 /******************************************************************
2  * File:        control.cpp  (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include <cctype>
25 #include <cmath>
26 #include <cstdint> // for int16_t, int32_t
27 #include <cstdio>  // for fclose, fopen, FILE
28 #include <ctime>   // for clock
29 #include "control.h"
30 #ifndef DISABLED_LEGACY_ENGINE
31 #  include "docqual.h"
32 #  include "drawfx.h"
33 #  include "fixspace.h"
34 #endif
35 #include <tesseract/ocrclass.h>
36 #include "lstmrecognizer.h"
37 #include "output.h"
38 #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
39 #ifndef DISABLED_LEGACY_ENGINE
40 #  include "reject.h"
41 #endif
42 #include "sorthelper.h"
43 #include "tesseractclass.h"
44 #include "tessvars.h"
45 #include "werdit.h"
46 
47 const char *const kBackUpConfigFile = "tempconfigdata.config";
48 #ifndef DISABLED_LEGACY_ENGINE
49 // Min believable x-height for any text when refitting as a fraction of
50 // original x-height
51 const double kMinRefitXHeightFraction = 0.5;
52 #endif // ! DISABLED_LEGACY_ENGINE
53 
54 /**
55  * Make a word from the selected blobs and run Tess on them.
56  *
57  * @param page_res recognise blobs
58  * @param selection_box within this box
59  */
60 namespace tesseract {
61 
recog_pseudo_word(PAGE_RES * page_res,TBOX & selection_box)62 void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {
63   PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
64   if (it != nullptr) {
65     recog_interactive(it);
66     it->DeleteCurrentWord();
67     delete it;
68   }
69 }
70 
71 /**
72  * Recognize a single word in interactive mode.
73  *
74  * @param pr_it the page results iterator
75  */
recog_interactive(PAGE_RES_IT * pr_it)76 bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {
77   WordData word_data(*pr_it);
78   SetupWordPassN(2, &word_data);
79   // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
80   if (lstm_recognizer_ == nullptr) {
81 #ifndef DISABLED_LEGACY_ENGINE
82     classify_word_and_language(2, pr_it, &word_data);
83 #endif // ndef DISABLED_LEGACY_ENGINE
84   } else {
85     classify_word_and_language(1, pr_it, &word_data);
86   }
87 #ifndef DISABLED_LEGACY_ENGINE
88   if (tessedit_debug_quality_metrics) {
89     int16_t char_qual;
90     int16_t good_char_qual;
91     WERD_RES *word_res = pr_it->word();
92     word_char_quality(word_res, &char_qual, &good_char_qual);
93     tprintf(
94         "\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
95         "char_quality: %d; good_char_quality: %d\n",
96         word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
97         char_qual, good_char_qual);
98   }
99 #endif // ndef DISABLED_LEGACY_ENGINE
100   return true;
101 }
102 
103 // Helper function to check for a target word and handle it appropriately.
104 // Inspired by Jetsoft's requirement to process only single words on pass2
105 // and beyond.
106 // If word_config is not null:
107 //   If the word_box and target_word_box overlap, read the word_config file
108 //   else reset to previous config data.
109 //   return true.
110 // else
111 //   If the word_box and target_word_box overlap or pass <= 1, return true.
112 // Note that this function uses a fixed temporary file for storing the previous
113 // configs, so it is neither thread-safe, nor process-safe, but the assumption
114 // is that it will only be used for one debug window at a time.
115 //
116 // Since this function is used for debugging (and not to change OCR results)
117 // set only debug params from the word config file.
ProcessTargetWord(const TBOX & word_box,const TBOX & target_word_box,const char * word_config,int pass)118 bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,
119                                   const char *word_config, int pass) {
120   if (word_config != nullptr) {
121     if (word_box.major_overlap(target_word_box)) {
122       if (backup_config_file_ == nullptr) {
123         backup_config_file_ = kBackUpConfigFile;
124         FILE *config_fp = fopen(backup_config_file_, "wb");
125         if (config_fp == nullptr) {
126           tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
127         } else {
128           ParamUtils::PrintParams(config_fp, params());
129           fclose(config_fp);
130         }
131         ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
132       }
133     } else {
134       if (backup_config_file_ != nullptr) {
135         ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
136         backup_config_file_ = nullptr;
137       }
138     }
139   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
140     return false;
141   }
142   return true;
143 }
144 
145 /** If tesseract is to be run, sets the words up ready for it. */
SetupAllWordsPassN(int pass_n,const TBOX * target_word_box,const char * word_config,PAGE_RES * page_res,std::vector<WordData> * words)146 void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
147                                    PAGE_RES *page_res, std::vector<WordData> *words) {
148   // Prepare all the words.
149   PAGE_RES_IT page_res_it(page_res);
150   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
151     if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
152                                                         *target_word_box, word_config, 1)) {
153       words->push_back(WordData(page_res_it));
154     }
155   }
156   // Setup all the words for recognition with polygonal approximation.
157   for (unsigned w = 0; w < words->size(); ++w) {
158     SetupWordPassN(pass_n, &(*words)[w]);
159     if (w > 0) {
160       (*words)[w].prev_word = &(*words)[w - 1];
161     }
162   }
163 }
164 
165 // Sets up the single word ready for whichever engine is to be run.
SetupWordPassN(int pass_n,WordData * word)166 void Tesseract::SetupWordPassN(int pass_n, WordData *word) {
167   if (pass_n == 1 || !word->word->done) {
168     if (pass_n == 1) {
169       word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
170                                       nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171                                       poly_allow_detailed_fx, word->row, word->block);
172     } else if (pass_n == 2) {
173       // TODO(rays) Should we do this on pass1 too?
174       word->word->caps_height = 0.0;
175       if (word->word->x_height == 0.0f) {
176         word->word->x_height = word->row->x_height();
177       }
178     }
179     word->lang_words.truncate(0);
180     for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
181       // The sub_langs_.size() entry is for the master language.
182       Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
183       auto *word_res = new WERD_RES;
184       word_res->InitForRetryRecognition(*word->word);
185       word->lang_words.push_back(word_res);
186       // LSTM doesn't get setup for pass2.
187       if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
188         word_res->SetupForRecognition(
189             lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
190             lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191             lang_t->poly_allow_detailed_fx, word->row, word->block);
192       }
193     }
194   }
195 }
196 
197 // Runs word recognition on all the words.
RecogAllWordsPassN(int pass_n,ETEXT_DESC * monitor,PAGE_RES_IT * pr_it,std::vector<WordData> * words)198 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
199                                    std::vector<WordData> *words) {
200   // TODO(rays) Before this loop can be parallelized (it would yield a massive
201   // speed-up) all remaining member globals need to be converted to local/heap
202   // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
203   // added. The results will be significantly different with adaption on, and
204   // deterioration will need investigation.
205   pr_it->restart_page();
206   for (unsigned w = 0; w < words->size(); ++w) {
207     WordData *word = &(*words)[w];
208     if (w > 0) {
209       word->prev_word = &(*words)[w - 1];
210     }
211     if (monitor != nullptr) {
212       monitor->ocr_alive = true;
213       if (pass_n == 1) {
214         monitor->progress = 70 * w / words->size();
215       } else {
216         monitor->progress = 70 + 30 * w / words->size();
217       }
218       if (monitor->progress_callback2 != nullptr) {
219         TBOX box = pr_it->word()->word->bounding_box();
220         (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
221       }
222       if (monitor->deadline_exceeded() ||
223           (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
224         // Timeout. Fake out the rest of the words.
225         for (; w < words->size(); ++w) {
226           (*words)[w].word->SetupFake(unicharset);
227         }
228         return false;
229       }
230     }
231     if (word->word->tess_failed) {
232       unsigned s;
233       for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
234       }
235       // If all are failed, skip it. Image words are skipped by this test.
236       if (s > word->lang_words.size()) {
237         continue;
238       }
239     }
240     // Sync pr_it with the WordData.
241     while (pr_it->word() != nullptr && pr_it->word() != word->word) {
242       pr_it->forward();
243     }
244     ASSERT_HOST(pr_it->word() != nullptr);
245     bool make_next_word_fuzzy = false;
246 #ifndef DISABLED_LEGACY_ENGINE
247     if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248       // Needs to be setup again to see the new outlines in the chopped_word.
249       SetupWordPassN(pass_n, word);
250     }
251 #endif // ndef DISABLED_LEGACY_ENGINE
252 
253     classify_word_and_language(pass_n, pr_it, word);
254     if (tessedit_dump_choices || debug_noise_removal) {
255       tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
256               word->word->best_choice->debug_string().c_str());
257     }
258     pr_it->forward();
259     if (make_next_word_fuzzy && pr_it->word() != nullptr) {
260       pr_it->MakeCurrentWordFuzzy();
261     }
262   }
263   return true;
264 }
265 
266 /**
267  * recog_all_words()
268  *
269  * Walk the page_res, recognizing all the words.
270  * If monitor is not null, it is used as a progress monitor/timeout/cancel.
271  * If dopasses is 0, all recognition passes are run,
272  * 1 just pass 1, 2 passes2 and higher.
273  * If target_word_box is not null, special things are done to words that
274  * overlap the target_word_box:
275  * if word_config is not null, the word config file is read for just the
276  * target word(s), otherwise, on pass 2 and beyond ONLY the target words
277  * are processed (Jetsoft modification.)
278  * Returns false if we cancelled prematurely.
279  *
280  * @param page_res page structure
281  * @param monitor progress monitor
282  * @param word_config word_config file
283  * @param target_word_box specifies just to extract a rectangle
284  * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
285  */
286 
recog_all_words(PAGE_RES * page_res,ETEXT_DESC * monitor,const TBOX * target_word_box,const char * word_config,int dopasses)287 bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,
288                                 const TBOX *target_word_box, const char *word_config,
289                                 int dopasses) {
290   PAGE_RES_IT page_res_it(page_res);
291 
292   if (tessedit_minimal_rej_pass1) {
293     tessedit_test_adaption.set_value(true);
294     tessedit_minimal_rejection.set_value(true);
295   }
296 
297   if (dopasses == 0 || dopasses == 1) {
298     page_res_it.restart_page();
299     // ****************** Pass 1 *******************
300 
301 #ifndef DISABLED_LEGACY_ENGINE
302     // If the adaptive classifier is full switch to one we prepared earlier,
303     // ie on the previous page. If the current adaptive classifier is non-empty,
304     // prepare a backup starting at this page, in case it fills up. Do all this
305     // independently for each language.
306     if (AdaptiveClassifierIsFull()) {
307       SwitchAdaptiveClassifier();
308     } else if (!AdaptiveClassifierIsEmpty()) {
309       StartBackupAdaptiveClassifier();
310     }
311     // Now check the sub-langs as well.
312     for (auto &lang : sub_langs_) {
313       if (lang->AdaptiveClassifierIsFull()) {
314         lang->SwitchAdaptiveClassifier();
315       } else if (!lang->AdaptiveClassifierIsEmpty()) {
316         lang->StartBackupAdaptiveClassifier();
317       }
318     }
319 
320 #endif // ndef DISABLED_LEGACY_ENGINE
321 
322     // Set up all words ready for recognition, so that if parallelism is on
323     // all the input and output classes are ready to run the classifier.
324     std::vector<WordData> words;
325     SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
326 #ifndef DISABLED_LEGACY_ENGINE
327     if (tessedit_parallelize) {
328       PrerecAllWordsPar(words);
329     }
330 #endif // ndef DISABLED_LEGACY_ENGINE
331 
332     stats_.word_count = words.size();
333 
334     stats_.dict_words = 0;
335     stats_.doc_blob_quality = 0;
336     stats_.doc_outline_errs = 0;
337     stats_.doc_char_quality = 0;
338     stats_.good_char_count = 0;
339     stats_.doc_good_char_quality = 0;
340 
341     most_recently_used_ = this;
342     // Run pass 1 word recognition.
343     if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
344       return false;
345     }
346     // Pass 1 post-processing.
347     for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
348       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
349         fix_rep_char(&page_res_it);
350         continue;
351       }
352 
353       // Count dict words.
354       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
355         ++(stats_.dict_words);
356       }
357 
358       // Update misadaption log (we only need to do it on pass 1, since
359       // adaption only happens on this pass).
360       if (page_res_it.word()->blamer_bundle != nullptr &&
361           page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362         page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
363       }
364     }
365   }
366 
367   if (dopasses == 1) {
368     return true;
369   }
370 
371 #ifndef DISABLED_LEGACY_ENGINE
372 
373   // ****************** Pass 2 *******************
374   if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
375     page_res_it.restart_page();
376     std::vector<WordData> words;
377     SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
378     if (tessedit_parallelize) {
379       PrerecAllWordsPar(words);
380     }
381     most_recently_used_ = this;
382     // Run pass 2 word recognition.
383     if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
384       return false;
385     }
386   }
387 
388   // The next passes are only required for Tess-only.
389   if (AnyTessLang() && !AnyLSTMLang()) {
390     // ****************** Pass 3 *******************
391     // Fix fuzzy spaces.
392 
393     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
394         !right_to_left()) {
395       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
396     }
397 
398     // ****************** Pass 4 *******************
399     if (tessedit_enable_dict_correction) {
400       dictionary_correction_pass(page_res);
401     }
402     if (tessedit_enable_bigram_correction) {
403       bigram_correction_pass(page_res);
404     }
405 
406     // ****************** Pass 5,6 *******************
407     rejection_passes(page_res, monitor, target_word_box, word_config);
408 
409     // ****************** Pass 8 *******************
410     font_recognition_pass(page_res);
411 
412     // ****************** Pass 9 *******************
413     // Check the correctness of the final results.
414     blamer_pass(page_res);
415     script_pos_pass(page_res);
416   }
417 
418 #endif // ndef DISABLED_LEGACY_ENGINE
419 
420   // Write results pass.
421   // This is now redundant, but retained commented so show how to obtain
422   // bounding boxes and style information.
423 
424 #ifndef DISABLED_LEGACY_ENGINE
425   // changed by jetsoft
426   // needed for dll to output memory structure
427   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
428     output_pass(page_res_it, target_word_box);
429   }
430 // end jetsoft
431 #endif // ndef DISABLED_LEGACY_ENGINE
432 
433   const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
434   textord_.CleanupSingleRowResult(pageseg_mode, page_res);
435 
436   // Remove empty words, as these mess up the result iterators.
437   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
438     const WERD_RES *word = page_res_it.word();
439     const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
440                                ? page_res_it.block()->block->pdblk.poly_block()
441                                : nullptr;
442     if (word->best_choice == nullptr || word->best_choice->empty() ||
443         (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
444       page_res_it.DeleteCurrentWord();
445     }
446   }
447 
448   if (monitor != nullptr) {
449     monitor->progress = 100;
450   }
451   return true;
452 }
453 
454 #ifndef DISABLED_LEGACY_ENGINE
455 
bigram_correction_pass(PAGE_RES * page_res)456 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
457   PAGE_RES_IT word_it(page_res);
458 
459   WERD_RES *w_prev = nullptr;
460   WERD_RES *w = word_it.word();
461   while (true) {
462     w_prev = w;
463     while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
464       // advance word_it, skipping over parts of combos
465     }
466     if (!word_it.word()) {
467       break;
468     }
469     w = word_it.word();
470     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
471       continue;
472     }
473     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
474       if (tessedit_bigram_debug) {
475         tprintf("Skipping because one of the words is W_REP_CHAR\n");
476       }
477       continue;
478     }
479     // Two words sharing the same language model, excellent!
480     std::vector<WERD_CHOICE *> overrides_word1;
481     std::vector<WERD_CHOICE *> overrides_word2;
482 
483     const auto orig_w1_str = w_prev->best_choice->unichar_string();
484     const auto orig_w2_str = w->best_choice->unichar_string();
485     WERD_CHOICE prev_best(w->uch_set);
486     {
487       int w1start, w1end;
488       w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
489       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
490     }
491     WERD_CHOICE this_best(w->uch_set);
492     {
493       int w2start, w2end;
494       w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
495       this_best = w->best_choice->shallow_copy(w2start, w2end);
496     }
497 
498     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
499       if (tessedit_bigram_debug) {
500         tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501                 orig_w2_str.c_str());
502       }
503       continue;
504     }
505     if (tessedit_bigram_debug > 2) {
506       tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
507     }
508     if (tessedit_bigram_debug > 1) {
509       if (!w_prev->best_choices.singleton()) {
510         w_prev->PrintBestChoices();
511       }
512       if (!w->best_choices.singleton()) {
513         w->PrintBestChoices();
514       }
515     }
516     float best_rating = 0.0;
517     int best_idx = 0;
518     WERD_CHOICE_IT prev_it(&w_prev->best_choices);
519     for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
520       WERD_CHOICE *p1 = prev_it.data();
521       WERD_CHOICE strip1(w->uch_set);
522       {
523         int p1start, p1end;
524         p1->GetNonSuperscriptSpan(&p1start, &p1end);
525         strip1 = p1->shallow_copy(p1start, p1end);
526       }
527       WERD_CHOICE_IT w_it(&w->best_choices);
528       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
529         WERD_CHOICE *p2 = w_it.data();
530         WERD_CHOICE strip2(w->uch_set);
531         {
532           int p2start, p2end;
533           p2->GetNonSuperscriptSpan(&p2start, &p2end);
534           strip2 = p2->shallow_copy(p2start, p2end);
535         }
536         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
537           overrides_word1.push_back(p1);
538           overrides_word2.push_back(p2);
539           if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
540             best_rating = p1->rating() + p2->rating();
541             best_idx = overrides_word1.size() - 1;
542           }
543         }
544       }
545     }
546     if (!overrides_word1.empty()) {
547       // Excellent, we have some bigram matches.
548       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
549           EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
550         if (tessedit_bigram_debug > 1) {
551           tprintf(
552               "Top choice \"%s %s\" verified (sans case) by bigram "
553               "model.\n",
554               orig_w1_str.c_str(), orig_w2_str.c_str());
555         }
556         continue;
557       }
558       const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559       const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560       if (new_w1_str != orig_w1_str) {
561         w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
562       }
563       if (new_w2_str != orig_w2_str) {
564         w->ReplaceBestChoice(overrides_word2[best_idx]);
565       }
566       if (tessedit_bigram_debug > 0) {
567         std::string choices_description;
568         int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569         if (num_bigram_choices == 1) {
570           choices_description = "This was the unique bigram choice.";
571         } else {
572           if (tessedit_bigram_debug > 1) {
573             std::string bigrams_list;
574             const int kMaxChoicesToPrint = 20;
575             for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
576               if (i > 0) {
577                 bigrams_list += ", ";
578               }
579               WERD_CHOICE *p1 = overrides_word1[i];
580               WERD_CHOICE *p2 = overrides_word2[i];
581               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
582             }
583             choices_description = "There were many choices: {";
584             choices_description += bigrams_list;
585             choices_description += "}";
586           } else {
587             choices_description += "There were " + std::to_string(num_bigram_choices);
588             choices_description += " compatible bigrams.";
589           }
590         }
591         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592                 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593                 choices_description.c_str());
594       }
595     }
596   }
597 }
598 
rejection_passes(PAGE_RES * page_res,ETEXT_DESC * monitor,const TBOX * target_word_box,const char * word_config)599 void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,
600                                  const TBOX *target_word_box, const char *word_config) {
601   PAGE_RES_IT page_res_it(page_res);
602   // ****************** Pass 5 *******************
603   // Gather statistics on rejects.
604   int word_index = 0;
605   while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
606     WERD_RES *word = page_res_it.word();
607     word_index++;
608     if (monitor != nullptr) {
609       monitor->ocr_alive = true;
610       monitor->progress = 95 + 5 * word_index / stats_.word_count;
611     }
612     if (word->rebuild_word == nullptr) {
613       // Word was not processed by tesseract.
614       page_res_it.forward();
615       continue;
616     }
617     check_debug_pt(word, 70);
618 
619     // changed by jetsoft
620     // specific to its needs to extract one word when need
621     if (target_word_box &&
622         !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
623       page_res_it.forward();
624       continue;
625     }
626     // end jetsoft
627 
628     page_res_it.rej_stat_word();
629     const int chars_in_word = word->reject_map.length();
630     const int rejects_in_word = word->reject_map.reject_count();
631 
632     const int blob_quality = word_blob_quality(word);
633     stats_.doc_blob_quality += blob_quality;
634     const int outline_errs = word_outline_errs(word);
635     stats_.doc_outline_errs += outline_errs;
636     int16_t all_char_quality;
637     int16_t accepted_all_char_quality;
638     word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
639     stats_.doc_char_quality += all_char_quality;
640     const uint8_t permuter_type = word->best_choice->permuter();
641     if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
642         (permuter_type == USER_DAWG_PERM)) {
643       stats_.good_char_count += chars_in_word - rejects_in_word;
644       stats_.doc_good_char_quality += accepted_all_char_quality;
645     }
646     check_debug_pt(word, 80);
647     if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
648       word->reject_map.rej_word_bad_quality();
649     }
650     check_debug_pt(word, 90);
651     page_res_it.forward();
652   }
653 
654   if (tessedit_debug_quality_metrics) {
655     tprintf(
656         "QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
657         " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
658         page_res->char_count, page_res->rej_count,
659         page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
660         stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
661         stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
662         stats_.doc_char_quality / static_cast<float>(page_res->char_count),
663         stats_.doc_good_char_quality,
664         (stats_.good_char_count > 0)
665             ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
666             : 0.0);
667   }
668   bool good_quality_doc =
669       ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
670       (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
671       (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
672       (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
673 
674   // ****************** Pass 6 *******************
675   // Do whole document or whole block rejection pass
676   if (!tessedit_test_adaption) {
677     quality_based_rejection(page_res_it, good_quality_doc);
678   }
679 }
680 
681 #endif // ndef DISABLED_LEGACY_ENGINE
682 
blamer_pass(PAGE_RES * page_res)683 void Tesseract::blamer_pass(PAGE_RES *page_res) {
684   if (!wordrec_run_blamer) {
685     return;
686   }
687   PAGE_RES_IT page_res_it(page_res);
688   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
689     WERD_RES *word = page_res_it.word();
690     BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
691     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
692   }
693   tprintf("Blame reasons:\n");
694   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
695     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),
696             page_res->blame_reasons[bl]);
697   }
698   if (page_res->misadaption_log.size() > 0) {
699     tprintf("Misadaption log:\n");
700     for (auto &log : page_res->misadaption_log) {
701       tprintf("%s\n", log.c_str());
702     }
703   }
704 }
705 
706 // Sets script positions and detects smallcaps on all output words.
script_pos_pass(PAGE_RES * page_res)707 void Tesseract::script_pos_pass(PAGE_RES *page_res) {
708   PAGE_RES_IT page_res_it(page_res);
709   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
710     WERD_RES *word = page_res_it.word();
711     if (word->word->flag(W_REP_CHAR)) {
712       page_res_it.forward();
713       continue;
714     }
715     const float x_height = page_res_it.block()->block->x_height();
716     float word_x_height = word->x_height;
717     if (word_x_height < word->best_choice->min_x_height() ||
718         word_x_height > word->best_choice->max_x_height()) {
719       word_x_height =
720           (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
721     }
722     // Test for small caps. Word capheight must be close to block xheight,
723     // and word must contain no lower case letters, and at least one upper case.
724     const double small_cap_xheight = x_height * kXHeightCapRatio;
725     const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
726     if (word->uch_set->script_has_xheight() &&
727         small_cap_xheight - small_cap_delta <= word_x_height &&
728         word_x_height <= small_cap_xheight + small_cap_delta) {
729       // Scan for upper/lower.
730       int num_upper = 0;
731       int num_lower = 0;
732       for (unsigned i = 0; i < word->best_choice->length(); ++i) {
733         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
734           ++num_upper;
735         } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
736           ++num_lower;
737         }
738       }
739       if (num_upper > 0 && num_lower == 0) {
740         word->small_caps = true;
741       }
742     }
743     word->SetScriptPositions();
744   }
745 }
746 
747 // Helper finds the gap between the index word and the next.
WordGap(const PointerVector<WERD_RES> & words,unsigned index,int * right,int * next_left)748 static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
749   *right = -INT32_MAX;
750   *next_left = INT32_MAX;
751   if (index < words.size()) {
752     *right = words[index]->word->bounding_box().right();
753     if (index + 1 < words.size()) {
754       *next_left = words[index + 1]->word->bounding_box().left();
755     }
756   }
757 }
758 
759 // Factored helper computes the rating, certainty, badness and validity of
760 // the permuter of the words in [first_index, end_index).
EvaluateWordSpan(const PointerVector<WERD_RES> & words,unsigned first_index,unsigned end_index,float * rating,float * certainty,bool * bad,bool * valid_permuter)761 static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
762                              float *rating, float *certainty, bool *bad, bool *valid_permuter) {
763   if (end_index <= first_index) {
764     *bad = true;
765     *valid_permuter = false;
766   }
767   for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
768     WERD_CHOICE *choice = words[index]->best_choice;
769     if (choice == nullptr) {
770       *bad = true;
771     } else {
772       *rating += choice->rating();
773       *certainty = std::min(*certainty, choice->certainty());
774       if (!Dict::valid_word_permuter(choice->permuter(), false)) {
775         *valid_permuter = false;
776       }
777     }
778   }
779 }
780 
781 // Helper chooses the best combination of words, transferring good ones from
782 // new_words to best_words. To win, a new word must have (better rating and
783 // certainty) or (better permuter status and rating within rating ratio and
784 // certainty within certainty margin) than current best.
785 // All the new_words are consumed (moved to best_words or deleted.)
786 // The return value is the number of new_words used minus the number of
787 // best_words that remain in the output.
SelectBestWords(double rating_ratio,double certainty_margin,bool debug,PointerVector<WERD_RES> * new_words,PointerVector<WERD_RES> * best_words)788 static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,
789                            PointerVector<WERD_RES> *new_words,
790                            PointerVector<WERD_RES> *best_words) {
791   // Process the smallest groups of words that have an overlapping word
792   // boundary at the end.
793   std::vector<WERD_RES *> out_words;
794   // Index into each word vector (best, new).
795   unsigned b = 0, n = 0;
796   int num_best = 0, num_new = 0;
797   while (b < best_words->size() || n < new_words->size()) {
798     // Start of the current run in each.
799     auto start_b = b, start_n = n;
800     while (b < best_words->size() || n < new_words->size()) {
801       int b_right = -INT32_MAX;
802       int next_b_left = INT32_MAX;
803       WordGap(*best_words, b, &b_right, &next_b_left);
804       int n_right = -INT32_MAX;
805       int next_n_left = INT32_MAX;
806       WordGap(*new_words, n, &n_right, &next_n_left);
807       if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
808         // The word breaks overlap. [start_b,b] and [start_n, n] match.
809         break;
810       }
811       // Keep searching for the matching word break.
812       if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
813         ++b;
814       } else {
815         ++n;
816       }
817     }
818     // Rating of the current run in each.
819     float b_rating = 0.0f, n_rating = 0.0f;
820     // Certainty of the current run in each.
821     float b_certainty = 0.0f, n_certainty = 0.0f;
822     // True if any word is missing its best choice.
823     bool b_bad = false, n_bad = false;
824     // True if all words have a valid permuter.
825     bool b_valid_permuter = true, n_valid_permuter = true;
826     const int end_b = b < best_words->size() ? b + 1 : b;
827     const int end_n = n < new_words->size() ? n + 1 : n;
828     EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
829                      &b_valid_permuter);
830     EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
831                      &n_valid_permuter);
832     bool new_better = false;
833     if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
834                    (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
835                     n_certainty > b_certainty - certainty_margin))) {
836       // New is better.
837       for (int i = start_n; i < end_n; ++i) {
838         out_words.push_back((*new_words)[i]);
839         (*new_words)[i] = nullptr;
840         ++num_new;
841       }
842       new_better = true;
843     } else if (!b_bad) {
844       // Current best is better.
845       for (int i = start_b; i < end_b; ++i) {
846         out_words.push_back((*best_words)[i]);
847         (*best_words)[i] = nullptr;
848         ++num_best;
849       }
850     }
851     if (debug) {
852       tprintf(
853           "%d new words %s than %d old words: r: %g v %g c: %g v %g"
854           " valid dict: %d v %d\n",
855           end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,
856           n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
857     }
858     // Move on to the next group.
859     b = end_b;
860     n = end_n;
861   }
862   // Transfer from out_words to best_words.
863   best_words->clear();
864   for (auto &out_word : out_words) {
865     best_words->push_back(out_word);
866   }
867   return num_new - num_best;
868 }
869 
870 // Helper to recognize the word using the given (language-specific) tesseract.
871 // Returns positive if this recognizer found more new best words than the
872 // number kept from best_words.
RetryWithLanguage(const WordData & word_data,WordRecognizer recognizer,bool debug,WERD_RES ** in_word,PointerVector<WERD_RES> * best_words)873 int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
874                                  WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {
875   if (debug) {
876     tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
877             static_cast<int>(tessedit_ocr_engine_mode));
878   }
879   // Run the recognizer on the word.
880   PointerVector<WERD_RES> new_words;
881   (this->*recognizer)(word_data, in_word, &new_words);
882   if (new_words.empty()) {
883     // Transfer input word to new_words, as the classifier must have put
884     // the result back in the input.
885     new_words.push_back(*in_word);
886     *in_word = nullptr;
887   }
888   if (debug) {
889     for (unsigned i = 0; i < new_words.size(); ++i) {
890       new_words[i]->DebugTopChoice("Lang result");
891     }
892   }
893   // Initial version is a bit of a hack based on better certainty and rating
894   // or a dictionary vs non-dictionary word.
895   return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896                          &new_words, best_words);
897 }
898 
899 // Helper returns true if all the words are acceptable.
WordsAcceptable(const PointerVector<WERD_RES> & words)900 static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
901   for (unsigned w = 0; w < words.size(); ++w) {
902     if (words[w]->tess_failed || !words[w]->tess_accepted) {
903       return false;
904     }
905   }
906   return true;
907 }
908 
909 #ifndef DISABLED_LEGACY_ENGINE
910 
911 // Moves good-looking "noise"/diacritics from the reject list to the main
912 // blob list on the current word. Returns true if anything was done, and
913 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
ReassignDiacritics(int pass,PAGE_RES_IT * pr_it,bool * make_next_word_fuzzy)914 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {
915   *make_next_word_fuzzy = false;
916   WERD *real_word = pr_it->word()->word;
917   if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
918       real_word->rej_cblob_list()->length() > noise_maxperword) {
919     return false;
920   }
921   real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
922   // Get the noise outlines into a vector with matching bool map.
923   std::vector<C_OUTLINE *> outlines;
924   real_word->GetNoiseOutlines(&outlines);
925   std::vector<bool> word_wanted;
926   std::vector<bool> overlapped_any_blob;
927   std::vector<C_BLOB *> target_blobs;
928   AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
929                                      &overlapped_any_blob, &target_blobs);
930   // Filter the outlines that overlapped any blob and put them into the word
931   // now. This simplifies the remaining task and also makes it more accurate
932   // as it has more completed blobs to work on.
933   std::vector<bool> wanted;
934   std::vector<C_BLOB *> wanted_blobs;
935   std::vector<C_OUTLINE *> wanted_outlines;
936   int num_overlapped = 0;
937   int num_overlapped_used = 0;
938   for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
939     if (overlapped_any_blob[i]) {
940       ++num_overlapped;
941       if (word_wanted[i]) {
942         ++num_overlapped_used;
943       }
944       wanted.push_back(word_wanted[i]);
945       wanted_blobs.push_back(target_blobs[i]);
946       wanted_outlines.push_back(outlines[i]);
947       outlines[i] = nullptr;
948     }
949   }
950   real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
951   AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
952   int non_overlapped = 0;
953   int non_overlapped_used = 0;
954   for (unsigned i = 0; i < word_wanted.size(); ++i) {
955     if (word_wanted[i]) {
956       ++non_overlapped_used;
957     }
958     if (outlines[i] != nullptr) {
959       ++non_overlapped_used;
960     }
961   }
962   if (debug_noise_removal) {
963     tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,
964             num_overlapped, non_overlapped_used, non_overlapped);
965     real_word->bounding_box().print();
966   }
967   // Now we have decided which outlines we want, put them into the real_word.
968   if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
969     pr_it->MakeCurrentWordFuzzy();
970   }
971   // TODO(rays) Parts of combos have a deep copy of the real word, and need
972   // to have their noise outlines moved/assigned in the same way!!
973   return num_overlapped_used != 0 || non_overlapped_used != 0;
974 }
975 
976 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
977 // Input: a set of noisy outlines that probably belong to the real_word.
978 // Output: word_wanted indicates which outlines are to be assigned to a blob,
979 //   target_blobs indicates which to assign to, and overlapped_any_blob is
980 //   true for all outlines that overlapped a blob.
AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE * > & outlines,int pass,WERD * real_word,PAGE_RES_IT * pr_it,std::vector<bool> * word_wanted,std::vector<bool> * overlapped_any_blob,std::vector<C_BLOB * > * target_blobs)981 void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
982                                                    int pass, WERD *real_word, PAGE_RES_IT *pr_it,
983                                                    std::vector<bool> *word_wanted,
984                                                    std::vector<bool> *overlapped_any_blob,
985                                                    std::vector<C_BLOB *> *target_blobs) {
986   std::vector<bool> blob_wanted;
987   word_wanted->clear();
988   word_wanted->resize(outlines.size());
989   overlapped_any_blob->clear();
990   overlapped_any_blob->resize(outlines.size());
991   target_blobs->clear();
992   target_blobs->resize(outlines.size());
993   // For each real blob, find the outlines that seriously overlap it.
994   // A single blob could be several merged characters, so there can be quite
995   // a few outlines overlapping, and the full engine needs to be used to chop
996   // and join to get a sensible result.
997   C_BLOB_IT blob_it(real_word->cblob_list());
998   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999     C_BLOB *blob = blob_it.data();
1000     const TBOX blob_box = blob->bounding_box();
1001     blob_wanted.clear();
1002     blob_wanted.resize(outlines.size());
1003     int num_blob_outlines = 0;
1004     for (unsigned i = 0; i < outlines.size(); ++i) {
1005       if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1006         blob_wanted[i] = true;
1007         (*overlapped_any_blob)[i] = true;
1008         ++num_blob_outlines;
1009       }
1010     }
1011     if (debug_noise_removal) {
1012       tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1013       blob_box.print();
1014     }
1015     // If any outlines overlap the blob, and not too many, classify the blob
1016     // (using the full engine, languages and all), and choose the maximal
1017     // combination of outlines that doesn't hurt the end-result classification
1018     // by too much. Mark them as wanted.
1019     if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1020       if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1021                                       num_blob_outlines, &blob_wanted)) {
1022         for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1023           if (blob_wanted[i]) {
1024             // Claim the outline and record where it is going.
1025             (*word_wanted)[i] = true;
1026             (*target_blobs)[i] = blob;
1027           }
1028         }
1029       }
1030     }
1031   }
1032 }
1033 
1034 // Attempts to assign non-overlapping outlines to their nearest blobs or
1035 // make new blobs out of them.
AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE * > & outlines,int pass,WERD * real_word,PAGE_RES_IT * pr_it,std::vector<bool> * word_wanted,std::vector<C_BLOB * > * target_blobs)1036 void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
1037                                            WERD *real_word, PAGE_RES_IT *pr_it,
1038                                            std::vector<bool> *word_wanted,
1039                                            std::vector<C_BLOB *> *target_blobs) {
1040   std::vector<bool> blob_wanted;
1041   word_wanted->clear();
1042   word_wanted->resize(outlines.size());
1043   target_blobs->clear();
1044   target_blobs->resize(outlines.size());
1045   // Check for outlines that need to be turned into stand-alone blobs.
1046   for (unsigned i = 0; i < outlines.size(); ++i) {
1047     if (outlines[i] == nullptr) {
1048       continue;
1049     }
1050     // Get a set of adjacent outlines that don't overlap any existing blob.
1051     blob_wanted.clear();
1052     blob_wanted.resize(outlines.size());
1053     int num_blob_outlines = 0;
1054     TBOX total_ol_box(outlines[i]->bounding_box());
1055     while (i < outlines.size() && outlines[i] != nullptr) {
1056       blob_wanted[i] = true;
1057       total_ol_box += outlines[i]->bounding_box();
1058       ++i;
1059       ++num_blob_outlines;
1060     }
1061     // Find the insertion point.
1062     C_BLOB_IT blob_it(real_word->cblob_list());
1063     while (!blob_it.at_last() &&
1064            blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1065       blob_it.forward();
1066     }
1067     // Choose which combination of them we actually want and where to put
1068     // them.
1069     if (debug_noise_removal) {
1070       tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1071     }
1072     C_BLOB *left_blob = blob_it.data();
1073     TBOX left_box = left_blob->bounding_box();
1074     C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075     if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1076          !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1077         SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1078                                     num_blob_outlines, &blob_wanted)) {
1079       if (debug_noise_removal) {
1080         tprintf("Added to left blob\n");
1081       }
1082       for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1083         if (blob_wanted[j]) {
1084           (*word_wanted)[j] = true;
1085           (*target_blobs)[j] = left_blob;
1086         }
1087       }
1088     } else if (right_blob != nullptr &&
1089                (!left_box.x_overlap(total_ol_box) ||
1090                 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1091                SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1092                                            num_blob_outlines, &blob_wanted)) {
1093       if (debug_noise_removal) {
1094         tprintf("Added to right blob\n");
1095       }
1096       for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1097         if (blob_wanted[j]) {
1098           (*word_wanted)[j] = true;
1099           (*target_blobs)[j] = right_blob;
1100         }
1101       }
1102     } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1103                                            num_blob_outlines, &blob_wanted)) {
1104       if (debug_noise_removal) {
1105         tprintf("Fitted between blobs\n");
1106       }
1107       for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1108         if (blob_wanted[j]) {
1109           (*word_wanted)[j] = true;
1110           (*target_blobs)[j] = nullptr;
1111         }
1112       }
1113     }
1114   }
1115 }
1116 
1117 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1118 // chooses the optimal set (approximately) and returns true if any outlines
1119 // are desired, in which case ok_outlines indicates which ones.
SelectGoodDiacriticOutlines(int pass,float certainty_threshold,PAGE_RES_IT * pr_it,C_BLOB * blob,const std::vector<C_OUTLINE * > & outlines,int num_outlines,std::vector<bool> * ok_outlines)1120 bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
1121                                             C_BLOB *blob,
1122                                             const std::vector<C_OUTLINE *> &outlines,
1123                                             int num_outlines, std::vector<bool> *ok_outlines) {
1124   std::string best_str;
1125   float target_cert = certainty_threshold;
1126   if (blob != nullptr) {
1127     float target_c2;
1128     target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1129     if (debug_noise_removal) {
1130       tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1131               target_c2);
1132       blob->bounding_box().print();
1133     }
1134     target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1135   }
1136   std::vector<bool> test_outlines = *ok_outlines;
1137   // Start with all the outlines in.
1138   std::string all_str;
1139   std::vector<bool> best_outlines = *ok_outlines;
1140   float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1141   if (debug_noise_removal) {
1142     TBOX ol_box;
1143     for (unsigned i = 0; i < test_outlines.size(); ++i) {
1144       if (test_outlines[i]) {
1145         ol_box += outlines[i]->bounding_box();
1146       }
1147     }
1148     tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149             best_cert - target_cert);
1150     ol_box.print();
1151   }
1152   // Iteratively zero out the bit that improves the certainty the most, until
1153   // we get past the threshold, have zero bits, or fail to improve.
1154   int best_index = 0; // To zero out.
1155   while (num_outlines > 1 && best_index >= 0 &&
1156          (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1157     // Find the best bit to zero out.
1158     best_index = -1;
1159     for (unsigned i = 0; i < outlines.size(); ++i) {
1160       if (test_outlines[i]) {
1161         test_outlines[i] = false;
1162         std::string str;
1163         float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1164         if (debug_noise_removal) {
1165           TBOX ol_box;
1166           for (unsigned j = 0; j < outlines.size(); ++j) {
1167             if (test_outlines[j]) {
1168               ol_box += outlines[j]->bounding_box();
1169             }
1170             tprintf("%c", test_outlines[j] ? 'T' : 'F');
1171           }
1172           tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173                   cert - target_cert);
1174           ol_box.print();
1175         }
1176         if (cert > best_cert) {
1177           best_cert = cert;
1178           best_index = i;
1179           best_outlines = test_outlines;
1180         }
1181         test_outlines[i] = true;
1182       }
1183     }
1184     if (best_index >= 0) {
1185       test_outlines[best_index] = false;
1186       --num_outlines;
1187     }
1188   }
1189   if (best_cert >= target_cert) {
1190     // Save the best combination.
1191     *ok_outlines = best_outlines;
1192     if (debug_noise_removal) {
1193       tprintf("%s noise combination ", blob ? "Adding" : "New");
1194       for (auto best_outline : best_outlines) {
1195         tprintf("%c", best_outline ? 'T' : 'F');
1196       }
1197       tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1198     }
1199     return true;
1200   }
1201 
1202   return false;
1203 }
1204 
1205 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1206 // the inclusion of the outlines, and returns the certainty of the raw choice.
ClassifyBlobPlusOutlines(const std::vector<bool> & ok_outlines,const std::vector<C_OUTLINE * > & outlines,int pass_n,PAGE_RES_IT * pr_it,C_BLOB * blob,std::string & best_str)1207 float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
1208                                           const std::vector<C_OUTLINE *> &outlines, int pass_n,
1209                                           PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
1210   C_OUTLINE_IT ol_it;
1211   C_OUTLINE *first_to_keep = nullptr;
1212   C_BLOB *local_blob = nullptr;
1213   if (blob != nullptr) {
1214     // Add the required outlines to the blob.
1215     ol_it.set_to_list(blob->out_list());
1216     first_to_keep = ol_it.data();
1217   }
1218   for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1219     if (ok_outlines[i]) {
1220       // This outline is to be added.
1221       if (blob == nullptr) {
1222         local_blob = new C_BLOB(outlines[i]);
1223         blob = local_blob;
1224         ol_it.set_to_list(blob->out_list());
1225       } else {
1226         ol_it.add_before_stay_put(outlines[i]);
1227       }
1228     }
1229   }
1230   float c2;
1231   float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1232   ol_it.move_to_first();
1233   if (first_to_keep == nullptr) {
1234     // We created blob. Empty its outlines and delete it.
1235     for (; !ol_it.empty(); ol_it.forward()) {
1236       ol_it.extract();
1237     }
1238     delete local_blob;
1239     cert = -c2;
1240   } else {
1241     // Remove the outlines that we put in.
1242     for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1243       ol_it.extract();
1244     }
1245   }
1246   return cert;
1247 }
1248 
1249 // Classifies the given blob (part of word_data->word->word) as an individual
1250 // word, using languages, chopper etc, returning only the certainty of the
1251 // best raw choice, and undoing all the work done to fake out the word.
ClassifyBlobAsWord(int pass_n,PAGE_RES_IT * pr_it,C_BLOB * blob,std::string & best_str,float * c2)1252 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
1253                                     float *c2) {
1254   WERD *real_word = pr_it->word()->word;
1255   WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1256                                                   C_BLOB::deep_copy(blob));
1257   WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1258   // Get a new iterator that points to the new word.
1259   PAGE_RES_IT it(pr_it->page_res);
1260   while (it.word() != word_res && it.word() != nullptr) {
1261     it.forward();
1262   }
1263   ASSERT_HOST(it.word() == word_res);
1264   WordData wd(it);
1265   // Force full initialization.
1266   SetupWordPassN(1, &wd);
1267   classify_word_and_language(pass_n, &it, &wd);
1268   if (debug_noise_removal) {
1269     if (wd.word->raw_choice != nullptr) {
1270       tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1271               wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1272     } else {
1273       tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1274               wd.row->x_height());
1275     }
1276   }
1277   float cert = 0.0f;
1278   if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1279     cert = wd.word->raw_choice->certainty();
1280     float rat = wd.word->raw_choice->rating();
1281     *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1282     best_str = wd.word->raw_choice->unichar_string();
1283   } else {
1284     *c2 = 0.0f;
1285     best_str.clear();
1286   }
1287   it.DeleteCurrentWord();
1288   pr_it->ResetWordIterator();
1289   return cert;
1290 }
1291 
1292 #endif // ndef DISABLED_LEGACY_ENGINE
1293 
1294 // Generic function for classifying a word. Can be used either for pass1 or
1295 // pass2 according to the function passed to recognizer.
1296 // word_data holds the word to be recognized, and its block and row, and
1297 // pr_it points to the word as well, in case we are running LSTM and it wants
1298 // to output multiple words.
1299 // Recognizes in the current language, and if successful that is all.
1300 // If recognition was not successful, tries all available languages until
1301 // it gets a successful result or runs out of languages. Keeps the best result.
classify_word_and_language(int pass_n,PAGE_RES_IT * pr_it,WordData * word_data)1302 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {
1303 #ifdef DISABLED_LEGACY_ENGINE
1304   WordRecognizer recognizer = &Tesseract::classify_word_pass1;
1305 #else
1306   WordRecognizer recognizer =
1307       pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;
1308 #endif // def DISABLED_LEGACY_ENGINE
1309 
1310   // Best result so far.
1311   PointerVector<WERD_RES> best_words;
1312   // Points to the best result. May be word or in lang_words.
1313   const WERD_RES *word = word_data->word;
1314   clock_t start_t = clock();
1315   const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1316   if (debug) {
1317     tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1318             most_recently_used_->lang.c_str());
1319     word->word->bounding_box().print();
1320   }
1321   if (word->done) {
1322     // If done on pass1, leave it as-is.
1323     if (!word->tess_failed) {
1324       most_recently_used_ = word->tesseract;
1325     }
1326     return;
1327   }
1328   auto sub = sub_langs_.size();
1329   if (most_recently_used_ != this) {
1330     // Get the index of the most_recently_used_.
1331     for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1332     }
1333   }
1334   most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1335                                          &best_words);
1336   Tesseract *best_lang_tess = most_recently_used_;
1337   if (!WordsAcceptable(best_words)) {
1338     // Try all the other languages to see if they are any better.
1339     if (most_recently_used_ != this &&
1340         this->RetryWithLanguage(*word_data, recognizer, debug,
1341                                 &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1342       best_lang_tess = this;
1343     }
1344     for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1345       if (most_recently_used_ != sub_langs_[i] &&
1346           sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1347                                            &best_words) > 0) {
1348         best_lang_tess = sub_langs_[i];
1349       }
1350     }
1351   }
1352   most_recently_used_ = best_lang_tess;
1353   if (!best_words.empty()) {
1354     if (best_words.size() == 1 && !best_words[0]->combination) {
1355       // Move the best single result to the main word.
1356       word_data->word->ConsumeWordResults(best_words[0]);
1357     } else {
1358       // Words came from LSTM, and must be moved to the PAGE_RES properly.
1359       word_data->word = best_words.back();
1360       pr_it->ReplaceCurrentWord(&best_words);
1361     }
1362     ASSERT_HOST(word_data->word->box_word != nullptr);
1363   } else {
1364     tprintf("no best words!!\n");
1365   }
1366   clock_t ocr_t = clock();
1367   if (tessedit_timing_debug) {
1368     tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
1369             static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1370   }
1371 }
1372 
1373 /**
1374  * classify_word_pass1
1375  *
1376  * Baseline normalize the word and pass it to Tess.
1377  */
1378 
classify_word_pass1(const WordData & word_data,WERD_RES ** in_word,PointerVector<WERD_RES> * out_words)1379 void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
1380                                     PointerVector<WERD_RES> *out_words) {
1381   ROW *row = word_data.row;
1382   BLOCK *block = word_data.block;
1383   prev_word_best_choice_ =
1384       word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1385 #ifdef DISABLED_LEGACY_ENGINE
1386   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1387 #else
1388   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1389       tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1390 #endif // def DISABLED_LEGACY_ENGINE
1391     if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1392       LSTMRecognizeWord(*block, row, *in_word, out_words);
1393       if (!out_words->empty()) {
1394         return; // Successful lstm recognition.
1395       }
1396     }
1397     if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1398       // No fallback allowed, so use a fake.
1399       (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1400       return;
1401     }
1402 
1403 #ifndef DISABLED_LEGACY_ENGINE
1404     // Fall back to tesseract for failed words or odd words.
1405     (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1406                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407                                     poly_allow_detailed_fx, row, block);
1408 #endif // ndef DISABLED_LEGACY_ENGINE
1409   }
1410 
1411 #ifndef DISABLED_LEGACY_ENGINE
1412   WERD_RES *word = *in_word;
1413   match_word_pass_n(1, word, row, block);
1414   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1415     word->tess_would_adapt = AdaptableWord(word);
1416     bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1417 
1418     if (adapt_ok) {
1419       // Send word to adaptive classifier for training.
1420       word->BestChoiceToCorrectText();
1421       LearnWord(nullptr, word);
1422       // Mark misadaptions if running blamer.
1423       if (word->blamer_bundle != nullptr) {
1424         word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1425       }
1426     }
1427 
1428     if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1429       tess_add_doc_word(word->best_choice);
1430     }
1431   }
1432 #endif // ndef DISABLED_LEGACY_ENGINE
1433 }
1434 
1435 // Helper to report the result of the xheight fix.
1436 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,
1437                                    WERD_RES *new_word) {
1438   tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1439           word->best_choice->debug_string().c_str());
1440   word->reject_map.print(debug_fp);
1441   tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1442           new_word->best_choice->debug_string().c_str());
1443   new_word->reject_map.print(debug_fp);
1444   tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1445           new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1446           accept_new_word ? "ACCEPTED" : "");
1447 }
1448 
1449 #ifndef DISABLED_LEGACY_ENGINE
1450 
1451 // Run the x-height fix-up, based on min/max top/bottom information in
1452 // unicharset.
1453 // Returns true if the word was changed.
1454 // See the comment in fixxht.cpp for a description of the overall process.
1455 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {
1456   int original_misfits = CountMisfitTops(word);
1457   if (original_misfits == 0) {
1458     return false;
1459   }
1460   float baseline_shift = 0.0f;
1461   float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1462   if (baseline_shift != 0.0f) {
1463     // Try the shift on its own first.
1464     if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1465       return false;
1466     }
1467     original_misfits = CountMisfitTops(word);
1468     if (original_misfits > 0) {
1469       float new_baseline_shift;
1470       // Now recompute the new x_height.
1471       new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1472       if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1473         // No test of return value here, as we are definitely making a change
1474         // to the word by shifting the baseline.
1475         TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1476       }
1477     }
1478     return true;
1479   } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480     return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1481   } else {
1482     return false;
1483   }
1484 }
1485 
1486 // Runs recognition with the test baseline shift and x-height and returns true
1487 // if there was an improvement in recognition result.
1488 bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
1489                                      WERD_RES *word, BLOCK *block, ROW *row) {
1490   bool accept_new_x_ht = false;
1491   WERD_RES new_x_ht_word(word->word);
1492   if (word->blamer_bundle != nullptr) {
1493     new_x_ht_word.blamer_bundle = new BlamerBundle();
1494     new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1495   }
1496   new_x_ht_word.x_height = new_x_ht;
1497   new_x_ht_word.baseline_shift = baseline_shift;
1498   new_x_ht_word.caps_height = 0.0;
1499   new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1500                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501                                     poly_allow_detailed_fx, row, block);
1502   match_word_pass_n(2, &new_x_ht_word, row, block);
1503   if (!new_x_ht_word.tess_failed) {
1504     int new_misfits = CountMisfitTops(&new_x_ht_word);
1505     if (debug_x_ht_level >= 1) {
1506       tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507               word->x_height, new_misfits, new_x_ht);
1508       tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1509               word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1510               new_x_ht_word.best_choice->certainty());
1511     }
1512     // The misfits must improve and either the rating or certainty.
1513     accept_new_x_ht = new_misfits < original_misfits &&
1514                       (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1515                        new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1516     if (debug_x_ht_level >= 1) {
1517       ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1518     }
1519   }
1520   if (accept_new_x_ht) {
1521     word->ConsumeWordResults(&new_x_ht_word);
1522     return true;
1523   }
1524   return false;
1525 }
1526 
1527 #endif // ndef DISABLED_LEGACY_ENGINE
1528 
1529 /**
1530  * classify_word_pass2
1531  *
1532  * Control what to do with the word in pass 2
1533  */
1534 
1535 void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
1536                                     PointerVector<WERD_RES> *out_words) {
1537   // Return if we do not want to run Tesseract.
1538   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1539     return;
1540   }
1541 #ifndef DISABLED_LEGACY_ENGINE
1542   ROW *row = word_data.row;
1543   BLOCK *block = word_data.block;
1544   WERD_RES *word = *in_word;
1545   prev_word_best_choice_ =
1546       word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1547 
1548   check_debug_pt(word, 30);
1549   if (!word->done) {
1550     word->caps_height = 0.0;
1551     if (word->x_height == 0.0f) {
1552       word->x_height = row->x_height();
1553     }
1554     match_word_pass_n(2, word, row, block);
1555     check_debug_pt(word, 40);
1556   }
1557 
1558   SubAndSuperscriptFix(word);
1559 
1560   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1561     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
1562         block->classify_rotation().y() == 0.0f) {
1563       // Use the tops and bottoms since they are available.
1564       TrainedXheightFix(word, block, row);
1565     }
1566   }
1567 #  ifndef GRAPHICS_DISABLED
1568   if (tessedit_display_outwords) {
1569     if (fx_win == nullptr) {
1570       create_fx_win();
1571     }
1572     clear_fx_win();
1573     word->rebuild_word->plot(fx_win);
1574     TBOX wbox = word->rebuild_word->bounding_box();
1575     fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1576     ScrollView::Update();
1577   }
1578 #  endif
1579   check_debug_pt(word, 50);
1580 #endif // ndef DISABLED_LEGACY_ENGINE
1581 }
1582 
1583 #ifndef DISABLED_LEGACY_ENGINE
1584 /**
1585  * match_word_pass2
1586  *
1587  * Baseline normalize the word and pass it to Tess.
1588  */
1589 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {
1590   if (word->tess_failed) {
1591     return;
1592   }
1593   tess_segment_pass_n(pass_n, word);
1594 
1595   if (!word->tess_failed) {
1596     if (!word->word->flag(W_REP_CHAR)) {
1597       word->fix_quotes();
1598       if (tessedit_fix_hyphens) {
1599         word->fix_hyphens();
1600       }
1601       /* Don't trust fix_quotes! - though I think I've fixed the bug */
1602       if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1603         tprintf(
1604             "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1605             " #Blobs=%u\n",
1606             word->best_choice->debug_string().c_str(), word->best_choice->length(),
1607             word->box_word->length());
1608       }
1609       word->tess_accepted = tess_acceptable_word(word);
1610 
1611       // Also sets word->done flag
1612       make_reject_map(word, row, pass_n);
1613     }
1614   }
1615   set_word_fonts(word);
1616 
1617   ASSERT_HOST(word->raw_choice != nullptr);
1618 }
1619 #endif // ndef DISABLED_LEGACY_ENGINE
1620 
1621 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1622 // the given char_id, or nullptr if none can be found.
1623 static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
1624   // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1625   BLOB_CHOICE *best_choice = nullptr;
1626   for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1627     BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
1628     if (choice != nullptr) {
1629       if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
1630         best_choice = choice;
1631       }
1632     }
1633   }
1634   return best_choice;
1635 }
1636 
1637 // Helper to insert blob_choice in each location in the leader word if there is
1638 // no matching BLOB_CHOICE there already, and correct any incorrect results
1639 // in the best_choice.
1640 static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1641   WERD_CHOICE *word = word_res->best_choice;
1642   for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1643     BLOB_CHOICE *choice =
1644         FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
1645     if (choice == nullptr) {
1646       BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1647       choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1648     }
1649   }
1650   // Correct any incorrect results in word.
1651   for (unsigned i = 0; i < word->length(); ++i) {
1652     if (word->unichar_id(i) != blob_choice->unichar_id()) {
1653       word->set_unichar_id(blob_choice->unichar_id(), i);
1654     }
1655   }
1656 }
1657 
1658 /**
1659  * fix_rep_char()
1660  * The word is a repeated char. (Leader.) Find the repeated char character.
1661  * Create the appropriate single-word or multi-word sequence according to
1662  * the size of spaces in between blobs, and correct the classifications
1663  * where some of the characters disagree with the majority.
1664  */
1665 void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
1666   WERD_RES *word_res = page_res_it->word();
1667   const WERD_CHOICE &word = *(word_res->best_choice);
1668 
1669   // Find the frequency of each unique character in the word.
1670   SortHelper<UNICHAR_ID> rep_ch(word.length());
1671   for (unsigned i = 0; i < word.length(); ++i) {
1672     rep_ch.Add(word.unichar_id(i), 1);
1673   }
1674 
1675   // Find the most frequent result.
1676   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1677   int max_count = rep_ch.MaxCount(&maxch_id);
1678   // Find the best exemplar of a classifier result for maxch_id.
1679   BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680   if (best_choice == nullptr) {
1681     tprintf("Failed to find a choice for %s, occurring %d times\n",
1682             word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1683     return;
1684   }
1685   word_res->done = true;
1686 
1687   // Measure the mean space.
1688   int gap_count = 0;
1689   WERD *werd = word_res->word;
1690   C_BLOB_IT blob_it(werd->cblob_list());
1691   C_BLOB *prev_blob = blob_it.data();
1692   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1693     C_BLOB *blob = blob_it.data();
1694     int gap = blob->bounding_box().left();
1695     gap -= prev_blob->bounding_box().right();
1696     ++gap_count;
1697     prev_blob = blob;
1698   }
1699   // Just correct existing classification.
1700   CorrectRepcharChoices(best_choice, word_res);
1701   word_res->reject_map.initialise(word.length());
1702 }
1703 
1704 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,
1705                                                        const char *lengths) {
1706   int i = 0;
1707   int offset = 0;
1708   int leading_punct_count;
1709   int upper_count = 0;
1710   int hyphen_pos = -1;
1711   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1712 
1713   if (strlen(lengths) > 20) {
1714     return word_type;
1715   }
1716 
1717   /* Single Leading punctuation char*/
1718 
1719   if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1720     offset += lengths[i++];
1721   }
1722   leading_punct_count = i;
1723 
1724   /* Initial cap */
1725   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1726     offset += lengths[i++];
1727     upper_count++;
1728   }
1729   if (upper_count > 1) {
1730     word_type = AC_UPPER_CASE;
1731   } else {
1732     /* Lower case word, possibly with an initial cap */
1733     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1734       offset += lengths[i++];
1735     }
1736     if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1737       goto not_a_word;
1738     }
1739     /*
1740 Allow a single hyphen in a lower case word
1741 - don't trust upper case - I've seen several cases of "H" -> "I-I"
1742 */
1743     if (lengths[i] == 1 && s[offset] == '-') {
1744       hyphen_pos = i;
1745       offset += lengths[i++];
1746       if (s[offset] != '\0') {
1747         while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1748           offset += lengths[i++];
1749         }
1750         if (i < hyphen_pos + 3) {
1751           goto not_a_word;
1752         }
1753       }
1754     } else {
1755       /* Allow "'s" in NON hyphenated lower case words */
1756       if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1757           (s[offset + lengths[i]] == 's')) {
1758         offset += lengths[i++];
1759         offset += lengths[i++];
1760       }
1761     }
1762     if (upper_count > 0) {
1763       word_type = AC_INITIAL_CAP;
1764     } else {
1765       word_type = AC_LOWER_CASE;
1766     }
1767   }
1768 
1769   /* Up to two different, constrained trailing punctuation chars */
1770   if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1771     offset += lengths[i++];
1772   }
1773   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1774       chs_trailing_punct2.contains(s[offset])) {
1775     offset += lengths[i++];
1776   }
1777 
1778   if (s[offset] != '\0') {
1779     word_type = AC_UNACCEPTABLE;
1780   }
1781 
1782 not_a_word:
1783 
1784   if (word_type == AC_UNACCEPTABLE) {
1785     /* Look for abbreviation string */
1786     i = 0;
1787     offset = 0;
1788     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1789       word_type = AC_UC_ABBREV;
1790       while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1791              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1792         offset += lengths[i++];
1793         offset += lengths[i++];
1794       }
1795     } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1796       word_type = AC_LC_ABBREV;
1797       while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1798              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1799         offset += lengths[i++];
1800         offset += lengths[i++];
1801       }
1802     }
1803     if (s[offset] != '\0') {
1804       word_type = AC_UNACCEPTABLE;
1805     }
1806   }
1807 
1808   return word_type;
1809 }
1810 
1811 bool Tesseract::check_debug_pt(WERD_RES *word, int location) {
1812   bool show_map_detail = false;
1813   int16_t i;
1814 
1815   if (!test_pt) {
1816     return false;
1817   }
1818 
1819   tessedit_rejection_debug.set_value(false);
1820   debug_x_ht_level.set_value(0);
1821 
1822   if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1823     if (location < 0) {
1824       return true; // For breakpoint use
1825     }
1826     tessedit_rejection_debug.set_value(true);
1827     debug_x_ht_level.set_value(2);
1828     tprintf("\n\nTESTWD::");
1829     switch (location) {
1830       case 0:
1831         tprintf("classify_word_pass1 start\n");
1832         word->word->print();
1833         break;
1834       case 10:
1835         tprintf("make_reject_map: initial map");
1836         break;
1837       case 20:
1838         tprintf("make_reject_map: after NN");
1839         break;
1840       case 30:
1841         tprintf("classify_word_pass2 - START");
1842         break;
1843       case 40:
1844         tprintf("classify_word_pass2 - Pre Xht");
1845         break;
1846       case 50:
1847         tprintf("classify_word_pass2 - END");
1848         show_map_detail = true;
1849         break;
1850       case 60:
1851         tprintf("fixspace");
1852         break;
1853       case 70:
1854         tprintf("MM pass START");
1855         break;
1856       case 80:
1857         tprintf("MM pass END");
1858         break;
1859       case 90:
1860         tprintf("After Poor quality rejection");
1861         break;
1862       case 100:
1863         tprintf("unrej_good_quality_words - START");
1864         break;
1865       case 110:
1866         tprintf("unrej_good_quality_words - END");
1867         break;
1868       case 120:
1869         tprintf("Write results pass");
1870         show_map_detail = true;
1871         break;
1872     }
1873     if (word->best_choice != nullptr) {
1874       tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1875       word->reject_map.print(debug_fp);
1876       tprintf("\n");
1877       if (show_map_detail) {
1878         tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1879         for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1880           tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1881           word->reject_map[i].full_print(debug_fp);
1882         }
1883       }
1884     } else {
1885       tprintf("null best choice\n");
1886     }
1887     tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1888     tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1889     return true;
1890   } else {
1891     return false;
1892   }
1893 }
1894 
1895 /**
1896  * find_modal_font
1897  *
1898  * Find the modal font and remove from the stats.
1899  */
1900 #ifndef DISABLED_LEGACY_ENGINE
1901 static void find_modal_font( // good chars in word
1902     STATS *fonts,            // font stats
1903     int16_t *font_out,       // output font
1904     int8_t *font_count       // output count
1905 ) {
1906   int16_t font;  // font index
1907   int32_t count; // pile count
1908 
1909   if (fonts->get_total() > 0) {
1910     font = static_cast<int16_t>(fonts->mode());
1911     *font_out = font;
1912     count = fonts->pile_count(font);
1913     *font_count = count < INT8_MAX ? count : INT8_MAX;
1914     fonts->add(font, -*font_count);
1915   } else {
1916     *font_out = -1;
1917     *font_count = 0;
1918   }
1919 }
1920 #endif // ! DISABLED_LEGACY_ENGINE
1921 
1922 /**
1923  * set_word_fonts
1924  *
1925  * Get the fonts for the word.
1926  */
1927 void Tesseract::set_word_fonts(WERD_RES *word) {
1928   // Don't try to set the word fonts for an lstm word, as the configs
1929   // will be meaningless.
1930   if (word->chopped_word == nullptr) {
1931     return;
1932   }
1933   ASSERT_HOST(word->best_choice != nullptr);
1934 
1935 #ifndef DISABLED_LEGACY_ENGINE
1936   const int fontinfo_size = fontinfo_table_.size();
1937   if (fontinfo_size == 0) {
1938     return;
1939   }
1940   if (tessedit_font_id > 0) {
1941     if (tessedit_font_id >= fontinfo_size) {
1942       tprintf("Error, invalid font ID provided: must be below %d.\n"
1943               "Falling back to font auto-detection.\n", fontinfo_size);
1944     } else {
1945       word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1946       word->fontinfo2 = nullptr;
1947       word->fontinfo_id_count = INT8_MAX;
1948       word->fontinfo_id2_count = 0;
1949       return;
1950     }
1951   }
1952   std::vector<int> font_total_score(fontinfo_size);
1953 
1954   // Compute the font scores for the word
1955   if (tessedit_debug_fonts) {
1956     tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1957   }
1958   for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1959     const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1960     if (choice == nullptr) {
1961       continue;
1962     }
1963     auto &fonts = choice->fonts();
1964     for (auto &f : fonts) {
1965       const int fontinfo_id = f.fontinfo_id;
1966       if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1967         font_total_score[fontinfo_id] += f.score;
1968       }
1969     }
1970   }
1971   // Find the top and 2nd choice for the word.
1972   int score1 = 0, score2 = 0;
1973   int16_t font_id1 = -1, font_id2 = -1;
1974   for (int f = 0; f < fontinfo_size; ++f) {
1975     if (tessedit_debug_fonts && font_total_score[f] > 0) {
1976       tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1977     }
1978     if (font_total_score[f] > score1) {
1979       score2 = score1;
1980       font_id2 = font_id1;
1981       score1 = font_total_score[f];
1982       font_id1 = f;
1983     } else if (font_total_score[f] > score2) {
1984       score2 = font_total_score[f];
1985       font_id2 = f;
1986     }
1987   }
1988   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1989   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1990   // Each score has a limit of UINT16_MAX, so divide by that to get the number
1991   // of "votes" for that font, ie number of perfect scores.
1992   word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1993   word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1994   if (score1 > 0) {
1995     const FontInfo fi = fontinfo_table_.at(font_id1);
1996     if (tessedit_debug_fonts) {
1997       if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1998         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1999                 word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
2000                 word->fontinfo_id2_count);
2001       } else {
2002         tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
2003       }
2004     }
2005   }
2006 #endif // ndef DISABLED_LEGACY_ENGINE
2007 }
2008 
2009 #ifndef DISABLED_LEGACY_ENGINE
2010 /**
2011  * font_recognition_pass
2012  *
2013  * Smooth the fonts for the document.
2014  */
2015 void Tesseract::font_recognition_pass(PAGE_RES *page_res) {
2016   PAGE_RES_IT page_res_it(page_res);
2017   WERD_RES *word;                       // current word
2018   STATS doc_fonts(0, font_table_size_); // font counters
2019 
2020   // Gather font id statistics.
2021   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2022     word = page_res_it.word();
2023     if (word->fontinfo != nullptr) {
2024       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2025     }
2026     if (word->fontinfo2 != nullptr) {
2027       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2028     }
2029   }
2030   int16_t doc_font;      // modal font
2031   int8_t doc_font_count; // modal font
2032   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2033   if (doc_font_count == 0) {
2034     return;
2035   }
2036   // Get the modal font pointer.
2037   const FontInfo *modal_font = nullptr;
2038   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2039     word = page_res_it.word();
2040     if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2041       modal_font = word->fontinfo;
2042       break;
2043     }
2044     if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2045       modal_font = word->fontinfo2;
2046       break;
2047     }
2048   }
2049   ASSERT_HOST(modal_font != nullptr);
2050 
2051   // Assign modal font to weak words.
2052   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2053     word = page_res_it.word();
2054     const int length = word->best_choice->length();
2055 
2056     const int count = word->fontinfo_id_count;
2057     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2058       word->fontinfo = modal_font;
2059       // Counts only get 1 as it came from the doc.
2060       word->fontinfo_id_count = 1;
2061     }
2062   }
2063 }
2064 #endif // ndef DISABLED_LEGACY_ENGINE
2065 
2066 // If a word has multiple alternates check if the best choice is in the
2067 // dictionary. If not, replace it with an alternate that exists in the
2068 // dictionary.
2069 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
2070   PAGE_RES_IT word_it(page_res);
2071   for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2072     if (word->best_choices.singleton()) {
2073       continue; // There are no alternates.
2074     }
2075 
2076     const WERD_CHOICE *best = word->best_choice;
2077     if (word->tesseract->getDict().valid_word(*best) != 0) {
2078       continue; // The best choice is in the dictionary.
2079     }
2080 
2081     WERD_CHOICE_IT choice_it(&word->best_choices);
2082     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2083       WERD_CHOICE *alternate = choice_it.data();
2084       if (word->tesseract->getDict().valid_word(*alternate)) {
2085         // The alternate choice is in the dictionary.
2086         if (tessedit_bigram_debug) {
2087           tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2088                   best->unichar_string().c_str(), alternate->unichar_string().c_str());
2089         }
2090         // Replace the 'best' choice with a better choice.
2091         word->ReplaceBestChoice(alternate);
2092         break;
2093       }
2094     }
2095   }
2096 }
2097 
2098 } // namespace tesseract
2099