1 ///////////////////////////////////////////////////////////////////////
2 // File:        blamer.h
3 // Description: Module allowing precise error causes to be allocated.
4 // Author:      Rike Antonova
5 // Refactored:  Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
21 #define TESSERACT_CCSTRUCT_BLAMER_H_
22 
23 #ifdef HAVE_CONFIG_H
24 #  include "config_auto.h" // DISABLED_LEGACY_ENGINE
25 #endif
26 #include "boxword.h" // for BoxWord
27 #ifndef DISABLED_LEGACY_ENGINE
28 #  include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra...
29 #endif                                 //  ndef DISABLED_LEGACY_ENGINE
30 #include "ratngs.h"                    // for BLOB_CHOICE_LIST (ptr only)
31 #include "rect.h"                      // for TBOX
32 #include "tprintf.h"                   // for tprintf
33 
34 #include <tesseract/unichar.h> // for UNICHAR_ID
35 
36 #include <cstdint> // for int16_t
37 #include <cstring> // for memcpy
38 #include <vector>  // for std::vector
39 
40 namespace tesseract {
41 
42 class DENORM;
43 class MATRIX;
44 class UNICHARSET;
45 class WERD_RES;
46 
47 struct MATRIX_COORD;
48 struct TWERD;
49 
50 class LMPainPoints;
51 
52 static const int16_t kBlamerBoxTolerance = 5;
53 
54 // Enum for expressing the source of error.
55 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
56 enum IncorrectResultReason {
57   // The text recorded in best choice == truth text
58   IRR_CORRECT,
59   // Either: Top choice is incorrect and is a dictionary word (language model
60   // is unlikely to help correct such errors, so blame the classifier).
61   // Or: the correct unichar was not included in shortlist produced by the
62   // classifier at all.
63   IRR_CLASSIFIER,
64   // Chopper have not found one or more splits that correspond to the correct
65   // character bounding boxes recorded in BlamerBundle::truth_word.
66   IRR_CHOPPER,
67   // Classifier did include correct unichars for each blob in the correct
68   // segmentation, however its rating could have been too bad to allow the
69   // language model to pull out the correct choice. On the other hand the
70   // strength of the language model might have been too weak to favor the
71   // correct answer, this we call this case a classifier-language model
72   // tradeoff error.
73   IRR_CLASS_LM_TRADEOFF,
74   // Page layout failed to produce the correct bounding box. Blame page layout
75   // if the truth was not found for the word, which implies that the bounding
76   // box of the word was incorrect (no truth word had a similar bounding box).
77   IRR_PAGE_LAYOUT,
78   // SegSearch heuristic prevented one or more blobs from the correct
79   // segmentation state to be classified (e.g. the blob was too wide).
80   IRR_SEGSEARCH_HEUR,
81   // The correct segmentaiton state was not explored because of poor SegSearch
82   // pain point prioritization. We blame SegSearch pain point prioritization
83   // if the best rating of a choice constructed from correct segmentation is
84   // better than that of the best choice (i.e. if we got to explore the correct
85   // segmentation state, language model would have picked the correct choice).
86   IRR_SEGSEARCH_PP,
87   // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
88   // and thus use the old language model (permuters).
89   // TODO(antonova): integrate the new language mode with chopper
90   IRR_CLASS_OLD_LM_TRADEOFF,
91   // If there is an incorrect adaptive template match with a better score than
92   // a correct one (either pre-trained or adapted), mark this as adaption error.
93   IRR_ADAPTION,
94   // split_and_recog_word() failed to find a suitable split in truth.
95   IRR_NO_TRUTH_SPLIT,
96   // Truth is not available for this word (e.g. when words in corrected content
97   // file are turned into ~~~~ because an appropriate alignment was not found.
98   IRR_NO_TRUTH,
99   // The text recorded in best choice != truth text, but none of the above
100   // reasons are set.
101   IRR_UNKNOWN,
102 
103   IRR_NUM_REASONS
104 };
105 
106 // Blamer-related information to determine the source of errors.
107 struct BlamerBundle {
108   static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundleBlamerBundle109   BlamerBundle()
110       : truth_has_char_boxes_(false)
111       , incorrect_result_reason_(IRR_CORRECT)
112       , lattice_data_(nullptr) {
113     ClearResults();
114   }
BlamerBundleBlamerBundle115   BlamerBundle(const BlamerBundle &other) {
116     this->CopyTruth(other);
117     this->CopyResults(other);
118   }
~BlamerBundleBlamerBundle119   ~BlamerBundle() {
120     delete[] lattice_data_;
121   }
122 
123   // Accessors.
TruthStringBlamerBundle124   std::string TruthString() const {
125     std::string truth_str;
126     for (auto &text : truth_text_) {
127       truth_str += text;
128     }
129     return truth_str;
130   }
incorrect_result_reasonBlamerBundle131   IncorrectResultReason incorrect_result_reason() const {
132     return incorrect_result_reason_;
133   }
NoTruthBlamerBundle134   bool NoTruth() const {
135     return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT;
136   }
HasDebugInfoBlamerBundle137   bool HasDebugInfo() const {
138     return debug_.length() > 0 || misadaption_debug_.length() > 0;
139   }
debugBlamerBundle140   const std::string &debug() const {
141     return debug_;
142   }
misadaption_debugBlamerBundle143   const std::string &misadaption_debug() const {
144     return misadaption_debug_;
145   }
UpdateBestRatingBlamerBundle146   void UpdateBestRating(float rating) {
147     if (rating < best_correctly_segmented_rating_) {
148       best_correctly_segmented_rating_ = rating;
149     }
150   }
correct_segmentation_lengthBlamerBundle151   int correct_segmentation_length() const {
152     return correct_segmentation_cols_.size();
153   }
154   // Returns true if the given ratings matrix col,row position is included
155   // in the correct segmentation path at the given index.
MatrixPositionCorrectBlamerBundle156   bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) {
157     return correct_segmentation_cols_[index] == coord.col &&
158            correct_segmentation_rows_[index] == coord.row;
159   }
set_best_choice_is_dict_and_top_choiceBlamerBundle160   void set_best_choice_is_dict_and_top_choice(bool value) {
161     best_choice_is_dict_and_top_choice_ = value;
162   }
lattice_dataBlamerBundle163   const char *lattice_data() const {
164     return lattice_data_;
165   }
lattice_sizeBlamerBundle166   int lattice_size() const {
167     return lattice_size_; // size of lattice_data in bytes
168   }
set_lattice_dataBlamerBundle169   void set_lattice_data(const char *data, int size) {
170     lattice_size_ = size;
171     delete[] lattice_data_;
172     lattice_data_ = new char[lattice_size_];
173     memcpy(lattice_data_, data, lattice_size_);
174   }
175 #ifndef DISABLED_LEGACY_ENGINE
params_training_bundleBlamerBundle176   const tesseract::ParamsTrainingBundle &params_training_bundle() const {
177     return params_training_bundle_;
178   }
179   // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
AddHypothesisBlamerBundle180   void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) {
181     params_training_bundle_.AddHypothesis(hypo);
182   }
183 #endif // ndef DISABLED_LEGACY_ENGINE
184 
185   // Functions to setup the blamer.
186   // Whole word string, whole word bounding box.
187   void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box);
188   // Single "character" string, "character" bounding box.
189   // May be called multiple times to indicate the characters in a word.
190   void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box);
191   // Marks that there is something wrong with the truth text, like it contains
192   // reject characters.
193   void SetRejectedTruth();
194 
195   // Returns true if the provided word_choice is correct.
196   bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const;
197 
ClearResultsBlamerBundle198   void ClearResults() {
199     norm_truth_word_.DeleteAllBoxes();
200     norm_box_tolerance_ = 0;
201     if (!NoTruth()) {
202       incorrect_result_reason_ = IRR_CORRECT;
203     }
204     debug_ = "";
205     segsearch_is_looking_for_blame_ = false;
206     best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
207     correct_segmentation_cols_.clear();
208     correct_segmentation_rows_.clear();
209     best_choice_is_dict_and_top_choice_ = false;
210     delete[] lattice_data_;
211     lattice_data_ = nullptr;
212     lattice_size_ = 0;
213   }
CopyTruthBlamerBundle214   void CopyTruth(const BlamerBundle &other) {
215     truth_has_char_boxes_ = other.truth_has_char_boxes_;
216     truth_word_ = other.truth_word_;
217     truth_text_ = other.truth_text_;
218     incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
219   }
CopyResultsBlamerBundle220   void CopyResults(const BlamerBundle &other) {
221     norm_truth_word_ = other.norm_truth_word_;
222     norm_box_tolerance_ = other.norm_box_tolerance_;
223     incorrect_result_reason_ = other.incorrect_result_reason_;
224     segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
225     best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
226     correct_segmentation_cols_ = other.correct_segmentation_cols_;
227     correct_segmentation_rows_ = other.correct_segmentation_rows_;
228     best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_;
229     if (other.lattice_data_ != nullptr) {
230       lattice_data_ = new char[other.lattice_size_];
231       memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
232       lattice_size_ = other.lattice_size_;
233     } else {
234       lattice_data_ = nullptr;
235     }
236   }
237   const char *IncorrectReason() const;
238 
239   // Appends choice and truth details to the given debug string.
240   void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug);
241 
242   // Sets up the norm_truth_word from truth_word using the given DENORM.
243   void SetupNormTruthWord(const DENORM &denorm);
244 
245   // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
246   // bundles) where the right edge/ of the left-hand word is word1_right,
247   // and the left edge of the right-hand word is word2_left.
248   void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1,
249                    BlamerBundle *bundle2) const;
250   // "Joins" the blames from bundle1 and bundle2 into *this.
251   void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug);
252 
253   // If a blob with the same bounding box as one of the truth character
254   // bounding boxes is not classified as the corresponding truth character
255   // blames character classifier for incorrect answer.
256   void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box,
257                        const BLOB_CHOICE_LIST &choices, bool debug);
258 
259   // Checks whether chops were made at all the character bounding box
260   // boundaries in word->truth_word. If not - blames the chopper for an
261   // incorrect answer.
262   void SetChopperBlame(const WERD_RES *word, bool debug);
263   // Blames the classifier or the language model if, after running only the
264   // chopper, best_choice is incorrect and no blame has been yet set.
265   // Blames the classifier if best_choice is classifier's top choice and is a
266   // dictionary word (i.e. language model could not have helped).
267   // Otherwise, blames the language model (formerly permuter word adjustment).
268   void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset,
269                                   bool valid_permuter, bool debug);
270   // Sets up the correct_segmentation_* to mark the correct bounding boxes.
271   void SetupCorrectSegmentation(const TWERD *word, bool debug);
272 
273   // Returns true if a guided segmentation search is needed.
274   bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
275   // Setup ready to guide the segmentation search to the correct segmentation.
276   void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id,
277                         bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points,
278                         double max_char_wh_ratio, WERD_RES *word_res);
279   // Returns true if the guided segsearch is in progress.
280   bool GuidedSegsearchStillGoing() const;
281   // The segmentation search has ended. Sets the blame appropriately.
282   void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str);
283 
284   // If the bundle is null or still does not indicate the correct result,
285   // fix it and use some backup reason for the blame.
286   static void LastChanceBlame(bool debug, WERD_RES *word);
287 
288   // Sets the misadaption debug if this word is incorrect, as this word is
289   // being adapted to.
290   void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
291 
292 private:
293   // Copy assignment operator (currently unused, therefore private).
294   BlamerBundle &operator=(const BlamerBundle &other) = delete;
SetBlameBlamerBundle295   void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice,
296                 bool debug) {
297     incorrect_result_reason_ = irr;
298     debug_ = IncorrectReason();
299     debug_ += " to blame: ";
300     FillDebugString(msg, choice, debug_);
301     if (debug) {
302       tprintf("SetBlame(): %s", debug_.c_str());
303     }
304   }
305 
306 private:
307   // Set to true when bounding boxes for individual unichars are recorded.
308   bool truth_has_char_boxes_;
309   // Variables used by the segmentation search when looking for the blame.
310   // Set to true while segmentation search is continued after the usual
311   // termination condition in order to look for the blame.
312   bool segsearch_is_looking_for_blame_;
313   // Set to true if best choice is a dictionary word and
314   // classifier's top choice.
315   bool best_choice_is_dict_and_top_choice_;
316   // Tolerance for bounding box comparisons in normalized space.
317   int norm_box_tolerance_;
318   // The true_word (in the original image coordinate space) contains ground
319   // truth bounding boxes for this WERD_RES.
320   tesseract::BoxWord truth_word_;
321   // Same as above, but in normalized coordinates
322   // (filled in by WERD_RES::SetupForRecognition()).
323   tesseract::BoxWord norm_truth_word_;
324   // Contains ground truth unichar for each of the bounding boxes in truth_word.
325   std::vector<std::string> truth_text_;
326   // The reason for incorrect OCR result.
327   IncorrectResultReason incorrect_result_reason_;
328   // Debug text associated with the blame.
329   std::string debug_;
330   // Misadaption debug information (filled in if this word was misadapted to).
331   std::string misadaption_debug_;
332   // Vectors populated by SegSearch to indicate column and row indices that
333   // correspond to blobs with correct bounding boxes.
334   std::vector<int> correct_segmentation_cols_;
335   std::vector<int> correct_segmentation_rows_;
336   // Best rating for correctly segmented path
337   // (set and used by SegSearch when looking for blame).
338   float best_correctly_segmented_rating_;
339   int lattice_size_; // size of lattice_data in bytes
340   // Serialized segmentation search lattice.
341   char *lattice_data_;
342   // Information about hypotheses (paths) explored by the segmentation search.
343 #ifndef DISABLED_LEGACY_ENGINE
344   tesseract::ParamsTrainingBundle params_training_bundle_;
345 #endif // ndef DISABLED_LEGACY_ENGINE
346 };
347 
348 } // namespace tesseract
349 
350 #endif // TESSERACT_CCSTRUCT_BLAMER_H_
351