1 /******************************************************************************
2  ** Filename:    adaptmatch.cpp
3  ** Purpose:     High level adaptive matcher.
4  ** Author:      Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  ******************************************************************************/
17 
18 /*-----------------------------------------------------------------------------
19           Include Files and Type Defines
20 -----------------------------------------------------------------------------*/
21 #ifdef HAVE_CONFIG_H
22 #  include "config_auto.h"
23 #endif
24 
25 #include "adaptive.h"        // for ADAPT_CLASS
26 #include "ambigs.h"          // for UnicharIdVector, UnicharAmbigs
27 #include "bitvec.h"          // for FreeBitVector, NewBitVector, BIT_VECTOR
28 #include "blobs.h"           // for TBLOB, TWERD
29 #include "classify.h"        // for Classify, CST_FRAGMENT, CST_WHOLE
30 #include "dict.h"            // for Dict
31 #include "errcode.h"         // for ASSERT_HOST
32 #include "featdefs.h"        // for CharNormDesc
33 #include "float2int.h"       // for BASELINE_Y_SHIFT
34 #include "fontinfo.h"        // for ScoredFont, FontSet
35 #include "intfx.h"           // for BlobToTrainingSample, INT_FX_RESULT_S...
36 #include "intmatcher.h"      // for CP_RESULT_STRUCT, IntegerMatcher
37 #include "intproto.h"        // for INT_FEATURE_STRUCT, (anonymous), Clas...
38 #include "matchdefs.h"       // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
39 #include "mfoutline.h"       // for baseline, character, MF_SCALE_FACTOR
40 #include "normalis.h"        // for DENORM, kBlnBaselineOffset, kBlnXHeight
41 #include "normfeat.h"        // for ActualOutlineLength, CharNormLength
42 #include "ocrfeatures.h"     // for FEATURE_STRUCT, FEATURE
43 #include "oldlist.h"         // for push, delete_d
44 #include "outfeat.h"         // for OutlineFeatDir, OutlineFeatLength
45 #include "pageres.h"         // for WERD_RES
46 #include "params.h"          // for IntParam, BoolParam, DoubleParam, Str...
47 #include "picofeat.h"        // for PicoFeatDir, PicoFeatX, PicoFeatY
48 #include "protos.h"          // for PROTO_STRUCT, FillABC
49 #include "ratngs.h"          // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
50 #include "rect.h"            // for TBOX
51 #include "scrollview.h"      // for ScrollView, ScrollView::BROWN, Scroll...
52 #include "seam.h"            // for SEAM
53 #include "shapeclassifier.h" // for ShapeClassifier
54 #include "shapetable.h"      // for UnicharRating, ShapeTable, Shape, Uni...
55 #include "tessclassifier.h"  // for TessClassifier
56 #include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
57 #include "tprintf.h"         // for tprintf
58 #include "trainingsample.h"  // for TrainingSample
59 #include "unicharset.h"      // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
60 #include "unicity_table.h"   // for UnicityTable
61 
62 #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
63 #include "helpers.h"           // for IntCastRounded, ClipToRange
64 #include "serialis.h"          // for TFile
65 
66 #include <algorithm> // for max, min
67 #include <cassert>   // for assert
68 #include <cmath>     // for fabs
69 #include <cstdint>   // for INT32_MAX, UINT8_MAX
70 #include <cstdio>    // for fflush, fclose, fopen, stdout, FILE
71 #include <cstring>   // for strstr, memset, strcmp
72 
73 namespace tesseract {
74 
75 // TODO: The parameter classify_enable_adaptive_matcher can cause
76 // a segmentation fault if it is set to false (issue #256),
77 // so override it here.
78 #define classify_enable_adaptive_matcher true
79 
80 #define ADAPT_TEMPLATE_SUFFIX ".a"
81 
82 #define MAX_MATCHES 10
83 #define UNLIKELY_NUM_FEAT 200
84 #define NO_DEBUG 0
85 #define MAX_ADAPTABLE_WERD_SIZE 40
86 
87 #define ADAPTABLE_WERD_ADJUSTMENT (0.05)
88 
89 #define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
90 
91 #define WORST_POSSIBLE_RATING (0.0f)
92 
93 struct ADAPT_RESULTS {
94   int32_t BlobLength;
95   bool HasNonfragment;
96   UNICHAR_ID best_unichar_id;
97   int best_match_index;
98   float best_rating;
99   std::vector<UnicharRating> match;
100   std::vector<CP_RESULT_STRUCT> CPResults;
101 
102   /// Initializes data members to the default values. Sets the initial
103   /// rating of each class to be the worst possible rating (1.0).
Initializetesseract::ADAPT_RESULTS104   inline void Initialize() {
105     BlobLength = INT32_MAX;
106     HasNonfragment = false;
107     ComputeBest();
108   }
109   // Computes best_unichar_id, best_match_index and best_rating.
ComputeBesttesseract::ADAPT_RESULTS110   void ComputeBest() {
111     best_unichar_id = INVALID_UNICHAR_ID;
112     best_match_index = -1;
113     best_rating = WORST_POSSIBLE_RATING;
114     for (unsigned i = 0; i < match.size(); ++i) {
115       if (match[i].rating > best_rating) {
116         best_rating = match[i].rating;
117         best_unichar_id = match[i].unichar_id;
118         best_match_index = i;
119       }
120     }
121   }
122 };
123 
124 struct PROTO_KEY {
125   ADAPT_TEMPLATES_STRUCT *Templates;
126   CLASS_ID ClassId;
127   int ConfigId;
128 };
129 
130 // Sort function to sort ratings appropriately by descending rating.
SortDescendingRating(const UnicharRating & a,const UnicharRating & b)131 static bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {
132   if (a.rating != b.rating) {
133     return a.rating > b.rating;
134   } else {
135     return a.unichar_id < b.unichar_id;
136   }
137 }
138 
139 /*-----------------------------------------------------------------------------
140           Private Macros
141 -----------------------------------------------------------------------------*/
MarginalMatch(float confidence,float matcher_great_threshold)142 inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
143   return (1.0f - confidence) > matcher_great_threshold;
144 }
145 
146 /*-----------------------------------------------------------------------------
147           Private Function Prototypes
148 -----------------------------------------------------------------------------*/
149 // Returns the index of the given id in results, if present, or the size of the
150 // vector (index it will go at) if not present.
FindScoredUnichar(UNICHAR_ID id,const ADAPT_RESULTS & results)151 static unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
152   for (unsigned i = 0; i < results.match.size(); i++) {
153     if (results.match[i].unichar_id == id) {
154       return i;
155     }
156   }
157   return results.match.size();
158 }
159 
160 // Returns the current rating for a unichar id if we have rated it, defaulting
161 // to WORST_POSSIBLE_RATING.
ScoredUnichar(UNICHAR_ID id,const ADAPT_RESULTS & results)162 static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
163   unsigned index = FindScoredUnichar(id, results);
164   if (index >= results.match.size()) {
165     return WORST_POSSIBLE_RATING;
166   }
167   return results.match[index].rating;
168 }
169 
170 void InitMatcherRatings(float *Rating);
171 
172 int MakeTempProtoPerm(void *item1, void *item2);
173 
174 void SetAdaptiveThreshold(float Threshold);
175 
176 /*-----------------------------------------------------------------------------
177               Public Code
178 -----------------------------------------------------------------------------*/
179 /**
180  * This routine calls the adaptive matcher
181  * which returns (in an array) the class id of each
182  * class matched.
183  *
184  * It also returns the number of classes matched.
185  * For each class matched it places the best rating
186  * found for that class into the Ratings array.
187  *
188  * Bad matches are then removed so that they don't
189  * need to be sorted.  The remaining good matches are
190  * then sorted and converted to choices.
191  *
192  * This routine also performs some simple speckle
193  * filtering.
194  *
195  * @param Blob    blob to be classified
196  * @param[out] Choices    List of choices found by adaptive matcher.
197  * filled on return with the choices found by the
198  * class pruner and the ratings there from. Also
199  * contains the detailed results of the integer matcher.
200  *
201  */
AdaptiveClassifier(TBLOB * Blob,BLOB_CHOICE_LIST * Choices)202 void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
203   assert(Choices != nullptr);
204   auto *Results = new ADAPT_RESULTS;
205   Results->Initialize();
206 
207   ASSERT_HOST(AdaptedTemplates != nullptr);
208 
209   DoAdaptiveMatch(Blob, Results);
210 
211   RemoveBadMatches(Results);
212   std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
213   RemoveExtraPuncs(Results);
214   Results->ComputeBest();
215   ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);
216 
217   // TODO(rays) Move to before ConvertMatchesToChoices!
218   if (LargeSpeckle(*Blob) || Choices->empty()) {
219     AddLargeSpeckleTo(Results->BlobLength, Choices);
220   }
221 
222   if (matcher_debug_level >= 1) {
223     tprintf("AD Matches =  ");
224     PrintAdaptiveMatchResults(*Results);
225   }
226 
227 #ifndef GRAPHICS_DISABLED
228   if (classify_enable_adaptive_debugger) {
229     DebugAdaptiveClassifier(Blob, Results);
230   }
231 #endif
232 
233   delete Results;
234 } /* AdaptiveClassifier */
235 
236 #ifndef GRAPHICS_DISABLED
237 
238 // If *win is nullptr, sets it to a new ScrollView() object with title msg.
239 // Clears the window and draws baselines.
RefreshDebugWindow(ScrollView ** win,const char * msg,int y_offset,const TBOX & wbox)240 void Classify::RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset,
241                                   const TBOX &wbox) {
242   const int kSampleSpaceWidth = 500;
243   if (*win == nullptr) {
244     *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,
245                           200, true);
246   }
247   (*win)->Clear();
248   (*win)->Pen(64, 64, 64);
249   (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);
250   (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,
251                kBlnXHeight + kBlnBaselineOffset);
252   (*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
253 }
254 
255 #endif // !GRAPHICS_DISABLED
256 
257 // Learns the given word using its chopped_word, seam_array, denorm,
258 // box_word, best_state, and correct_text to learn both correctly and
259 // incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
260 // is called and the data will be saved in an internal buffer.
261 // Otherwise AdaptToBlob is called for adaption within a document.
LearnWord(const char * fontname,WERD_RES * word)262 void Classify::LearnWord(const char *fontname, WERD_RES *word) {
263   int word_len = word->correct_text.size();
264   if (word_len == 0) {
265     return;
266   }
267 
268   float *thresholds = nullptr;
269   if (fontname == nullptr) {
270     // Adaption mode.
271     if (!EnableLearning || word->best_choice == nullptr) {
272       return; // Can't or won't adapt.
273     }
274 
275     if (classify_learning_debug_level >= 1) {
276       tprintf("\n\nAdapting to word = %s\n", word->best_choice->debug_string().c_str());
277     }
278     thresholds = new float[word_len];
279     word->ComputeAdaptionThresholds(certainty_scale, matcher_perfect_threshold,
280                                     matcher_good_threshold, matcher_rating_margin, thresholds);
281   }
282   int start_blob = 0;
283 
284 #ifndef GRAPHICS_DISABLED
285   if (classify_debug_character_fragments) {
286     if (learn_fragmented_word_debug_win_ != nullptr) {
287       learn_fragmented_word_debug_win_->Wait();
288     }
289     RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
290                        word->chopped_word->bounding_box());
291     RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
292                        word->chopped_word->bounding_box());
293     word->chopped_word->plot(learn_fragmented_word_debug_win_);
294     ScrollView::Update();
295   }
296 #endif // !GRAPHICS_DISABLED
297 
298   for (int ch = 0; ch < word_len; ++ch) {
299     if (classify_debug_character_fragments) {
300       tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
301     }
302     if (word->correct_text[ch].length() > 0) {
303       float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
304 
305       LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,
306                   word->correct_text[ch].c_str(), word);
307 
308       if (word->best_state[ch] > 1 && !disable_character_fragments) {
309         // Check that the character breaks into meaningful fragments
310         // that each match a whole character with at least
311         // classify_character_fragments_garbage_certainty_threshold
312         bool garbage = false;
313         int frag;
314         for (frag = 0; frag < word->best_state[ch]; ++frag) {
315           TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];
316           if (classify_character_fragments_garbage_certainty_threshold < 0) {
317             garbage |= LooksLikeGarbage(frag_blob);
318           }
319         }
320         // Learn the fragments.
321         if (!garbage) {
322           bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);
323           if (pieces_all_natural || !prioritize_division) {
324             for (frag = 0; frag < word->best_state[ch]; ++frag) {
325               std::vector<std::string> tokens = split(word->correct_text[ch], ' ');
326 
327               tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],
328                                                    pieces_all_natural);
329 
330 	      std::string full_string;
331               for (unsigned i = 0; i < tokens.size(); i++) {
332                 full_string += tokens[i];
333                 if (i != tokens.size() - 1) {
334                   full_string += ' ';
335                 }
336               }
337               LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,
338                           full_string.c_str(), word);
339             }
340           }
341         }
342       }
343 
344       // TODO(rays): re-enable this part of the code when we switch to the
345       // new classifier that needs to see examples of garbage.
346       /*
347 if (word->best_state[ch] > 1) {
348   // If the next blob is good, make junk with the rightmost fragment.
349   if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
350     LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
351                 word->best_state[ch + 1] + 1,
352                 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
353   }
354   // If the previous blob is good, make junk with the leftmost fragment.
355   if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
356     LearnPieces(fontname, start_blob - word->best_state[ch - 1],
357                 word->best_state[ch - 1] + 1,
358                 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
359   }
360 }
361 // If the next blob is good, make a join with it.
362 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
363   std::string joined_text = word->correct_text[ch];
364   joined_text += word->correct_text[ch + 1];
365   LearnPieces(fontname, start_blob,
366               word->best_state[ch] + word->best_state[ch + 1],
367               threshold, CST_NGRAM, joined_text.c_str(), word);
368 }
369 */
370     }
371     start_blob += word->best_state[ch];
372   }
373   delete[] thresholds;
374 } // LearnWord.
375 
376 // Builds a blob of length fragments, from the word, starting at start,
377 // and then learns it, as having the given correct_text.
378 // If fontname is not nullptr, then LearnBlob is called and the data will be
379 // saved in an internal buffer for static training.
380 // Otherwise AdaptToBlob is called for adaption within a document.
381 // threshold is a magic number required by AdaptToChar and generated by
382 // ComputeAdaptionThresholds.
383 // Although it can be partly inferred from the string, segmentation is
384 // provided to explicitly clarify the character segmentation.
LearnPieces(const char * fontname,int start,int length,float threshold,CharSegmentationType segmentation,const char * correct_text,WERD_RES * word)385 void Classify::LearnPieces(const char *fontname, int start, int length, float threshold,
386                            CharSegmentationType segmentation, const char *correct_text,
387                            WERD_RES *word) {
388   // TODO(daria) Remove/modify this if/when we want
389   // to train and/or adapt to n-grams.
390   if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {
391     return;
392   }
393 
394   if (length > 1) {
395     SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
396   }
397   TBLOB *blob = word->chopped_word->blobs[start];
398   // Rotate the blob if needed for classification.
399   TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();
400   if (rotated_blob == nullptr) {
401     rotated_blob = blob;
402   }
403 
404 #ifndef GRAPHICS_DISABLED
405   // Draw debug windows showing the blob that is being learned if needed.
406   if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
407     RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600, word->chopped_word->bounding_box());
408     rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
409     learn_debug_win_->Update();
410     learn_debug_win_->Wait();
411   }
412   if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
413     ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
414     blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);
415     learn_fragments_debug_win_->Update();
416   }
417 #endif // !GRAPHICS_DISABLED
418 
419   if (fontname != nullptr) {
420     classify_norm_method.set_value(character); // force char norm spc 30/11/93
421     tess_bn_matching.set_value(false);         // turn it off
422     tess_cn_matching.set_value(false);
423     DENORM bl_denorm, cn_denorm;
424     INT_FX_RESULT_STRUCT fx_info;
425     SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);
426     LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
427   } else if (unicharset.contains_unichar(correct_text)) {
428     UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
429     int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;
430     if (classify_learning_debug_level >= 1) {
431       tprintf("Adapting to char = %s, thr= %g font_id= %d\n", unicharset.id_to_unichar(class_id),
432               threshold, font_id);
433     }
434     // If filename is not nullptr we are doing recognition
435     // (as opposed to training), so we must have already set word fonts.
436     AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
437     if (BackupAdaptedTemplates != nullptr) {
438       // Adapt the backup templates too. They will be used if the primary gets
439       // too full.
440       AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);
441     }
442   } else if (classify_debug_level >= 1) {
443     tprintf("Can't adapt to %s not in unicharset\n", correct_text);
444   }
445   if (rotated_blob != blob) {
446     delete rotated_blob;
447   }
448 
449   SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
450 } // LearnPieces.
451 
452 /*---------------------------------------------------------------------------*/
453 /**
454  * This routine performs cleanup operations
455  * on the adaptive classifier.  It should be called
456  * before the program is terminated.  Its main function
457  * is to save the adapted templates to a file.
458  *
459  * Globals:
460  * - #AdaptedTemplates current set of adapted templates
461  * - #classify_save_adapted_templates true if templates should be saved
462  * - #classify_enable_adaptive_matcher true if adaptive matcher is enabled
463  */
EndAdaptiveClassifier()464 void Classify::EndAdaptiveClassifier() {
465   std::string Filename;
466   FILE *File;
467 
468   if (AdaptedTemplates != nullptr && classify_enable_adaptive_matcher &&
469       classify_save_adapted_templates) {
470     Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
471     File = fopen(Filename.c_str(), "wb");
472     if (File == nullptr) {
473       tprintf("Unable to save adapted templates to %s!\n", Filename.c_str());
474     } else {
475       tprintf("\nSaving adapted templates to %s ...", Filename.c_str());
476       fflush(stdout);
477       WriteAdaptedTemplates(File, AdaptedTemplates);
478       tprintf("\n");
479       fclose(File);
480     }
481   }
482 
483   delete AdaptedTemplates;
484   AdaptedTemplates = nullptr;
485   delete BackupAdaptedTemplates;
486   BackupAdaptedTemplates = nullptr;
487 
488   if (PreTrainedTemplates != nullptr) {
489     delete PreTrainedTemplates;
490     PreTrainedTemplates = nullptr;
491   }
492   getDict().EndDangerousAmbigs();
493   FreeNormProtos();
494   if (AllProtosOn != nullptr) {
495     FreeBitVector(AllProtosOn);
496     FreeBitVector(AllConfigsOn);
497     FreeBitVector(AllConfigsOff);
498     FreeBitVector(TempProtoMask);
499     AllProtosOn = nullptr;
500     AllConfigsOn = nullptr;
501     AllConfigsOff = nullptr;
502     TempProtoMask = nullptr;
503   }
504   delete shape_table_;
505   shape_table_ = nullptr;
506   delete static_classifier_;
507   static_classifier_ = nullptr;
508 } /* EndAdaptiveClassifier */
509 
510 /*---------------------------------------------------------------------------*/
511 /**
512  * This routine reads in the training
513  * information needed by the adaptive classifier
514  * and saves it into global variables.
515  *  Parameters:
516  *      load_pre_trained_templates  Indicates whether the pre-trained
517  *                     templates (inttemp, normproto and pffmtable components)
518  *                     should be loaded. Should only be set to true if the
519  *                     necessary classifier components are present in the
520  *                     [lang].traineddata file.
521  *  Globals:
522  *      BuiltInTemplatesFile  file to get built-in temps from
523  *      BuiltInCutoffsFile    file to get avg. feat per class from
524  *      classify_use_pre_adapted_templates
525  *                            enables use of pre-adapted templates
526  */
InitAdaptiveClassifier(TessdataManager * mgr)527 void Classify::InitAdaptiveClassifier(TessdataManager *mgr) {
528   if (!classify_enable_adaptive_matcher) {
529     return;
530   }
531   if (AllProtosOn != nullptr) {
532     EndAdaptiveClassifier(); // Don't leak with multiple inits.
533   }
534 
535   // If there is no language_data_path_prefix, the classifier will be
536   // adaptive only.
537   if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
538     TFile fp;
539     ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
540     PreTrainedTemplates = ReadIntTemplates(&fp);
541 
542     if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
543       shape_table_ = new ShapeTable(unicharset);
544       if (!shape_table_->DeSerialize(&fp)) {
545         tprintf("Error loading shape table!\n");
546         delete shape_table_;
547         shape_table_ = nullptr;
548       }
549     }
550 
551     ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
552     ReadNewCutoffs(&fp, CharNormCutoffs);
553 
554     ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
555     NormProtos = ReadNormProtos(&fp);
556     static_classifier_ = new TessClassifier(false, this);
557   }
558 
559   InitIntegerFX();
560 
561   AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
562   AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
563   AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
564   TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
565   set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
566   set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
567   zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
568 
569   for (uint16_t &BaselineCutoff : BaselineCutoffs) {
570     BaselineCutoff = 0;
571   }
572 
573   if (classify_use_pre_adapted_templates) {
574     TFile fp;
575     std::string Filename = imagefile;
576     Filename += ADAPT_TEMPLATE_SUFFIX;
577     if (!fp.Open(Filename.c_str(), nullptr)) {
578       AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
579     } else {
580       tprintf("\nReading pre-adapted templates from %s ...\n", Filename.c_str());
581       fflush(stdout);
582       AdaptedTemplates = ReadAdaptedTemplates(&fp);
583       tprintf("\n");
584       PrintAdaptedTemplates(stdout, AdaptedTemplates);
585 
586       for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
587         BaselineCutoffs[i] = CharNormCutoffs[i];
588       }
589     }
590   } else {
591     delete AdaptedTemplates;
592     AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
593   }
594 } /* InitAdaptiveClassifier */
595 
ResetAdaptiveClassifierInternal()596 void Classify::ResetAdaptiveClassifierInternal() {
597   if (classify_learning_debug_level > 0) {
598     tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n", NumAdaptationsFailed);
599   }
600   delete AdaptedTemplates;
601   AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
602   delete BackupAdaptedTemplates;
603   BackupAdaptedTemplates = nullptr;
604   NumAdaptationsFailed = 0;
605 }
606 
607 // If there are backup adapted templates, switches to those, otherwise resets
608 // the main adaptive classifier (because it is full.)
SwitchAdaptiveClassifier()609 void Classify::SwitchAdaptiveClassifier() {
610   if (BackupAdaptedTemplates == nullptr) {
611     ResetAdaptiveClassifierInternal();
612     return;
613   }
614   if (classify_learning_debug_level > 0) {
615     tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
616             NumAdaptationsFailed);
617   }
618   delete AdaptedTemplates;
619   AdaptedTemplates = BackupAdaptedTemplates;
620   BackupAdaptedTemplates = nullptr;
621   NumAdaptationsFailed = 0;
622 }
623 
624 // Resets the backup adaptive classifier to empty.
StartBackupAdaptiveClassifier()625 void Classify::StartBackupAdaptiveClassifier() {
626   delete BackupAdaptedTemplates;
627   BackupAdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
628 }
629 
630 /*---------------------------------------------------------------------------*/
631 /**
632  * This routine prepares the adaptive
633  * matcher for the start
634  * of the first pass.  Learning is enabled (unless it
635  * is disabled for the whole program).
636  *
637  * @note this is somewhat redundant, it simply says that if learning is
638  * enabled then it will remain enabled on the first pass.  If it is
639  * disabled, then it will remain disabled.  This is only put here to
640  * make it very clear that learning is controlled directly by the global
641  * setting of EnableLearning.
642  *
643  * Globals:
644  * - #EnableLearning
645  * set to true by this routine
646  */
SettupPass1()647 void Classify::SettupPass1() {
648   EnableLearning = classify_enable_learning;
649 
650   getDict().SettupStopperPass1();
651 
652 } /* SettupPass1 */
653 
654 /*---------------------------------------------------------------------------*/
655 /**
656  * This routine prepares the adaptive
657  * matcher for the start of the second pass.  Further
658  * learning is disabled.
659  *
660  * Globals:
661  * - #EnableLearning set to false by this routine
662  */
SettupPass2()663 void Classify::SettupPass2() {
664   EnableLearning = false;
665   getDict().SettupStopperPass2();
666 
667 } /* SettupPass2 */
668 
669 /*---------------------------------------------------------------------------*/
670 /**
671  * This routine creates a new adapted
672  * class and uses Blob as the model for the first
673  * config in that class.
674  *
675  * @param Blob blob to model new class after
676  * @param ClassId id of the class to be initialized
677  * @param FontinfoId font information inferred from pre-trained templates
678  * @param Class adapted class to be initialized
679  * @param Templates adapted templates to add new class to
680  *
681  * Globals:
682  * - #AllProtosOn dummy mask with all 1's
683  * - BaselineCutoffs kludge needed to get cutoffs
684  * - #PreTrainedTemplates kludge needed to get cutoffs
685  */
InitAdaptedClass(TBLOB * Blob,CLASS_ID ClassId,int FontinfoId,ADAPT_CLASS_STRUCT * Class,ADAPT_TEMPLATES_STRUCT * Templates)686 void Classify::InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
687                                 ADAPT_TEMPLATES_STRUCT *Templates) {
688   FEATURE_SET Features;
689   int Fid, Pid;
690   FEATURE Feature;
691   int NumFeatures;
692   PROTO_STRUCT *Proto;
693   INT_CLASS_STRUCT *IClass;
694   TEMP_CONFIG_STRUCT *Config;
695 
696   classify_norm_method.set_value(baseline);
697   Features = ExtractOutlineFeatures(Blob);
698   NumFeatures = Features->NumFeatures;
699   if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
700     delete Features;
701     return;
702   }
703 
704   Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);
705   TempConfigFor(Class, 0) = Config;
706 
707   /* this is a kludge to construct cutoffs for adapted templates */
708   if (Templates == AdaptedTemplates) {
709     BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
710   }
711 
712   IClass = ClassForClassId(Templates->Templates, ClassId);
713 
714   for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
715     Pid = AddIntProto(IClass);
716     assert(Pid != NO_PROTO);
717 
718     Feature = Features->Features[Fid];
719     auto TempProto = new TEMP_PROTO_STRUCT;
720     Proto = &(TempProto->Proto);
721 
722     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
723    ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
724    instead of the -0.25 to 0.75 used in baseline normalization */
725     Proto->Angle = Feature->Params[OutlineFeatDir];
726     Proto->X = Feature->Params[OutlineFeatX];
727     Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
728     Proto->Length = Feature->Params[OutlineFeatLength];
729     FillABC(Proto);
730 
731     TempProto->ProtoId = Pid;
732     SET_BIT(Config->Protos, Pid);
733 
734     ConvertProto(Proto, Pid, IClass);
735     AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
736 
737     Class->TempProtos = push(Class->TempProtos, TempProto);
738   }
739   delete Features;
740 
741   AddIntConfig(IClass);
742   ConvertConfig(AllProtosOn, 0, IClass);
743 
744   if (classify_learning_debug_level >= 1) {
745     tprintf("Added new class '%s' with class id %d and %d protos.\n",
746             unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
747 #ifndef GRAPHICS_DISABLED
748     if (classify_learning_debug_level > 1) {
749       DisplayAdaptedChar(Blob, IClass);
750     }
751 #endif
752   }
753 
754   if (IsEmptyAdaptedClass(Class)) {
755     (Templates->NumNonEmptyClasses)++;
756   }
757 } /* InitAdaptedClass */
758 
759 /*---------------------------------------------------------------------------*/
760 /**
761  * This routine sets up the feature
762  * extractor to extract baseline normalized
763  * pico-features.
764  *
765  * The extracted pico-features are converted
766  * to integer form and placed in IntFeatures. The
767  * original floating-pt. features are returned in
768  * FloatFeatures.
769  *
770  * Globals: none
771  * @param Blob blob to extract features from
772  * @param[out] IntFeatures array to fill with integer features
773  * @param[out] FloatFeatures place to return actual floating-pt features
774  *
775  * @return Number of pico-features returned (0 if
776  * an error occurred)
777  */
GetAdaptiveFeatures(TBLOB * Blob,INT_FEATURE_ARRAY IntFeatures,FEATURE_SET * FloatFeatures)778 int Classify::GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures,
779                                   FEATURE_SET *FloatFeatures) {
780   FEATURE_SET Features;
781   int NumFeatures;
782 
783   classify_norm_method.set_value(baseline);
784   Features = ExtractPicoFeatures(Blob);
785 
786   NumFeatures = Features->NumFeatures;
787   if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
788     delete Features;
789     return 0;
790   }
791 
792   ComputeIntFeatures(Features, IntFeatures);
793   *FloatFeatures = Features;
794 
795   return NumFeatures;
796 } /* GetAdaptiveFeatures */
797 
798 /*-----------------------------------------------------------------------------
799               Private Code
800 -----------------------------------------------------------------------------*/
801 /*---------------------------------------------------------------------------*/
802 /**
803  * Return true if the specified word is acceptable for adaptation.
804  *
805  * Globals: none
806  *
807  * @param word current word
808  *
809  * @return true or false
810  */
AdaptableWord(WERD_RES * word)811 bool Classify::AdaptableWord(WERD_RES *word) {
812   if (word->best_choice == nullptr) {
813     return false;
814   }
815   auto BestChoiceLength = word->best_choice->length();
816   float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
817   return // rules that apply in general - simplest to compute first
818       BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
819       BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
820       // This basically ensures that the word is at least a dictionary match
821       // (freq word, user word, system dawg word, etc).
822       // Since all the other adjustments will make adjust factor higher
823       // than higher than adaptable_score=1.1+0.05=1.15
824       // Since these are other flags that ensure that the word is dict word,
825       // this check could be at times redundant.
826       word->best_choice->adjust_factor() <= adaptable_score &&
827       // Make sure that alternative choices are not dictionary words.
828       word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
829 }
830 
831 /*---------------------------------------------------------------------------*/
832 /**
833  * @param Blob blob to add to templates for ClassId
834  * @param ClassId class to add blob to
835  * @param FontinfoId font information from pre-trained templates
836  * @param Threshold minimum match rating to existing template
837  * @param adaptive_templates current set of adapted templates
838  *
839  * Globals:
840  * - AllProtosOn dummy mask to match against all protos
841  * - AllConfigsOn dummy mask to match against all configs
842  */
AdaptToChar(TBLOB * Blob,CLASS_ID ClassId,int FontinfoId,float Threshold,ADAPT_TEMPLATES_STRUCT * adaptive_templates)843 void Classify::AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
844                            ADAPT_TEMPLATES_STRUCT *adaptive_templates) {
845   int NumFeatures;
846   INT_FEATURE_ARRAY IntFeatures;
847   UnicharRating int_result;
848   INT_CLASS_STRUCT *IClass;
849   ADAPT_CLASS_STRUCT *Class;
850   TEMP_CONFIG_STRUCT *TempConfig;
851   FEATURE_SET FloatFeatures;
852   int NewTempConfigId;
853 
854   if (!LegalClassId(ClassId)) {
855     return;
856   }
857 
858   int_result.unichar_id = ClassId;
859   Class = adaptive_templates->Class[ClassId];
860   assert(Class != nullptr);
861   if (IsEmptyAdaptedClass(Class)) {
862     InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
863   } else {
864     IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
865 
866     NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
867     if (NumFeatures <= 0) {
868       return; // Features already freed by GetAdaptiveFeatures.
869     }
870 
871     // Only match configs with the matching font.
872     BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
873     for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
874       if (GetFontinfoId(Class, cfg) == FontinfoId) {
875         SET_BIT(MatchingFontConfigs, cfg);
876       } else {
877         reset_bit(MatchingFontConfigs, cfg);
878       }
879     }
880     im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,
881               classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
882     FreeBitVector(MatchingFontConfigs);
883 
884     SetAdaptiveThreshold(Threshold);
885 
886     if (1.0f - int_result.rating <= Threshold) {
887       if (ConfigIsPermanent(Class, int_result.config)) {
888         if (classify_learning_debug_level >= 1) {
889           tprintf("Found good match to perm config %d = %4.1f%%.\n", int_result.config,
890                   int_result.rating * 100.0);
891         }
892         delete FloatFeatures;
893         return;
894       }
895 
896       TempConfig = TempConfigFor(Class, int_result.config);
897       IncreaseConfidence(TempConfig);
898       if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
899         Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
900       }
901       if (classify_learning_debug_level >= 1) {
902         tprintf("Increasing reliability of temp config %d to %d.\n", int_result.config,
903                 TempConfig->NumTimesSeen);
904       }
905 
906       if (TempConfigReliable(ClassId, TempConfig)) {
907         MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
908         UpdateAmbigsGroup(ClassId, Blob);
909       }
910     } else {
911       if (classify_learning_debug_level >= 1) {
912         tprintf("Found poor match to temp config %d = %4.1f%%.\n", int_result.config,
913                 int_result.rating * 100.0);
914 #ifndef GRAPHICS_DISABLED
915         if (classify_learning_debug_level > 2) {
916           DisplayAdaptedChar(Blob, IClass);
917         }
918 #endif
919       }
920       NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,
921                                                IntFeatures, FloatFeatures);
922       if (NewTempConfigId >= 0 &&
923           TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
924         MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
925         UpdateAmbigsGroup(ClassId, Blob);
926       }
927 
928 #ifndef GRAPHICS_DISABLED
929       if (classify_learning_debug_level > 1) {
930         DisplayAdaptedChar(Blob, IClass);
931       }
932 #endif
933     }
934     delete FloatFeatures;
935   }
936 } /* AdaptToChar */
937 
938 #ifndef GRAPHICS_DISABLED
939 
DisplayAdaptedChar(TBLOB * blob,INT_CLASS_STRUCT * int_class)940 void Classify::DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class) {
941   INT_FX_RESULT_STRUCT fx_info;
942   std::vector<INT_FEATURE_STRUCT> bl_features;
943   TrainingSample *sample =
944       BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);
945   if (sample == nullptr) {
946     return;
947   }
948 
949   UnicharRating int_result;
950   im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,
951             classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
952   tprintf("Best match to temp config %d = %4.1f%%.\n", int_result.config,
953           int_result.rating * 100.0);
954   if (classify_learning_debug_level >= 2) {
955     uint32_t ConfigMask;
956     ConfigMask = 1 << int_result.config;
957     ShowMatchDisplay();
958     im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),
959               &bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,
960               matcher_debug_separate_windows);
961     UpdateMatchDisplay();
962   }
963 
964   delete sample;
965 }
966 
967 #endif
968 
969 /**
970  * This routine adds the result of a classification into
971  * Results.  If the new rating is much worse than the current
972  * best rating, it is not entered into results because it
973  * would end up being stripped later anyway.  If the new rating
974  * is better than the old rating for the class, it replaces the
975  * old rating.  If this is the first rating for the class, the
976  * class is added to the list of matched classes in Results.
977  * If the new rating is better than the best so far, it
978  * becomes the best so far.
979  *
980  * Globals:
981  * - #matcher_bad_match_pad defines limits of an acceptable match
982  *
983  * @param new_result new result to add
984  * @param[out] results results to add new result to
985  */
AddNewResult(const UnicharRating & new_result,ADAPT_RESULTS * results)986 void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
987   auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
988 
989   if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
990       (old_match < results->match.size() &&
991        new_result.rating <= results->match[old_match].rating)) {
992     return; // New one not good enough.
993   }
994 
995   if (!unicharset.get_fragment(new_result.unichar_id)) {
996     results->HasNonfragment = true;
997   }
998 
999   if (old_match < results->match.size()) {
1000     results->match[old_match].rating = new_result.rating;
1001   } else {
1002     results->match.push_back(new_result);
1003   }
1004 
1005   if (new_result.rating > results->best_rating &&
1006       // Ensure that fragments do not affect best rating, class and config.
1007       // This is needed so that at least one non-fragmented character is
1008       // always present in the results.
1009       // TODO(daria): verify that this helps accuracy and does not
1010       // hurt performance.
1011       !unicharset.get_fragment(new_result.unichar_id)) {
1012     results->best_match_index = old_match;
1013     results->best_rating = new_result.rating;
1014     results->best_unichar_id = new_result.unichar_id;
1015   }
1016 } /* AddNewResult */
1017 
1018 /*---------------------------------------------------------------------------*/
1019 /**
1020  * This routine is identical to CharNormClassifier()
1021  * except that it does no class pruning.  It simply matches
1022  * the unknown blob against the classes listed in
1023  * Ambiguities.
1024  *
1025  * Globals:
1026  * - #AllProtosOn mask that enables all protos
1027  * - #AllConfigsOn mask that enables all configs
1028  *
1029  * @param blob blob to be classified
1030  * @param templates built-in templates to classify against
1031  * @param classes adapted class templates
1032  * @param ambiguities array of unichar id's to match against
1033  * @param[out] results place to put match results
1034  * @param int_features
1035  * @param fx_info
1036  */
AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> & int_features,const INT_FX_RESULT_STRUCT & fx_info,const TBLOB * blob,INT_TEMPLATES_STRUCT * templates,ADAPT_CLASS_STRUCT ** classes,UNICHAR_ID * ambiguities,ADAPT_RESULTS * results)1037 void Classify::AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
1038                                const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
1039                                INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes,
1040                                UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) {
1041   if (int_features.empty()) {
1042     return;
1043   }
1044   auto *CharNormArray = new uint8_t[unicharset.size()];
1045   UnicharRating int_result;
1046 
1047   results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);
1048   bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1049   if (debug) {
1050     tprintf("AM Matches =  ");
1051   }
1052 
1053   int top = blob->bounding_box().top();
1054   int bottom = blob->bounding_box().bottom();
1055   while (*ambiguities >= 0) {
1056     CLASS_ID class_id = *ambiguities;
1057 
1058     int_result.unichar_id = class_id;
1059     im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),
1060               &int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,
1061               matcher_debug_separate_windows);
1062 
1063     ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,
1064                                     classify_integer_matcher_multiplier, CharNormArray, &int_result,
1065                                     results);
1066     ambiguities++;
1067   }
1068   delete[] CharNormArray;
1069 } /* AmbigClassifier */
1070 
1071 /*---------------------------------------------------------------------------*/
1072 /// Factored-out calls to IntegerMatcher based on class pruner results.
1073 /// Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.
MasterMatcher(INT_TEMPLATES_STRUCT * templates,int16_t num_features,const INT_FEATURE_STRUCT * features,const uint8_t * norm_factors,ADAPT_CLASS_STRUCT ** classes,int debug,int matcher_multiplier,const TBOX & blob_box,const std::vector<CP_RESULT_STRUCT> & results,ADAPT_RESULTS * final_results)1074 void Classify::MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
1075                              const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
1076                              ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier,
1077                              const TBOX &blob_box, const std::vector<CP_RESULT_STRUCT> &results,
1078                              ADAPT_RESULTS *final_results) {
1079   int top = blob_box.top();
1080   int bottom = blob_box.bottom();
1081   UnicharRating int_result;
1082   for (auto result : results) {
1083     CLASS_ID class_id = result.Class;
1084     BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;
1085     BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;
1086 
1087     int_result.unichar_id = class_id;
1088     im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,
1089               &int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);
1090     bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1091     ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,
1092                                     final_results->BlobLength, matcher_multiplier, norm_factors,
1093                                     &int_result, final_results);
1094   }
1095 }
1096 
1097 // Converts configs to fonts, and if the result is not adapted, and a
1098 // shape_table_ is present, the shape is expanded to include all
1099 // unichar_ids represented, before applying a set of corrections to the
1100 // distance rating in int_result, (see ComputeCorrectedRating.)
1101 // The results are added to the final_results output.
ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT ** classes,bool debug,int class_id,int bottom,int top,float cp_rating,int blob_length,int matcher_multiplier,const uint8_t * cn_factors,UnicharRating * int_result,ADAPT_RESULTS * final_results)1102 void Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id,
1103                                                int bottom, int top, float cp_rating,
1104                                                int blob_length, int matcher_multiplier,
1105                                                const uint8_t *cn_factors, UnicharRating *int_result,
1106                                                ADAPT_RESULTS *final_results) {
1107   if (classes != nullptr) {
1108     // Adapted result. Convert configs to fontinfo_ids.
1109     int_result->adapted = true;
1110     for (auto &font : int_result->fonts) {
1111       font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);
1112     }
1113   } else {
1114     // Pre-trained result. Map fonts using font_sets_.
1115     int_result->adapted = false;
1116     for (auto &font : int_result->fonts) {
1117       font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);
1118     }
1119     if (shape_table_ != nullptr) {
1120       // Two possible cases:
1121       // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1122       // int_result->fonts are the same. In this case build a new vector of
1123       // mapped fonts and replace the fonts in int_result.
1124       // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1125       // by int_result. In this case, build a vector of UnicharRating to
1126       // gather together different font-ids for each unichar. Also covers case1.
1127       std::vector<UnicharRating> mapped_results;
1128       for (auto &f : int_result->fonts) {
1129         int shape_id = f.fontinfo_id;
1130         const Shape &shape = shape_table_->GetShape(shape_id);
1131         for (int c = 0; c < shape.size(); ++c) {
1132           int unichar_id = shape[c].unichar_id;
1133           if (!unicharset.get_enabled(unichar_id)) {
1134             continue;
1135           }
1136           // Find the mapped_result for unichar_id.
1137           unsigned r = 0;
1138           for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;
1139                ++r) {
1140           }
1141           if (r == mapped_results.size()) {
1142             mapped_results.push_back(*int_result);
1143             mapped_results[r].unichar_id = unichar_id;
1144             mapped_results[r].fonts.clear();
1145           }
1146           for (int font_id : shape[c].font_ids) {
1147             mapped_results[r].fonts.emplace_back(font_id, f.score);
1148           }
1149         }
1150       }
1151       for (auto &m : mapped_results) {
1152         m.rating = ComputeCorrectedRating(
1153             debug, m.unichar_id, cp_rating, int_result->rating,
1154             int_result->feature_misses, bottom, top, blob_length, matcher_multiplier, cn_factors);
1155         AddNewResult(m, final_results);
1156       }
1157       return;
1158     }
1159   }
1160   if (unicharset.get_enabled(class_id)) {
1161     int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,
1162                                                 int_result->feature_misses, bottom, top,
1163                                                 blob_length, matcher_multiplier, cn_factors);
1164     AddNewResult(*int_result, final_results);
1165   }
1166 }
1167 
1168 // Applies a set of corrections to the confidence im_rating,
1169 // including the cn_correction, miss penalty and additional penalty
1170 // for non-alnums being vertical misfits. Returns the corrected confidence.
ComputeCorrectedRating(bool debug,int unichar_id,double cp_rating,double im_rating,int feature_misses,int bottom,int top,int blob_length,int matcher_multiplier,const uint8_t * cn_factors)1171 double Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
1172                                         double im_rating, int feature_misses, int bottom, int top,
1173                                         int blob_length, int matcher_multiplier,
1174                                         const uint8_t *cn_factors) {
1175   // Compute class feature corrections.
1176   double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],
1177                                               matcher_multiplier);
1178   double miss_penalty = tessedit_class_miss_scale * feature_misses;
1179   double vertical_penalty = 0.0;
1180   // Penalize non-alnums for being vertical misfits.
1181   if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&
1182       cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1183     int min_bottom, max_bottom, min_top, max_top;
1184     unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
1185     if (debug) {
1186       tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n", top, min_top, max_top, bottom,
1187               min_bottom, max_bottom);
1188     }
1189     if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {
1190       vertical_penalty = classify_misfit_junk_penalty;
1191     }
1192   }
1193   double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1194   if (result < WORST_POSSIBLE_RATING) {
1195     result = WORST_POSSIBLE_RATING;
1196   }
1197   if (debug) {
1198     tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1199             unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,
1200             (1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,
1201             cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);
1202   }
1203   return result;
1204 }
1205 
1206 /*---------------------------------------------------------------------------*/
1207 /**
1208  * This routine extracts baseline normalized features
1209  * from the unknown character and matches them against the
1210  * specified set of templates.  The classes which match
1211  * are added to Results.
1212  *
1213  * Globals:
1214  * - BaselineCutoffs expected num features for each class
1215  *
1216  * @param Blob blob to be classified
1217  * @param Templates current set of adapted templates
1218  * @param Results place to put match results
1219  * @param int_features
1220  * @param fx_info
1221  *
1222  * @return Array of possible ambiguous chars that should be checked.
1223  */
BaselineClassifier(TBLOB * Blob,const std::vector<INT_FEATURE_STRUCT> & int_features,const INT_FX_RESULT_STRUCT & fx_info,ADAPT_TEMPLATES_STRUCT * Templates,ADAPT_RESULTS * Results)1224 UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
1225                                          const std::vector<INT_FEATURE_STRUCT> &int_features,
1226                                          const INT_FX_RESULT_STRUCT &fx_info,
1227                                          ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) {
1228   if (int_features.empty()) {
1229     return nullptr;
1230   }
1231   auto *CharNormArray = new uint8_t[unicharset.size()];
1232   ClearCharNormArray(CharNormArray);
1233 
1234   Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
1235   PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,
1236                BaselineCutoffs, &Results->CPResults);
1237 
1238   if (matcher_debug_level >= 2 || classify_debug_level > 1) {
1239     tprintf("BL Matches =  ");
1240   }
1241 
1242   MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,
1243                 Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,
1244                 Results);
1245 
1246   delete[] CharNormArray;
1247   CLASS_ID ClassId = Results->best_unichar_id;
1248   if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {
1249     return nullptr;
1250   }
1251 
1252   return Templates->Class[ClassId]
1253       ->Config[Results->match[Results->best_match_index].config]
1254       .Perm->Ambigs;
1255 } /* BaselineClassifier */
1256 
1257 /*---------------------------------------------------------------------------*/
1258 /**
1259  * This routine extracts character normalized features
1260  * from the unknown character and matches them against the
1261  * specified set of templates.  The classes which match
1262  * are added to Results.
1263  *
1264  * @param blob blob to be classified
1265  * @param sample templates to classify unknown against
1266  * @param adapt_results place to put match results
1267  *
1268  * Globals:
1269  * - CharNormCutoffs expected num features for each class
1270  * - AllProtosOn mask that enables all protos
1271  * - AllConfigsOn mask that enables all configs
1272  */
CharNormClassifier(TBLOB * blob,const TrainingSample & sample,ADAPT_RESULTS * adapt_results)1273 int Classify::CharNormClassifier(TBLOB *blob, const TrainingSample &sample,
1274                                  ADAPT_RESULTS *adapt_results) {
1275   // This is the length that is used for scaling ratings vs certainty.
1276   adapt_results->BlobLength = IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1277   std::vector<UnicharRating> unichar_results;
1278   static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);
1279   // Convert results to the format used internally by AdaptiveClassifier.
1280   for (auto &r : unichar_results) {
1281     AddNewResult(r, adapt_results);
1282   }
1283   return sample.num_features();
1284 } /* CharNormClassifier */
1285 
1286 // As CharNormClassifier, but operates on a TrainingSample and outputs to
1287 // a vector of ShapeRating without conversion to classes.
CharNormTrainingSample(bool pruner_only,int keep_this,const TrainingSample & sample,std::vector<UnicharRating> * results)1288 int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
1289                                      std::vector<UnicharRating> *results) {
1290   results->clear();
1291   std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
1292   adapt_results->Initialize();
1293   // Compute the bounding box of the features.
1294   uint32_t num_features = sample.num_features();
1295   // Only the top and bottom of the blob_box are used by MasterMatcher, so
1296   // fabricate right and left using top and bottom.
1297   TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1298                 sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1299   // Compute the char_norm_array from the saved cn_feature.
1300   FEATURE norm_feature = sample.GetCNFeature();
1301   std::vector<uint8_t> char_norm_array(unicharset.size());
1302   auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
1303   std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
1304   adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);
1305   ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
1306 
1307   PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
1308                shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1309                &adapt_results->CPResults);
1310   if (keep_this >= 0) {
1311     adapt_results->CPResults[0].Class = keep_this;
1312     adapt_results->CPResults.resize(1);
1313   }
1314   if (pruner_only) {
1315     // Convert pruner results to output format.
1316     for (auto &it : adapt_results->CPResults) {
1317       int class_id = it.Class;
1318       results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
1319     }
1320   } else {
1321     MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
1322                   matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
1323                   adapt_results->CPResults, adapt_results.get());
1324     // Convert master matcher results to output format.
1325     for (auto &i : adapt_results->match) {
1326       results->push_back(i);
1327     }
1328     if (results->size() > 1) {
1329       std::sort(results->begin(), results->end(), SortDescendingRating);
1330     }
1331   }
1332   return num_features;
1333 } /* CharNormTrainingSample */
1334 
1335 /*---------------------------------------------------------------------------*/
1336 /**
1337  * This routine computes a rating which reflects the
1338  * likelihood that the blob being classified is a noise
1339  * blob.  NOTE: assumes that the blob length has already been
1340  * computed and placed into Results.
1341  *
1342  * @param results results to add noise classification to
1343  *
1344  * Globals:
1345  * - matcher_avg_noise_size avg. length of a noise blob
1346  */
ClassifyAsNoise(ADAPT_RESULTS * results)1347 void Classify::ClassifyAsNoise(ADAPT_RESULTS *results) {
1348   float rating = results->BlobLength / matcher_avg_noise_size;
1349   rating *= rating;
1350   rating /= 1 + rating;
1351 
1352   AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1353 } /* ClassifyAsNoise */
1354 
1355 /// The function converts the given match ratings to the list of blob
1356 /// choices with ratings and certainties (used by the context checkers).
1357 /// If character fragments are present in the results, this function also makes
1358 /// sure that there is at least one non-fragmented classification included.
1359 /// For each classification result check the unicharset for "definite"
1360 /// ambiguities and modify the resulting Choices accordingly.
ConvertMatchesToChoices(const DENORM & denorm,const TBOX & box,ADAPT_RESULTS * Results,BLOB_CHOICE_LIST * Choices)1361 void Classify::ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box,
1362                                        ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) {
1363   assert(Choices != nullptr);
1364   float Rating;
1365   float Certainty;
1366   BLOB_CHOICE_IT temp_it;
1367   bool contains_nonfrag = false;
1368   temp_it.set_to_list(Choices);
1369   int choices_length = 0;
1370   // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1371   // number of returned results, but with a shape_table_ we want to have room
1372   // for at least the biggest shape (which might contain hundreds of Indic
1373   // grapheme fragments) and more, so use double the size of the biggest shape
1374   // if that is more than the default.
1375   int max_matches = MAX_MATCHES;
1376   if (shape_table_ != nullptr) {
1377     max_matches = shape_table_->MaxNumUnichars() * 2;
1378     if (max_matches < MAX_MATCHES) {
1379       max_matches = MAX_MATCHES;
1380     }
1381   }
1382 
1383   float best_certainty = -FLT_MAX;
1384   for (auto &it : Results->match) {
1385     const UnicharRating &result = it;
1386     bool adapted = result.adapted;
1387     bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1388     if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {
1389       continue; // look for a non-fragmented character to fill the
1390                 // last spot in Choices if only fragments are present
1391     }
1392     // BlobLength can never be legally 0, this means recognition failed.
1393     // But we must return a classification result because some invoking
1394     // functions (chopper/permuter) do not anticipate a null blob choice.
1395     // So we need to assign a poor, but not infinitely bad score.
1396     if (Results->BlobLength == 0) {
1397       Certainty = -20;
1398       Rating = 100; // should be -certainty * real_blob_length
1399     } else {
1400       Rating = Certainty = (1.0f - result.rating);
1401       Rating *= rating_scale * Results->BlobLength;
1402       Certainty *= -(getDict().certainty_scale);
1403     }
1404     // Adapted results, by their very nature, should have good certainty.
1405     // Those that don't are at best misleading, and often lead to errors,
1406     // so don't accept adapted results that are too far behind the best result,
1407     // whether adapted or static.
1408     // TODO(rays) find some way of automatically tuning these constants.
1409     if (Certainty > best_certainty) {
1410       best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1411     } else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {
1412       continue; // Don't accept bad adapted results.
1413     }
1414 
1415     float min_xheight, max_xheight, yshift;
1416     denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);
1417     auto *choice = new BLOB_CHOICE(
1418         result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,
1419         max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);
1420     choice->set_fonts(result.fonts);
1421     temp_it.add_to_end(choice);
1422     contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1423     choices_length++;
1424     if (choices_length >= max_matches) {
1425       break;
1426     }
1427   }
1428   Results->match.resize(choices_length);
1429 } // ConvertMatchesToChoices
1430 
1431 /*---------------------------------------------------------------------------*/
1432 #ifndef GRAPHICS_DISABLED
1433 /**
1434  *
1435  * @param blob blob whose classification is being debugged
1436  * @param Results results of match being debugged
1437  *
1438  * Globals: none
1439  */
DebugAdaptiveClassifier(TBLOB * blob,ADAPT_RESULTS * Results)1440 void Classify::DebugAdaptiveClassifier(TBLOB *blob, ADAPT_RESULTS *Results) {
1441   if (static_classifier_ == nullptr) {
1442     return;
1443   }
1444   INT_FX_RESULT_STRUCT fx_info;
1445   std::vector<INT_FEATURE_STRUCT> bl_features;
1446   TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1447   if (sample == nullptr) {
1448     return;
1449   }
1450   static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);
1451 } /* DebugAdaptiveClassifier */
1452 #endif
1453 
1454 /*---------------------------------------------------------------------------*/
1455 /**
1456  * This routine performs an adaptive classification.
1457  * If we have not yet adapted to enough classes, a simple
1458  * classification to the pre-trained templates is performed.
1459  * Otherwise, we match the blob against the adapted templates.
1460  * If the adapted templates do not match well, we try a
1461  * match against the pre-trained templates.  If an adapted
1462  * template match is found, we do a match to any pre-trained
1463  * templates which could be ambiguous.  The results from all
1464  * of these classifications are merged together into Results.
1465  *
1466  * @param Blob blob to be classified
1467  * @param Results place to put match results
1468  *
1469  * Globals:
1470  * - PreTrainedTemplates built-in training templates
1471  * - AdaptedTemplates templates adapted for this page
1472  * - matcher_reliable_adaptive_result rating limit for a great match
1473  */
DoAdaptiveMatch(TBLOB * Blob,ADAPT_RESULTS * Results)1474 void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
1475   UNICHAR_ID *Ambiguities;
1476 
1477   INT_FX_RESULT_STRUCT fx_info;
1478   std::vector<INT_FEATURE_STRUCT> bl_features;
1479   TrainingSample *sample =
1480       BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1481   if (sample == nullptr) {
1482     return;
1483   }
1484 
1485   // TODO: With LSTM, static_classifier_ is nullptr.
1486   // Return to avoid crash in CharNormClassifier.
1487   if (static_classifier_ == nullptr) {
1488     delete sample;
1489     return;
1490   }
1491 
1492   if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {
1493     CharNormClassifier(Blob, *sample, Results);
1494   } else {
1495     Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);
1496     if ((!Results->match.empty() &&
1497          MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&
1498          !tess_bn_matching) ||
1499         Results->match.empty()) {
1500       CharNormClassifier(Blob, *sample, Results);
1501     } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1502       AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,
1503                       Ambiguities, Results);
1504     }
1505   }
1506 
1507   // Force the blob to be classified as noise
1508   // if the results contain only fragments.
1509   // TODO(daria): verify that this is better than
1510   // just adding a nullptr classification.
1511   if (!Results->HasNonfragment || Results->match.empty()) {
1512     ClassifyAsNoise(Results);
1513   }
1514   delete sample;
1515 } /* DoAdaptiveMatch */
1516 
1517 /*---------------------------------------------------------------------------*/
1518 /**
1519  * This routine matches blob to the built-in templates
1520  * to find out if there are any classes other than the correct
1521  * class which are potential ambiguities.
1522  *
1523  * @param Blob blob to get classification ambiguities for
1524  * @param CorrectClass correct class for Blob
1525  *
1526  * Globals:
1527  * - CurrentRatings used by qsort compare routine
1528  * - PreTrainedTemplates built-in templates
1529  *
1530  * @return String containing all possible ambiguous classes.
1531  */
GetAmbiguities(TBLOB * Blob,CLASS_ID CorrectClass)1532 UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass) {
1533   auto *Results = new ADAPT_RESULTS();
1534   UNICHAR_ID *Ambiguities;
1535 
1536   Results->Initialize();
1537   INT_FX_RESULT_STRUCT fx_info;
1538   std::vector<INT_FEATURE_STRUCT> bl_features;
1539   TrainingSample *sample =
1540       BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1541   if (sample == nullptr) {
1542     delete Results;
1543     return nullptr;
1544   }
1545 
1546   CharNormClassifier(Blob, *sample, Results);
1547   delete sample;
1548   RemoveBadMatches(Results);
1549   std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
1550 
1551   /* copy the class id's into an string of ambiguities - don't copy if
1552    the correct class is the only class id matched */
1553   Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1554   if (Results->match.size() > 1 ||
1555       (Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {
1556     unsigned i;
1557     for (i = 0; i < Results->match.size(); i++) {
1558       Ambiguities[i] = Results->match[i].unichar_id;
1559     }
1560     Ambiguities[i] = -1;
1561   } else {
1562     Ambiguities[0] = -1;
1563   }
1564 
1565   delete Results;
1566   return Ambiguities;
1567 } /* GetAmbiguities */
1568 
1569 // Returns true if the given blob looks too dissimilar to any character
1570 // present in the classifier templates.
LooksLikeGarbage(TBLOB * blob)1571 bool Classify::LooksLikeGarbage(TBLOB *blob) {
1572   auto *ratings = new BLOB_CHOICE_LIST();
1573   AdaptiveClassifier(blob, ratings);
1574   BLOB_CHOICE_IT ratings_it(ratings);
1575   const UNICHARSET &unicharset = getDict().getUnicharset();
1576   if (classify_debug_character_fragments) {
1577     print_ratings_list("======================\nLooksLikeGarbage() got ", ratings, unicharset);
1578   }
1579   for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {
1580     if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1581       continue;
1582     }
1583     float certainty = ratings_it.data()->certainty();
1584     delete ratings;
1585     return certainty < classify_character_fragments_garbage_certainty_threshold;
1586   }
1587   delete ratings;
1588   return true; // no whole characters in ratings
1589 }
1590 
1591 /*---------------------------------------------------------------------------*/
1592 /**
1593  * This routine calls the integer (Hardware) feature
1594  * extractor if it has not been called before for this blob.
1595  *
1596  * The results from the feature extractor are placed into
1597  * globals so that they can be used in other routines without
1598  * re-extracting the features.
1599  *
1600  * It then copies the char norm features into the IntFeatures
1601  * array provided by the caller.
1602  *
1603  * @param templates used to compute char norm adjustments
1604  * @param pruner_norm_array Array of factors from blob normalization
1605  *        process
1606  * @param char_norm_array array to fill with dummy char norm adjustments
1607  * @param fx_info
1608  *
1609  * Globals:
1610  *
1611  * @return Number of features extracted or 0 if an error occurred.
1612  */
GetCharNormFeature(const INT_FX_RESULT_STRUCT & fx_info,INT_TEMPLATES_STRUCT * templates,uint8_t * pruner_norm_array,uint8_t * char_norm_array)1613 int Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,
1614                                  uint8_t *pruner_norm_array, uint8_t *char_norm_array) {
1615   auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);
1616   float baseline = kBlnBaselineOffset;
1617   float scale = MF_SCALE_FACTOR;
1618   norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1619   norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;
1620   norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1621   norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1622   // Deletes norm_feature.
1623   ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);
1624   return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1625 } /* GetCharNormFeature */
1626 
1627 // Computes the char_norm_array for the unicharset and, if not nullptr, the
1628 // pruner_array as appropriate according to the existence of the shape_table.
ComputeCharNormArrays(FEATURE_STRUCT * norm_feature,INT_TEMPLATES_STRUCT * templates,uint8_t * char_norm_array,uint8_t * pruner_array)1629 void Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
1630                                      uint8_t *char_norm_array, uint8_t *pruner_array) {
1631   ComputeIntCharNormArray(*norm_feature, char_norm_array);
1632   //if (pruner_array != nullptr) {
1633     if (shape_table_ == nullptr) {
1634       ComputeIntCharNormArray(*norm_feature, pruner_array);
1635     } else {
1636       memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
1637       // Each entry in the pruner norm array is the MIN of all the entries of
1638       // the corresponding unichars in the CharNormArray.
1639       for (unsigned id = 0; id < templates->NumClasses; ++id) {
1640         int font_set_id = templates->Class[id]->font_set_id;
1641         const FontSet &fs = fontset_table_.at(font_set_id);
1642         for (auto f : fs) {
1643           const Shape &shape = shape_table_->GetShape(f);
1644           for (int c = 0; c < shape.size(); ++c) {
1645             if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
1646               pruner_array[id] = char_norm_array[shape[c].unichar_id];
1647             }
1648           }
1649         }
1650       }
1651     }
1652   //}
1653   delete norm_feature;
1654 }
1655 
1656 /*---------------------------------------------------------------------------*/
1657 /**
1658  *
1659  * @param Templates adapted templates to add new config to
1660  * @param ClassId class id to associate with new config
1661  * @param FontinfoId font information inferred from pre-trained templates
1662  * @param NumFeatures number of features in IntFeatures
1663  * @param Features features describing model for new config
1664  * @param FloatFeatures floating-pt representation of features
1665  *
1666  * @return The id of the new config created, a negative integer in
1667  * case of error.
1668  */
MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT * Templates,CLASS_ID ClassId,int FontinfoId,int NumFeatures,INT_FEATURE_ARRAY Features,FEATURE_SET FloatFeatures)1669 int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,
1670                                      int NumFeatures, INT_FEATURE_ARRAY Features,
1671                                      FEATURE_SET FloatFeatures) {
1672   INT_CLASS_STRUCT *IClass;
1673   ADAPT_CLASS_STRUCT *Class;
1674   PROTO_ID OldProtos[MAX_NUM_PROTOS];
1675   FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1676   int NumOldProtos;
1677   int NumBadFeatures;
1678   int MaxProtoId, OldMaxProtoId;
1679   int MaskSize;
1680   int ConfigId;
1681   int i;
1682   int debug_level = NO_DEBUG;
1683 
1684   if (classify_learning_debug_level >= 3) {
1685     debug_level = PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
1686   }
1687 
1688   IClass = ClassForClassId(Templates->Templates, ClassId);
1689   Class = Templates->Class[ClassId];
1690 
1691   if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1692     ++NumAdaptationsFailed;
1693     if (classify_learning_debug_level >= 1) {
1694       tprintf("Cannot make new temporary config: maximum number exceeded.\n");
1695     }
1696     return -1;
1697   }
1698 
1699   OldMaxProtoId = IClass->NumProtos - 1;
1700 
1701   NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,
1702                                     OldProtos, classify_adapt_proto_threshold, debug_level);
1703 
1704   MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1705   zero_all_bits(TempProtoMask, MaskSize);
1706   for (i = 0; i < NumOldProtos; i++) {
1707     SET_BIT(TempProtoMask, OldProtos[i]);
1708   }
1709 
1710   NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,
1711                                        BadFeatures, classify_adapt_feature_threshold, debug_level);
1712 
1713   MaxProtoId =
1714       MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);
1715   if (MaxProtoId == NO_PROTO) {
1716     ++NumAdaptationsFailed;
1717     if (classify_learning_debug_level >= 1) {
1718       tprintf("Cannot make new temp protos: maximum number exceeded.\n");
1719     }
1720     return -1;
1721   }
1722 
1723   ConfigId = AddIntConfig(IClass);
1724   ConvertConfig(TempProtoMask, ConfigId, IClass);
1725   auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);
1726   TempConfigFor(Class, ConfigId) = Config;
1727   copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1728 
1729   if (classify_learning_debug_level >= 1) {
1730     tprintf(
1731         "Making new temp config %d fontinfo id %d"
1732         " using %d old and %d new protos.\n",
1733         ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);
1734   }
1735 
1736   return ConfigId;
1737 } /* MakeNewTemporaryConfig */
1738 
1739 /*---------------------------------------------------------------------------*/
1740 /**
1741  * This routine finds sets of sequential bad features
1742  * that all have the same angle and converts each set into
1743  * a new temporary proto.  The temp proto is added to the
1744  * proto pruner for IClass, pushed onto the list of temp
1745  * protos in Class, and added to TempProtoMask.
1746  *
1747  * @param Features floating-pt features describing new character
1748  * @param NumBadFeat number of bad features to turn into protos
1749  * @param BadFeat feature id's of bad features
1750  * @param IClass integer class templates to add new protos to
1751  * @param Class adapted class templates to add new protos to
1752  * @param TempProtoMask proto mask to add new protos to
1753  *
1754  * Globals: none
1755  *
1756  * @return Max proto id in class after all protos have been added.
1757  */
MakeNewTempProtos(FEATURE_SET Features,int NumBadFeat,FEATURE_ID BadFeat[],INT_CLASS_STRUCT * IClass,ADAPT_CLASS_STRUCT * Class,BIT_VECTOR TempProtoMask)1758 PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],
1759                                      INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class,
1760                                      BIT_VECTOR TempProtoMask) {
1761   FEATURE_ID *ProtoStart;
1762   FEATURE_ID *ProtoEnd;
1763   FEATURE_ID *LastBad;
1764   PROTO_STRUCT *Proto;
1765   FEATURE F1, F2;
1766   float X1, X2, Y1, Y2;
1767   float A1, A2, AngleDelta;
1768   float SegmentLength;
1769   PROTO_ID Pid;
1770 
1771   for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;
1772        ProtoStart = ProtoEnd) {
1773     F1 = Features->Features[*ProtoStart];
1774     X1 = F1->Params[PicoFeatX];
1775     Y1 = F1->Params[PicoFeatY];
1776     A1 = F1->Params[PicoFeatDir];
1777 
1778     for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;
1779          ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1780       F2 = Features->Features[*ProtoEnd];
1781       X2 = F2->Params[PicoFeatX];
1782       Y2 = F2->Params[PicoFeatY];
1783       A2 = F2->Params[PicoFeatDir];
1784 
1785       AngleDelta = std::fabs(A1 - A2);
1786       if (AngleDelta > 0.5f) {
1787         AngleDelta = 1 - AngleDelta;
1788       }
1789 
1790       if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||
1791           std::fabs(Y1 - Y2) > SegmentLength) {
1792         break;
1793       }
1794     }
1795 
1796     F2 = Features->Features[*(ProtoEnd - 1)];
1797     X2 = F2->Params[PicoFeatX];
1798     Y2 = F2->Params[PicoFeatY];
1799     A2 = F2->Params[PicoFeatDir];
1800 
1801     Pid = AddIntProto(IClass);
1802     if (Pid == NO_PROTO) {
1803       return (NO_PROTO);
1804     }
1805 
1806     auto TempProto = new TEMP_PROTO_STRUCT;
1807     Proto = &(TempProto->Proto);
1808 
1809     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1810    ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1811    instead of the -0.25 to 0.75 used in baseline normalization */
1812     Proto->Length = SegmentLength;
1813     Proto->Angle = A1;
1814     Proto->X = (X1 + X2) / 2;
1815     Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;
1816     FillABC(Proto);
1817 
1818     TempProto->ProtoId = Pid;
1819     SET_BIT(TempProtoMask, Pid);
1820 
1821     ConvertProto(Proto, Pid, IClass);
1822     AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
1823 
1824     Class->TempProtos = push(Class->TempProtos, TempProto);
1825   }
1826   return IClass->NumProtos - 1;
1827 } /* MakeNewTempProtos */
1828 
1829 /*---------------------------------------------------------------------------*/
1830 /**
1831  *
1832  * @param Templates current set of adaptive templates
1833  * @param ClassId class containing config to be made permanent
1834  * @param ConfigId config to be made permanent
1835  * @param Blob current blob being adapted to
1836  *
1837  * Globals: none
1838  */
MakePermanent(ADAPT_TEMPLATES_STRUCT * Templates,CLASS_ID ClassId,int ConfigId,TBLOB * Blob)1839 void Classify::MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId,
1840                              TBLOB *Blob) {
1841   UNICHAR_ID *Ambigs;
1842   PROTO_KEY ProtoKey;
1843 
1844   auto Class = Templates->Class[ClassId];
1845   auto Config = TempConfigFor(Class, ConfigId);
1846 
1847   MakeConfigPermanent(Class, ConfigId);
1848   if (Class->NumPermConfigs == 0) {
1849     Templates->NumPermClasses++;
1850   }
1851   Class->NumPermConfigs++;
1852 
1853   // Initialize permanent config.
1854   Ambigs = GetAmbiguities(Blob, ClassId);
1855   auto Perm = new PERM_CONFIG_STRUCT;
1856   Perm->Ambigs = Ambigs;
1857   Perm->FontinfoId = Config->FontinfoId;
1858 
1859   // Free memory associated with temporary config (since ADAPTED_CONFIG
1860   // is a union we need to clean up before we record permanent config).
1861   ProtoKey.Templates = Templates;
1862   ProtoKey.ClassId = ClassId;
1863   ProtoKey.ConfigId = ConfigId;
1864   Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1865   delete Config;
1866 
1867   // Record permanent config.
1868   PermConfigFor(Class, ConfigId) = Perm;
1869 
1870   if (classify_learning_debug_level >= 1) {
1871     tprintf(
1872         "Making config %d for %s (ClassId %d) permanent:"
1873         " fontinfo id %d, ambiguities '",
1874         ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,
1875         PermConfigFor(Class, ConfigId)->FontinfoId);
1876     for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {
1877       tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1878     }
1879     tprintf("'.\n");
1880   }
1881 } /* MakePermanent */
1882 
1883 /*---------------------------------------------------------------------------*/
1884 /**
1885  * This routine converts TempProto to be permanent if
1886  * its proto id is used by the configuration specified in
1887  * ProtoKey.
1888  *
1889  * @param item1 (TEMP_PROTO) temporary proto to compare to key
1890  * @param item2 (PROTO_KEY) defines which protos to make permanent
1891  *
1892  * Globals: none
1893  *
1894  * @return true if TempProto is converted, false otherwise
1895  */
MakeTempProtoPerm(void * item1,void * item2)1896 int MakeTempProtoPerm(void *item1, void *item2) {
1897   auto TempProto = static_cast<TEMP_PROTO_STRUCT *>(item1);
1898   auto ProtoKey = static_cast<PROTO_KEY *>(item2);
1899 
1900   auto Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
1901   auto Config = TempConfigFor(Class, ProtoKey->ConfigId);
1902 
1903   if (TempProto->ProtoId > Config->MaxProtoId || !test_bit(Config->Protos, TempProto->ProtoId)) {
1904     return false;
1905   }
1906 
1907   MakeProtoPermanent(Class, TempProto->ProtoId);
1908   AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId, ProtoKey->Templates->Templates);
1909   delete TempProto;
1910 
1911   return true;
1912 } /* MakeTempProtoPerm */
1913 
1914 /*---------------------------------------------------------------------------*/
1915 /**
1916  * This routine writes the matches in Results to File.
1917  *
1918  * @param results match results to write to File
1919  *
1920  * Globals: none
1921  */
PrintAdaptiveMatchResults(const ADAPT_RESULTS & results)1922 void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS &results) {
1923   for (auto &it : results.match) {
1924     tprintf("%s  ", unicharset.debug_str(it.unichar_id).c_str());
1925     it.Print();
1926   }
1927 } /* PrintAdaptiveMatchResults */
1928 
1929 /*---------------------------------------------------------------------------*/
1930 /**
1931  * This routine steps through each matching class in Results
1932  * and removes it from the match list if its rating
1933  * is worse than the BestRating plus a pad.  In other words,
1934  * all good matches get moved to the front of the classes
1935  * array.
1936  *
1937  * @param Results contains matches to be filtered
1938  *
1939  * Globals:
1940  * - matcher_bad_match_pad defines a "bad match"
1941  */
RemoveBadMatches(ADAPT_RESULTS * Results)1942 void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
1943   unsigned Next, NextGood;
1944   float BadMatchThreshold;
1945   static const char *romans = "i v x I V X";
1946   BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
1947 
1948   if (classify_bln_numeric_mode) {
1949     UNICHAR_ID unichar_id_one =
1950         unicharset.contains_unichar("1") ? unicharset.unichar_to_id("1") : -1;
1951     UNICHAR_ID unichar_id_zero =
1952         unicharset.contains_unichar("0") ? unicharset.unichar_to_id("0") : -1;
1953     float scored_one = ScoredUnichar(unichar_id_one, *Results);
1954     float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
1955 
1956     for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1957       const UnicharRating &match = Results->match[Next];
1958       if (match.rating >= BadMatchThreshold) {
1959         if (!unicharset.get_isalpha(match.unichar_id) ||
1960             strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
1961         } else if (unicharset.eq(match.unichar_id, "l") && scored_one < BadMatchThreshold) {
1962           Results->match[Next].unichar_id = unichar_id_one;
1963         } else if (unicharset.eq(match.unichar_id, "O") && scored_zero < BadMatchThreshold) {
1964           Results->match[Next].unichar_id = unichar_id_zero;
1965         } else {
1966           Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
1967         }
1968         if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
1969           if (NextGood == Next) {
1970             ++NextGood;
1971           } else {
1972             Results->match[NextGood++] = Results->match[Next];
1973           }
1974         }
1975       }
1976     }
1977   } else {
1978     for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1979       if (Results->match[Next].rating >= BadMatchThreshold) {
1980         if (NextGood == Next) {
1981           ++NextGood;
1982         } else {
1983           Results->match[NextGood++] = Results->match[Next];
1984         }
1985       }
1986     }
1987   }
1988   Results->match.resize(NextGood);
1989 } /* RemoveBadMatches */
1990 
1991 /*----------------------------------------------------------------------------*/
1992 /**
1993  * This routine discards extra digits or punctuation from the results.
1994  * We keep only the top 2 punctuation answers and the top 1 digit answer if
1995  * present.
1996  *
1997  * @param Results contains matches to be filtered
1998  */
RemoveExtraPuncs(ADAPT_RESULTS * Results)1999 void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
2000   unsigned Next, NextGood;
2001   int punc_count; /*no of garbage characters */
2002   int digit_count;
2003   /*garbage characters */
2004   static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2005   static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2006 
2007   punc_count = 0;
2008   digit_count = 0;
2009   for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2010     const UnicharRating &match = Results->match[Next];
2011     bool keep = true;
2012     if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2013       if (punc_count >= 2) {
2014         keep = false;
2015       }
2016       punc_count++;
2017     } else {
2018       if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2019         if (digit_count >= 1) {
2020           keep = false;
2021         }
2022         digit_count++;
2023       }
2024     }
2025     if (keep) {
2026       if (NextGood == Next) {
2027         ++NextGood;
2028       } else {
2029         Results->match[NextGood++] = match;
2030       }
2031     }
2032   }
2033   Results->match.resize(NextGood);
2034 } /* RemoveExtraPuncs */
2035 
2036 /*---------------------------------------------------------------------------*/
2037 /**
2038  * This routine resets the internal thresholds inside
2039  * the integer matcher to correspond to the specified
2040  * threshold.
2041  *
2042  * @param Threshold threshold for creating new templates
2043  *
2044  * Globals:
2045  * - matcher_good_threshold default good match rating
2046  */
SetAdaptiveThreshold(float Threshold)2047 void Classify::SetAdaptiveThreshold(float Threshold) {
2048   Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);
2049   classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2050   classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2051 } /* SetAdaptiveThreshold */
2052 
2053 #ifndef GRAPHICS_DISABLED
2054 
2055 /*---------------------------------------------------------------------------*/
2056 /**
2057  * This routine displays debug information for the best config
2058  * of the given shape_id for the given set of features.
2059  *
2060  * @param shape_id classifier id to work with
2061  * @param features features of the unknown character
2062  * @param num_features Number of features in the features array.
2063  */
2064 
ShowBestMatchFor(int shape_id,const INT_FEATURE_STRUCT * features,int num_features)2065 void Classify::ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features,
2066                                 int num_features) {
2067   uint32_t config_mask;
2068   if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2069     tprintf("No built-in templates for class/shape %d\n", shape_id);
2070     return;
2071   }
2072   if (num_features <= 0) {
2073     tprintf("Illegal blob (char norm features)!\n");
2074     return;
2075   }
2076   UnicharRating cn_result;
2077   classify_norm_method.set_value(character);
2078   im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, AllConfigsOn, num_features,
2079             features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,
2080             matcher_debug_separate_windows);
2081   tprintf("\n");
2082   config_mask = 1 << cn_result.config;
2083 
2084   tprintf("Static Shape ID: %d\n", shape_id);
2085   ShowMatchDisplay();
2086   im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,
2087             features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,
2088             matcher_debug_separate_windows);
2089   UpdateMatchDisplay();
2090 } /* ShowBestMatchFor */
2091 
2092 #endif // !GRAPHICS_DISABLED
2093 
2094 // Returns a string for the classifier class_id: either the corresponding
2095 // unicharset debug_str or the shape_table_ debug str.
ClassIDToDebugStr(const INT_TEMPLATES_STRUCT * templates,int class_id,int config_id) const2096 std::string Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
2097                                         int config_id) const {
2098   std::string class_string;
2099   if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2100     int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2101     class_string = shape_table_->DebugStr(shape_id);
2102   } else {
2103     class_string = unicharset.debug_str(class_id);
2104   }
2105   return class_string;
2106 }
2107 
2108 // Converts a classifier class_id index to a shape_table_ index
ClassAndConfigIDToFontOrShapeID(int class_id,int int_result_config) const2109 int Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const {
2110   int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2111   // Older inttemps have no font_ids.
2112   if (font_set_id < 0) {
2113     return kBlankFontinfoId;
2114   }
2115   const FontSet &fs = fontset_table_.at(font_set_id);
2116   return fs.at(int_result_config);
2117 }
2118 
2119 // Converts a shape_table_ index to a classifier class_id index (not a
2120 // unichar-id!). Uses a search, so not fast.
ShapeIDToClassID(int shape_id) const2121 int Classify::ShapeIDToClassID(int shape_id) const {
2122   for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2123     int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2124     ASSERT_HOST(font_set_id >= 0);
2125     const FontSet &fs = fontset_table_.at(font_set_id);
2126     for (auto f : fs) {
2127       if (f == shape_id) {
2128         return id;
2129       }
2130     }
2131   }
2132   tprintf("Shape %d not found\n", shape_id);
2133   return -1;
2134 }
2135 
2136 // Returns true if the given TEMP_CONFIG_STRUCT is good enough to make it
2137 // a permanent config.
TempConfigReliable(CLASS_ID class_id,const TEMP_CONFIG_STRUCT * config)2138 bool Classify::TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config) {
2139   if (classify_learning_debug_level >= 1) {
2140     tprintf("NumTimesSeen for config of %s is %d\n",
2141             getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);
2142   }
2143   if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
2144     return true;
2145   } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2146     return false;
2147   } else if (use_ambigs_for_adaption) {
2148     // Go through the ambigs vector and see whether we have already seen
2149     // enough times all the characters represented by the ambigs vector.
2150     const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
2151     int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2152     for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2153       ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2154       assert(ambig_class != nullptr);
2155       if (ambig_class->NumPermConfigs == 0 &&
2156           ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {
2157         if (classify_learning_debug_level >= 1) {
2158           tprintf(
2159               "Ambig %s has not been seen enough times,"
2160               " not making config for %s permanent\n",
2161               getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),
2162               getDict().getUnicharset().debug_str(class_id).c_str());
2163         }
2164         return false;
2165       }
2166     }
2167   }
2168   return true;
2169 }
2170 
UpdateAmbigsGroup(CLASS_ID class_id,TBLOB * Blob)2171 void Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {
2172   const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
2173   int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2174   if (classify_learning_debug_level >= 1) {
2175     tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2176             getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
2177   }
2178   for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2179     CLASS_ID ambig_class_id = (*ambigs)[ambig];
2180     const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2181     for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2182       if (ConfigIsPermanent(ambigs_class, cfg)) {
2183         continue;
2184       }
2185       const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2186       if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2187         if (classify_learning_debug_level >= 1) {
2188           tprintf("Making config %d of %s permanent\n", cfg,
2189                   getDict().getUnicharset().debug_str(ambig_class_id).c_str());
2190         }
2191         MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2192       }
2193     }
2194   }
2195 }
2196 
2197 } // namespace tesseract
2198