1 /**********************************************************************
2  * File:        tessedit.cpp  (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  *              Now just code to load the language model and various
5  *              engine-specific data files.
6  * Author:      Ray Smith
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #  include "config_auto.h"
24 #endif
25 
26 #include <regex> // for std::regex_match
27 
28 #include "control.h"
29 #include "matchdefs.h"
30 #include "pageres.h"
31 #include "params.h"
32 #include "stopper.h"
33 #include "tesseractclass.h"
34 #include "tessvars.h"
35 #include "tprintf.h"
36 #ifndef DISABLED_LEGACY_ENGINE
37 #  include "chop.h"
38 #  include "intmatcher.h"
39 #  include "reject.h"
40 #endif
41 #include "lstmrecognizer.h"
42 
43 namespace tesseract {
44 
45 // Read a "config" file containing a set of variable, value pairs.
46 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
47 // and also accepts a relative or absolute path name.
read_config_file(const char * filename,SetParamConstraint constraint)48 void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
49   std::string path = datadir;
50   path += "configs/";
51   path += filename;
52   FILE *fp;
53   if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
54     fclose(fp);
55   } else {
56     path = datadir;
57     path += "tessconfigs/";
58     path += filename;
59     if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
60       fclose(fp);
61     } else {
62       path = filename;
63     }
64   }
65   ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
66 }
67 
68 // Returns false if a unicharset file for the specified language was not found
69 // or was invalid.
70 // This function initializes TessdataManager. After TessdataManager is
71 // no longer needed, TessdataManager::End() should be called.
72 //
73 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
74 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
75 // from the language-specific config file (stored in [lang].traineddata), from
76 // the config files specified on the command line or left as the default
77 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
init_tesseract_lang_data(const std::string & arg0,const std::string & language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params,TessdataManager * mgr)78 bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
79                                          const std::string &language, OcrEngineMode oem,
80                                          char **configs, int configs_size,
81                                          const std::vector<std::string> *vars_vec,
82                                          const std::vector<std::string> *vars_values,
83                                          bool set_only_non_debug_params, TessdataManager *mgr) {
84   // Set the language data path prefix
85   lang = !language.empty() ? language : "eng";
86   language_data_path_prefix = datadir;
87   language_data_path_prefix += lang;
88   language_data_path_prefix += ".";
89 
90   // Initialize TessdataManager.
91   std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
92   if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
93     tprintf("Error opening data file %s\n", tessdata_path.c_str());
94     tprintf(
95         "Please make sure the TESSDATA_PREFIX environment variable is set"
96         " to your \"tessdata\" directory.\n");
97     return false;
98   }
99 #ifdef DISABLED_LEGACY_ENGINE
100   tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
101 #else
102   if (oem == OEM_DEFAULT) {
103     // Set the engine mode from availability, which can then be overridden by
104     // the config file when we read it below.
105     if (!mgr->IsLSTMAvailable()) {
106       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
107     } else if (!mgr->IsBaseAvailable()) {
108       tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
109     } else {
110       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
111     }
112   }
113 #endif // ndef DISABLED_LEGACY_ENGINE
114 
115   // If a language specific config file (lang.config) exists, load it in.
116   TFile fp;
117   if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
118     ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
119   }
120 
121   SetParamConstraint set_params_constraint =
122       set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
123   // Load tesseract variables from config files. This is done after loading
124   // language-specific variables from [lang].traineddata file, so that custom
125   // config files can override values in [lang].traineddata file.
126   for (int i = 0; i < configs_size; ++i) {
127     read_config_file(configs[i], set_params_constraint);
128   }
129 
130   // Set params specified in vars_vec (done after setting params from config
131   // files, so that params in vars_vec can override those from files).
132   if (vars_vec != nullptr && vars_values != nullptr) {
133     for (unsigned i = 0; i < vars_vec->size(); ++i) {
134       if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
135                                 set_params_constraint, this->params())) {
136         tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
137       }
138     }
139   }
140 
141   if (!tessedit_write_params_to_file.empty()) {
142     FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
143     if (params_file != nullptr) {
144       ParamUtils::PrintParams(params_file, this->params());
145       fclose(params_file);
146     } else {
147       tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
148     }
149   }
150 
151 #ifndef DISABLED_LEGACY_ENGINE
152   // Determine which ocr engine(s) should be loaded and used for recognition.
153   if (oem != OEM_DEFAULT) {
154     tessedit_ocr_engine_mode.set_value(oem);
155   }
156 #endif
157 
158   // If we are only loading the config file (and so not planning on doing any
159   // recognition) then there's nothing else do here.
160   if (tessedit_init_config_only) {
161     return true;
162   }
163 
164 // The various OcrEngineMode settings (see tesseract/publictypes.h) determine
165 // which engine-specific data files need to be loaded. If LSTM_ONLY is
166 // requested, the base Tesseract files are *Not* required.
167 #ifdef DISABLED_LEGACY_ENGINE
168   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
169 #else
170   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
171       tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
172 #endif // ndef DISABLED_LEGACY_ENGINE
173     if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
174       lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
175       ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
176     } else {
177       tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
178       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
179     }
180   }
181 
182   // Load the unicharset
183   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
184     // Avoid requiring a unicharset when we aren't running base tesseract.
185     unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
186   }
187 #ifndef DISABLED_LEGACY_ENGINE
188   else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
189     tprintf(
190         "Error: Tesseract (legacy) engine requested, but components are "
191         "not present in %s!!\n",
192         tessdata_path.c_str());
193     return false;
194   }
195 #endif // ndef DISABLED_LEGACY_ENGINE
196   if (unicharset.size() > MAX_NUM_CLASSES) {
197     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
198     return false;
199   }
200   right_to_left_ = unicharset.major_right_to_left();
201 
202 #ifndef DISABLED_LEGACY_ENGINE
203 
204   // Setup initial unichar ambigs table and read universal ambigs.
205   UNICHARSET encoder_unicharset;
206   encoder_unicharset.CopyFrom(unicharset);
207   unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
208   unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
209 
210   if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
211     unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
212                                      use_ambigs_for_adaption, &unicharset);
213   }
214 
215   // Init ParamsModel.
216   // Load pass1 and pass2 weights (for now these two sets are the same, but in
217   // the future separate sets of weights can be generated).
218   for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
219     language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
220     if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
221       if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
222         return false;
223       }
224     }
225   }
226 #endif // ndef DISABLED_LEGACY_ENGINE
227 
228   return true;
229 }
230 
231 // Helper returns true if the given string is in the vector of strings.
232 static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
233   for (const auto &i : str_list) {
234     if (i == str) {
235       return true;
236     }
237   }
238   return false;
239 }
240 
241 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
242 // Langs with no prefix get appended to to_load, provided they
243 // are not in there already.
244 // Langs with ~ prefix get appended to not_to_load, provided they are not in
245 // there already.
246 void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
247                                     std::vector<std::string> *not_to_load) {
248   std::string remains(lang_str);
249   // Look whether the model file uses a prefix which must be applied to
250   // included model files as well.
251   std::regex e("(.*)/[^/]*");
252   std::cmatch cm;
253   std::string prefix;
254   if (std::regex_match(lang.c_str(), cm, e, std::regex_constants::match_default)) {
255     // A prefix was found.
256     prefix = cm[1].str() + "/";
257   }
258   while (!remains.empty()) {
259     // Find the start of the lang code and which vector to add to.
260     const char *start = remains.c_str();
261     while (*start == '+') {
262       ++start;
263     }
264     std::vector<std::string> *target = to_load;
265     if (*start == '~') {
266       target = not_to_load;
267       ++start;
268     }
269     // Find the index of the end of the lang code in string start.
270     int end = strlen(start);
271     const char *plus = strchr(start, '+');
272     if (plus != nullptr && plus - start < end) {
273       end = plus - start;
274     }
275     std::string lang_code(start);
276     lang_code.resize(end);
277     std::string next(start + end);
278     remains = next;
279     lang_code = prefix + lang_code;
280     // Check whether lang_code is already in the target vector and add.
281     if (!IsStrInList(lang_code, *target)) {
282       target->push_back(lang_code);
283     }
284   }
285 }
286 
287 // Initialize for potentially a set of languages defined by the language
288 // string and recursively any additional languages required by any language
289 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
290 // See init_tesseract_internal for args.
291 int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
292                               const std::string &language, OcrEngineMode oem, char **configs,
293                               int configs_size, const std::vector<std::string> *vars_vec,
294                               const std::vector<std::string> *vars_values,
295                               bool set_only_non_debug_params, TessdataManager *mgr) {
296   std::vector<std::string> langs_to_load;
297   std::vector<std::string> langs_not_to_load;
298   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
299 
300   for (auto *lang : sub_langs_) {
301     delete lang;
302   }
303 
304   // Set the basename, compute the data directory.
305   main_setup(arg0, textbase);
306 
307   sub_langs_.clear();
308   // Find the first loadable lang and load into this.
309   // Add any languages that this language requires
310   bool loaded_primary = false;
311   // Load the rest into sub_langs_.
312   // A range based for loop does not work here because langs_to_load
313   // might be changed in the loop when a new submodel is found.
314   for (auto &lang_to_load : langs_to_load) {
315     if (!IsStrInList(lang_to_load, langs_not_to_load)) {
316       const char *lang_str = lang_to_load.c_str();
317       Tesseract *tess_to_init;
318       if (!loaded_primary) {
319         tess_to_init = this;
320       } else {
321         tess_to_init = new Tesseract;
322         tess_to_init->main_setup(arg0, textbase);
323       }
324 
325       int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
326                                                          configs_size, vars_vec, vars_values,
327                                                          set_only_non_debug_params, mgr);
328       // Forget that language, but keep any reader we were given.
329       mgr->Clear();
330 
331       if (!loaded_primary) {
332         if (result < 0) {
333           tprintf("Failed loading language '%s'\n", lang_str);
334         } else {
335           ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
336                               &langs_not_to_load);
337           loaded_primary = true;
338         }
339       } else {
340         if (result < 0) {
341           tprintf("Failed loading language '%s'\n", lang_str);
342           delete tess_to_init;
343         } else {
344           sub_langs_.push_back(tess_to_init);
345           // Add any languages that this language requires
346           ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
347                               &langs_not_to_load);
348         }
349       }
350     }
351   }
352   if (!loaded_primary && !langs_to_load.empty()) {
353     tprintf("Tesseract couldn't load any languages!\n");
354     return -1; // Couldn't load any language!
355   }
356 #ifndef DISABLED_LEGACY_ENGINE
357   if (!sub_langs_.empty()) {
358     // In multilingual mode word ratings have to be directly comparable,
359     // so use the same language model weights for all languages:
360     // use the primary language's params model if
361     // tessedit_use_primary_params_model is set,
362     // otherwise use default language model weights.
363     if (tessedit_use_primary_params_model) {
364       for (auto &sub_lang : sub_langs_) {
365         sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
366       }
367       tprintf("Using params model of the primary language\n");
368     } else {
369       this->language_model_->getParamsModel().Clear();
370       for (auto &sub_lang : sub_langs_) {
371         sub_lang->language_model_->getParamsModel().Clear();
372       }
373     }
374   }
375 
376   SetupUniversalFontIds();
377 #endif // ndef DISABLED_LEGACY_ENGINE
378   return 0;
379 }
380 
381 // Common initialization for a single language.
382 // arg0 is the datapath for the tessdata directory, which could be the
383 // path of the tessdata directory with no trailing /, or (if tessdata
384 // lives in the same directory as the executable, the path of the executable,
385 // hence the name arg0.
386 // textbase is an optional output file basename (used only for training)
387 // language is the language code to load.
388 // oem controls which engine(s) will operate on the image
389 // configs (argv) is an array of config filenames to load variables from.
390 // May be nullptr.
391 // configs_size (argc) is the number of elements in configs.
392 // vars_vec is an optional vector of variables to set.
393 // vars_values is an optional corresponding vector of values for the variables
394 // in vars_vec.
395 // If set_only_non_debug_params is true, only params that do not contain
396 // "debug" in the name will be set.
397 int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
398                                        const std::string &language, OcrEngineMode oem,
399                                        char **configs, int configs_size,
400                                        const std::vector<std::string> *vars_vec,
401                                        const std::vector<std::string> *vars_values,
402                                        bool set_only_non_debug_params, TessdataManager *mgr) {
403   if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
404                                 vars_values, set_only_non_debug_params, mgr)) {
405     return -1;
406   }
407   if (tessedit_init_config_only) {
408     return 0;
409   }
410   // If only LSTM will be used, skip loading Tesseract classifier's
411   // pre-trained templates and dictionary.
412   bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
413   program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
414   return 0; // Normal exit
415 }
416 
417 #ifndef DISABLED_LEGACY_ENGINE
418 
419 // Helper builds the all_fonts table by adding new fonts from new_fonts.
420 static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
421                          UnicityTable<FontInfo> *all_fonts) {
422   for (int i = 0; i < new_fonts.size(); ++i) {
423     // UnicityTable uniques as we go.
424     all_fonts->push_back(new_fonts.at(i));
425   }
426 }
427 
428 // Helper assigns an id to lang_fonts using the index in all_fonts table.
429 static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
430   for (int i = 0; i < lang_fonts->size(); ++i) {
431     auto index = all_fonts.get_index(lang_fonts->at(i));
432     lang_fonts->at(i).universal_id = index;
433   }
434 }
435 
436 // Set the universal_id member of each font to be unique among all
437 // instances of the same font loaded.
438 void Tesseract::SetupUniversalFontIds() {
439   // Note that we can get away with bitwise copying FontInfo in
440   // all_fonts, as it is a temporary structure and we avoid setting the
441   // delete callback.
442   UnicityTable<FontInfo> all_fonts;
443 
444   // Create the universal ID table.
445   CollectFonts(get_fontinfo_table(), &all_fonts);
446   for (auto &sub_lang : sub_langs_) {
447     CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
448   }
449   // Assign ids from the table to each font table.
450   AssignIds(all_fonts, &get_fontinfo_table());
451   for (auto &sub_lang : sub_langs_) {
452     AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
453   }
454   font_table_size_ = all_fonts.size();
455 }
456 
457 #endif // ndef DISABLED_LEGACY_ENGINE
458 
459 void Tesseract::end_tesseract() {
460   end_recog();
461 }
462 
463 /* Define command type identifiers */
464 
465 enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
466 } // namespace tesseract
467