1 /**********************************************************************
2 * File: tessedit.cpp (Formerly tessedit.c)
3 * Description: (Previously) Main program for merge of tess and editor.
4 * Now just code to load the language model and various
5 * engine-specific data files.
6 * Author: Ray Smith
7 *
8 * (C) Copyright 1992, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25
26 #include <regex> // for std::regex_match
27
28 #include "control.h"
29 #include "matchdefs.h"
30 #include "pageres.h"
31 #include "params.h"
32 #include "stopper.h"
33 #include "tesseractclass.h"
34 #include "tessvars.h"
35 #include "tprintf.h"
36 #ifndef DISABLED_LEGACY_ENGINE
37 # include "chop.h"
38 # include "intmatcher.h"
39 # include "reject.h"
40 #endif
41 #include "lstmrecognizer.h"
42
43 namespace tesseract {
44
45 // Read a "config" file containing a set of variable, value pairs.
46 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
47 // and also accepts a relative or absolute path name.
read_config_file(const char * filename,SetParamConstraint constraint)48 void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
49 std::string path = datadir;
50 path += "configs/";
51 path += filename;
52 FILE *fp;
53 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
54 fclose(fp);
55 } else {
56 path = datadir;
57 path += "tessconfigs/";
58 path += filename;
59 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
60 fclose(fp);
61 } else {
62 path = filename;
63 }
64 }
65 ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
66 }
67
68 // Returns false if a unicharset file for the specified language was not found
69 // or was invalid.
70 // This function initializes TessdataManager. After TessdataManager is
71 // no longer needed, TessdataManager::End() should be called.
72 //
73 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
74 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
75 // from the language-specific config file (stored in [lang].traineddata), from
76 // the config files specified on the command line or left as the default
77 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
init_tesseract_lang_data(const std::string & arg0,const std::string & language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params,TessdataManager * mgr)78 bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
79 const std::string &language, OcrEngineMode oem,
80 char **configs, int configs_size,
81 const std::vector<std::string> *vars_vec,
82 const std::vector<std::string> *vars_values,
83 bool set_only_non_debug_params, TessdataManager *mgr) {
84 // Set the language data path prefix
85 lang = !language.empty() ? language : "eng";
86 language_data_path_prefix = datadir;
87 language_data_path_prefix += lang;
88 language_data_path_prefix += ".";
89
90 // Initialize TessdataManager.
91 std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
92 if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
93 tprintf("Error opening data file %s\n", tessdata_path.c_str());
94 tprintf(
95 "Please make sure the TESSDATA_PREFIX environment variable is set"
96 " to your \"tessdata\" directory.\n");
97 return false;
98 }
99 #ifdef DISABLED_LEGACY_ENGINE
100 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
101 #else
102 if (oem == OEM_DEFAULT) {
103 // Set the engine mode from availability, which can then be overridden by
104 // the config file when we read it below.
105 if (!mgr->IsLSTMAvailable()) {
106 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
107 } else if (!mgr->IsBaseAvailable()) {
108 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
109 } else {
110 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
111 }
112 }
113 #endif // ndef DISABLED_LEGACY_ENGINE
114
115 // If a language specific config file (lang.config) exists, load it in.
116 TFile fp;
117 if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
118 ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
119 }
120
121 SetParamConstraint set_params_constraint =
122 set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
123 // Load tesseract variables from config files. This is done after loading
124 // language-specific variables from [lang].traineddata file, so that custom
125 // config files can override values in [lang].traineddata file.
126 for (int i = 0; i < configs_size; ++i) {
127 read_config_file(configs[i], set_params_constraint);
128 }
129
130 // Set params specified in vars_vec (done after setting params from config
131 // files, so that params in vars_vec can override those from files).
132 if (vars_vec != nullptr && vars_values != nullptr) {
133 for (unsigned i = 0; i < vars_vec->size(); ++i) {
134 if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
135 set_params_constraint, this->params())) {
136 tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
137 }
138 }
139 }
140
141 if (!tessedit_write_params_to_file.empty()) {
142 FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
143 if (params_file != nullptr) {
144 ParamUtils::PrintParams(params_file, this->params());
145 fclose(params_file);
146 } else {
147 tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
148 }
149 }
150
151 #ifndef DISABLED_LEGACY_ENGINE
152 // Determine which ocr engine(s) should be loaded and used for recognition.
153 if (oem != OEM_DEFAULT) {
154 tessedit_ocr_engine_mode.set_value(oem);
155 }
156 #endif
157
158 // If we are only loading the config file (and so not planning on doing any
159 // recognition) then there's nothing else do here.
160 if (tessedit_init_config_only) {
161 return true;
162 }
163
164 // The various OcrEngineMode settings (see tesseract/publictypes.h) determine
165 // which engine-specific data files need to be loaded. If LSTM_ONLY is
166 // requested, the base Tesseract files are *Not* required.
167 #ifdef DISABLED_LEGACY_ENGINE
168 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
169 #else
170 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
171 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
172 #endif // ndef DISABLED_LEGACY_ENGINE
173 if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
174 lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
175 ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
176 } else {
177 tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
178 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
179 }
180 }
181
182 // Load the unicharset
183 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
184 // Avoid requiring a unicharset when we aren't running base tesseract.
185 unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
186 }
187 #ifndef DISABLED_LEGACY_ENGINE
188 else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
189 tprintf(
190 "Error: Tesseract (legacy) engine requested, but components are "
191 "not present in %s!!\n",
192 tessdata_path.c_str());
193 return false;
194 }
195 #endif // ndef DISABLED_LEGACY_ENGINE
196 if (unicharset.size() > MAX_NUM_CLASSES) {
197 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
198 return false;
199 }
200 right_to_left_ = unicharset.major_right_to_left();
201
202 #ifndef DISABLED_LEGACY_ENGINE
203
204 // Setup initial unichar ambigs table and read universal ambigs.
205 UNICHARSET encoder_unicharset;
206 encoder_unicharset.CopyFrom(unicharset);
207 unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
208 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
209
210 if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
211 unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
212 use_ambigs_for_adaption, &unicharset);
213 }
214
215 // Init ParamsModel.
216 // Load pass1 and pass2 weights (for now these two sets are the same, but in
217 // the future separate sets of weights can be generated).
218 for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
219 language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
220 if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
221 if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
222 return false;
223 }
224 }
225 }
226 #endif // ndef DISABLED_LEGACY_ENGINE
227
228 return true;
229 }
230
231 // Helper returns true if the given string is in the vector of strings.
232 static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
233 for (const auto &i : str_list) {
234 if (i == str) {
235 return true;
236 }
237 }
238 return false;
239 }
240
241 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
242 // Langs with no prefix get appended to to_load, provided they
243 // are not in there already.
244 // Langs with ~ prefix get appended to not_to_load, provided they are not in
245 // there already.
246 void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
247 std::vector<std::string> *not_to_load) {
248 std::string remains(lang_str);
249 // Look whether the model file uses a prefix which must be applied to
250 // included model files as well.
251 std::regex e("(.*)/[^/]*");
252 std::cmatch cm;
253 std::string prefix;
254 if (std::regex_match(lang.c_str(), cm, e, std::regex_constants::match_default)) {
255 // A prefix was found.
256 prefix = cm[1].str() + "/";
257 }
258 while (!remains.empty()) {
259 // Find the start of the lang code and which vector to add to.
260 const char *start = remains.c_str();
261 while (*start == '+') {
262 ++start;
263 }
264 std::vector<std::string> *target = to_load;
265 if (*start == '~') {
266 target = not_to_load;
267 ++start;
268 }
269 // Find the index of the end of the lang code in string start.
270 int end = strlen(start);
271 const char *plus = strchr(start, '+');
272 if (plus != nullptr && plus - start < end) {
273 end = plus - start;
274 }
275 std::string lang_code(start);
276 lang_code.resize(end);
277 std::string next(start + end);
278 remains = next;
279 lang_code = prefix + lang_code;
280 // Check whether lang_code is already in the target vector and add.
281 if (!IsStrInList(lang_code, *target)) {
282 target->push_back(lang_code);
283 }
284 }
285 }
286
287 // Initialize for potentially a set of languages defined by the language
288 // string and recursively any additional languages required by any language
289 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
290 // See init_tesseract_internal for args.
291 int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
292 const std::string &language, OcrEngineMode oem, char **configs,
293 int configs_size, const std::vector<std::string> *vars_vec,
294 const std::vector<std::string> *vars_values,
295 bool set_only_non_debug_params, TessdataManager *mgr) {
296 std::vector<std::string> langs_to_load;
297 std::vector<std::string> langs_not_to_load;
298 ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
299
300 for (auto *lang : sub_langs_) {
301 delete lang;
302 }
303
304 // Set the basename, compute the data directory.
305 main_setup(arg0, textbase);
306
307 sub_langs_.clear();
308 // Find the first loadable lang and load into this.
309 // Add any languages that this language requires
310 bool loaded_primary = false;
311 // Load the rest into sub_langs_.
312 // A range based for loop does not work here because langs_to_load
313 // might be changed in the loop when a new submodel is found.
314 for (auto &lang_to_load : langs_to_load) {
315 if (!IsStrInList(lang_to_load, langs_not_to_load)) {
316 const char *lang_str = lang_to_load.c_str();
317 Tesseract *tess_to_init;
318 if (!loaded_primary) {
319 tess_to_init = this;
320 } else {
321 tess_to_init = new Tesseract;
322 tess_to_init->main_setup(arg0, textbase);
323 }
324
325 int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
326 configs_size, vars_vec, vars_values,
327 set_only_non_debug_params, mgr);
328 // Forget that language, but keep any reader we were given.
329 mgr->Clear();
330
331 if (!loaded_primary) {
332 if (result < 0) {
333 tprintf("Failed loading language '%s'\n", lang_str);
334 } else {
335 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
336 &langs_not_to_load);
337 loaded_primary = true;
338 }
339 } else {
340 if (result < 0) {
341 tprintf("Failed loading language '%s'\n", lang_str);
342 delete tess_to_init;
343 } else {
344 sub_langs_.push_back(tess_to_init);
345 // Add any languages that this language requires
346 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
347 &langs_not_to_load);
348 }
349 }
350 }
351 }
352 if (!loaded_primary && !langs_to_load.empty()) {
353 tprintf("Tesseract couldn't load any languages!\n");
354 return -1; // Couldn't load any language!
355 }
356 #ifndef DISABLED_LEGACY_ENGINE
357 if (!sub_langs_.empty()) {
358 // In multilingual mode word ratings have to be directly comparable,
359 // so use the same language model weights for all languages:
360 // use the primary language's params model if
361 // tessedit_use_primary_params_model is set,
362 // otherwise use default language model weights.
363 if (tessedit_use_primary_params_model) {
364 for (auto &sub_lang : sub_langs_) {
365 sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
366 }
367 tprintf("Using params model of the primary language\n");
368 } else {
369 this->language_model_->getParamsModel().Clear();
370 for (auto &sub_lang : sub_langs_) {
371 sub_lang->language_model_->getParamsModel().Clear();
372 }
373 }
374 }
375
376 SetupUniversalFontIds();
377 #endif // ndef DISABLED_LEGACY_ENGINE
378 return 0;
379 }
380
381 // Common initialization for a single language.
382 // arg0 is the datapath for the tessdata directory, which could be the
383 // path of the tessdata directory with no trailing /, or (if tessdata
384 // lives in the same directory as the executable, the path of the executable,
385 // hence the name arg0.
386 // textbase is an optional output file basename (used only for training)
387 // language is the language code to load.
388 // oem controls which engine(s) will operate on the image
389 // configs (argv) is an array of config filenames to load variables from.
390 // May be nullptr.
391 // configs_size (argc) is the number of elements in configs.
392 // vars_vec is an optional vector of variables to set.
393 // vars_values is an optional corresponding vector of values for the variables
394 // in vars_vec.
395 // If set_only_non_debug_params is true, only params that do not contain
396 // "debug" in the name will be set.
397 int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
398 const std::string &language, OcrEngineMode oem,
399 char **configs, int configs_size,
400 const std::vector<std::string> *vars_vec,
401 const std::vector<std::string> *vars_values,
402 bool set_only_non_debug_params, TessdataManager *mgr) {
403 if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
404 vars_values, set_only_non_debug_params, mgr)) {
405 return -1;
406 }
407 if (tessedit_init_config_only) {
408 return 0;
409 }
410 // If only LSTM will be used, skip loading Tesseract classifier's
411 // pre-trained templates and dictionary.
412 bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
413 program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
414 return 0; // Normal exit
415 }
416
417 #ifndef DISABLED_LEGACY_ENGINE
418
419 // Helper builds the all_fonts table by adding new fonts from new_fonts.
420 static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
421 UnicityTable<FontInfo> *all_fonts) {
422 for (int i = 0; i < new_fonts.size(); ++i) {
423 // UnicityTable uniques as we go.
424 all_fonts->push_back(new_fonts.at(i));
425 }
426 }
427
428 // Helper assigns an id to lang_fonts using the index in all_fonts table.
429 static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
430 for (int i = 0; i < lang_fonts->size(); ++i) {
431 auto index = all_fonts.get_index(lang_fonts->at(i));
432 lang_fonts->at(i).universal_id = index;
433 }
434 }
435
436 // Set the universal_id member of each font to be unique among all
437 // instances of the same font loaded.
438 void Tesseract::SetupUniversalFontIds() {
439 // Note that we can get away with bitwise copying FontInfo in
440 // all_fonts, as it is a temporary structure and we avoid setting the
441 // delete callback.
442 UnicityTable<FontInfo> all_fonts;
443
444 // Create the universal ID table.
445 CollectFonts(get_fontinfo_table(), &all_fonts);
446 for (auto &sub_lang : sub_langs_) {
447 CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
448 }
449 // Assign ids from the table to each font table.
450 AssignIds(all_fonts, &get_fontinfo_table());
451 for (auto &sub_lang : sub_langs_) {
452 AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
453 }
454 font_table_size_ = all_fonts.size();
455 }
456
457 #endif // ndef DISABLED_LEGACY_ENGINE
458
459 void Tesseract::end_tesseract() {
460 end_recog();
461 }
462
463 /* Define command type identifiers */
464
465 enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
466 } // namespace tesseract
467