1 /**********************************************************************
2  * File:        baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 2006, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #define _USE_MATH_DEFINES // for M_PI
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #  include "config_auto.h"
24 #endif
25 
26 #include "boxword.h"    // for BoxWord
27 #include "coutln.h"     // for C_OUTLINE_IT, C_OUTLINE_LIST
28 #include "dawg_cache.h" // for DawgCache
29 #include "dict.h"       // for Dict
30 #include "elst.h"       // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31 #include "environ.h"    // for l_uint8
32 #ifndef DISABLED_LEGACY_ENGINE
33 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34 #endif // ndef DISABLED_LEGACY_ENGINE
35 #include "errcode.h" // for ASSERT_HOST
36 #include "helpers.h" // for IntCastRounded, chomp_string
37 #include "host.h"    // for MAX_PATH
38 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39 #ifndef DISABLED_LEGACY_ENGINE
40 #  include "intfx.h" // for INT_FX_RESULT_STRUCT
41 #endif
42 #include "mutableiterator.h" // for MutableIterator
43 #include "normalis.h"        // for kBlnBaselineOffset, kBlnXHeight
44 #if defined(USE_OPENCL)
45 #  include "openclwrapper.h" // for OpenclDevice
46 #endif
47 #include "pageres.h"         // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
48 #include "paragraphs.h"      // for DetectParagraphs
49 #include "params.h"          // for BoolParam, IntParam, DoubleParam, Stri...
50 #include "pdblock.h"         // for PDBLK
51 #include "points.h"          // for FCOORD
52 #include "polyblk.h"         // for POLY_BLOCK
53 #include "rect.h"            // for TBOX
54 #include "stepblob.h"        // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
55 #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
56 #include "tesseractclass.h"  // for Tesseract
57 #include "tprintf.h"         // for tprintf
58 #include "werd.h"            // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
59 #include "thresholder.h"     // for ImageThresholder
60 
61 #include <tesseract/baseapi.h>
62 #include <tesseract/ocrclass.h>       // for ETEXT_DESC
63 #include <tesseract/osdetect.h>       // for OSResults, OSBestResult, OrientationId...
64 #include <tesseract/renderer.h>       // for TessResultRenderer
65 #include <tesseract/resultiterator.h> // for ResultIterator
66 
67 #include <cmath>    // for round, M_PI
68 #include <cstdint>  // for int32_t
69 #include <cstring>  // for strcmp, strcpy
70 #include <fstream>  // for size_t
71 #include <iostream> // for std::cin
72 #include <locale>   // for std::locale::classic
73 #include <memory>   // for std::unique_ptr
74 #include <set>      // for std::pair
75 #include <sstream>  // for std::stringstream
76 #include <vector>   // for std::vector
77 
78 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
79 #ifdef HAVE_LIBCURL
80 #  include <curl/curl.h>
81 #endif
82 
83 #ifdef __linux__
84 #  include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
85 #endif
86 
87 #if defined(_WIN32)
88 #  include <fcntl.h>
89 #  include <io.h>
90 #else
91 #  include <dirent.h> // for closedir, opendir, readdir, DIR, dirent
92 #  include <libgen.h>
93 #  include <sys/stat.h> // for stat, S_IFDIR
94 #  include <sys/types.h>
95 #  include <unistd.h>
96 #endif // _WIN32
97 
98 namespace tesseract {
99 
100 static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
101 static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
102 
103 /** Minimum sensible image size to be worth running tesseract. */
104 const int kMinRectSize = 10;
105 /** Character returned when Tesseract couldn't recognize as anything. */
106 const char kTesseractReject = '~';
107 /** Character used by UNLV error counter as a reject. */
108 const char kUNLVReject = '~';
109 /** Character used by UNLV as a suspect marker. */
110 const char kUNLVSuspect = '^';
111 /**
112  * Temp file used for storing current parameters before applying retry values.
113  */
114 static const char *kOldVarsFile = "failed_vars.txt";
115 
116 #ifndef DISABLED_LEGACY_ENGINE
117 /**
118  * Filename used for input image file, from which to derive a name to search
119  * for a possible UNLV zone file, if none is specified by SetInputName.
120  */
121 static const char *kInputFile = "noname.tif";
122 static const char kUnknownFontName[] = "UnknownFont";
123 
124 static STRING_VAR(classify_font_name, kUnknownFontName,
125                   "Default font name to be used in training");
126 
127 // Finds the name of the training font and returns it in fontname, by cutting
128 // it out based on the expectation that the filename is of the form:
129 // /path/to/dir/[lang].[fontname].exp[num]
130 // The [lang], [fontname] and [num] fields should not have '.' characters.
131 // If the global parameter classify_font_name is set, its value is used instead.
ExtractFontName(const char * filename,std::string * fontname)132 static void ExtractFontName(const char* filename, std::string* fontname) {
133   *fontname = classify_font_name;
134   if (*fontname == kUnknownFontName) {
135     // filename is expected to be of the form [lang].[fontname].exp[num]
136     // The [lang], [fontname] and [num] fields should not have '.' characters.
137     const char *basename = strrchr(filename, '/');
138     const char *firstdot = strchr(basename ? basename : filename, '.');
139     const char *lastdot  = strrchr(filename, '.');
140     if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
141       ++firstdot;
142       *fontname = firstdot;
143       fontname->resize(lastdot - firstdot);
144     }
145   }
146 }
147 #endif
148 
149 /* Add all available languages recursively.
150  */
addAvailableLanguages(const std::string & datadir,const std::string & base,std::vector<std::string> * langs)151 static void addAvailableLanguages(const std::string &datadir, const std::string &base,
152                                   std::vector<std::string> *langs) {
153   auto base2 = base;
154   if (!base2.empty()) {
155     base2 += "/";
156   }
157   const size_t extlen = sizeof(kTrainedDataSuffix);
158 #ifdef _WIN32
159   WIN32_FIND_DATA data;
160   HANDLE handle = FindFirstFile((datadir + base2 + "*").c_str(), &data);
161   if (handle != INVALID_HANDLE_VALUE) {
162     BOOL result = TRUE;
163     for (; result;) {
164       char *name = data.cFileName;
165       // Skip '.', '..', and hidden files
166       if (name[0] != '.') {
167         if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == FILE_ATTRIBUTE_DIRECTORY) {
168           addAvailableLanguages(datadir, base2 + name, langs);
169         } else {
170           size_t len = strlen(name);
171           if (len > extlen && name[len - extlen] == '.' &&
172               strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
173             name[len - extlen] = '\0';
174             langs->push_back(base2 + name);
175           }
176         }
177       }
178       result = FindNextFile(handle, &data);
179     }
180     FindClose(handle);
181   }
182 #else // _WIN32
183   DIR *dir = opendir((datadir + base).c_str());
184   if (dir != nullptr) {
185     dirent *de;
186     while ((de = readdir(dir))) {
187       char *name = de->d_name;
188       // Skip '.', '..', and hidden files
189       if (name[0] != '.') {
190         struct stat st;
191         if (stat((datadir + base2 + name).c_str(), &st) == 0 && (st.st_mode & S_IFDIR) == S_IFDIR) {
192           addAvailableLanguages(datadir, base2 + name, langs);
193         } else {
194           size_t len = strlen(name);
195           if (len > extlen && name[len - extlen] == '.' &&
196               strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
197             name[len - extlen] = '\0';
198             langs->push_back(base2 + name);
199           }
200         }
201       }
202     }
203     closedir(dir);
204   }
205 #endif
206 }
207 
TessBaseAPI()208 TessBaseAPI::TessBaseAPI()
209     : tesseract_(nullptr)
210     , osd_tesseract_(nullptr)
211     , equ_detect_(nullptr)
212     , reader_(nullptr)
213     ,
214     // thresholder_ is initialized to nullptr here, but will be set before use
215     // by: A constructor of a derived API or created
216     // implicitly when used in InternalSetImage.
217     thresholder_(nullptr)
218     , paragraph_models_(nullptr)
219     , block_list_(nullptr)
220     , page_res_(nullptr)
221     , last_oem_requested_(OEM_DEFAULT)
222     , recognition_done_(false)
223     , rect_left_(0)
224     , rect_top_(0)
225     , rect_width_(0)
226     , rect_height_(0)
227     , image_width_(0)
228     , image_height_(0) {
229 }
230 
~TessBaseAPI()231 TessBaseAPI::~TessBaseAPI() {
232   End();
233 }
234 
235 /**
236  * Returns the version identifier as a static string. Do not delete.
237  */
Version()238 const char *TessBaseAPI::Version() {
239   return TESSERACT_VERSION_STR;
240 }
241 
242 /**
243  * If compiled with OpenCL AND an available OpenCL
244  * device is deemed faster than serial code, then
245  * "device" is populated with the cl_device_id
246  * and returns sizeof(cl_device_id)
247  * otherwise *device=nullptr and returns 0.
248  */
getOpenCLDevice(void ** data)249 size_t TessBaseAPI::getOpenCLDevice(void **data) {
250 #ifdef USE_OPENCL
251   ds_device device = OpenclDevice::getDeviceSelection();
252   if (device.type == DS_DEVICE_OPENCL_DEVICE) {
253     *data = new cl_device_id;
254     memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
255     return sizeof(cl_device_id);
256   }
257 #endif
258 
259   *data = nullptr;
260   return 0;
261 }
262 
263 /**
264  * Set the name of the input file. Needed only for training and
265  * loading a UNLV zone file.
266  */
SetInputName(const char * name)267 void TessBaseAPI::SetInputName(const char *name) {
268   input_file_ = name ? name : "";
269 }
270 
271 /** Set the name of the output files. Needed only for debugging. */
SetOutputName(const char * name)272 void TessBaseAPI::SetOutputName(const char *name) {
273   output_file_ = name ? name : "";
274 }
275 
SetVariable(const char * name,const char * value)276 bool TessBaseAPI::SetVariable(const char *name, const char *value) {
277   if (tesseract_ == nullptr) {
278     tesseract_ = new Tesseract;
279   }
280   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
281                               tesseract_->params());
282 }
283 
SetDebugVariable(const char * name,const char * value)284 bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
285   if (tesseract_ == nullptr) {
286     tesseract_ = new Tesseract;
287   }
288   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());
289 }
290 
GetIntVariable(const char * name,int * value) const291 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
292   auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
293                                             tesseract_->params()->int_params);
294   if (p == nullptr) {
295     return false;
296   }
297   *value = (int32_t)(*p);
298   return true;
299 }
300 
GetBoolVariable(const char * name,bool * value) const301 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
302   auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
303                                              tesseract_->params()->bool_params);
304   if (p == nullptr) {
305     return false;
306   }
307   *value = bool(*p);
308   return true;
309 }
310 
GetStringVariable(const char * name) const311 const char *TessBaseAPI::GetStringVariable(const char *name) const {
312   auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
313                                                tesseract_->params()->string_params);
314   return (p != nullptr) ? p->c_str() : nullptr;
315 }
316 
GetDoubleVariable(const char * name,double * value) const317 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
318   auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
319                                                tesseract_->params()->double_params);
320   if (p == nullptr) {
321     return false;
322   }
323   *value = (double)(*p);
324   return true;
325 }
326 
327 /** Get value of named variable as a string, if it exists. */
GetVariableAsString(const char * name,std::string * val) const328 bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
329   return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
330 }
331 
332 #ifndef DISABLED_LEGACY_ENGINE
333 
334 /** Print Tesseract fonts table to the given file. */
PrintFontsTable(FILE * fp) const335 void TessBaseAPI::PrintFontsTable(FILE *fp) const {
336   const int fontinfo_size = tesseract_->get_fontinfo_table().size();
337   for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
338     FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
339     fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
340                 " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
341                 font_index, font.name,
342                 font.is_italic() ? "true" : "false",
343                 font.is_bold() ? "true" : "false",
344                 font.is_fixed_pitch() ? "true" : "false",
345                 font.is_serif() ? "true" : "false",
346                 font.is_fraktur() ? "true" : "false");
347   }
348 }
349 
350 #endif
351 
352 /** Print Tesseract parameters to the given file. */
PrintVariables(FILE * fp) const353 void TessBaseAPI::PrintVariables(FILE *fp) const {
354   ParamUtils::PrintParams(fp, tesseract_->params());
355 }
356 
357 /**
358  * The datapath must be the name of the data directory or
359  * some other file in which the data directory resides (for instance argv[0].)
360  * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
361  * If numeric_mode is true, then only digits and Roman numerals will
362  * be returned.
363  * @return: 0 on success and -1 on initialization failure.
364  */
Init(const char * datapath,const char * language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params)365 int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
366                       int configs_size, const std::vector<std::string> *vars_vec,
367                       const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
368   return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
369               set_only_non_debug_params, nullptr);
370 }
371 
372 // In-memory version reads the traineddata file directly from the given
373 // data[data_size] array. Also implements the version with a datapath in data,
374 // flagged by data_size = 0.
Init(const char * data,int data_size,const char * language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params,FileReader reader)375 int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
376                       char **configs, int configs_size, const std::vector<std::string> *vars_vec,
377                       const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
378                       FileReader reader) {
379   if (language == nullptr) {
380     language = "";
381   }
382   if (data == nullptr) {
383     data = "";
384   }
385   std::string datapath = data_size == 0 ? data : language;
386   // If the datapath, OcrEngineMode or the language have changed - start again.
387   // Note that the language_ field stores the last requested language that was
388   // initialized successfully, while tesseract_->lang stores the language
389   // actually used. They differ only if the requested language was nullptr, in
390   // which case tesseract_->lang is set to the Tesseract default ("eng").
391   if (tesseract_ != nullptr &&
392       (datapath_.empty() || language_.empty() || datapath_ != datapath ||
393        last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
394     delete tesseract_;
395     tesseract_ = nullptr;
396   }
397 #ifdef USE_OPENCL
398   OpenclDevice od;
399   od.InitEnv();
400 #endif
401   bool reset_classifier = true;
402   if (tesseract_ == nullptr) {
403     reset_classifier = false;
404     tesseract_ = new Tesseract;
405     if (reader != nullptr) {
406       reader_ = reader;
407     }
408     TessdataManager mgr(reader_);
409     if (data_size != 0) {
410       mgr.LoadMemBuffer(language, data, data_size);
411     }
412     if (tesseract_->init_tesseract(datapath.c_str(), output_file_.c_str(), language, oem, configs,
413                                    configs_size, vars_vec, vars_values, set_only_non_debug_params,
414                                    &mgr) != 0) {
415       return -1;
416     }
417   }
418 
419   // Update datapath and language requested for the last valid initialization.
420   datapath_ = datapath;
421   if (datapath_.empty() && !tesseract_->datadir.empty()) {
422     datapath_ = tesseract_->datadir;
423   }
424 
425   language_ = language;
426   last_oem_requested_ = oem;
427 
428 #ifndef DISABLED_LEGACY_ENGINE
429   // For same language and datapath, just reset the adaptive classifier.
430   if (reset_classifier) {
431     tesseract_->ResetAdaptiveClassifier();
432   }
433 #endif // ndef DISABLED_LEGACY_ENGINE
434   return 0;
435 }
436 
437 /**
438  * Returns the languages string used in the last valid initialization.
439  * If the last initialization specified "deu+hin" then that will be
440  * returned. If hin loaded eng automatically as well, then that will
441  * not be included in this list. To find the languages actually
442  * loaded use GetLoadedLanguagesAsVector.
443  * The returned string should NOT be deleted.
444  */
GetInitLanguagesAsString() const445 const char *TessBaseAPI::GetInitLanguagesAsString() const {
446   return language_.c_str();
447 }
448 
449 /**
450  * Returns the loaded languages in the vector of std::string.
451  * Includes all languages loaded by the last Init, including those loaded
452  * as dependencies of other loaded languages.
453  */
GetLoadedLanguagesAsVector(std::vector<std::string> * langs) const454 void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
455   langs->clear();
456   if (tesseract_ != nullptr) {
457     langs->push_back(tesseract_->lang);
458     int num_subs = tesseract_->num_sub_langs();
459     for (int i = 0; i < num_subs; ++i) {
460       langs->push_back(tesseract_->get_sub_lang(i)->lang);
461     }
462   }
463 }
464 
465 /**
466  * Returns the available languages in the sorted vector of std::string.
467  */
GetAvailableLanguagesAsVector(std::vector<std::string> * langs) const468 void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
469   langs->clear();
470   if (tesseract_ != nullptr) {
471     addAvailableLanguages(tesseract_->datadir, "", langs);
472     std::sort(langs->begin(), langs->end());
473   }
474 }
475 
476 /**
477  * Init only for page layout analysis. Use only for calls to SetImage and
478  * AnalysePage. Calls that attempt recognition will generate an error.
479  */
InitForAnalysePage()480 void TessBaseAPI::InitForAnalysePage() {
481   if (tesseract_ == nullptr) {
482     tesseract_ = new Tesseract;
483 #ifndef DISABLED_LEGACY_ENGINE
484     tesseract_->InitAdaptiveClassifier(nullptr);
485 #endif
486   }
487 }
488 
489 /**
490  * Read a "config" file containing a set of parameter name, value pairs.
491  * Searches the standard places: tessdata/configs, tessdata/tessconfigs
492  * and also accepts a relative or absolute path name.
493  */
ReadConfigFile(const char * filename)494 void TessBaseAPI::ReadConfigFile(const char *filename) {
495   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
496 }
497 
498 /** Same as above, but only set debug params from the given config file. */
ReadDebugConfigFile(const char * filename)499 void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
500   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
501 }
502 
503 /**
504  * Set the current page segmentation mode. Defaults to PSM_AUTO.
505  * The mode is stored as an IntParam so it can also be modified by
506  * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
507  */
SetPageSegMode(PageSegMode mode)508 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
509   if (tesseract_ == nullptr) {
510     tesseract_ = new Tesseract;
511   }
512   tesseract_->tessedit_pageseg_mode.set_value(mode);
513 }
514 
515 /** Return the current page segmentation mode. */
GetPageSegMode() const516 PageSegMode TessBaseAPI::GetPageSegMode() const {
517   if (tesseract_ == nullptr) {
518     return PSM_SINGLE_BLOCK;
519   }
520   return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
521 }
522 
523 /**
524  * Recognize a rectangle from an image and return the result as a string.
525  * May be called many times for a single Init.
526  * Currently has no error checking.
527  * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
528  * Palette color images will not work properly and must be converted to
529  * 24 bit.
530  * Binary images of 1 bit per pixel may also be given but they must be
531  * byte packed with the MSB of the first byte being the first pixel, and a
532  * one pixel is WHITE. For binary images set bytes_per_pixel=0.
533  * The recognized text is returned as a char* which is coded
534  * as UTF8 and must be freed with the delete [] operator.
535  */
TesseractRect(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height)536 char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
537                                  int bytes_per_line, int left, int top, int width, int height) {
538   if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
539     return nullptr; // Nothing worth doing.
540   }
541 
542   // Since this original api didn't give the exact size of the image,
543   // we have to invent a reasonable value.
544   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
545   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
546            bytes_per_line);
547   SetRectangle(left, top, width, height);
548 
549   return GetUTF8Text();
550 }
551 
552 #ifndef DISABLED_LEGACY_ENGINE
553 /**
554  * Call between pages or documents etc to free up memory and forget
555  * adaptive data.
556  */
ClearAdaptiveClassifier()557 void TessBaseAPI::ClearAdaptiveClassifier() {
558   if (tesseract_ == nullptr) {
559     return;
560   }
561   tesseract_->ResetAdaptiveClassifier();
562   tesseract_->ResetDocumentDictionary();
563 }
564 #endif // ndef DISABLED_LEGACY_ENGINE
565 
566 /**
567  * Provide an image for Tesseract to recognize. Format is as
568  * TesseractRect above. Copies the image buffer and converts to Pix.
569  * SetImage clears all recognition results, and sets the rectangle to the
570  * full image, so it may be followed immediately by a GetUTF8Text, and it
571  * will automatically perform recognition.
572  */
SetImage(const unsigned char * imagedata,int width,int height,int bytes_per_pixel,int bytes_per_line)573 void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
574                            int bytes_per_pixel, int bytes_per_line) {
575   if (InternalSetImage()) {
576     thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
577     SetInputImage(thresholder_->GetPixRect());
578   }
579 }
580 
SetSourceResolution(int ppi)581 void TessBaseAPI::SetSourceResolution(int ppi) {
582   if (thresholder_) {
583     thresholder_->SetSourceYResolution(ppi);
584   } else {
585     tprintf("Please call SetImage before SetSourceResolution.\n");
586   }
587 }
588 
589 /**
590  * Provide an image for Tesseract to recognize. As with SetImage above,
591  * Tesseract takes its own copy of the image, so it need not persist until
592  * after Recognize.
593  * Pix vs raw, which to use?
594  * Use Pix where possible. Tesseract uses Pix as its internal representation
595  * and it is therefore more efficient to provide a Pix directly.
596  */
SetImage(Pix * pix)597 void TessBaseAPI::SetImage(Pix *pix) {
598   if (InternalSetImage()) {
599     if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
600       // remove alpha channel from png
601       Pix *p1 = pixRemoveAlpha(pix);
602       pixSetSpp(p1, 3);
603       (void)pixCopy(pix, p1);
604       pixDestroy(&p1);
605     }
606     thresholder_->SetImage(pix);
607     SetInputImage(thresholder_->GetPixRect());
608   }
609 }
610 
611 /**
612  * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
613  * Each SetRectangle clears the recogntion results so multiple rectangles
614  * can be recognized with the same image.
615  */
SetRectangle(int left,int top,int width,int height)616 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
617   if (thresholder_ == nullptr) {
618     return;
619   }
620   thresholder_->SetRectangle(left, top, width, height);
621   ClearResults();
622 }
623 
624 /**
625  * ONLY available after SetImage if you have Leptonica installed.
626  * Get a copy of the internal thresholded image from Tesseract.
627  */
GetThresholdedImage()628 Pix *TessBaseAPI::GetThresholdedImage() {
629   if (tesseract_ == nullptr || thresholder_ == nullptr) {
630     return nullptr;
631   }
632   if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
633     return nullptr;
634   }
635   return tesseract_->pix_binary().clone();
636 }
637 
638 /**
639  * Get the result of page layout analysis as a leptonica-style
640  * Boxa, Pixa pair, in reading order.
641  * Can be called before or after Recognize.
642  */
GetRegions(Pixa ** pixa)643 Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
644   return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
645 }
646 
647 /**
648  * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
649  * Can be called before or after Recognize.
650  * If blockids is not nullptr, the block-id of each line is also returned as an
651  * array of one element per line. delete [] after use.
652  * If paraids is not nullptr, the paragraph-id of each line within its block is
653  * also returned as an array of one element per line. delete [] after use.
654  */
GetTextlines(const bool raw_image,const int raw_padding,Pixa ** pixa,int ** blockids,int ** paraids)655 Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
656                                 int **blockids, int **paraids) {
657   return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
658 }
659 
660 /**
661  * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
662  * pair, in reading order. Enables downstream handling of non-rectangular
663  * regions.
664  * Can be called before or after Recognize.
665  * If blockids is not nullptr, the block-id of each line is also returned as an
666  * array of one element per line. delete [] after use.
667  */
GetStrips(Pixa ** pixa,int ** blockids)668 Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
669   return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
670 }
671 
672 /**
673  * Get the words as a leptonica-style
674  * Boxa, Pixa pair, in reading order.
675  * Can be called before or after Recognize.
676  */
GetWords(Pixa ** pixa)677 Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
678   return GetComponentImages(RIL_WORD, true, pixa, nullptr);
679 }
680 
681 /**
682  * Gets the individual connected (text) components (created
683  * after pages segmentation step, but before recognition)
684  * as a leptonica-style Boxa, Pixa pair, in reading order.
685  * Can be called before or after Recognize.
686  */
GetConnectedComponents(Pixa ** pixa)687 Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {
688   return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
689 }
690 
691 /**
692  * Get the given level kind of components (block, textline, word etc.) as a
693  * leptonica-style Boxa, Pixa pair, in reading order.
694  * Can be called before or after Recognize.
695  * If blockids is not nullptr, the block-id of each component is also returned
696  * as an array of one element per component. delete [] after use.
697  * If text_only is true, then only text components are returned.
698  */
GetComponentImages(PageIteratorLevel level,bool text_only,bool raw_image,const int raw_padding,Pixa ** pixa,int ** blockids,int ** paraids)699 Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
700                                       const int raw_padding, Pixa **pixa, int **blockids,
701                                       int **paraids) {
702   /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
703   if (page_it == nullptr) {
704     page_it.reset(AnalyseLayout());
705   }
706   if (page_it == nullptr) {
707     return nullptr; // Failed.
708   }
709 
710   // Count the components to get a size for the arrays.
711   int component_count = 0;
712   int left, top, right, bottom;
713 
714   if (raw_image) {
715     // Get bounding box in original raw image with padding.
716     do {
717       if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
718           (!text_only || PTIsTextType(page_it->BlockType()))) {
719         ++component_count;
720       }
721     } while (page_it->Next(level));
722   } else {
723     // Get bounding box from binarized imaged. Note that this could be
724     // differently scaled from the original image.
725     do {
726       if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
727           (!text_only || PTIsTextType(page_it->BlockType()))) {
728         ++component_count;
729       }
730     } while (page_it->Next(level));
731   }
732 
733   Boxa *boxa = boxaCreate(component_count);
734   if (pixa != nullptr) {
735     *pixa = pixaCreate(component_count);
736   }
737   if (blockids != nullptr) {
738     *blockids = new int[component_count];
739   }
740   if (paraids != nullptr) {
741     *paraids = new int[component_count];
742   }
743 
744   int blockid = 0;
745   int paraid = 0;
746   int component_index = 0;
747   page_it->Begin();
748   do {
749     bool got_bounding_box;
750     if (raw_image) {
751       got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
752     } else {
753       got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
754     }
755     if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
756       Box *lbox = boxCreate(left, top, right - left, bottom - top);
757       boxaAddBox(boxa, lbox, L_INSERT);
758       if (pixa != nullptr) {
759         Pix *pix = nullptr;
760         if (raw_image) {
761           pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
762         } else {
763           pix = page_it->GetBinaryImage(level);
764         }
765         pixaAddPix(*pixa, pix, L_INSERT);
766         pixaAddBox(*pixa, lbox, L_CLONE);
767       }
768       if (paraids != nullptr) {
769         (*paraids)[component_index] = paraid;
770         if (page_it->IsAtFinalElement(RIL_PARA, level)) {
771           ++paraid;
772         }
773       }
774       if (blockids != nullptr) {
775         (*blockids)[component_index] = blockid;
776         if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
777           ++blockid;
778           paraid = 0;
779         }
780       }
781       ++component_index;
782     }
783   } while (page_it->Next(level));
784   return boxa;
785 }
786 
GetThresholdedImageScaleFactor() const787 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
788   if (thresholder_ == nullptr) {
789     return 0;
790   }
791   return thresholder_->GetScaleFactor();
792 }
793 
794 /**
795  * Runs page layout analysis in the mode set by SetPageSegMode.
796  * May optionally be called prior to Recognize to get access to just
797  * the page layout results. Returns an iterator to the results.
798  * If merge_similar_words is true, words are combined where suitable for use
799  * with a line recognizer. Use if you want to use AnalyseLayout to find the
800  * textlines, and then want to process textline fragments with an external
801  * line recognizer.
802  * Returns nullptr on error or an empty page.
803  * The returned iterator must be deleted after use.
804  * WARNING! This class points to data held within the TessBaseAPI class, and
805  * therefore can only be used while the TessBaseAPI class still exists and
806  * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
807  * DetectOS, or anything else that changes the internal PAGE_RES.
808  */
AnalyseLayout()809 PageIterator *TessBaseAPI::AnalyseLayout() {
810   return AnalyseLayout(false);
811 }
812 
AnalyseLayout(bool merge_similar_words)813 PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
814   if (FindLines() == 0) {
815     if (block_list_->empty()) {
816       return nullptr; // The page was empty.
817     }
818     page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
819     DetectParagraphs(false);
820     return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
821                             thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
822                             rect_width_, rect_height_);
823   }
824   return nullptr;
825 }
826 
827 /**
828  * Recognize the tesseract global image and return the result as Tesseract
829  * internal structures.
830  */
Recognize(ETEXT_DESC * monitor)831 int TessBaseAPI::Recognize(ETEXT_DESC *monitor) {
832   if (tesseract_ == nullptr) {
833     return -1;
834   }
835   if (FindLines() != 0) {
836     return -1;
837   }
838   delete page_res_;
839   if (block_list_->empty()) {
840     page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);
841     return 0; // Empty page.
842   }
843 
844   tesseract_->SetBlackAndWhitelist();
845   recognition_done_ = true;
846 #ifndef DISABLED_LEGACY_ENGINE
847   if (tesseract_->tessedit_resegment_from_line_boxes) {
848     page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);
849   } else if (tesseract_->tessedit_resegment_from_boxes) {
850     page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);
851   } else
852 #endif // ndef DISABLED_LEGACY_ENGINE
853   {
854     page_res_ =
855         new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);
856   }
857 
858   if (page_res_ == nullptr) {
859     return -1;
860   }
861 
862   if (tesseract_->tessedit_train_line_recognizer) {
863     if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {
864       return -1;
865     }
866     tesseract_->CorrectClassifyWords(page_res_);
867     return 0;
868   }
869 #ifndef DISABLED_LEGACY_ENGINE
870   if (tesseract_->tessedit_make_boxes_from_boxes) {
871     tesseract_->CorrectClassifyWords(page_res_);
872     return 0;
873   }
874 #endif // ndef DISABLED_LEGACY_ENGINE
875 
876   int result = 0;
877   if (tesseract_->interactive_display_mode) {
878 #ifndef GRAPHICS_DISABLED
879     tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
880 #endif // !GRAPHICS_DISABLED
881        // The page_res is invalid after an interactive session, so cleanup
882        // in a way that lets us continue to the next page without crashing.
883     delete page_res_;
884     page_res_ = nullptr;
885     return -1;
886 #ifndef DISABLED_LEGACY_ENGINE
887   } else if (tesseract_->tessedit_train_from_boxes) {
888     std::string fontname;
889     ExtractFontName(output_file_.c_str(), &fontname);
890     tesseract_->ApplyBoxTraining(fontname, page_res_);
891   } else if (tesseract_->tessedit_ambigs_training) {
892     FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
893     // OCR the page segmented into words by tesseract.
894     tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,
895                                          training_output_file);
896     fclose(training_output_file);
897 #endif // ndef DISABLED_LEGACY_ENGINE
898   } else {
899     // Now run the main recognition.
900     bool wait_for_text = true;
901     GetBoolVariable("paragraph_text_based", &wait_for_text);
902     if (!wait_for_text) {
903       DetectParagraphs(false);
904     }
905     if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
906       if (wait_for_text) {
907         DetectParagraphs(true);
908       }
909     } else {
910       result = -1;
911     }
912   }
913   return result;
914 }
915 
916 // Takes ownership of the input pix.
SetInputImage(Pix * pix)917 void TessBaseAPI::SetInputImage(Pix *pix) {
918   tesseract_->set_pix_original(pix);
919 }
920 
GetInputImage()921 Pix *TessBaseAPI::GetInputImage() {
922   return tesseract_->pix_original();
923 }
924 
GetInputName()925 const char *TessBaseAPI::GetInputName() {
926   if (!input_file_.empty()) {
927     return input_file_.c_str();
928   }
929   return nullptr;
930 }
931 
GetDatapath()932 const char *TessBaseAPI::GetDatapath() {
933   return tesseract_->datadir.c_str();
934 }
935 
GetSourceYResolution()936 int TessBaseAPI::GetSourceYResolution() {
937   if (thresholder_ == nullptr)
938     return -1;
939   return thresholder_->GetSourceYResolution();
940 }
941 
942 // If flist exists, get data from there. Otherwise get data from buf.
943 // Seems convoluted, but is the easiest way I know of to meet multiple
944 // goals. Support streaming from stdin, and also work on platforms
945 // lacking fmemopen.
946 // TODO: check different logic for flist/buf and simplify.
ProcessPagesFileList(FILE * flist,std::string * buf,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer,int tessedit_page_number)947 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
948                                        int timeout_millisec, TessResultRenderer *renderer,
949                                        int tessedit_page_number) {
950   if (!flist && !buf) {
951     return false;
952   }
953   unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
954   char pagename[MAX_PATH];
955 
956   std::vector<std::string> lines;
957   if (!flist) {
958     std::string line;
959     for (const auto ch : *buf) {
960       if (ch == '\n') {
961         lines.push_back(line);
962         line.clear();
963       } else {
964         line.push_back(ch);
965       }
966     }
967     if (!line.empty()) {
968       // Add last line without terminating LF.
969       lines.push_back(line);
970     }
971     if (lines.empty()) {
972       return false;
973     }
974   }
975 
976   // Skip to the requested page number.
977   for (unsigned i = 0; i < page; i++) {
978     if (flist) {
979       if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
980         break;
981       }
982     }
983   }
984 
985   // Begin producing output
986   if (renderer && !renderer->BeginDocument(document_title.c_str())) {
987     return false;
988   }
989 
990   // Loop over all pages - or just the requested one
991   while (true) {
992     if (flist) {
993       if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
994         break;
995       }
996     } else {
997       if (page >= lines.size()) {
998         break;
999       }
1000       snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
1001     }
1002     chomp_string(pagename);
1003     Pix *pix = pixRead(pagename);
1004     if (pix == nullptr) {
1005       tprintf("Image file %s cannot be read!\n", pagename);
1006       return false;
1007     }
1008     tprintf("Page %u : %s\n", page, pagename);
1009     bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
1010     pixDestroy(&pix);
1011     if (!r) {
1012       return false;
1013     }
1014     if (tessedit_page_number >= 0) {
1015       break;
1016     }
1017     ++page;
1018   }
1019 
1020   // Finish producing output
1021   if (renderer && !renderer->EndDocument()) {
1022     return false;
1023   }
1024   return true;
1025 }
1026 
ProcessPagesMultipageTiff(const l_uint8 * data,size_t size,const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer,int tessedit_page_number)1027 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
1028                                             const char *retry_config, int timeout_millisec,
1029                                             TessResultRenderer *renderer,
1030                                             int tessedit_page_number) {
1031   Pix *pix = nullptr;
1032   int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1033   size_t offset = 0;
1034   for (;; ++page) {
1035     if (tessedit_page_number >= 0) {
1036       page = tessedit_page_number;
1037       pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
1038     } else {
1039       pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1040                    : pixReadFromMultipageTiff(filename, &offset);
1041     }
1042     if (pix == nullptr) {
1043       break;
1044     }
1045     if (offset || page > 0) {
1046       // Only print page number for multipage TIFF file.
1047       tprintf("Page %d\n", page + 1);
1048     }
1049     auto page_string = std::to_string(page);
1050     SetVariable("applybox_page", page_string.c_str());
1051     bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
1052     pixDestroy(&pix);
1053     if (!r) {
1054       return false;
1055     }
1056     if (tessedit_page_number >= 0) {
1057       break;
1058     }
1059     if (!offset) {
1060       break;
1061     }
1062   }
1063   return true;
1064 }
1065 
1066 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1067 // processing required due to being in a training mode.
ProcessPages(const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1068 bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
1069                                TessResultRenderer *renderer) {
1070   bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1071 #ifndef DISABLED_LEGACY_ENGINE
1072   if (result) {
1073     if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1074       tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1075       return false;
1076     }
1077   }
1078 #endif // ndef DISABLED_LEGACY_ENGINE
1079   return result;
1080 }
1081 
1082 #ifdef HAVE_LIBCURL
WriteMemoryCallback(void * contents,size_t size,size_t nmemb,void * userp)1083 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1084   size = size * nmemb;
1085   auto *buf = reinterpret_cast<std::string *>(userp);
1086   buf->append(reinterpret_cast<const char *>(contents), size);
1087   return size;
1088 }
1089 #endif
1090 
1091 // In the ideal scenario, Tesseract will start working on data as soon
1092 // as it can. For example, if you stream a filelist through stdin, we
1093 // should start the OCR process as soon as the first filename is
1094 // available. This is particularly useful when hooking Tesseract up to
1095 // slow hardware such as a book scanning machine.
1096 //
1097 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1098 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1099 // impractical.  So we support a command line flag to explicitly
1100 // identify the scenario that really matters: filelists on
1101 // stdin. We'll still do our best if the user likes pipes.
ProcessPagesInternal(const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1102 bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1103                                        int timeout_millisec, TessResultRenderer *renderer) {
1104   bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1105   if (stdInput) {
1106 #ifdef WIN32
1107     if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1108       tprintf("ERROR: cin to binary: %s", strerror(errno));
1109 #endif // WIN32
1110   }
1111 
1112   if (stream_filelist) {
1113     return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1114                                 tesseract_->tessedit_page_number);
1115   }
1116 
1117   // At this point we are officially in autodection territory.
1118   // That means any data in stdin must be buffered, to make it
1119   // seekable.
1120   std::string buf;
1121   const l_uint8 *data = nullptr;
1122   if (stdInput) {
1123     buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1124     data = reinterpret_cast<const l_uint8 *>(buf.data());
1125   } else if (strstr(filename, "://") != nullptr) {
1126     // Get image or image list by URL.
1127 #ifdef HAVE_LIBCURL
1128     CURL *curl = curl_easy_init();
1129     if (curl == nullptr) {
1130       fprintf(stderr, "Error, curl_easy_init failed\n");
1131       return false;
1132     } else {
1133       CURLcode curlcode;
1134       auto error = [curl, &curlcode](const char *function) {
1135         fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1136         curl_easy_cleanup(curl);
1137         return false;
1138       };
1139       curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1140       if (curlcode != CURLE_OK) {
1141         return error("curl_easy_setopt");
1142       }
1143       curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1144       if (curlcode != CURLE_OK) {
1145         return error("curl_easy_setopt");
1146       }
1147       curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1148       if (curlcode != CURLE_OK) {
1149         return error("curl_easy_setopt");
1150       }
1151       curlcode = curl_easy_perform(curl);
1152       if (curlcode != CURLE_OK) {
1153         return error("curl_easy_perform");
1154       }
1155       curl_easy_cleanup(curl);
1156       data = reinterpret_cast<const l_uint8 *>(buf.data());
1157     }
1158 #else
1159     fprintf(stderr, "Error, this tesseract has no URL support\n");
1160     return false;
1161 #endif
1162   } else {
1163     // Check whether the input file can be read.
1164     if (FILE *file = fopen(filename, "rb")) {
1165       fclose(file);
1166     } else {
1167       fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1168       return false;
1169     }
1170   }
1171 
1172   // Here is our autodetection
1173   int format;
1174   int r =
1175       (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1176 
1177   // Maybe we have a filelist
1178   if (r != 0 || format == IFF_UNKNOWN) {
1179     std::string s;
1180     if (data != nullptr) {
1181       s = buf.c_str();
1182     } else {
1183       std::ifstream t(filename);
1184       std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1185       s = u.c_str();
1186     }
1187     return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1188                                 tesseract_->tessedit_page_number);
1189   }
1190 
1191   // Maybe we have a TIFF which is potentially multipage
1192   bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1193                format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1194 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1195                format == IFF_TIFF_JPEG ||
1196 #endif
1197                format == IFF_TIFF_ZIP);
1198 
1199   // Fail early if we can, before producing any output
1200   Pix *pix = nullptr;
1201   if (!tiff) {
1202     pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1203     if (pix == nullptr) {
1204       return false;
1205     }
1206   }
1207 
1208   // Begin the output
1209   if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1210     pixDestroy(&pix);
1211     return false;
1212   }
1213 
1214   // Produce output
1215   r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1216                                          renderer, tesseract_->tessedit_page_number)
1217              : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1218 
1219   // Clean up memory as needed
1220   pixDestroy(&pix);
1221 
1222   // End the output
1223   if (!r || (renderer && !renderer->EndDocument())) {
1224     return false;
1225   }
1226   return true;
1227 }
1228 
ProcessPage(Pix * pix,int page_index,const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1229 bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1230                               const char *retry_config, int timeout_millisec,
1231                               TessResultRenderer *renderer) {
1232   SetInputName(filename);
1233   SetImage(pix);
1234   bool failed = false;
1235 
1236   if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1237     // Disabled character recognition
1238     if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1239       failed = true;
1240     }
1241   } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1242     failed = FindLines() != 0;
1243   } else if (timeout_millisec > 0) {
1244     // Running with a timeout.
1245     ETEXT_DESC monitor;
1246     monitor.cancel = nullptr;
1247     monitor.cancel_this = nullptr;
1248     monitor.set_deadline_msecs(timeout_millisec);
1249 
1250     // Now run the main recognition.
1251     failed = Recognize(&monitor) < 0;
1252   } else {
1253     // Normal layout and character recognition with no timeout.
1254     failed = Recognize(nullptr) < 0;
1255   }
1256 
1257   if (tesseract_->tessedit_write_images) {
1258     Pix *page_pix = GetThresholdedImage();
1259     std::string output_filename = output_file_ + ".processed";
1260     if (page_index > 0) {
1261       output_filename += std::to_string(page_index);
1262     }
1263     output_filename += ".tif";
1264     pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1265     pixDestroy(&page_pix);
1266   }
1267 
1268   if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1269     // Save current config variables before switching modes.
1270     FILE *fp = fopen(kOldVarsFile, "wb");
1271     if (fp == nullptr) {
1272       tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1273     } else {
1274       PrintVariables(fp);
1275       fclose(fp);
1276     }
1277     // Switch to alternate mode for retry.
1278     ReadConfigFile(retry_config);
1279     SetImage(pix);
1280     Recognize(nullptr);
1281     // Restore saved config variables.
1282     ReadConfigFile(kOldVarsFile);
1283   }
1284 
1285   if (renderer && !failed) {
1286     failed = !renderer->AddImage(this);
1287   }
1288 
1289   return !failed;
1290 }
1291 
1292 /**
1293  * Get a left-to-right iterator to the results of LayoutAnalysis and/or
1294  * Recognize. The returned iterator must be deleted after use.
1295  */
GetLTRIterator()1296 LTRResultIterator *TessBaseAPI::GetLTRIterator() {
1297   if (tesseract_ == nullptr || page_res_ == nullptr) {
1298     return nullptr;
1299   }
1300   return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1301                                thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1302                                rect_width_, rect_height_);
1303 }
1304 
1305 /**
1306  * Get a reading-order iterator to the results of LayoutAnalysis and/or
1307  * Recognize. The returned iterator must be deleted after use.
1308  * WARNING! This class points to data held within the TessBaseAPI class, and
1309  * therefore can only be used while the TessBaseAPI class still exists and
1310  * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1311  * DetectOS, or anything else that changes the internal PAGE_RES.
1312  */
GetIterator()1313 ResultIterator *TessBaseAPI::GetIterator() {
1314   if (tesseract_ == nullptr || page_res_ == nullptr) {
1315     return nullptr;
1316   }
1317   return ResultIterator::StartOfParagraph(LTRResultIterator(
1318       page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
1319       rect_left_, rect_top_, rect_width_, rect_height_));
1320 }
1321 
1322 /**
1323  * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
1324  * The returned iterator must be deleted after use.
1325  * WARNING! This class points to data held within the TessBaseAPI class, and
1326  * therefore can only be used while the TessBaseAPI class still exists and
1327  * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1328  * DetectOS, or anything else that changes the internal PAGE_RES.
1329  */
GetMutableIterator()1330 MutableIterator *TessBaseAPI::GetMutableIterator() {
1331   if (tesseract_ == nullptr || page_res_ == nullptr) {
1332     return nullptr;
1333   }
1334   return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1335                              thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1336                              rect_width_, rect_height_);
1337 }
1338 
1339 /** Make a text string from the internal data structures. */
GetUTF8Text()1340 char *TessBaseAPI::GetUTF8Text() {
1341   if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1342     return nullptr;
1343   }
1344   std::string text("");
1345   const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1346   do {
1347     if (it->Empty(RIL_PARA)) {
1348       continue;
1349     }
1350     const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1351     text += para_text.get();
1352   } while (it->Next(RIL_PARA));
1353   char *result = new char[text.length() + 1];
1354   strncpy(result, text.c_str(), text.length() + 1);
1355   return result;
1356 }
1357 
AddBoxToTSV(const PageIterator * it,PageIteratorLevel level,std::string & text)1358 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1359   int left, top, right, bottom;
1360   it->BoundingBox(level, &left, &top, &right, &bottom);
1361   text += "\t" + std::to_string(left);
1362   text += "\t" + std::to_string(top);
1363   text += "\t" + std::to_string(right - left);
1364   text += "\t" + std::to_string(bottom - top);
1365 }
1366 
1367 /**
1368  * Make a TSV-formatted string from the internal data structures.
1369  * page_number is 0-based but will appear in the output as 1-based.
1370  * Returned string must be freed with the delete [] operator.
1371  */
GetTSVText(int page_number)1372 char *TessBaseAPI::GetTSVText(int page_number) {
1373   if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1374     return nullptr;
1375   }
1376 
1377   int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1378   int page_id = page_number + 1; // we use 1-based page numbers.
1379 
1380   int page_num = page_id;
1381   int block_num = 0;
1382   int par_num = 0;
1383   int line_num = 0;
1384   int word_num = 0;
1385 
1386   std::string tsv_str;
1387   tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1388   tsv_str += "\t" + std::to_string(block_num);
1389   tsv_str += "\t" + std::to_string(par_num);
1390   tsv_str += "\t" + std::to_string(line_num);
1391   tsv_str += "\t" + std::to_string(word_num);
1392   tsv_str += "\t" + std::to_string(rect_left_);
1393   tsv_str += "\t" + std::to_string(rect_top_);
1394   tsv_str += "\t" + std::to_string(rect_width_);
1395   tsv_str += "\t" + std::to_string(rect_height_);
1396   tsv_str += "\t-1\t\n";
1397 
1398   const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1399   while (!res_it->Empty(RIL_BLOCK)) {
1400     if (res_it->Empty(RIL_WORD)) {
1401       res_it->Next(RIL_WORD);
1402       continue;
1403     }
1404 
1405     // Add rows for any new block/paragraph/textline.
1406     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1407       block_num++;
1408       par_num = 0;
1409       line_num = 0;
1410       word_num = 0;
1411       tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1412       tsv_str += "\t" + std::to_string(block_num);
1413       tsv_str += "\t" + std::to_string(par_num);
1414       tsv_str += "\t" + std::to_string(line_num);
1415       tsv_str += "\t" + std::to_string(word_num);
1416       AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1417       tsv_str += "\t-1\t\n"; // end of row for block
1418     }
1419     if (res_it->IsAtBeginningOf(RIL_PARA)) {
1420       par_num++;
1421       line_num = 0;
1422       word_num = 0;
1423       tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1424       tsv_str += "\t" + std::to_string(block_num);
1425       tsv_str += "\t" + std::to_string(par_num);
1426       tsv_str += "\t" + std::to_string(line_num);
1427       tsv_str += "\t" + std::to_string(word_num);
1428       AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1429       tsv_str += "\t-1\t\n"; // end of row for para
1430     }
1431     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1432       line_num++;
1433       word_num = 0;
1434       tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1435       tsv_str += "\t" + std::to_string(block_num);
1436       tsv_str += "\t" + std::to_string(par_num);
1437       tsv_str += "\t" + std::to_string(line_num);
1438       tsv_str += "\t" + std::to_string(word_num);
1439       AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1440       tsv_str += "\t-1\t\n"; // end of row for line
1441     }
1442 
1443     // Now, process the word...
1444     int left, top, right, bottom;
1445     res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1446     word_num++;
1447     tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1448     tsv_str += "\t" + std::to_string(block_num);
1449     tsv_str += "\t" + std::to_string(par_num);
1450     tsv_str += "\t" + std::to_string(line_num);
1451     tsv_str += "\t" + std::to_string(word_num);
1452     tsv_str += "\t" + std::to_string(left);
1453     tsv_str += "\t" + std::to_string(top);
1454     tsv_str += "\t" + std::to_string(right - left);
1455     tsv_str += "\t" + std::to_string(bottom - top);
1456     tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1457     tsv_str += "\t";
1458 
1459     // Increment counts if at end of block/paragraph/textline.
1460     if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1461       lcnt++;
1462     }
1463     if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1464       pcnt++;
1465     }
1466     if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1467       bcnt++;
1468     }
1469 
1470     do {
1471       tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1472       res_it->Next(RIL_SYMBOL);
1473     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1474     tsv_str += "\n"; // end of row
1475     wcnt++;
1476   }
1477 
1478   char *ret = new char[tsv_str.length() + 1];
1479   strcpy(ret, tsv_str.c_str());
1480   return ret;
1481 }
1482 
1483 /** The 5 numbers output for each box (the usual 4 and a page number.) */
1484 const int kNumbersPerBlob = 5;
1485 /**
1486  * The number of bytes taken by each number. Since we use int16_t for ICOORD,
1487  * assume only 5 digits max.
1488  */
1489 const int kBytesPerNumber = 5;
1490 /**
1491  * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
1492  * * kNumbersPerBlob plus the newline. Add to this the
1493  * original UTF8 characters, and one kMaxBytesPerLine for safety.
1494  */
1495 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1496 /** Max bytes in the decimal representation of int64_t. */
1497 const int kBytesPer64BitNumber = 20;
1498 /**
1499  * A maximal single box could occupy kNumbersPerBlob numbers at
1500  * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
1501  * space plus the newline and the maximum length of a UNICHAR.
1502  * Test against this on each iteration for safety.
1503  */
1504 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;
1505 
1506 /**
1507  * The recognized text is returned as a char* which is coded
1508  * as a UTF8 box file.
1509  * page_number is a 0-base page index that will appear in the box file.
1510  * Returned string must be freed with the delete [] operator.
1511  */
GetBoxText(int page_number)1512 char *TessBaseAPI::GetBoxText(int page_number) {
1513   if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1514     return nullptr;
1515   }
1516   int blob_count;
1517   int utf8_length = TextLength(&blob_count);
1518   int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1519   char *result = new char[total_length];
1520   result[0] = '\0';
1521   int output_length = 0;
1522   LTRResultIterator *it = GetLTRIterator();
1523   do {
1524     int left, top, right, bottom;
1525     if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1526       const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1527       // Tesseract uses space for recognition failure. Fix to a reject
1528       // character, kTesseractReject so we don't create illegal box files.
1529       for (int i = 0; text[i] != '\0'; ++i) {
1530         if (text[i] == ' ') {
1531           text[i] = kTesseractReject;
1532         }
1533       }
1534       snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1535                text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1536       output_length += strlen(result + output_length);
1537       // Just in case...
1538       if (output_length + kMaxBytesPerLine > total_length) {
1539         break;
1540       }
1541     }
1542   } while (it->Next(RIL_SYMBOL));
1543   delete it;
1544   return result;
1545 }
1546 
1547 /**
1548  * Conversion table for non-latin characters.
1549  * Maps characters out of the latin set into the latin set.
1550  * TODO(rays) incorporate this translation into unicharset.
1551  */
1552 const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1553 /** Latin chars corresponding to the unicode chars above. */
1554 const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1555 
1556 /**
1557  * The recognized text is returned as a char* which is coded
1558  * as UNLV format Latin-1 with specific reject and suspect codes.
1559  * Returned string must be freed with the delete [] operator.
1560  */
GetUNLVText()1561 char *TessBaseAPI::GetUNLVText() {
1562   if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1563     return nullptr;
1564   }
1565   bool tilde_crunch_written = false;
1566   bool last_char_was_newline = true;
1567   bool last_char_was_tilde = false;
1568 
1569   int total_length = TextLength(nullptr);
1570   PAGE_RES_IT page_res_it(page_res_);
1571   char *result = new char[total_length];
1572   char *ptr = result;
1573   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1574     WERD_RES *word = page_res_it.word();
1575     // Process the current word.
1576     if (word->unlv_crunch_mode != CR_NONE) {
1577       if (word->unlv_crunch_mode != CR_DELETE &&
1578           (!tilde_crunch_written ||
1579            (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1580             !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1581         if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1582             !word->word->flag(W_FUZZY_SP)) {
1583           /* Write a space to separate from preceding good text */
1584           *ptr++ = ' ';
1585           last_char_was_tilde = false;
1586         }
1587         if (!last_char_was_tilde) {
1588           // Write a reject char.
1589           last_char_was_tilde = true;
1590           *ptr++ = kUNLVReject;
1591           tilde_crunch_written = true;
1592           last_char_was_newline = false;
1593         }
1594       }
1595     } else {
1596       // NORMAL PROCESSING of non tilde crunched words.
1597       tilde_crunch_written = false;
1598       tesseract_->set_unlv_suspects(word);
1599       const char *wordstr = word->best_choice->unichar_string().c_str();
1600       const auto &lengths = word->best_choice->unichar_lengths();
1601       int length = lengths.length();
1602       int i = 0;
1603       int offset = 0;
1604 
1605       if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1606         // Prevent adjacent tilde across words - we know that adjacent tildes
1607         // within words have been removed.
1608         // Skip the first character.
1609         offset = lengths[i++];
1610       }
1611       if (i < length && wordstr[offset] != 0) {
1612         if (!last_char_was_newline) {
1613           *ptr++ = ' ';
1614         } else {
1615           last_char_was_newline = false;
1616         }
1617         for (; i < length; offset += lengths[i++]) {
1618           if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1619             *ptr++ = kUNLVReject;
1620             last_char_was_tilde = true;
1621           } else {
1622             if (word->reject_map[i].rejected()) {
1623               *ptr++ = kUNLVSuspect;
1624             }
1625             UNICHAR ch(wordstr + offset, lengths[i]);
1626             int uni_ch = ch.first_uni();
1627             for (int j = 0; kUniChs[j] != 0; ++j) {
1628               if (kUniChs[j] == uni_ch) {
1629                 uni_ch = kLatinChs[j];
1630                 break;
1631               }
1632             }
1633             if (uni_ch <= 0xff) {
1634               *ptr++ = static_cast<char>(uni_ch);
1635               last_char_was_tilde = false;
1636             } else {
1637               *ptr++ = kUNLVReject;
1638               last_char_was_tilde = true;
1639             }
1640           }
1641         }
1642       }
1643     }
1644     if (word->word->flag(W_EOL) && !last_char_was_newline) {
1645       /* Add a new line output */
1646       *ptr++ = '\n';
1647       tilde_crunch_written = false;
1648       last_char_was_newline = true;
1649       last_char_was_tilde = false;
1650     }
1651   }
1652   *ptr++ = '\n';
1653   *ptr = '\0';
1654   return result;
1655 }
1656 
1657 #ifndef DISABLED_LEGACY_ENGINE
1658 
1659 /**
1660  * Detect the orientation of the input image and apparent script (alphabet).
1661  * orient_deg is the detected clockwise rotation of the input image in degrees
1662  * (0, 90, 180, 270)
1663  * orient_conf is the confidence (15.0 is reasonably confident)
1664  * script_name is an ASCII string, the name of the script, e.g. "Latin"
1665  * script_conf is confidence level in the script
1666  * Returns true on success and writes values to each parameter as an output
1667  */
DetectOrientationScript(int * orient_deg,float * orient_conf,const char ** script_name,float * script_conf)1668 bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1669                                           const char **script_name, float *script_conf) {
1670   OSResults osr;
1671 
1672   bool osd = DetectOS(&osr);
1673   if (!osd) {
1674     return false;
1675   }
1676 
1677   int orient_id = osr.best_result.orientation_id;
1678   int script_id = osr.get_best_script(orient_id);
1679   if (orient_conf) {
1680     *orient_conf = osr.best_result.oconfidence;
1681   }
1682   if (orient_deg) {
1683     *orient_deg = orient_id * 90; // convert quadrant to degrees
1684   }
1685 
1686   if (script_name) {
1687     const char *script = osr.unicharset->get_script_from_script_id(script_id);
1688 
1689     *script_name = script;
1690   }
1691 
1692   if (script_conf) {
1693     *script_conf = osr.best_result.sconfidence;
1694   }
1695 
1696   return true;
1697 }
1698 
1699 /**
1700  * The recognized text is returned as a char* which is coded
1701  * as UTF8 and must be freed with the delete [] operator.
1702  * page_number is a 0-based page index that will appear in the osd file.
1703  */
GetOsdText(int page_number)1704 char *TessBaseAPI::GetOsdText(int page_number) {
1705   int orient_deg;
1706   float orient_conf;
1707   const char *script_name;
1708   float script_conf;
1709 
1710   if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1711     return nullptr;
1712   }
1713 
1714   // clockwise rotation needed to make the page upright
1715   int rotate = OrientationIdToValue(orient_deg / 90);
1716 
1717   std::stringstream stream;
1718   // Use "C" locale (needed for float values orient_conf and script_conf).
1719   stream.imbue(std::locale::classic());
1720   // Use fixed notation with 2 digits after the decimal point for float values.
1721   stream.precision(2);
1722   stream << std::fixed << "Page number: " << page_number << "\n"
1723          << "Orientation in degrees: " << orient_deg << "\n"
1724          << "Rotate: " << rotate << "\n"
1725          << "Orientation confidence: " << orient_conf << "\n"
1726          << "Script: " << script_name << "\n"
1727          << "Script confidence: " << script_conf << "\n";
1728   const std::string &text = stream.str();
1729   char *result = new char[text.length() + 1];
1730   strcpy(result, text.c_str());
1731   return result;
1732 }
1733 
1734 #endif // ndef DISABLED_LEGACY_ENGINE
1735 
1736 /** Returns the average word confidence for Tesseract page result. */
MeanTextConf()1737 int TessBaseAPI::MeanTextConf() {
1738   int *conf = AllWordConfidences();
1739   if (!conf) {
1740     return 0;
1741   }
1742   int sum = 0;
1743   int *pt = conf;
1744   while (*pt >= 0) {
1745     sum += *pt++;
1746   }
1747   if (pt != conf) {
1748     sum /= pt - conf;
1749   }
1750   delete[] conf;
1751   return sum;
1752 }
1753 
1754 /** Returns an array of all word confidences, terminated by -1. */
AllWordConfidences()1755 int *TessBaseAPI::AllWordConfidences() {
1756   if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1757     return nullptr;
1758   }
1759   int n_word = 0;
1760   PAGE_RES_IT res_it(page_res_);
1761   for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1762     n_word++;
1763   }
1764 
1765   int *conf = new int[n_word + 1];
1766   n_word = 0;
1767   for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1768     WERD_RES *word = res_it.word();
1769     WERD_CHOICE *choice = word->best_choice;
1770     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1771     // This is the eq for converting Tesseract confidence to 1..100
1772     if (w_conf < 0) {
1773       w_conf = 0;
1774     }
1775     if (w_conf > 100) {
1776       w_conf = 100;
1777     }
1778     conf[n_word++] = w_conf;
1779   }
1780   conf[n_word] = -1;
1781   return conf;
1782 }
1783 
1784 #ifndef DISABLED_LEGACY_ENGINE
1785 /**
1786  * Applies the given word to the adaptive classifier if possible.
1787  * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
1788  * tell the boundaries of the graphemes.
1789  * Assumes that SetImage/SetRectangle have been used to set the image
1790  * to the given word. The mode arg should be PSM_SINGLE_WORD or
1791  * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
1792  * The currently set PageSegMode is preserved.
1793  * Returns false if adaption was not possible for some reason.
1794  */
AdaptToWordStr(PageSegMode mode,const char * wordstr)1795 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1796   int debug = 0;
1797   GetIntVariable("applybox_debug", &debug);
1798   bool success = true;
1799   PageSegMode current_psm = GetPageSegMode();
1800   SetPageSegMode(mode);
1801   SetVariable("classify_enable_learning", "0");
1802   const std::unique_ptr<const char[]> text(GetUTF8Text());
1803   if (debug) {
1804     tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1805   }
1806   if (text != nullptr) {
1807     PAGE_RES_IT it(page_res_);
1808     WERD_RES *word_res = it.word();
1809     if (word_res != nullptr) {
1810       word_res->word->set_text(wordstr);
1811       // Check to see if text matches wordstr.
1812       int w = 0;
1813       int t;
1814       for (t = 0; text[t] != '\0'; ++t) {
1815         if (text[t] == '\n' || text[t] == ' ') {
1816           continue;
1817         }
1818         while (wordstr[w] == ' ') {
1819           ++w;
1820         }
1821         if (text[t] != wordstr[w]) {
1822           break;
1823         }
1824         ++w;
1825       }
1826       if (text[t] != '\0' || wordstr[w] != '\0') {
1827         // No match.
1828         delete page_res_;
1829         std::vector<TBOX> boxes;
1830         page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
1831         tesseract_->ReSegmentByClassification(page_res_);
1832         tesseract_->TidyUp(page_res_);
1833         PAGE_RES_IT pr_it(page_res_);
1834         if (pr_it.word() == nullptr) {
1835           success = false;
1836         } else {
1837           word_res = pr_it.word();
1838         }
1839       } else {
1840         word_res->BestChoiceToCorrectText();
1841       }
1842       if (success) {
1843         tesseract_->EnableLearning = true;
1844         tesseract_->LearnWord(nullptr, word_res);
1845       }
1846     } else {
1847       success = false;
1848     }
1849   } else {
1850     success = false;
1851   }
1852   SetPageSegMode(current_psm);
1853   return success;
1854 }
1855 #endif // ndef DISABLED_LEGACY_ENGINE
1856 
1857 /**
1858  * Free up recognition results and any stored image data, without actually
1859  * freeing any recognition data that would be time-consuming to reload.
1860  * Afterwards, you must call SetImage or TesseractRect before doing
1861  * any Recognize or Get* operation.
1862  */
Clear()1863 void TessBaseAPI::Clear() {
1864   if (thresholder_ != nullptr) {
1865     thresholder_->Clear();
1866   }
1867   ClearResults();
1868   if (tesseract_ != nullptr) {
1869     SetInputImage(nullptr);
1870   }
1871 }
1872 
1873 /**
1874  * Close down tesseract and free up all memory. End() is equivalent to
1875  * destructing and reconstructing your TessBaseAPI.
1876  * Once End() has been used, none of the other API functions may be used
1877  * other than Init and anything declared above it in the class definition.
1878  */
End()1879 void TessBaseAPI::End() {
1880   Clear();
1881   delete thresholder_;
1882   thresholder_ = nullptr;
1883   delete page_res_;
1884   page_res_ = nullptr;
1885   delete block_list_;
1886   block_list_ = nullptr;
1887   if (paragraph_models_ != nullptr) {
1888     for (auto model : *paragraph_models_) {
1889       delete model;
1890     }
1891     delete paragraph_models_;
1892     paragraph_models_ = nullptr;
1893   }
1894 #ifndef DISABLED_LEGACY_ENGINE
1895   if (osd_tesseract_ == tesseract_) {
1896     osd_tesseract_ = nullptr;
1897   }
1898   delete osd_tesseract_;
1899   osd_tesseract_ = nullptr;
1900   delete equ_detect_;
1901   equ_detect_ = nullptr;
1902 #endif // ndef DISABLED_LEGACY_ENGINE
1903   delete tesseract_;
1904   tesseract_ = nullptr;
1905   input_file_.clear();
1906   output_file_.clear();
1907   datapath_.clear();
1908   language_.clear();
1909 }
1910 
1911 // Clear any library-level memory caches.
1912 // There are a variety of expensive-to-load constant data structures (mostly
1913 // language dictionaries) that are cached globally -- surviving the Init()
1914 // and End() of individual TessBaseAPI's.  This function allows the clearing
1915 // of these caches.
ClearPersistentCache()1916 void TessBaseAPI::ClearPersistentCache() {
1917   Dict::GlobalDawgCache()->DeleteUnusedDawgs();
1918 }
1919 
1920 /**
1921  * Check whether a word is valid according to Tesseract's language model
1922  * returns 0 if the word is invalid, non-zero if valid
1923  */
IsValidWord(const char * word) const1924 int TessBaseAPI::IsValidWord(const char *word) const {
1925   return tesseract_->getDict().valid_word(word);
1926 }
1927 // Returns true if utf8_character is defined in the UniCharset.
IsValidCharacter(const char * utf8_character) const1928 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1929   return tesseract_->unicharset.contains_unichar(utf8_character);
1930 }
1931 
1932 // TODO(rays) Obsolete this function and replace with a more aptly named
1933 // function that returns image coordinates rather than tesseract coordinates.
GetTextDirection(int * out_offset,float * out_slope)1934 bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1935   const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1936   if (it == nullptr) {
1937     return false;
1938   }
1939   int x1, x2, y1, y2;
1940   it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1941   // Calculate offset and slope (NOTE: Kind of ugly)
1942   if (x2 <= x1) {
1943     x2 = x1 + 1;
1944   }
1945   // Convert the point pair to slope/offset of the baseline (in image coords.)
1946   *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1947   *out_offset = static_cast<int>(y1 - *out_slope * x1);
1948   // Get the y-coord of the baseline at the left and right edges of the
1949   // textline's bounding box.
1950   int left, top, right, bottom;
1951   if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1952     return false;
1953   }
1954   int left_y = IntCastRounded(*out_slope * left + *out_offset);
1955   int right_y = IntCastRounded(*out_slope * right + *out_offset);
1956   // Shift the baseline down so it passes through the nearest bottom-corner
1957   // of the textline's bounding box. This is the difference between the y
1958   // at the lowest (max) edge of the box and the actual box bottom.
1959   *out_offset += bottom - std::max(left_y, right_y);
1960   // Switch back to bottom-up tesseract coordinates. Requires negation of
1961   // the slope and height - offset for the offset.
1962   *out_slope = -*out_slope;
1963   *out_offset = rect_height_ - *out_offset;
1964 
1965   return true;
1966 }
1967 
1968 /** Sets Dict::letter_is_okay_ function to point to the given function. */
SetDictFunc(DictFunc f)1969 void TessBaseAPI::SetDictFunc(DictFunc f) {
1970   if (tesseract_ != nullptr) {
1971     tesseract_->getDict().letter_is_okay_ = f;
1972   }
1973 }
1974 
1975 /**
1976  * Sets Dict::probability_in_context_ function to point to the given
1977  * function.
1978  *
1979  * @param f A single function that returns the probability of the current
1980  * "character" (in general a utf-8 string), given the context of a previous
1981  * utf-8 string.
1982  */
SetProbabilityInContextFunc(ProbabilityInContextFunc f)1983 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
1984   if (tesseract_ != nullptr) {
1985     tesseract_->getDict().probability_in_context_ = f;
1986     // Set it for the sublangs too.
1987     int num_subs = tesseract_->num_sub_langs();
1988     for (int i = 0; i < num_subs; ++i) {
1989       tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
1990     }
1991   }
1992 }
1993 
1994 /** Common code for setting the image. */
InternalSetImage()1995 bool TessBaseAPI::InternalSetImage() {
1996   if (tesseract_ == nullptr) {
1997     tprintf("Please call Init before attempting to set an image.\n");
1998     return false;
1999   }
2000   if (thresholder_ == nullptr) {
2001     thresholder_ = new ImageThresholder;
2002   }
2003   ClearResults();
2004   return true;
2005 }
2006 
2007 /**
2008  * Run the thresholder to make the thresholded image, returned in pix,
2009  * which must not be nullptr. *pix must be initialized to nullptr, or point
2010  * to an existing pixDestroyable Pix.
2011  * The usual argument to Threshold is Tesseract::mutable_pix_binary().
2012  */
Threshold(Pix ** pix)2013 bool TessBaseAPI::Threshold(Pix **pix) {
2014   ASSERT_HOST(pix != nullptr);
2015   if (*pix != nullptr) {
2016     pixDestroy(pix);
2017   }
2018   // Zero resolution messes up the algorithms, so make sure it is credible.
2019   int user_dpi = 0;
2020   GetIntVariable("user_defined_dpi", &user_dpi);
2021   int y_res = thresholder_->GetScaledYResolution();
2022   if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2023     tprintf(
2024         "Warning: User defined image dpi is outside of expected range "
2025         "(%d - %d)!\n",
2026         kMinCredibleResolution, kMaxCredibleResolution);
2027   }
2028   // Always use user defined dpi
2029   if (user_dpi) {
2030     thresholder_->SetSourceYResolution(user_dpi);
2031   } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2032     if (y_res != 0) {
2033       // Show warning only if a resolution was given.
2034       tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2035               y_res, kMinCredibleResolution);
2036     }
2037     thresholder_->SetSourceYResolution(kMinCredibleResolution);
2038   }
2039 
2040   auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2041 
2042   if (thresholding_method == ThresholdMethod::Otsu) {
2043     Image pix_binary(*pix);
2044     if (!thresholder_->ThresholdToPix(&pix_binary)) {
2045       return false;
2046     }
2047     *pix = pix_binary;
2048 
2049     if (!thresholder_->IsBinary()) {
2050       tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
2051       tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
2052     } else {
2053       tesseract_->set_pix_thresholds(nullptr);
2054       tesseract_->set_pix_grey(nullptr);
2055     }
2056   } else {
2057     auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2058 
2059     if (!ok) {
2060       return false;
2061     }
2062     *pix = pix_binary;
2063 
2064     tesseract_->set_pix_thresholds(pix_thresholds);
2065     tesseract_->set_pix_grey(pix_grey);
2066   }
2067 
2068   thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,
2069                               &image_height_);
2070 
2071   // Set the internal resolution that is used for layout parameters from the
2072   // estimated resolution, rather than the image resolution, which may be
2073   // fabricated, but we will use the image resolution, if there is one, to
2074   // report output point sizes.
2075   int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2076                                   kMinCredibleResolution, kMaxCredibleResolution);
2077   if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2078     tprintf(
2079         "Estimated internal resolution %d out of range! "
2080         "Corrected to %d.\n",
2081         thresholder_->GetScaledEstimatedResolution(), estimated_res);
2082   }
2083   tesseract_->set_source_resolution(estimated_res);
2084   return true;
2085 }
2086 
2087 /** Find lines from the image making the BLOCK_LIST. */
FindLines()2088 int TessBaseAPI::FindLines() {
2089   if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2090     tprintf("Please call SetImage before attempting recognition.\n");
2091     return -1;
2092   }
2093   if (recognition_done_) {
2094     ClearResults();
2095   }
2096   if (!block_list_->empty()) {
2097     return 0;
2098   }
2099   if (tesseract_ == nullptr) {
2100     tesseract_ = new Tesseract;
2101 #ifndef DISABLED_LEGACY_ENGINE
2102     tesseract_->InitAdaptiveClassifier(nullptr);
2103 #endif
2104   }
2105   if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2106     return -1;
2107   }
2108 
2109   tesseract_->PrepareForPageseg();
2110 
2111 #ifndef DISABLED_LEGACY_ENGINE
2112   if (tesseract_->textord_equation_detect) {
2113     if (equ_detect_ == nullptr && !datapath_.empty()) {
2114       equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2115     }
2116     if (equ_detect_ == nullptr) {
2117       tprintf("Warning: Could not set equation detector\n");
2118     } else {
2119       tesseract_->SetEquationDetect(equ_detect_);
2120     }
2121   }
2122 #endif // ndef DISABLED_LEGACY_ENGINE
2123 
2124   Tesseract *osd_tess = osd_tesseract_;
2125   OSResults osr;
2126 #ifndef DISABLED_LEGACY_ENGINE
2127   if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2128     if (strcmp(language_.c_str(), "osd") == 0) {
2129       osd_tess = tesseract_;
2130     } else {
2131       osd_tesseract_ = new Tesseract;
2132       TessdataManager mgr(reader_);
2133       if (datapath_.empty()) {
2134         tprintf(
2135             "Warning: Auto orientation and script detection requested,"
2136             " but data path is undefined\n");
2137         delete osd_tesseract_;
2138         osd_tesseract_ = nullptr;
2139       } else if (osd_tesseract_->init_tesseract(datapath_.c_str(), "", "osd", OEM_TESSERACT_ONLY,
2140                                                 nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2141         osd_tess = osd_tesseract_;
2142         osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());
2143       } else {
2144         tprintf(
2145             "Warning: Auto orientation and script detection requested,"
2146             " but osd language failed to load\n");
2147         delete osd_tesseract_;
2148         osd_tesseract_ = nullptr;
2149       }
2150     }
2151   }
2152 #endif // ndef DISABLED_LEGACY_ENGINE
2153 
2154   if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2155     return -1;
2156   }
2157 
2158   // If Devanagari is being recognized, we use different images for page seg
2159   // and for OCR.
2160   tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2161   return 0;
2162 }
2163 
2164 /** Delete the pageres and clear the block list ready for a new page. */
ClearResults()2165 void TessBaseAPI::ClearResults() {
2166   if (tesseract_ != nullptr) {
2167     tesseract_->Clear();
2168   }
2169   delete page_res_;
2170   page_res_ = nullptr;
2171   recognition_done_ = false;
2172   if (block_list_ == nullptr) {
2173     block_list_ = new BLOCK_LIST;
2174   } else {
2175     block_list_->clear();
2176   }
2177   if (paragraph_models_ != nullptr) {
2178     for (auto model : *paragraph_models_) {
2179       delete model;
2180     }
2181     delete paragraph_models_;
2182     paragraph_models_ = nullptr;
2183   }
2184 }
2185 
2186 /**
2187  * Return the length of the output text string, as UTF8, assuming
2188  * liberally two spacing marks after each word (as paragraphs end with two
2189  * newlines), and assuming a single character reject marker for each rejected
2190  * character.
2191  * Also return the number of recognized blobs in blob_count.
2192  */
TextLength(int * blob_count) const2193 int TessBaseAPI::TextLength(int *blob_count) const {
2194   if (tesseract_ == nullptr || page_res_ == nullptr) {
2195     return 0;
2196   }
2197 
2198   PAGE_RES_IT page_res_it(page_res_);
2199   int total_length = 2;
2200   int total_blobs = 0;
2201   // Iterate over the data structures to extract the recognition result.
2202   for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2203     WERD_RES *word = page_res_it.word();
2204     WERD_CHOICE *choice = word->best_choice;
2205     if (choice != nullptr) {
2206       total_blobs += choice->length() + 2;
2207       total_length += choice->unichar_string().length() + 2;
2208       for (int i = 0; i < word->reject_map.length(); ++i) {
2209         if (word->reject_map[i].rejected()) {
2210           ++total_length;
2211         }
2212       }
2213     }
2214   }
2215   if (blob_count != nullptr) {
2216     *blob_count = total_blobs;
2217   }
2218   return total_length;
2219 }
2220 
2221 #ifndef DISABLED_LEGACY_ENGINE
2222 /**
2223  * Estimates the Orientation And Script of the image.
2224  * Returns true if the image was processed successfully.
2225  */
DetectOS(OSResults * osr)2226 bool TessBaseAPI::DetectOS(OSResults *osr) {
2227   if (tesseract_ == nullptr) {
2228     return false;
2229   }
2230   ClearResults();
2231   if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2232     return false;
2233   }
2234 
2235   if (input_file_.empty()) {
2236     input_file_ = kInputFile;
2237   }
2238   return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
2239 }
2240 #endif // #ifndef DISABLED_LEGACY_ENGINE
2241 
set_min_orientation_margin(double margin)2242 void TessBaseAPI::set_min_orientation_margin(double margin) {
2243   tesseract_->min_orientation_margin.set_value(margin);
2244 }
2245 
2246 /**
2247  * Return text orientation of each block as determined in an earlier page layout
2248  * analysis operation. Orientation is returned as the number of ccw 90-degree
2249  * rotations (in [0..3]) required to make the text in the block upright
2250  * (readable). Note that this may not necessary be the block orientation
2251  * preferred for recognition (such as the case of vertical CJK text).
2252  *
2253  * Also returns whether the text in the block is believed to have vertical
2254  * writing direction (when in an upright page orientation).
2255  *
2256  * The returned array is of length equal to the number of text blocks, which may
2257  * be less than the total number of blocks. The ordering is intended to be
2258  * consistent with GetTextLines().
2259  */
GetBlockTextOrientations(int ** block_orientation,bool ** vertical_writing)2260 void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2261   delete[] * block_orientation;
2262   *block_orientation = nullptr;
2263   delete[] * vertical_writing;
2264   *vertical_writing = nullptr;
2265   BLOCK_IT block_it(block_list_);
2266 
2267   block_it.move_to_first();
2268   int num_blocks = 0;
2269   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2270     if (!block_it.data()->pdblk.poly_block()->IsText()) {
2271       continue;
2272     }
2273     ++num_blocks;
2274   }
2275   if (!num_blocks) {
2276     tprintf("WARNING: Found no blocks\n");
2277     return;
2278   }
2279   *block_orientation = new int[num_blocks];
2280   *vertical_writing = new bool[num_blocks];
2281   block_it.move_to_first();
2282   int i = 0;
2283   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2284     if (!block_it.data()->pdblk.poly_block()->IsText()) {
2285       continue;
2286     }
2287     FCOORD re_rotation = block_it.data()->re_rotation();
2288     float re_theta = re_rotation.angle();
2289     FCOORD classify_rotation = block_it.data()->classify_rotation();
2290     float classify_theta = classify_rotation.angle();
2291     double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2292     if (rot_theta < 0) {
2293       rot_theta += 4;
2294     }
2295     int num_rotations = static_cast<int>(rot_theta + 0.5);
2296     (*block_orientation)[i] = num_rotations;
2297     // The classify_rotation is non-zero only if the text has vertical
2298     // writing direction.
2299     (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2300     ++i;
2301   }
2302 }
2303 
DetectParagraphs(bool after_text_recognition)2304 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2305   int debug_level = 0;
2306   GetIntVariable("paragraph_debug_level", &debug_level);
2307   if (paragraph_models_ == nullptr) {
2308     paragraph_models_ = new std::vector<ParagraphModel *>;
2309   }
2310   MutableIterator *result_it = GetMutableIterator();
2311   do { // Detect paragraphs for this block
2312     std::vector<ParagraphModel *> models;
2313     ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2314     paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2315   } while (result_it->Next(RIL_BLOCK));
2316   delete result_it;
2317 }
2318 
2319 /** This method returns the string form of the specified unichar. */
GetUnichar(int unichar_id) const2320 const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2321   return tesseract_->unicharset.id_to_unichar(unichar_id);
2322 }
2323 
2324 /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
GetDawg(int i) const2325 const Dawg *TessBaseAPI::GetDawg(int i) const {
2326   if (tesseract_ == nullptr || i >= NumDawgs()) {
2327     return nullptr;
2328   }
2329   return tesseract_->getDict().GetDawg(i);
2330 }
2331 
2332 /** Return the number of dawgs loaded into tesseract_ object. */
NumDawgs() const2333 int TessBaseAPI::NumDawgs() const {
2334   return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2335 }
2336 
2337 /** Escape a char string - remove <>&"' with HTML codes. */
HOcrEscape(const char * text)2338 std::string HOcrEscape(const char *text) {
2339   std::string ret;
2340   const char *ptr;
2341   for (ptr = text; *ptr; ptr++) {
2342     switch (*ptr) {
2343       case '<':
2344         ret += "&lt;";
2345         break;
2346       case '>':
2347         ret += "&gt;";
2348         break;
2349       case '&':
2350         ret += "&amp;";
2351         break;
2352       case '"':
2353         ret += "&quot;";
2354         break;
2355       case '\'':
2356         ret += "&#39;";
2357         break;
2358       default:
2359         ret += *ptr;
2360     }
2361   }
2362   return ret;
2363 }
2364 
2365 } // namespace tesseract
2366