1 /**********************************************************************
2 * File: baseapi.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 2006, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #define _USE_MATH_DEFINES // for M_PI
20
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 # include "config_auto.h"
24 #endif
25
26 #include "boxword.h" // for BoxWord
27 #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST
28 #include "dawg_cache.h" // for DawgCache
29 #include "dict.h" // for Dict
30 #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31 #include "environ.h" // for l_uint8
32 #ifndef DISABLED_LEGACY_ENGINE
33 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34 #endif // ndef DISABLED_LEGACY_ENGINE
35 #include "errcode.h" // for ASSERT_HOST
36 #include "helpers.h" // for IntCastRounded, chomp_string
37 #include "host.h" // for MAX_PATH
38 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39 #ifndef DISABLED_LEGACY_ENGINE
40 # include "intfx.h" // for INT_FX_RESULT_STRUCT
41 #endif
42 #include "mutableiterator.h" // for MutableIterator
43 #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight
44 #if defined(USE_OPENCL)
45 # include "openclwrapper.h" // for OpenclDevice
46 #endif
47 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
48 #include "paragraphs.h" // for DetectParagraphs
49 #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri...
50 #include "pdblock.h" // for PDBLK
51 #include "points.h" // for FCOORD
52 #include "polyblk.h" // for POLY_BLOCK
53 #include "rect.h" // for TBOX
54 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
55 #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
56 #include "tesseractclass.h" // for Tesseract
57 #include "tprintf.h" // for tprintf
58 #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
59 #include "thresholder.h" // for ImageThresholder
60
61 #include <tesseract/baseapi.h>
62 #include <tesseract/ocrclass.h> // for ETEXT_DESC
63 #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId...
64 #include <tesseract/renderer.h> // for TessResultRenderer
65 #include <tesseract/resultiterator.h> // for ResultIterator
66
67 #include <cmath> // for round, M_PI
68 #include <cstdint> // for int32_t
69 #include <cstring> // for strcmp, strcpy
70 #include <fstream> // for size_t
71 #include <iostream> // for std::cin
72 #include <locale> // for std::locale::classic
73 #include <memory> // for std::unique_ptr
74 #include <set> // for std::pair
75 #include <sstream> // for std::stringstream
76 #include <vector> // for std::vector
77
78 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
79 #ifdef HAVE_LIBCURL
80 # include <curl/curl.h>
81 #endif
82
83 #ifdef __linux__
84 # include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
85 #endif
86
87 #if defined(_WIN32)
88 # include <fcntl.h>
89 # include <io.h>
90 #else
91 # include <dirent.h> // for closedir, opendir, readdir, DIR, dirent
92 # include <libgen.h>
93 # include <sys/stat.h> // for stat, S_IFDIR
94 # include <sys/types.h>
95 # include <unistd.h>
96 #endif // _WIN32
97
98 namespace tesseract {
99
100 static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
101 static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
102
103 /** Minimum sensible image size to be worth running tesseract. */
104 const int kMinRectSize = 10;
105 /** Character returned when Tesseract couldn't recognize as anything. */
106 const char kTesseractReject = '~';
107 /** Character used by UNLV error counter as a reject. */
108 const char kUNLVReject = '~';
109 /** Character used by UNLV as a suspect marker. */
110 const char kUNLVSuspect = '^';
111 /**
112 * Temp file used for storing current parameters before applying retry values.
113 */
114 static const char *kOldVarsFile = "failed_vars.txt";
115
116 #ifndef DISABLED_LEGACY_ENGINE
117 /**
118 * Filename used for input image file, from which to derive a name to search
119 * for a possible UNLV zone file, if none is specified by SetInputName.
120 */
121 static const char *kInputFile = "noname.tif";
122 static const char kUnknownFontName[] = "UnknownFont";
123
124 static STRING_VAR(classify_font_name, kUnknownFontName,
125 "Default font name to be used in training");
126
127 // Finds the name of the training font and returns it in fontname, by cutting
128 // it out based on the expectation that the filename is of the form:
129 // /path/to/dir/[lang].[fontname].exp[num]
130 // The [lang], [fontname] and [num] fields should not have '.' characters.
131 // If the global parameter classify_font_name is set, its value is used instead.
ExtractFontName(const char * filename,std::string * fontname)132 static void ExtractFontName(const char* filename, std::string* fontname) {
133 *fontname = classify_font_name;
134 if (*fontname == kUnknownFontName) {
135 // filename is expected to be of the form [lang].[fontname].exp[num]
136 // The [lang], [fontname] and [num] fields should not have '.' characters.
137 const char *basename = strrchr(filename, '/');
138 const char *firstdot = strchr(basename ? basename : filename, '.');
139 const char *lastdot = strrchr(filename, '.');
140 if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
141 ++firstdot;
142 *fontname = firstdot;
143 fontname->resize(lastdot - firstdot);
144 }
145 }
146 }
147 #endif
148
149 /* Add all available languages recursively.
150 */
addAvailableLanguages(const std::string & datadir,const std::string & base,std::vector<std::string> * langs)151 static void addAvailableLanguages(const std::string &datadir, const std::string &base,
152 std::vector<std::string> *langs) {
153 auto base2 = base;
154 if (!base2.empty()) {
155 base2 += "/";
156 }
157 const size_t extlen = sizeof(kTrainedDataSuffix);
158 #ifdef _WIN32
159 WIN32_FIND_DATA data;
160 HANDLE handle = FindFirstFile((datadir + base2 + "*").c_str(), &data);
161 if (handle != INVALID_HANDLE_VALUE) {
162 BOOL result = TRUE;
163 for (; result;) {
164 char *name = data.cFileName;
165 // Skip '.', '..', and hidden files
166 if (name[0] != '.') {
167 if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == FILE_ATTRIBUTE_DIRECTORY) {
168 addAvailableLanguages(datadir, base2 + name, langs);
169 } else {
170 size_t len = strlen(name);
171 if (len > extlen && name[len - extlen] == '.' &&
172 strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
173 name[len - extlen] = '\0';
174 langs->push_back(base2 + name);
175 }
176 }
177 }
178 result = FindNextFile(handle, &data);
179 }
180 FindClose(handle);
181 }
182 #else // _WIN32
183 DIR *dir = opendir((datadir + base).c_str());
184 if (dir != nullptr) {
185 dirent *de;
186 while ((de = readdir(dir))) {
187 char *name = de->d_name;
188 // Skip '.', '..', and hidden files
189 if (name[0] != '.') {
190 struct stat st;
191 if (stat((datadir + base2 + name).c_str(), &st) == 0 && (st.st_mode & S_IFDIR) == S_IFDIR) {
192 addAvailableLanguages(datadir, base2 + name, langs);
193 } else {
194 size_t len = strlen(name);
195 if (len > extlen && name[len - extlen] == '.' &&
196 strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
197 name[len - extlen] = '\0';
198 langs->push_back(base2 + name);
199 }
200 }
201 }
202 }
203 closedir(dir);
204 }
205 #endif
206 }
207
TessBaseAPI()208 TessBaseAPI::TessBaseAPI()
209 : tesseract_(nullptr)
210 , osd_tesseract_(nullptr)
211 , equ_detect_(nullptr)
212 , reader_(nullptr)
213 ,
214 // thresholder_ is initialized to nullptr here, but will be set before use
215 // by: A constructor of a derived API or created
216 // implicitly when used in InternalSetImage.
217 thresholder_(nullptr)
218 , paragraph_models_(nullptr)
219 , block_list_(nullptr)
220 , page_res_(nullptr)
221 , last_oem_requested_(OEM_DEFAULT)
222 , recognition_done_(false)
223 , rect_left_(0)
224 , rect_top_(0)
225 , rect_width_(0)
226 , rect_height_(0)
227 , image_width_(0)
228 , image_height_(0) {
229 }
230
~TessBaseAPI()231 TessBaseAPI::~TessBaseAPI() {
232 End();
233 }
234
235 /**
236 * Returns the version identifier as a static string. Do not delete.
237 */
Version()238 const char *TessBaseAPI::Version() {
239 return TESSERACT_VERSION_STR;
240 }
241
242 /**
243 * If compiled with OpenCL AND an available OpenCL
244 * device is deemed faster than serial code, then
245 * "device" is populated with the cl_device_id
246 * and returns sizeof(cl_device_id)
247 * otherwise *device=nullptr and returns 0.
248 */
getOpenCLDevice(void ** data)249 size_t TessBaseAPI::getOpenCLDevice(void **data) {
250 #ifdef USE_OPENCL
251 ds_device device = OpenclDevice::getDeviceSelection();
252 if (device.type == DS_DEVICE_OPENCL_DEVICE) {
253 *data = new cl_device_id;
254 memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
255 return sizeof(cl_device_id);
256 }
257 #endif
258
259 *data = nullptr;
260 return 0;
261 }
262
263 /**
264 * Set the name of the input file. Needed only for training and
265 * loading a UNLV zone file.
266 */
SetInputName(const char * name)267 void TessBaseAPI::SetInputName(const char *name) {
268 input_file_ = name ? name : "";
269 }
270
271 /** Set the name of the output files. Needed only for debugging. */
SetOutputName(const char * name)272 void TessBaseAPI::SetOutputName(const char *name) {
273 output_file_ = name ? name : "";
274 }
275
SetVariable(const char * name,const char * value)276 bool TessBaseAPI::SetVariable(const char *name, const char *value) {
277 if (tesseract_ == nullptr) {
278 tesseract_ = new Tesseract;
279 }
280 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
281 tesseract_->params());
282 }
283
SetDebugVariable(const char * name,const char * value)284 bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
285 if (tesseract_ == nullptr) {
286 tesseract_ = new Tesseract;
287 }
288 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());
289 }
290
GetIntVariable(const char * name,int * value) const291 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
292 auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
293 tesseract_->params()->int_params);
294 if (p == nullptr) {
295 return false;
296 }
297 *value = (int32_t)(*p);
298 return true;
299 }
300
GetBoolVariable(const char * name,bool * value) const301 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
302 auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
303 tesseract_->params()->bool_params);
304 if (p == nullptr) {
305 return false;
306 }
307 *value = bool(*p);
308 return true;
309 }
310
GetStringVariable(const char * name) const311 const char *TessBaseAPI::GetStringVariable(const char *name) const {
312 auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
313 tesseract_->params()->string_params);
314 return (p != nullptr) ? p->c_str() : nullptr;
315 }
316
GetDoubleVariable(const char * name,double * value) const317 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
318 auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
319 tesseract_->params()->double_params);
320 if (p == nullptr) {
321 return false;
322 }
323 *value = (double)(*p);
324 return true;
325 }
326
327 /** Get value of named variable as a string, if it exists. */
GetVariableAsString(const char * name,std::string * val) const328 bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
329 return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
330 }
331
332 #ifndef DISABLED_LEGACY_ENGINE
333
334 /** Print Tesseract fonts table to the given file. */
PrintFontsTable(FILE * fp) const335 void TessBaseAPI::PrintFontsTable(FILE *fp) const {
336 const int fontinfo_size = tesseract_->get_fontinfo_table().size();
337 for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
338 FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
339 fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
340 " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
341 font_index, font.name,
342 font.is_italic() ? "true" : "false",
343 font.is_bold() ? "true" : "false",
344 font.is_fixed_pitch() ? "true" : "false",
345 font.is_serif() ? "true" : "false",
346 font.is_fraktur() ? "true" : "false");
347 }
348 }
349
350 #endif
351
352 /** Print Tesseract parameters to the given file. */
PrintVariables(FILE * fp) const353 void TessBaseAPI::PrintVariables(FILE *fp) const {
354 ParamUtils::PrintParams(fp, tesseract_->params());
355 }
356
357 /**
358 * The datapath must be the name of the data directory or
359 * some other file in which the data directory resides (for instance argv[0].)
360 * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
361 * If numeric_mode is true, then only digits and Roman numerals will
362 * be returned.
363 * @return: 0 on success and -1 on initialization failure.
364 */
Init(const char * datapath,const char * language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params)365 int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
366 int configs_size, const std::vector<std::string> *vars_vec,
367 const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
368 return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
369 set_only_non_debug_params, nullptr);
370 }
371
372 // In-memory version reads the traineddata file directly from the given
373 // data[data_size] array. Also implements the version with a datapath in data,
374 // flagged by data_size = 0.
Init(const char * data,int data_size,const char * language,OcrEngineMode oem,char ** configs,int configs_size,const std::vector<std::string> * vars_vec,const std::vector<std::string> * vars_values,bool set_only_non_debug_params,FileReader reader)375 int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
376 char **configs, int configs_size, const std::vector<std::string> *vars_vec,
377 const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
378 FileReader reader) {
379 if (language == nullptr) {
380 language = "";
381 }
382 if (data == nullptr) {
383 data = "";
384 }
385 std::string datapath = data_size == 0 ? data : language;
386 // If the datapath, OcrEngineMode or the language have changed - start again.
387 // Note that the language_ field stores the last requested language that was
388 // initialized successfully, while tesseract_->lang stores the language
389 // actually used. They differ only if the requested language was nullptr, in
390 // which case tesseract_->lang is set to the Tesseract default ("eng").
391 if (tesseract_ != nullptr &&
392 (datapath_.empty() || language_.empty() || datapath_ != datapath ||
393 last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
394 delete tesseract_;
395 tesseract_ = nullptr;
396 }
397 #ifdef USE_OPENCL
398 OpenclDevice od;
399 od.InitEnv();
400 #endif
401 bool reset_classifier = true;
402 if (tesseract_ == nullptr) {
403 reset_classifier = false;
404 tesseract_ = new Tesseract;
405 if (reader != nullptr) {
406 reader_ = reader;
407 }
408 TessdataManager mgr(reader_);
409 if (data_size != 0) {
410 mgr.LoadMemBuffer(language, data, data_size);
411 }
412 if (tesseract_->init_tesseract(datapath.c_str(), output_file_.c_str(), language, oem, configs,
413 configs_size, vars_vec, vars_values, set_only_non_debug_params,
414 &mgr) != 0) {
415 return -1;
416 }
417 }
418
419 // Update datapath and language requested for the last valid initialization.
420 datapath_ = datapath;
421 if (datapath_.empty() && !tesseract_->datadir.empty()) {
422 datapath_ = tesseract_->datadir;
423 }
424
425 language_ = language;
426 last_oem_requested_ = oem;
427
428 #ifndef DISABLED_LEGACY_ENGINE
429 // For same language and datapath, just reset the adaptive classifier.
430 if (reset_classifier) {
431 tesseract_->ResetAdaptiveClassifier();
432 }
433 #endif // ndef DISABLED_LEGACY_ENGINE
434 return 0;
435 }
436
437 /**
438 * Returns the languages string used in the last valid initialization.
439 * If the last initialization specified "deu+hin" then that will be
440 * returned. If hin loaded eng automatically as well, then that will
441 * not be included in this list. To find the languages actually
442 * loaded use GetLoadedLanguagesAsVector.
443 * The returned string should NOT be deleted.
444 */
GetInitLanguagesAsString() const445 const char *TessBaseAPI::GetInitLanguagesAsString() const {
446 return language_.c_str();
447 }
448
449 /**
450 * Returns the loaded languages in the vector of std::string.
451 * Includes all languages loaded by the last Init, including those loaded
452 * as dependencies of other loaded languages.
453 */
GetLoadedLanguagesAsVector(std::vector<std::string> * langs) const454 void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
455 langs->clear();
456 if (tesseract_ != nullptr) {
457 langs->push_back(tesseract_->lang);
458 int num_subs = tesseract_->num_sub_langs();
459 for (int i = 0; i < num_subs; ++i) {
460 langs->push_back(tesseract_->get_sub_lang(i)->lang);
461 }
462 }
463 }
464
465 /**
466 * Returns the available languages in the sorted vector of std::string.
467 */
GetAvailableLanguagesAsVector(std::vector<std::string> * langs) const468 void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
469 langs->clear();
470 if (tesseract_ != nullptr) {
471 addAvailableLanguages(tesseract_->datadir, "", langs);
472 std::sort(langs->begin(), langs->end());
473 }
474 }
475
476 /**
477 * Init only for page layout analysis. Use only for calls to SetImage and
478 * AnalysePage. Calls that attempt recognition will generate an error.
479 */
InitForAnalysePage()480 void TessBaseAPI::InitForAnalysePage() {
481 if (tesseract_ == nullptr) {
482 tesseract_ = new Tesseract;
483 #ifndef DISABLED_LEGACY_ENGINE
484 tesseract_->InitAdaptiveClassifier(nullptr);
485 #endif
486 }
487 }
488
489 /**
490 * Read a "config" file containing a set of parameter name, value pairs.
491 * Searches the standard places: tessdata/configs, tessdata/tessconfigs
492 * and also accepts a relative or absolute path name.
493 */
ReadConfigFile(const char * filename)494 void TessBaseAPI::ReadConfigFile(const char *filename) {
495 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
496 }
497
498 /** Same as above, but only set debug params from the given config file. */
ReadDebugConfigFile(const char * filename)499 void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
500 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
501 }
502
503 /**
504 * Set the current page segmentation mode. Defaults to PSM_AUTO.
505 * The mode is stored as an IntParam so it can also be modified by
506 * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
507 */
SetPageSegMode(PageSegMode mode)508 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
509 if (tesseract_ == nullptr) {
510 tesseract_ = new Tesseract;
511 }
512 tesseract_->tessedit_pageseg_mode.set_value(mode);
513 }
514
515 /** Return the current page segmentation mode. */
GetPageSegMode() const516 PageSegMode TessBaseAPI::GetPageSegMode() const {
517 if (tesseract_ == nullptr) {
518 return PSM_SINGLE_BLOCK;
519 }
520 return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
521 }
522
523 /**
524 * Recognize a rectangle from an image and return the result as a string.
525 * May be called many times for a single Init.
526 * Currently has no error checking.
527 * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
528 * Palette color images will not work properly and must be converted to
529 * 24 bit.
530 * Binary images of 1 bit per pixel may also be given but they must be
531 * byte packed with the MSB of the first byte being the first pixel, and a
532 * one pixel is WHITE. For binary images set bytes_per_pixel=0.
533 * The recognized text is returned as a char* which is coded
534 * as UTF8 and must be freed with the delete [] operator.
535 */
TesseractRect(const unsigned char * imagedata,int bytes_per_pixel,int bytes_per_line,int left,int top,int width,int height)536 char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
537 int bytes_per_line, int left, int top, int width, int height) {
538 if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
539 return nullptr; // Nothing worth doing.
540 }
541
542 // Since this original api didn't give the exact size of the image,
543 // we have to invent a reasonable value.
544 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
545 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
546 bytes_per_line);
547 SetRectangle(left, top, width, height);
548
549 return GetUTF8Text();
550 }
551
552 #ifndef DISABLED_LEGACY_ENGINE
553 /**
554 * Call between pages or documents etc to free up memory and forget
555 * adaptive data.
556 */
ClearAdaptiveClassifier()557 void TessBaseAPI::ClearAdaptiveClassifier() {
558 if (tesseract_ == nullptr) {
559 return;
560 }
561 tesseract_->ResetAdaptiveClassifier();
562 tesseract_->ResetDocumentDictionary();
563 }
564 #endif // ndef DISABLED_LEGACY_ENGINE
565
566 /**
567 * Provide an image for Tesseract to recognize. Format is as
568 * TesseractRect above. Copies the image buffer and converts to Pix.
569 * SetImage clears all recognition results, and sets the rectangle to the
570 * full image, so it may be followed immediately by a GetUTF8Text, and it
571 * will automatically perform recognition.
572 */
SetImage(const unsigned char * imagedata,int width,int height,int bytes_per_pixel,int bytes_per_line)573 void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
574 int bytes_per_pixel, int bytes_per_line) {
575 if (InternalSetImage()) {
576 thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
577 SetInputImage(thresholder_->GetPixRect());
578 }
579 }
580
SetSourceResolution(int ppi)581 void TessBaseAPI::SetSourceResolution(int ppi) {
582 if (thresholder_) {
583 thresholder_->SetSourceYResolution(ppi);
584 } else {
585 tprintf("Please call SetImage before SetSourceResolution.\n");
586 }
587 }
588
589 /**
590 * Provide an image for Tesseract to recognize. As with SetImage above,
591 * Tesseract takes its own copy of the image, so it need not persist until
592 * after Recognize.
593 * Pix vs raw, which to use?
594 * Use Pix where possible. Tesseract uses Pix as its internal representation
595 * and it is therefore more efficient to provide a Pix directly.
596 */
SetImage(Pix * pix)597 void TessBaseAPI::SetImage(Pix *pix) {
598 if (InternalSetImage()) {
599 if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
600 // remove alpha channel from png
601 Pix *p1 = pixRemoveAlpha(pix);
602 pixSetSpp(p1, 3);
603 (void)pixCopy(pix, p1);
604 pixDestroy(&p1);
605 }
606 thresholder_->SetImage(pix);
607 SetInputImage(thresholder_->GetPixRect());
608 }
609 }
610
611 /**
612 * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
613 * Each SetRectangle clears the recogntion results so multiple rectangles
614 * can be recognized with the same image.
615 */
SetRectangle(int left,int top,int width,int height)616 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
617 if (thresholder_ == nullptr) {
618 return;
619 }
620 thresholder_->SetRectangle(left, top, width, height);
621 ClearResults();
622 }
623
624 /**
625 * ONLY available after SetImage if you have Leptonica installed.
626 * Get a copy of the internal thresholded image from Tesseract.
627 */
GetThresholdedImage()628 Pix *TessBaseAPI::GetThresholdedImage() {
629 if (tesseract_ == nullptr || thresholder_ == nullptr) {
630 return nullptr;
631 }
632 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
633 return nullptr;
634 }
635 return tesseract_->pix_binary().clone();
636 }
637
638 /**
639 * Get the result of page layout analysis as a leptonica-style
640 * Boxa, Pixa pair, in reading order.
641 * Can be called before or after Recognize.
642 */
GetRegions(Pixa ** pixa)643 Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
644 return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
645 }
646
647 /**
648 * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
649 * Can be called before or after Recognize.
650 * If blockids is not nullptr, the block-id of each line is also returned as an
651 * array of one element per line. delete [] after use.
652 * If paraids is not nullptr, the paragraph-id of each line within its block is
653 * also returned as an array of one element per line. delete [] after use.
654 */
GetTextlines(const bool raw_image,const int raw_padding,Pixa ** pixa,int ** blockids,int ** paraids)655 Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
656 int **blockids, int **paraids) {
657 return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
658 }
659
660 /**
661 * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
662 * pair, in reading order. Enables downstream handling of non-rectangular
663 * regions.
664 * Can be called before or after Recognize.
665 * If blockids is not nullptr, the block-id of each line is also returned as an
666 * array of one element per line. delete [] after use.
667 */
GetStrips(Pixa ** pixa,int ** blockids)668 Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
669 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
670 }
671
672 /**
673 * Get the words as a leptonica-style
674 * Boxa, Pixa pair, in reading order.
675 * Can be called before or after Recognize.
676 */
GetWords(Pixa ** pixa)677 Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
678 return GetComponentImages(RIL_WORD, true, pixa, nullptr);
679 }
680
681 /**
682 * Gets the individual connected (text) components (created
683 * after pages segmentation step, but before recognition)
684 * as a leptonica-style Boxa, Pixa pair, in reading order.
685 * Can be called before or after Recognize.
686 */
GetConnectedComponents(Pixa ** pixa)687 Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {
688 return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
689 }
690
691 /**
692 * Get the given level kind of components (block, textline, word etc.) as a
693 * leptonica-style Boxa, Pixa pair, in reading order.
694 * Can be called before or after Recognize.
695 * If blockids is not nullptr, the block-id of each component is also returned
696 * as an array of one element per component. delete [] after use.
697 * If text_only is true, then only text components are returned.
698 */
GetComponentImages(PageIteratorLevel level,bool text_only,bool raw_image,const int raw_padding,Pixa ** pixa,int ** blockids,int ** paraids)699 Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
700 const int raw_padding, Pixa **pixa, int **blockids,
701 int **paraids) {
702 /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
703 if (page_it == nullptr) {
704 page_it.reset(AnalyseLayout());
705 }
706 if (page_it == nullptr) {
707 return nullptr; // Failed.
708 }
709
710 // Count the components to get a size for the arrays.
711 int component_count = 0;
712 int left, top, right, bottom;
713
714 if (raw_image) {
715 // Get bounding box in original raw image with padding.
716 do {
717 if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
718 (!text_only || PTIsTextType(page_it->BlockType()))) {
719 ++component_count;
720 }
721 } while (page_it->Next(level));
722 } else {
723 // Get bounding box from binarized imaged. Note that this could be
724 // differently scaled from the original image.
725 do {
726 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
727 (!text_only || PTIsTextType(page_it->BlockType()))) {
728 ++component_count;
729 }
730 } while (page_it->Next(level));
731 }
732
733 Boxa *boxa = boxaCreate(component_count);
734 if (pixa != nullptr) {
735 *pixa = pixaCreate(component_count);
736 }
737 if (blockids != nullptr) {
738 *blockids = new int[component_count];
739 }
740 if (paraids != nullptr) {
741 *paraids = new int[component_count];
742 }
743
744 int blockid = 0;
745 int paraid = 0;
746 int component_index = 0;
747 page_it->Begin();
748 do {
749 bool got_bounding_box;
750 if (raw_image) {
751 got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
752 } else {
753 got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
754 }
755 if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
756 Box *lbox = boxCreate(left, top, right - left, bottom - top);
757 boxaAddBox(boxa, lbox, L_INSERT);
758 if (pixa != nullptr) {
759 Pix *pix = nullptr;
760 if (raw_image) {
761 pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
762 } else {
763 pix = page_it->GetBinaryImage(level);
764 }
765 pixaAddPix(*pixa, pix, L_INSERT);
766 pixaAddBox(*pixa, lbox, L_CLONE);
767 }
768 if (paraids != nullptr) {
769 (*paraids)[component_index] = paraid;
770 if (page_it->IsAtFinalElement(RIL_PARA, level)) {
771 ++paraid;
772 }
773 }
774 if (blockids != nullptr) {
775 (*blockids)[component_index] = blockid;
776 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
777 ++blockid;
778 paraid = 0;
779 }
780 }
781 ++component_index;
782 }
783 } while (page_it->Next(level));
784 return boxa;
785 }
786
GetThresholdedImageScaleFactor() const787 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
788 if (thresholder_ == nullptr) {
789 return 0;
790 }
791 return thresholder_->GetScaleFactor();
792 }
793
794 /**
795 * Runs page layout analysis in the mode set by SetPageSegMode.
796 * May optionally be called prior to Recognize to get access to just
797 * the page layout results. Returns an iterator to the results.
798 * If merge_similar_words is true, words are combined where suitable for use
799 * with a line recognizer. Use if you want to use AnalyseLayout to find the
800 * textlines, and then want to process textline fragments with an external
801 * line recognizer.
802 * Returns nullptr on error or an empty page.
803 * The returned iterator must be deleted after use.
804 * WARNING! This class points to data held within the TessBaseAPI class, and
805 * therefore can only be used while the TessBaseAPI class still exists and
806 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
807 * DetectOS, or anything else that changes the internal PAGE_RES.
808 */
AnalyseLayout()809 PageIterator *TessBaseAPI::AnalyseLayout() {
810 return AnalyseLayout(false);
811 }
812
AnalyseLayout(bool merge_similar_words)813 PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
814 if (FindLines() == 0) {
815 if (block_list_->empty()) {
816 return nullptr; // The page was empty.
817 }
818 page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
819 DetectParagraphs(false);
820 return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
821 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
822 rect_width_, rect_height_);
823 }
824 return nullptr;
825 }
826
827 /**
828 * Recognize the tesseract global image and return the result as Tesseract
829 * internal structures.
830 */
Recognize(ETEXT_DESC * monitor)831 int TessBaseAPI::Recognize(ETEXT_DESC *monitor) {
832 if (tesseract_ == nullptr) {
833 return -1;
834 }
835 if (FindLines() != 0) {
836 return -1;
837 }
838 delete page_res_;
839 if (block_list_->empty()) {
840 page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);
841 return 0; // Empty page.
842 }
843
844 tesseract_->SetBlackAndWhitelist();
845 recognition_done_ = true;
846 #ifndef DISABLED_LEGACY_ENGINE
847 if (tesseract_->tessedit_resegment_from_line_boxes) {
848 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);
849 } else if (tesseract_->tessedit_resegment_from_boxes) {
850 page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);
851 } else
852 #endif // ndef DISABLED_LEGACY_ENGINE
853 {
854 page_res_ =
855 new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);
856 }
857
858 if (page_res_ == nullptr) {
859 return -1;
860 }
861
862 if (tesseract_->tessedit_train_line_recognizer) {
863 if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {
864 return -1;
865 }
866 tesseract_->CorrectClassifyWords(page_res_);
867 return 0;
868 }
869 #ifndef DISABLED_LEGACY_ENGINE
870 if (tesseract_->tessedit_make_boxes_from_boxes) {
871 tesseract_->CorrectClassifyWords(page_res_);
872 return 0;
873 }
874 #endif // ndef DISABLED_LEGACY_ENGINE
875
876 int result = 0;
877 if (tesseract_->interactive_display_mode) {
878 #ifndef GRAPHICS_DISABLED
879 tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
880 #endif // !GRAPHICS_DISABLED
881 // The page_res is invalid after an interactive session, so cleanup
882 // in a way that lets us continue to the next page without crashing.
883 delete page_res_;
884 page_res_ = nullptr;
885 return -1;
886 #ifndef DISABLED_LEGACY_ENGINE
887 } else if (tesseract_->tessedit_train_from_boxes) {
888 std::string fontname;
889 ExtractFontName(output_file_.c_str(), &fontname);
890 tesseract_->ApplyBoxTraining(fontname, page_res_);
891 } else if (tesseract_->tessedit_ambigs_training) {
892 FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
893 // OCR the page segmented into words by tesseract.
894 tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,
895 training_output_file);
896 fclose(training_output_file);
897 #endif // ndef DISABLED_LEGACY_ENGINE
898 } else {
899 // Now run the main recognition.
900 bool wait_for_text = true;
901 GetBoolVariable("paragraph_text_based", &wait_for_text);
902 if (!wait_for_text) {
903 DetectParagraphs(false);
904 }
905 if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
906 if (wait_for_text) {
907 DetectParagraphs(true);
908 }
909 } else {
910 result = -1;
911 }
912 }
913 return result;
914 }
915
916 // Takes ownership of the input pix.
SetInputImage(Pix * pix)917 void TessBaseAPI::SetInputImage(Pix *pix) {
918 tesseract_->set_pix_original(pix);
919 }
920
GetInputImage()921 Pix *TessBaseAPI::GetInputImage() {
922 return tesseract_->pix_original();
923 }
924
GetInputName()925 const char *TessBaseAPI::GetInputName() {
926 if (!input_file_.empty()) {
927 return input_file_.c_str();
928 }
929 return nullptr;
930 }
931
GetDatapath()932 const char *TessBaseAPI::GetDatapath() {
933 return tesseract_->datadir.c_str();
934 }
935
GetSourceYResolution()936 int TessBaseAPI::GetSourceYResolution() {
937 if (thresholder_ == nullptr)
938 return -1;
939 return thresholder_->GetSourceYResolution();
940 }
941
942 // If flist exists, get data from there. Otherwise get data from buf.
943 // Seems convoluted, but is the easiest way I know of to meet multiple
944 // goals. Support streaming from stdin, and also work on platforms
945 // lacking fmemopen.
946 // TODO: check different logic for flist/buf and simplify.
ProcessPagesFileList(FILE * flist,std::string * buf,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer,int tessedit_page_number)947 bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
948 int timeout_millisec, TessResultRenderer *renderer,
949 int tessedit_page_number) {
950 if (!flist && !buf) {
951 return false;
952 }
953 unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
954 char pagename[MAX_PATH];
955
956 std::vector<std::string> lines;
957 if (!flist) {
958 std::string line;
959 for (const auto ch : *buf) {
960 if (ch == '\n') {
961 lines.push_back(line);
962 line.clear();
963 } else {
964 line.push_back(ch);
965 }
966 }
967 if (!line.empty()) {
968 // Add last line without terminating LF.
969 lines.push_back(line);
970 }
971 if (lines.empty()) {
972 return false;
973 }
974 }
975
976 // Skip to the requested page number.
977 for (unsigned i = 0; i < page; i++) {
978 if (flist) {
979 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
980 break;
981 }
982 }
983 }
984
985 // Begin producing output
986 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
987 return false;
988 }
989
990 // Loop over all pages - or just the requested one
991 while (true) {
992 if (flist) {
993 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
994 break;
995 }
996 } else {
997 if (page >= lines.size()) {
998 break;
999 }
1000 snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
1001 }
1002 chomp_string(pagename);
1003 Pix *pix = pixRead(pagename);
1004 if (pix == nullptr) {
1005 tprintf("Image file %s cannot be read!\n", pagename);
1006 return false;
1007 }
1008 tprintf("Page %u : %s\n", page, pagename);
1009 bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
1010 pixDestroy(&pix);
1011 if (!r) {
1012 return false;
1013 }
1014 if (tessedit_page_number >= 0) {
1015 break;
1016 }
1017 ++page;
1018 }
1019
1020 // Finish producing output
1021 if (renderer && !renderer->EndDocument()) {
1022 return false;
1023 }
1024 return true;
1025 }
1026
ProcessPagesMultipageTiff(const l_uint8 * data,size_t size,const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer,int tessedit_page_number)1027 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
1028 const char *retry_config, int timeout_millisec,
1029 TessResultRenderer *renderer,
1030 int tessedit_page_number) {
1031 Pix *pix = nullptr;
1032 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1033 size_t offset = 0;
1034 for (;; ++page) {
1035 if (tessedit_page_number >= 0) {
1036 page = tessedit_page_number;
1037 pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
1038 } else {
1039 pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1040 : pixReadFromMultipageTiff(filename, &offset);
1041 }
1042 if (pix == nullptr) {
1043 break;
1044 }
1045 if (offset || page > 0) {
1046 // Only print page number for multipage TIFF file.
1047 tprintf("Page %d\n", page + 1);
1048 }
1049 auto page_string = std::to_string(page);
1050 SetVariable("applybox_page", page_string.c_str());
1051 bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
1052 pixDestroy(&pix);
1053 if (!r) {
1054 return false;
1055 }
1056 if (tessedit_page_number >= 0) {
1057 break;
1058 }
1059 if (!offset) {
1060 break;
1061 }
1062 }
1063 return true;
1064 }
1065
1066 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1067 // processing required due to being in a training mode.
ProcessPages(const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1068 bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
1069 TessResultRenderer *renderer) {
1070 bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1071 #ifndef DISABLED_LEGACY_ENGINE
1072 if (result) {
1073 if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1074 tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1075 return false;
1076 }
1077 }
1078 #endif // ndef DISABLED_LEGACY_ENGINE
1079 return result;
1080 }
1081
1082 #ifdef HAVE_LIBCURL
WriteMemoryCallback(void * contents,size_t size,size_t nmemb,void * userp)1083 static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1084 size = size * nmemb;
1085 auto *buf = reinterpret_cast<std::string *>(userp);
1086 buf->append(reinterpret_cast<const char *>(contents), size);
1087 return size;
1088 }
1089 #endif
1090
1091 // In the ideal scenario, Tesseract will start working on data as soon
1092 // as it can. For example, if you stream a filelist through stdin, we
1093 // should start the OCR process as soon as the first filename is
1094 // available. This is particularly useful when hooking Tesseract up to
1095 // slow hardware such as a book scanning machine.
1096 //
1097 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1098 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1099 // impractical. So we support a command line flag to explicitly
1100 // identify the scenario that really matters: filelists on
1101 // stdin. We'll still do our best if the user likes pipes.
ProcessPagesInternal(const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1102 bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1103 int timeout_millisec, TessResultRenderer *renderer) {
1104 bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1105 if (stdInput) {
1106 #ifdef WIN32
1107 if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1108 tprintf("ERROR: cin to binary: %s", strerror(errno));
1109 #endif // WIN32
1110 }
1111
1112 if (stream_filelist) {
1113 return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1114 tesseract_->tessedit_page_number);
1115 }
1116
1117 // At this point we are officially in autodection territory.
1118 // That means any data in stdin must be buffered, to make it
1119 // seekable.
1120 std::string buf;
1121 const l_uint8 *data = nullptr;
1122 if (stdInput) {
1123 buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1124 data = reinterpret_cast<const l_uint8 *>(buf.data());
1125 } else if (strstr(filename, "://") != nullptr) {
1126 // Get image or image list by URL.
1127 #ifdef HAVE_LIBCURL
1128 CURL *curl = curl_easy_init();
1129 if (curl == nullptr) {
1130 fprintf(stderr, "Error, curl_easy_init failed\n");
1131 return false;
1132 } else {
1133 CURLcode curlcode;
1134 auto error = [curl, &curlcode](const char *function) {
1135 fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1136 curl_easy_cleanup(curl);
1137 return false;
1138 };
1139 curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1140 if (curlcode != CURLE_OK) {
1141 return error("curl_easy_setopt");
1142 }
1143 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1144 if (curlcode != CURLE_OK) {
1145 return error("curl_easy_setopt");
1146 }
1147 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1148 if (curlcode != CURLE_OK) {
1149 return error("curl_easy_setopt");
1150 }
1151 curlcode = curl_easy_perform(curl);
1152 if (curlcode != CURLE_OK) {
1153 return error("curl_easy_perform");
1154 }
1155 curl_easy_cleanup(curl);
1156 data = reinterpret_cast<const l_uint8 *>(buf.data());
1157 }
1158 #else
1159 fprintf(stderr, "Error, this tesseract has no URL support\n");
1160 return false;
1161 #endif
1162 } else {
1163 // Check whether the input file can be read.
1164 if (FILE *file = fopen(filename, "rb")) {
1165 fclose(file);
1166 } else {
1167 fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1168 return false;
1169 }
1170 }
1171
1172 // Here is our autodetection
1173 int format;
1174 int r =
1175 (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1176
1177 // Maybe we have a filelist
1178 if (r != 0 || format == IFF_UNKNOWN) {
1179 std::string s;
1180 if (data != nullptr) {
1181 s = buf.c_str();
1182 } else {
1183 std::ifstream t(filename);
1184 std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1185 s = u.c_str();
1186 }
1187 return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1188 tesseract_->tessedit_page_number);
1189 }
1190
1191 // Maybe we have a TIFF which is potentially multipage
1192 bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1193 format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1194 #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1195 format == IFF_TIFF_JPEG ||
1196 #endif
1197 format == IFF_TIFF_ZIP);
1198
1199 // Fail early if we can, before producing any output
1200 Pix *pix = nullptr;
1201 if (!tiff) {
1202 pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1203 if (pix == nullptr) {
1204 return false;
1205 }
1206 }
1207
1208 // Begin the output
1209 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1210 pixDestroy(&pix);
1211 return false;
1212 }
1213
1214 // Produce output
1215 r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1216 renderer, tesseract_->tessedit_page_number)
1217 : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1218
1219 // Clean up memory as needed
1220 pixDestroy(&pix);
1221
1222 // End the output
1223 if (!r || (renderer && !renderer->EndDocument())) {
1224 return false;
1225 }
1226 return true;
1227 }
1228
ProcessPage(Pix * pix,int page_index,const char * filename,const char * retry_config,int timeout_millisec,TessResultRenderer * renderer)1229 bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1230 const char *retry_config, int timeout_millisec,
1231 TessResultRenderer *renderer) {
1232 SetInputName(filename);
1233 SetImage(pix);
1234 bool failed = false;
1235
1236 if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1237 // Disabled character recognition
1238 if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1239 failed = true;
1240 }
1241 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1242 failed = FindLines() != 0;
1243 } else if (timeout_millisec > 0) {
1244 // Running with a timeout.
1245 ETEXT_DESC monitor;
1246 monitor.cancel = nullptr;
1247 monitor.cancel_this = nullptr;
1248 monitor.set_deadline_msecs(timeout_millisec);
1249
1250 // Now run the main recognition.
1251 failed = Recognize(&monitor) < 0;
1252 } else {
1253 // Normal layout and character recognition with no timeout.
1254 failed = Recognize(nullptr) < 0;
1255 }
1256
1257 if (tesseract_->tessedit_write_images) {
1258 Pix *page_pix = GetThresholdedImage();
1259 std::string output_filename = output_file_ + ".processed";
1260 if (page_index > 0) {
1261 output_filename += std::to_string(page_index);
1262 }
1263 output_filename += ".tif";
1264 pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1265 pixDestroy(&page_pix);
1266 }
1267
1268 if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1269 // Save current config variables before switching modes.
1270 FILE *fp = fopen(kOldVarsFile, "wb");
1271 if (fp == nullptr) {
1272 tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1273 } else {
1274 PrintVariables(fp);
1275 fclose(fp);
1276 }
1277 // Switch to alternate mode for retry.
1278 ReadConfigFile(retry_config);
1279 SetImage(pix);
1280 Recognize(nullptr);
1281 // Restore saved config variables.
1282 ReadConfigFile(kOldVarsFile);
1283 }
1284
1285 if (renderer && !failed) {
1286 failed = !renderer->AddImage(this);
1287 }
1288
1289 return !failed;
1290 }
1291
1292 /**
1293 * Get a left-to-right iterator to the results of LayoutAnalysis and/or
1294 * Recognize. The returned iterator must be deleted after use.
1295 */
GetLTRIterator()1296 LTRResultIterator *TessBaseAPI::GetLTRIterator() {
1297 if (tesseract_ == nullptr || page_res_ == nullptr) {
1298 return nullptr;
1299 }
1300 return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1301 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1302 rect_width_, rect_height_);
1303 }
1304
1305 /**
1306 * Get a reading-order iterator to the results of LayoutAnalysis and/or
1307 * Recognize. The returned iterator must be deleted after use.
1308 * WARNING! This class points to data held within the TessBaseAPI class, and
1309 * therefore can only be used while the TessBaseAPI class still exists and
1310 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1311 * DetectOS, or anything else that changes the internal PAGE_RES.
1312 */
GetIterator()1313 ResultIterator *TessBaseAPI::GetIterator() {
1314 if (tesseract_ == nullptr || page_res_ == nullptr) {
1315 return nullptr;
1316 }
1317 return ResultIterator::StartOfParagraph(LTRResultIterator(
1318 page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
1319 rect_left_, rect_top_, rect_width_, rect_height_));
1320 }
1321
1322 /**
1323 * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
1324 * The returned iterator must be deleted after use.
1325 * WARNING! This class points to data held within the TessBaseAPI class, and
1326 * therefore can only be used while the TessBaseAPI class still exists and
1327 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1328 * DetectOS, or anything else that changes the internal PAGE_RES.
1329 */
GetMutableIterator()1330 MutableIterator *TessBaseAPI::GetMutableIterator() {
1331 if (tesseract_ == nullptr || page_res_ == nullptr) {
1332 return nullptr;
1333 }
1334 return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1335 thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1336 rect_width_, rect_height_);
1337 }
1338
1339 /** Make a text string from the internal data structures. */
GetUTF8Text()1340 char *TessBaseAPI::GetUTF8Text() {
1341 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1342 return nullptr;
1343 }
1344 std::string text("");
1345 const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1346 do {
1347 if (it->Empty(RIL_PARA)) {
1348 continue;
1349 }
1350 const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1351 text += para_text.get();
1352 } while (it->Next(RIL_PARA));
1353 char *result = new char[text.length() + 1];
1354 strncpy(result, text.c_str(), text.length() + 1);
1355 return result;
1356 }
1357
AddBoxToTSV(const PageIterator * it,PageIteratorLevel level,std::string & text)1358 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1359 int left, top, right, bottom;
1360 it->BoundingBox(level, &left, &top, &right, &bottom);
1361 text += "\t" + std::to_string(left);
1362 text += "\t" + std::to_string(top);
1363 text += "\t" + std::to_string(right - left);
1364 text += "\t" + std::to_string(bottom - top);
1365 }
1366
1367 /**
1368 * Make a TSV-formatted string from the internal data structures.
1369 * page_number is 0-based but will appear in the output as 1-based.
1370 * Returned string must be freed with the delete [] operator.
1371 */
GetTSVText(int page_number)1372 char *TessBaseAPI::GetTSVText(int page_number) {
1373 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1374 return nullptr;
1375 }
1376
1377 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1378 int page_id = page_number + 1; // we use 1-based page numbers.
1379
1380 int page_num = page_id;
1381 int block_num = 0;
1382 int par_num = 0;
1383 int line_num = 0;
1384 int word_num = 0;
1385
1386 std::string tsv_str;
1387 tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1388 tsv_str += "\t" + std::to_string(block_num);
1389 tsv_str += "\t" + std::to_string(par_num);
1390 tsv_str += "\t" + std::to_string(line_num);
1391 tsv_str += "\t" + std::to_string(word_num);
1392 tsv_str += "\t" + std::to_string(rect_left_);
1393 tsv_str += "\t" + std::to_string(rect_top_);
1394 tsv_str += "\t" + std::to_string(rect_width_);
1395 tsv_str += "\t" + std::to_string(rect_height_);
1396 tsv_str += "\t-1\t\n";
1397
1398 const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1399 while (!res_it->Empty(RIL_BLOCK)) {
1400 if (res_it->Empty(RIL_WORD)) {
1401 res_it->Next(RIL_WORD);
1402 continue;
1403 }
1404
1405 // Add rows for any new block/paragraph/textline.
1406 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1407 block_num++;
1408 par_num = 0;
1409 line_num = 0;
1410 word_num = 0;
1411 tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1412 tsv_str += "\t" + std::to_string(block_num);
1413 tsv_str += "\t" + std::to_string(par_num);
1414 tsv_str += "\t" + std::to_string(line_num);
1415 tsv_str += "\t" + std::to_string(word_num);
1416 AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1417 tsv_str += "\t-1\t\n"; // end of row for block
1418 }
1419 if (res_it->IsAtBeginningOf(RIL_PARA)) {
1420 par_num++;
1421 line_num = 0;
1422 word_num = 0;
1423 tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1424 tsv_str += "\t" + std::to_string(block_num);
1425 tsv_str += "\t" + std::to_string(par_num);
1426 tsv_str += "\t" + std::to_string(line_num);
1427 tsv_str += "\t" + std::to_string(word_num);
1428 AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1429 tsv_str += "\t-1\t\n"; // end of row for para
1430 }
1431 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1432 line_num++;
1433 word_num = 0;
1434 tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1435 tsv_str += "\t" + std::to_string(block_num);
1436 tsv_str += "\t" + std::to_string(par_num);
1437 tsv_str += "\t" + std::to_string(line_num);
1438 tsv_str += "\t" + std::to_string(word_num);
1439 AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1440 tsv_str += "\t-1\t\n"; // end of row for line
1441 }
1442
1443 // Now, process the word...
1444 int left, top, right, bottom;
1445 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1446 word_num++;
1447 tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1448 tsv_str += "\t" + std::to_string(block_num);
1449 tsv_str += "\t" + std::to_string(par_num);
1450 tsv_str += "\t" + std::to_string(line_num);
1451 tsv_str += "\t" + std::to_string(word_num);
1452 tsv_str += "\t" + std::to_string(left);
1453 tsv_str += "\t" + std::to_string(top);
1454 tsv_str += "\t" + std::to_string(right - left);
1455 tsv_str += "\t" + std::to_string(bottom - top);
1456 tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1457 tsv_str += "\t";
1458
1459 // Increment counts if at end of block/paragraph/textline.
1460 if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1461 lcnt++;
1462 }
1463 if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1464 pcnt++;
1465 }
1466 if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1467 bcnt++;
1468 }
1469
1470 do {
1471 tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1472 res_it->Next(RIL_SYMBOL);
1473 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1474 tsv_str += "\n"; // end of row
1475 wcnt++;
1476 }
1477
1478 char *ret = new char[tsv_str.length() + 1];
1479 strcpy(ret, tsv_str.c_str());
1480 return ret;
1481 }
1482
1483 /** The 5 numbers output for each box (the usual 4 and a page number.) */
1484 const int kNumbersPerBlob = 5;
1485 /**
1486 * The number of bytes taken by each number. Since we use int16_t for ICOORD,
1487 * assume only 5 digits max.
1488 */
1489 const int kBytesPerNumber = 5;
1490 /**
1491 * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
1492 * * kNumbersPerBlob plus the newline. Add to this the
1493 * original UTF8 characters, and one kMaxBytesPerLine for safety.
1494 */
1495 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1496 /** Max bytes in the decimal representation of int64_t. */
1497 const int kBytesPer64BitNumber = 20;
1498 /**
1499 * A maximal single box could occupy kNumbersPerBlob numbers at
1500 * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
1501 * space plus the newline and the maximum length of a UNICHAR.
1502 * Test against this on each iteration for safety.
1503 */
1504 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;
1505
1506 /**
1507 * The recognized text is returned as a char* which is coded
1508 * as a UTF8 box file.
1509 * page_number is a 0-base page index that will appear in the box file.
1510 * Returned string must be freed with the delete [] operator.
1511 */
GetBoxText(int page_number)1512 char *TessBaseAPI::GetBoxText(int page_number) {
1513 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1514 return nullptr;
1515 }
1516 int blob_count;
1517 int utf8_length = TextLength(&blob_count);
1518 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1519 char *result = new char[total_length];
1520 result[0] = '\0';
1521 int output_length = 0;
1522 LTRResultIterator *it = GetLTRIterator();
1523 do {
1524 int left, top, right, bottom;
1525 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1526 const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1527 // Tesseract uses space for recognition failure. Fix to a reject
1528 // character, kTesseractReject so we don't create illegal box files.
1529 for (int i = 0; text[i] != '\0'; ++i) {
1530 if (text[i] == ' ') {
1531 text[i] = kTesseractReject;
1532 }
1533 }
1534 snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1535 text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1536 output_length += strlen(result + output_length);
1537 // Just in case...
1538 if (output_length + kMaxBytesPerLine > total_length) {
1539 break;
1540 }
1541 }
1542 } while (it->Next(RIL_SYMBOL));
1543 delete it;
1544 return result;
1545 }
1546
1547 /**
1548 * Conversion table for non-latin characters.
1549 * Maps characters out of the latin set into the latin set.
1550 * TODO(rays) incorporate this translation into unicharset.
1551 */
1552 const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1553 /** Latin chars corresponding to the unicode chars above. */
1554 const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1555
1556 /**
1557 * The recognized text is returned as a char* which is coded
1558 * as UNLV format Latin-1 with specific reject and suspect codes.
1559 * Returned string must be freed with the delete [] operator.
1560 */
GetUNLVText()1561 char *TessBaseAPI::GetUNLVText() {
1562 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1563 return nullptr;
1564 }
1565 bool tilde_crunch_written = false;
1566 bool last_char_was_newline = true;
1567 bool last_char_was_tilde = false;
1568
1569 int total_length = TextLength(nullptr);
1570 PAGE_RES_IT page_res_it(page_res_);
1571 char *result = new char[total_length];
1572 char *ptr = result;
1573 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1574 WERD_RES *word = page_res_it.word();
1575 // Process the current word.
1576 if (word->unlv_crunch_mode != CR_NONE) {
1577 if (word->unlv_crunch_mode != CR_DELETE &&
1578 (!tilde_crunch_written ||
1579 (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1580 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1581 if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1582 !word->word->flag(W_FUZZY_SP)) {
1583 /* Write a space to separate from preceding good text */
1584 *ptr++ = ' ';
1585 last_char_was_tilde = false;
1586 }
1587 if (!last_char_was_tilde) {
1588 // Write a reject char.
1589 last_char_was_tilde = true;
1590 *ptr++ = kUNLVReject;
1591 tilde_crunch_written = true;
1592 last_char_was_newline = false;
1593 }
1594 }
1595 } else {
1596 // NORMAL PROCESSING of non tilde crunched words.
1597 tilde_crunch_written = false;
1598 tesseract_->set_unlv_suspects(word);
1599 const char *wordstr = word->best_choice->unichar_string().c_str();
1600 const auto &lengths = word->best_choice->unichar_lengths();
1601 int length = lengths.length();
1602 int i = 0;
1603 int offset = 0;
1604
1605 if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1606 // Prevent adjacent tilde across words - we know that adjacent tildes
1607 // within words have been removed.
1608 // Skip the first character.
1609 offset = lengths[i++];
1610 }
1611 if (i < length && wordstr[offset] != 0) {
1612 if (!last_char_was_newline) {
1613 *ptr++ = ' ';
1614 } else {
1615 last_char_was_newline = false;
1616 }
1617 for (; i < length; offset += lengths[i++]) {
1618 if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1619 *ptr++ = kUNLVReject;
1620 last_char_was_tilde = true;
1621 } else {
1622 if (word->reject_map[i].rejected()) {
1623 *ptr++ = kUNLVSuspect;
1624 }
1625 UNICHAR ch(wordstr + offset, lengths[i]);
1626 int uni_ch = ch.first_uni();
1627 for (int j = 0; kUniChs[j] != 0; ++j) {
1628 if (kUniChs[j] == uni_ch) {
1629 uni_ch = kLatinChs[j];
1630 break;
1631 }
1632 }
1633 if (uni_ch <= 0xff) {
1634 *ptr++ = static_cast<char>(uni_ch);
1635 last_char_was_tilde = false;
1636 } else {
1637 *ptr++ = kUNLVReject;
1638 last_char_was_tilde = true;
1639 }
1640 }
1641 }
1642 }
1643 }
1644 if (word->word->flag(W_EOL) && !last_char_was_newline) {
1645 /* Add a new line output */
1646 *ptr++ = '\n';
1647 tilde_crunch_written = false;
1648 last_char_was_newline = true;
1649 last_char_was_tilde = false;
1650 }
1651 }
1652 *ptr++ = '\n';
1653 *ptr = '\0';
1654 return result;
1655 }
1656
1657 #ifndef DISABLED_LEGACY_ENGINE
1658
1659 /**
1660 * Detect the orientation of the input image and apparent script (alphabet).
1661 * orient_deg is the detected clockwise rotation of the input image in degrees
1662 * (0, 90, 180, 270)
1663 * orient_conf is the confidence (15.0 is reasonably confident)
1664 * script_name is an ASCII string, the name of the script, e.g. "Latin"
1665 * script_conf is confidence level in the script
1666 * Returns true on success and writes values to each parameter as an output
1667 */
DetectOrientationScript(int * orient_deg,float * orient_conf,const char ** script_name,float * script_conf)1668 bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1669 const char **script_name, float *script_conf) {
1670 OSResults osr;
1671
1672 bool osd = DetectOS(&osr);
1673 if (!osd) {
1674 return false;
1675 }
1676
1677 int orient_id = osr.best_result.orientation_id;
1678 int script_id = osr.get_best_script(orient_id);
1679 if (orient_conf) {
1680 *orient_conf = osr.best_result.oconfidence;
1681 }
1682 if (orient_deg) {
1683 *orient_deg = orient_id * 90; // convert quadrant to degrees
1684 }
1685
1686 if (script_name) {
1687 const char *script = osr.unicharset->get_script_from_script_id(script_id);
1688
1689 *script_name = script;
1690 }
1691
1692 if (script_conf) {
1693 *script_conf = osr.best_result.sconfidence;
1694 }
1695
1696 return true;
1697 }
1698
1699 /**
1700 * The recognized text is returned as a char* which is coded
1701 * as UTF8 and must be freed with the delete [] operator.
1702 * page_number is a 0-based page index that will appear in the osd file.
1703 */
GetOsdText(int page_number)1704 char *TessBaseAPI::GetOsdText(int page_number) {
1705 int orient_deg;
1706 float orient_conf;
1707 const char *script_name;
1708 float script_conf;
1709
1710 if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1711 return nullptr;
1712 }
1713
1714 // clockwise rotation needed to make the page upright
1715 int rotate = OrientationIdToValue(orient_deg / 90);
1716
1717 std::stringstream stream;
1718 // Use "C" locale (needed for float values orient_conf and script_conf).
1719 stream.imbue(std::locale::classic());
1720 // Use fixed notation with 2 digits after the decimal point for float values.
1721 stream.precision(2);
1722 stream << std::fixed << "Page number: " << page_number << "\n"
1723 << "Orientation in degrees: " << orient_deg << "\n"
1724 << "Rotate: " << rotate << "\n"
1725 << "Orientation confidence: " << orient_conf << "\n"
1726 << "Script: " << script_name << "\n"
1727 << "Script confidence: " << script_conf << "\n";
1728 const std::string &text = stream.str();
1729 char *result = new char[text.length() + 1];
1730 strcpy(result, text.c_str());
1731 return result;
1732 }
1733
1734 #endif // ndef DISABLED_LEGACY_ENGINE
1735
1736 /** Returns the average word confidence for Tesseract page result. */
MeanTextConf()1737 int TessBaseAPI::MeanTextConf() {
1738 int *conf = AllWordConfidences();
1739 if (!conf) {
1740 return 0;
1741 }
1742 int sum = 0;
1743 int *pt = conf;
1744 while (*pt >= 0) {
1745 sum += *pt++;
1746 }
1747 if (pt != conf) {
1748 sum /= pt - conf;
1749 }
1750 delete[] conf;
1751 return sum;
1752 }
1753
1754 /** Returns an array of all word confidences, terminated by -1. */
AllWordConfidences()1755 int *TessBaseAPI::AllWordConfidences() {
1756 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1757 return nullptr;
1758 }
1759 int n_word = 0;
1760 PAGE_RES_IT res_it(page_res_);
1761 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1762 n_word++;
1763 }
1764
1765 int *conf = new int[n_word + 1];
1766 n_word = 0;
1767 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1768 WERD_RES *word = res_it.word();
1769 WERD_CHOICE *choice = word->best_choice;
1770 int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1771 // This is the eq for converting Tesseract confidence to 1..100
1772 if (w_conf < 0) {
1773 w_conf = 0;
1774 }
1775 if (w_conf > 100) {
1776 w_conf = 100;
1777 }
1778 conf[n_word++] = w_conf;
1779 }
1780 conf[n_word] = -1;
1781 return conf;
1782 }
1783
1784 #ifndef DISABLED_LEGACY_ENGINE
1785 /**
1786 * Applies the given word to the adaptive classifier if possible.
1787 * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
1788 * tell the boundaries of the graphemes.
1789 * Assumes that SetImage/SetRectangle have been used to set the image
1790 * to the given word. The mode arg should be PSM_SINGLE_WORD or
1791 * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
1792 * The currently set PageSegMode is preserved.
1793 * Returns false if adaption was not possible for some reason.
1794 */
AdaptToWordStr(PageSegMode mode,const char * wordstr)1795 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1796 int debug = 0;
1797 GetIntVariable("applybox_debug", &debug);
1798 bool success = true;
1799 PageSegMode current_psm = GetPageSegMode();
1800 SetPageSegMode(mode);
1801 SetVariable("classify_enable_learning", "0");
1802 const std::unique_ptr<const char[]> text(GetUTF8Text());
1803 if (debug) {
1804 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1805 }
1806 if (text != nullptr) {
1807 PAGE_RES_IT it(page_res_);
1808 WERD_RES *word_res = it.word();
1809 if (word_res != nullptr) {
1810 word_res->word->set_text(wordstr);
1811 // Check to see if text matches wordstr.
1812 int w = 0;
1813 int t;
1814 for (t = 0; text[t] != '\0'; ++t) {
1815 if (text[t] == '\n' || text[t] == ' ') {
1816 continue;
1817 }
1818 while (wordstr[w] == ' ') {
1819 ++w;
1820 }
1821 if (text[t] != wordstr[w]) {
1822 break;
1823 }
1824 ++w;
1825 }
1826 if (text[t] != '\0' || wordstr[w] != '\0') {
1827 // No match.
1828 delete page_res_;
1829 std::vector<TBOX> boxes;
1830 page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
1831 tesseract_->ReSegmentByClassification(page_res_);
1832 tesseract_->TidyUp(page_res_);
1833 PAGE_RES_IT pr_it(page_res_);
1834 if (pr_it.word() == nullptr) {
1835 success = false;
1836 } else {
1837 word_res = pr_it.word();
1838 }
1839 } else {
1840 word_res->BestChoiceToCorrectText();
1841 }
1842 if (success) {
1843 tesseract_->EnableLearning = true;
1844 tesseract_->LearnWord(nullptr, word_res);
1845 }
1846 } else {
1847 success = false;
1848 }
1849 } else {
1850 success = false;
1851 }
1852 SetPageSegMode(current_psm);
1853 return success;
1854 }
1855 #endif // ndef DISABLED_LEGACY_ENGINE
1856
1857 /**
1858 * Free up recognition results and any stored image data, without actually
1859 * freeing any recognition data that would be time-consuming to reload.
1860 * Afterwards, you must call SetImage or TesseractRect before doing
1861 * any Recognize or Get* operation.
1862 */
Clear()1863 void TessBaseAPI::Clear() {
1864 if (thresholder_ != nullptr) {
1865 thresholder_->Clear();
1866 }
1867 ClearResults();
1868 if (tesseract_ != nullptr) {
1869 SetInputImage(nullptr);
1870 }
1871 }
1872
1873 /**
1874 * Close down tesseract and free up all memory. End() is equivalent to
1875 * destructing and reconstructing your TessBaseAPI.
1876 * Once End() has been used, none of the other API functions may be used
1877 * other than Init and anything declared above it in the class definition.
1878 */
End()1879 void TessBaseAPI::End() {
1880 Clear();
1881 delete thresholder_;
1882 thresholder_ = nullptr;
1883 delete page_res_;
1884 page_res_ = nullptr;
1885 delete block_list_;
1886 block_list_ = nullptr;
1887 if (paragraph_models_ != nullptr) {
1888 for (auto model : *paragraph_models_) {
1889 delete model;
1890 }
1891 delete paragraph_models_;
1892 paragraph_models_ = nullptr;
1893 }
1894 #ifndef DISABLED_LEGACY_ENGINE
1895 if (osd_tesseract_ == tesseract_) {
1896 osd_tesseract_ = nullptr;
1897 }
1898 delete osd_tesseract_;
1899 osd_tesseract_ = nullptr;
1900 delete equ_detect_;
1901 equ_detect_ = nullptr;
1902 #endif // ndef DISABLED_LEGACY_ENGINE
1903 delete tesseract_;
1904 tesseract_ = nullptr;
1905 input_file_.clear();
1906 output_file_.clear();
1907 datapath_.clear();
1908 language_.clear();
1909 }
1910
1911 // Clear any library-level memory caches.
1912 // There are a variety of expensive-to-load constant data structures (mostly
1913 // language dictionaries) that are cached globally -- surviving the Init()
1914 // and End() of individual TessBaseAPI's. This function allows the clearing
1915 // of these caches.
ClearPersistentCache()1916 void TessBaseAPI::ClearPersistentCache() {
1917 Dict::GlobalDawgCache()->DeleteUnusedDawgs();
1918 }
1919
1920 /**
1921 * Check whether a word is valid according to Tesseract's language model
1922 * returns 0 if the word is invalid, non-zero if valid
1923 */
IsValidWord(const char * word) const1924 int TessBaseAPI::IsValidWord(const char *word) const {
1925 return tesseract_->getDict().valid_word(word);
1926 }
1927 // Returns true if utf8_character is defined in the UniCharset.
IsValidCharacter(const char * utf8_character) const1928 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1929 return tesseract_->unicharset.contains_unichar(utf8_character);
1930 }
1931
1932 // TODO(rays) Obsolete this function and replace with a more aptly named
1933 // function that returns image coordinates rather than tesseract coordinates.
GetTextDirection(int * out_offset,float * out_slope)1934 bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1935 const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1936 if (it == nullptr) {
1937 return false;
1938 }
1939 int x1, x2, y1, y2;
1940 it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1941 // Calculate offset and slope (NOTE: Kind of ugly)
1942 if (x2 <= x1) {
1943 x2 = x1 + 1;
1944 }
1945 // Convert the point pair to slope/offset of the baseline (in image coords.)
1946 *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1947 *out_offset = static_cast<int>(y1 - *out_slope * x1);
1948 // Get the y-coord of the baseline at the left and right edges of the
1949 // textline's bounding box.
1950 int left, top, right, bottom;
1951 if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1952 return false;
1953 }
1954 int left_y = IntCastRounded(*out_slope * left + *out_offset);
1955 int right_y = IntCastRounded(*out_slope * right + *out_offset);
1956 // Shift the baseline down so it passes through the nearest bottom-corner
1957 // of the textline's bounding box. This is the difference between the y
1958 // at the lowest (max) edge of the box and the actual box bottom.
1959 *out_offset += bottom - std::max(left_y, right_y);
1960 // Switch back to bottom-up tesseract coordinates. Requires negation of
1961 // the slope and height - offset for the offset.
1962 *out_slope = -*out_slope;
1963 *out_offset = rect_height_ - *out_offset;
1964
1965 return true;
1966 }
1967
1968 /** Sets Dict::letter_is_okay_ function to point to the given function. */
SetDictFunc(DictFunc f)1969 void TessBaseAPI::SetDictFunc(DictFunc f) {
1970 if (tesseract_ != nullptr) {
1971 tesseract_->getDict().letter_is_okay_ = f;
1972 }
1973 }
1974
1975 /**
1976 * Sets Dict::probability_in_context_ function to point to the given
1977 * function.
1978 *
1979 * @param f A single function that returns the probability of the current
1980 * "character" (in general a utf-8 string), given the context of a previous
1981 * utf-8 string.
1982 */
SetProbabilityInContextFunc(ProbabilityInContextFunc f)1983 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
1984 if (tesseract_ != nullptr) {
1985 tesseract_->getDict().probability_in_context_ = f;
1986 // Set it for the sublangs too.
1987 int num_subs = tesseract_->num_sub_langs();
1988 for (int i = 0; i < num_subs; ++i) {
1989 tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
1990 }
1991 }
1992 }
1993
1994 /** Common code for setting the image. */
InternalSetImage()1995 bool TessBaseAPI::InternalSetImage() {
1996 if (tesseract_ == nullptr) {
1997 tprintf("Please call Init before attempting to set an image.\n");
1998 return false;
1999 }
2000 if (thresholder_ == nullptr) {
2001 thresholder_ = new ImageThresholder;
2002 }
2003 ClearResults();
2004 return true;
2005 }
2006
2007 /**
2008 * Run the thresholder to make the thresholded image, returned in pix,
2009 * which must not be nullptr. *pix must be initialized to nullptr, or point
2010 * to an existing pixDestroyable Pix.
2011 * The usual argument to Threshold is Tesseract::mutable_pix_binary().
2012 */
Threshold(Pix ** pix)2013 bool TessBaseAPI::Threshold(Pix **pix) {
2014 ASSERT_HOST(pix != nullptr);
2015 if (*pix != nullptr) {
2016 pixDestroy(pix);
2017 }
2018 // Zero resolution messes up the algorithms, so make sure it is credible.
2019 int user_dpi = 0;
2020 GetIntVariable("user_defined_dpi", &user_dpi);
2021 int y_res = thresholder_->GetScaledYResolution();
2022 if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2023 tprintf(
2024 "Warning: User defined image dpi is outside of expected range "
2025 "(%d - %d)!\n",
2026 kMinCredibleResolution, kMaxCredibleResolution);
2027 }
2028 // Always use user defined dpi
2029 if (user_dpi) {
2030 thresholder_->SetSourceYResolution(user_dpi);
2031 } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2032 if (y_res != 0) {
2033 // Show warning only if a resolution was given.
2034 tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2035 y_res, kMinCredibleResolution);
2036 }
2037 thresholder_->SetSourceYResolution(kMinCredibleResolution);
2038 }
2039
2040 auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2041
2042 if (thresholding_method == ThresholdMethod::Otsu) {
2043 Image pix_binary(*pix);
2044 if (!thresholder_->ThresholdToPix(&pix_binary)) {
2045 return false;
2046 }
2047 *pix = pix_binary;
2048
2049 if (!thresholder_->IsBinary()) {
2050 tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
2051 tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
2052 } else {
2053 tesseract_->set_pix_thresholds(nullptr);
2054 tesseract_->set_pix_grey(nullptr);
2055 }
2056 } else {
2057 auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2058
2059 if (!ok) {
2060 return false;
2061 }
2062 *pix = pix_binary;
2063
2064 tesseract_->set_pix_thresholds(pix_thresholds);
2065 tesseract_->set_pix_grey(pix_grey);
2066 }
2067
2068 thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,
2069 &image_height_);
2070
2071 // Set the internal resolution that is used for layout parameters from the
2072 // estimated resolution, rather than the image resolution, which may be
2073 // fabricated, but we will use the image resolution, if there is one, to
2074 // report output point sizes.
2075 int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2076 kMinCredibleResolution, kMaxCredibleResolution);
2077 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2078 tprintf(
2079 "Estimated internal resolution %d out of range! "
2080 "Corrected to %d.\n",
2081 thresholder_->GetScaledEstimatedResolution(), estimated_res);
2082 }
2083 tesseract_->set_source_resolution(estimated_res);
2084 return true;
2085 }
2086
2087 /** Find lines from the image making the BLOCK_LIST. */
FindLines()2088 int TessBaseAPI::FindLines() {
2089 if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2090 tprintf("Please call SetImage before attempting recognition.\n");
2091 return -1;
2092 }
2093 if (recognition_done_) {
2094 ClearResults();
2095 }
2096 if (!block_list_->empty()) {
2097 return 0;
2098 }
2099 if (tesseract_ == nullptr) {
2100 tesseract_ = new Tesseract;
2101 #ifndef DISABLED_LEGACY_ENGINE
2102 tesseract_->InitAdaptiveClassifier(nullptr);
2103 #endif
2104 }
2105 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2106 return -1;
2107 }
2108
2109 tesseract_->PrepareForPageseg();
2110
2111 #ifndef DISABLED_LEGACY_ENGINE
2112 if (tesseract_->textord_equation_detect) {
2113 if (equ_detect_ == nullptr && !datapath_.empty()) {
2114 equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2115 }
2116 if (equ_detect_ == nullptr) {
2117 tprintf("Warning: Could not set equation detector\n");
2118 } else {
2119 tesseract_->SetEquationDetect(equ_detect_);
2120 }
2121 }
2122 #endif // ndef DISABLED_LEGACY_ENGINE
2123
2124 Tesseract *osd_tess = osd_tesseract_;
2125 OSResults osr;
2126 #ifndef DISABLED_LEGACY_ENGINE
2127 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2128 if (strcmp(language_.c_str(), "osd") == 0) {
2129 osd_tess = tesseract_;
2130 } else {
2131 osd_tesseract_ = new Tesseract;
2132 TessdataManager mgr(reader_);
2133 if (datapath_.empty()) {
2134 tprintf(
2135 "Warning: Auto orientation and script detection requested,"
2136 " but data path is undefined\n");
2137 delete osd_tesseract_;
2138 osd_tesseract_ = nullptr;
2139 } else if (osd_tesseract_->init_tesseract(datapath_.c_str(), "", "osd", OEM_TESSERACT_ONLY,
2140 nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2141 osd_tess = osd_tesseract_;
2142 osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());
2143 } else {
2144 tprintf(
2145 "Warning: Auto orientation and script detection requested,"
2146 " but osd language failed to load\n");
2147 delete osd_tesseract_;
2148 osd_tesseract_ = nullptr;
2149 }
2150 }
2151 }
2152 #endif // ndef DISABLED_LEGACY_ENGINE
2153
2154 if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2155 return -1;
2156 }
2157
2158 // If Devanagari is being recognized, we use different images for page seg
2159 // and for OCR.
2160 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2161 return 0;
2162 }
2163
2164 /** Delete the pageres and clear the block list ready for a new page. */
ClearResults()2165 void TessBaseAPI::ClearResults() {
2166 if (tesseract_ != nullptr) {
2167 tesseract_->Clear();
2168 }
2169 delete page_res_;
2170 page_res_ = nullptr;
2171 recognition_done_ = false;
2172 if (block_list_ == nullptr) {
2173 block_list_ = new BLOCK_LIST;
2174 } else {
2175 block_list_->clear();
2176 }
2177 if (paragraph_models_ != nullptr) {
2178 for (auto model : *paragraph_models_) {
2179 delete model;
2180 }
2181 delete paragraph_models_;
2182 paragraph_models_ = nullptr;
2183 }
2184 }
2185
2186 /**
2187 * Return the length of the output text string, as UTF8, assuming
2188 * liberally two spacing marks after each word (as paragraphs end with two
2189 * newlines), and assuming a single character reject marker for each rejected
2190 * character.
2191 * Also return the number of recognized blobs in blob_count.
2192 */
TextLength(int * blob_count) const2193 int TessBaseAPI::TextLength(int *blob_count) const {
2194 if (tesseract_ == nullptr || page_res_ == nullptr) {
2195 return 0;
2196 }
2197
2198 PAGE_RES_IT page_res_it(page_res_);
2199 int total_length = 2;
2200 int total_blobs = 0;
2201 // Iterate over the data structures to extract the recognition result.
2202 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2203 WERD_RES *word = page_res_it.word();
2204 WERD_CHOICE *choice = word->best_choice;
2205 if (choice != nullptr) {
2206 total_blobs += choice->length() + 2;
2207 total_length += choice->unichar_string().length() + 2;
2208 for (int i = 0; i < word->reject_map.length(); ++i) {
2209 if (word->reject_map[i].rejected()) {
2210 ++total_length;
2211 }
2212 }
2213 }
2214 }
2215 if (blob_count != nullptr) {
2216 *blob_count = total_blobs;
2217 }
2218 return total_length;
2219 }
2220
2221 #ifndef DISABLED_LEGACY_ENGINE
2222 /**
2223 * Estimates the Orientation And Script of the image.
2224 * Returns true if the image was processed successfully.
2225 */
DetectOS(OSResults * osr)2226 bool TessBaseAPI::DetectOS(OSResults *osr) {
2227 if (tesseract_ == nullptr) {
2228 return false;
2229 }
2230 ClearResults();
2231 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2232 return false;
2233 }
2234
2235 if (input_file_.empty()) {
2236 input_file_ = kInputFile;
2237 }
2238 return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
2239 }
2240 #endif // #ifndef DISABLED_LEGACY_ENGINE
2241
set_min_orientation_margin(double margin)2242 void TessBaseAPI::set_min_orientation_margin(double margin) {
2243 tesseract_->min_orientation_margin.set_value(margin);
2244 }
2245
2246 /**
2247 * Return text orientation of each block as determined in an earlier page layout
2248 * analysis operation. Orientation is returned as the number of ccw 90-degree
2249 * rotations (in [0..3]) required to make the text in the block upright
2250 * (readable). Note that this may not necessary be the block orientation
2251 * preferred for recognition (such as the case of vertical CJK text).
2252 *
2253 * Also returns whether the text in the block is believed to have vertical
2254 * writing direction (when in an upright page orientation).
2255 *
2256 * The returned array is of length equal to the number of text blocks, which may
2257 * be less than the total number of blocks. The ordering is intended to be
2258 * consistent with GetTextLines().
2259 */
GetBlockTextOrientations(int ** block_orientation,bool ** vertical_writing)2260 void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2261 delete[] * block_orientation;
2262 *block_orientation = nullptr;
2263 delete[] * vertical_writing;
2264 *vertical_writing = nullptr;
2265 BLOCK_IT block_it(block_list_);
2266
2267 block_it.move_to_first();
2268 int num_blocks = 0;
2269 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2270 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2271 continue;
2272 }
2273 ++num_blocks;
2274 }
2275 if (!num_blocks) {
2276 tprintf("WARNING: Found no blocks\n");
2277 return;
2278 }
2279 *block_orientation = new int[num_blocks];
2280 *vertical_writing = new bool[num_blocks];
2281 block_it.move_to_first();
2282 int i = 0;
2283 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2284 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2285 continue;
2286 }
2287 FCOORD re_rotation = block_it.data()->re_rotation();
2288 float re_theta = re_rotation.angle();
2289 FCOORD classify_rotation = block_it.data()->classify_rotation();
2290 float classify_theta = classify_rotation.angle();
2291 double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2292 if (rot_theta < 0) {
2293 rot_theta += 4;
2294 }
2295 int num_rotations = static_cast<int>(rot_theta + 0.5);
2296 (*block_orientation)[i] = num_rotations;
2297 // The classify_rotation is non-zero only if the text has vertical
2298 // writing direction.
2299 (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2300 ++i;
2301 }
2302 }
2303
DetectParagraphs(bool after_text_recognition)2304 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2305 int debug_level = 0;
2306 GetIntVariable("paragraph_debug_level", &debug_level);
2307 if (paragraph_models_ == nullptr) {
2308 paragraph_models_ = new std::vector<ParagraphModel *>;
2309 }
2310 MutableIterator *result_it = GetMutableIterator();
2311 do { // Detect paragraphs for this block
2312 std::vector<ParagraphModel *> models;
2313 ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2314 paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2315 } while (result_it->Next(RIL_BLOCK));
2316 delete result_it;
2317 }
2318
2319 /** This method returns the string form of the specified unichar. */
GetUnichar(int unichar_id) const2320 const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2321 return tesseract_->unicharset.id_to_unichar(unichar_id);
2322 }
2323
2324 /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
GetDawg(int i) const2325 const Dawg *TessBaseAPI::GetDawg(int i) const {
2326 if (tesseract_ == nullptr || i >= NumDawgs()) {
2327 return nullptr;
2328 }
2329 return tesseract_->getDict().GetDawg(i);
2330 }
2331
2332 /** Return the number of dawgs loaded into tesseract_ object. */
NumDawgs() const2333 int TessBaseAPI::NumDawgs() const {
2334 return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2335 }
2336
2337 /** Escape a char string - remove <>&"' with HTML codes. */
HOcrEscape(const char * text)2338 std::string HOcrEscape(const char *text) {
2339 std::string ret;
2340 const char *ptr;
2341 for (ptr = text; *ptr; ptr++) {
2342 switch (*ptr) {
2343 case '<':
2344 ret += "<";
2345 break;
2346 case '>':
2347 ret += ">";
2348 break;
2349 case '&':
2350 ret += "&";
2351 break;
2352 case '"':
2353 ret += """;
2354 break;
2355 case '\'':
2356 ret += "'";
2357 break;
2358 default:
2359 ret += *ptr;
2360 }
2361 }
2362 return ret;
2363 }
2364
2365 } // namespace tesseract
2366