1 /**********************************************************************
2  * File:        applybox.cpp  (Formerly applybox.c)
3  * Description: Re segment rows according to box file data
4  * Author:      Phil Cheatle
5  *
6  * (C) Copyright 1993, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef DISABLED_LEGACY_ENGINE
20 #  include <allheaders.h>
21 #  include <cctype>
22 #  include <cerrno>
23 #  include <cstring>
24 #  include "boxread.h"
25 #endif // ndef DISABLED_LEGACY_ENGINE
26 #include <tesseract/unichar.h>
27 #include "pageres.h"
28 #include "tesseractclass.h"
29 #include "unicharset.h"
30 
31 #ifndef DISABLED_LEGACY_ENGINE
32 /** Max number of blobs to classify together in FindSegmentation. */
33 const int kMaxGroupSize = 4;
34 /// Max fraction of median allowed as deviation in xheight before switching
35 /// to median.
36 const double kMaxXHeightDeviationFraction = 0.125;
37 #endif // ndef DISABLED_LEGACY_ENGINE
38 
39 /**
40  * The box file is assumed to contain box definitions, one per line, of the
41  * following format for blob-level boxes:
42  * @verbatim
43  *   <UTF8 str> <left> <bottom> <right> <top> <page id>
44  * @endverbatim
45  * and for word/line-level boxes:
46  * @verbatim
47  *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
48  * @endverbatim
49  * NOTES:
50  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
51  *
52  * <page id> is 0-based, and the page number is used for multipage input (tiff).
53  *
54  * In the blob-level form, each line represents a recognizable unit, which may
55  * be several UTF-8 bytes, but there is a bounding box around each recognizable
56  * unit, and no classifier is needed to train in this mode (bootstrapping.)
57  *
58  * In the word/line-level form, the line begins with the literal "WordStr", and
59  * the bounding box bounds either a whole line or a whole word. The recognizable
60  * units in the word/line are listed after the # at the end of the line and
61  * are space delimited, ignoring any original spaces on the line.
62  * Eg.
63  * @verbatim
64  * word -> #w o r d
65  * multi word line -> #m u l t i w o r d l i n e
66  * @endverbatim
67  * The recognizable units must be space-delimited in order to allow multiple
68  * unicodes to be used for a single recognizable unit, eg Hindi.
69  *
70  * In this mode, the classifier must have been pre-trained with the desired
71  * character set, or it will not be able to find the character segmentations.
72  */
73 
74 namespace tesseract {
75 
76 #ifndef DISABLED_LEGACY_ENGINE
clear_any_old_text(BLOCK_LIST * block_list)77 static void clear_any_old_text(BLOCK_LIST *block_list) {
78   BLOCK_IT block_it(block_list);
79   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
80     ROW_IT row_it(block_it.data()->row_list());
81     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82       WERD_IT word_it(row_it.data()->word_list());
83       for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
84         word_it.data()->set_text("");
85       }
86     }
87   }
88 }
89 
90 // Applies the box file based on the image name filename, and resegments
91 // the words in the block_list (page), with:
92 // blob-mode: one blob per line in the box file, words as input.
93 // word/line-mode: one blob per space-delimited unit after the #, and one word
94 // per line in the box file. (See comment above for box file format.)
95 // If find_segmentation is true, (word/line mode) then the classifier is used
96 // to re-segment words/lines to match the space-delimited truth string for
97 // each box. In this case, the input box may be for a word or even a whole
98 // text line, and the output words will contain multiple blobs corresponding
99 // to the space-delimited input string.
100 // With find_segmentation false, no classifier is needed, but the chopper
101 // can still be used to correctly segment touching characters with the help
102 // of the input boxes.
103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
104 // from normal classification, ie. with a word, chopped_word, rebuild_word,
105 // seam_array, denorm, box_word, and best_state, but NO best_choice or
106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
107 // Instead, the correct_text member of WERD_RES is set, and this may be later
108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
109 // is not required before calling ApplyBoxTraining.
ApplyBoxes(const char * filename,bool find_segmentation,BLOCK_LIST * block_list)110 PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
111                                 BLOCK_LIST *block_list) {
112   std::vector<TBOX> boxes;
113   std::vector<std::string> texts, full_texts;
114   if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
115     return nullptr; // Can't do it.
116   }
117 
118   const int box_count = boxes.size();
119   int box_failures = 0;
120 
121   // In word mode, we use the boxes to make a word for each box, but
122   // in blob mode we use the existing words and maximally chop them first.
123   PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
124   clear_any_old_text(block_list);
125 
126   for (int i = 0; i < box_count; i++) {
127     bool foundit = false;
128     if (page_res != nullptr) {
129       foundit =
130           ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
131                            (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
132     } else {
133       foundit = ResegmentWordBox(block_list, boxes[i],
134                                  (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
135     }
136     if (!foundit) {
137       box_failures++;
138       ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
139     }
140   }
141 
142   if (page_res == nullptr) {
143     // In word/line mode, we now maximally chop all the words and resegment
144     // them with the classifier.
145     page_res = SetupApplyBoxes(boxes, block_list);
146     ReSegmentByClassification(page_res);
147   }
148   if (applybox_debug > 0) {
149     tprintf("APPLY_BOXES:\n");
150     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
151     if (box_failures > 0) {
152       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
153     }
154   }
155   TidyUp(page_res);
156   return page_res;
157 }
158 
159 // Helper computes median xheight in the image.
MedianXHeight(BLOCK_LIST * block_list)160 static double MedianXHeight(BLOCK_LIST *block_list) {
161   BLOCK_IT block_it(block_list);
162   STATS xheights(0, block_it.data()->pdblk.bounding_box().height());
163   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
164     ROW_IT row_it(block_it.data()->row_list());
165     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
166       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
167     }
168   }
169   return xheights.median();
170 }
171 
172 /// Any row xheight that is significantly different from the median is set
173 /// to the median.
PreenXHeights(BLOCK_LIST * block_list)174 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
175   const double median_xheight = MedianXHeight(block_list);
176   const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
177   // Strip all fuzzy space markers to simplify the PAGE_RES.
178   BLOCK_IT b_it(block_list);
179   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
180     BLOCK *block = b_it.data();
181     ROW_IT r_it(block->row_list());
182     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
183       ROW *row = r_it.data();
184       const double diff = fabs(row->x_height() - median_xheight);
185       if (diff > max_deviation) {
186         if (applybox_debug) {
187           tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
188         }
189         row->set_x_height(static_cast<float>(median_xheight));
190       }
191     }
192   }
193 }
194 
195 /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
196 /// All fuzzy spaces are removed, and all the words are maximally chopped.
SetupApplyBoxes(const std::vector<TBOX> & boxes,BLOCK_LIST * block_list)197 PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
198   PreenXHeights(block_list);
199   // Strip all fuzzy space markers to simplify the PAGE_RES.
200   BLOCK_IT b_it(block_list);
201   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
202     BLOCK *block = b_it.data();
203     ROW_IT r_it(block->row_list());
204     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
205       ROW *row = r_it.data();
206       WERD_IT w_it(row->word_list());
207       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
208         WERD *word = w_it.data();
209         if (word->cblob_list()->empty()) {
210           delete w_it.extract();
211         } else {
212           word->set_flag(W_FUZZY_SP, false);
213           word->set_flag(W_FUZZY_NON, false);
214         }
215       }
216     }
217   }
218   auto *page_res = new PAGE_RES(false, block_list, nullptr);
219   PAGE_RES_IT pr_it(page_res);
220   WERD_RES *word_res;
221   while ((word_res = pr_it.word()) != nullptr) {
222     MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
223     pr_it.forward();
224   }
225   return page_res;
226 }
227 
228 /// Tests the chopper by exhaustively running chop_one_blob.
229 /// The word_res will contain filled chopped_word, seam_array, denorm,
230 /// box_word and best_state for the maximally chopped word.
MaximallyChopWord(const std::vector<TBOX> & boxes,BLOCK * block,ROW * row,WERD_RES * word_res)231 void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
232                                   WERD_RES *word_res) {
233   if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
234                                      classify_bln_numeric_mode, textord_use_cjk_fp_model,
235                                      poly_allow_detailed_fx, row, block)) {
236     word_res->CloneChoppedToRebuild();
237     return;
238   }
239   if (chop_debug) {
240     tprintf("Maximally chopping word at:");
241     word_res->word->bounding_box().print();
242   }
243   std::vector<BLOB_CHOICE *> blob_choices;
244   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
245   auto rating = static_cast<float>(INT8_MAX);
246   for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
247     // The rating and certainty are not quite arbitrary. Since
248     // select_blob_to_chop uses the worst certainty to choose, they all have
249     // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
250     // in here, and then divide by e each time they are chopped, which
251     // should guarantee a set of unequal values for the whole tree of blobs
252     // produced, however much chopping is required. The chops are thus only
253     // limited by the ability of the chopper to find suitable chop points,
254     // and not by the value of the certainties.
255     auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
256     blob_choices.push_back(choice);
257     rating -= 0.125f;
258   }
259   const double e = exp(1.0); // The base of natural logs.
260   unsigned blob_number;
261   int right_chop_index = 0;
262   if (!assume_fixed_pitch_char_segment) {
263     // We only chop if the language is not fixed pitch like CJK.
264     SEAM *seam = nullptr;
265     while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
266       word_res->InsertSeam(blob_number, seam);
267       BLOB_CHOICE *left_choice = blob_choices[blob_number];
268       rating = left_choice->rating() / e;
269       left_choice->set_rating(rating);
270       left_choice->set_certainty(-rating);
271       // combine confidence w/ serial #
272       auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
273                                            0.0f, 0.0f, BCC_FAKE);
274       blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
275     }
276   }
277   word_res->CloneChoppedToRebuild();
278   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
279 }
280 
281 /// Helper to compute the dispute resolution metric.
282 /// Disputed blob resolution. The aim is to give the blob to the most
283 /// appropriate boxfile box. Most of the time it is obvious, but if
284 /// two boxfile boxes overlap significantly it is not. If a small boxfile
285 /// box takes most of the blob, and a large boxfile box does too, then
286 /// we want the small boxfile box to get it, but if the small box
287 /// is much smaller than the blob, we don't want it to get it.
288 /// Details of the disputed blob resolution:
289 /// Given a box with area A, and a blob with area B, with overlap area C,
290 /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
291 /// miss metric gets the blob.
BoxMissMetric(const TBOX & box1,const TBOX & box2)292 static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
293   const int overlap_area = box1.intersection(box2).area();
294   const int a = box1.area();
295   const int b = box2.area();
296   ASSERT_HOST(a != 0 && b != 0);
297   return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
298 }
299 
300 /// Gather consecutive blobs that match the given box into the best_state
301 /// and corresponding correct_text.
302 ///
303 /// Fights over which box owns which blobs are settled by pre-chopping and
304 /// applying the blobs to box or next_box with the least non-overlap.
305 /// @return false if the box was in error, which can only be caused by
306 /// failing to find an appropriate blob for a box.
307 ///
308 /// This means that occasionally, blobs may be incorrectly segmented if the
309 /// chopper fails to find a suitable chop point.
ResegmentCharBox(PAGE_RES * page_res,const TBOX * prev_box,const TBOX & box,const TBOX * next_box,const char * correct_text)310 bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
311                                  const TBOX *next_box, const char *correct_text) {
312   if (applybox_debug > 1) {
313     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
314   }
315   PAGE_RES_IT page_res_it(page_res);
316   WERD_RES *word_res;
317   for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
318     if (!word_res->box_word->bounding_box().major_overlap(box)) {
319       continue;
320     }
321     if (applybox_debug > 1) {
322       tprintf("Checking word box:");
323       word_res->box_word->bounding_box().print();
324     }
325     int word_len = word_res->box_word->length();
326     for (int i = 0; i < word_len; ++i) {
327       TBOX char_box = TBOX();
328       int blob_count = 0;
329       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
330         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
331         if (!blob_box.major_overlap(box)) {
332           break;
333         }
334         if (word_res->correct_text[i + blob_count].length() > 0) {
335           break; // Blob is claimed already.
336         }
337         if (next_box != nullptr) {
338           const double current_box_miss_metric = BoxMissMetric(blob_box, box);
339           const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
340           if (applybox_debug > 2) {
341             tprintf("Checking blob:");
342             blob_box.print();
343             tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
344                     next_box_miss_metric);
345           }
346           if (current_box_miss_metric > next_box_miss_metric) {
347             break; // Blob is a better match for next box.
348           }
349         }
350         char_box += blob_box;
351       }
352       if (blob_count > 0) {
353         if (applybox_debug > 1) {
354           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
355         }
356         if (!char_box.almost_equal(box, 3) &&
357             ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
358              (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
359           return false;
360         }
361         // We refine just the box_word, best_state and correct_text here.
362         // The rebuild_word is made in TidyUp.
363         // blob_count blobs are put together to match the box. Merge the
364         // box_word boxes, save the blob_count in the state and the text.
365         word_res->box_word->MergeBoxes(i, i + blob_count);
366         word_res->best_state[i] = blob_count;
367         word_res->correct_text[i] = correct_text;
368         if (applybox_debug > 2) {
369           tprintf("%d Blobs match: blob box:", blob_count);
370           word_res->box_word->BlobBox(i).print();
371           tprintf("Matches box:");
372           box.print();
373           if (next_box != nullptr) {
374             tprintf("With next box:");
375             next_box->print();
376           }
377         }
378         // Eliminated best_state and correct_text entries for the consumed
379         // blobs.
380         for (int j = 1; j < blob_count; ++j) {
381           word_res->best_state.erase(word_res->best_state.begin() + i + 1);
382           word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
383         }
384         // Assume that no box spans multiple source words, so we are done with
385         // this box.
386         if (applybox_debug > 1) {
387           tprintf("Best state = ");
388           for (auto best_state : word_res->best_state) {
389             tprintf("%d ", best_state);
390           }
391           tprintf("\n");
392           tprintf("Correct text = [[ ");
393           for (auto &it : word_res->correct_text) {
394             tprintf("%s ", it.c_str());
395           }
396           tprintf("]]\n");
397         }
398         return true;
399       }
400     }
401   }
402   if (applybox_debug > 0) {
403     tprintf("FAIL!\n");
404   }
405   return false; // Failure.
406 }
407 
408 /// Consume all source blobs that strongly overlap the given box,
409 /// putting them into a new word, with the correct_text label.
410 /// Fights over which box owns which blobs are settled by
411 /// applying the blobs to box or next_box with the least non-overlap.
412 /// @return false if the box was in error, which can only be caused by
413 /// failing to find an overlapping blob for a box.
ResegmentWordBox(BLOCK_LIST * block_list,const TBOX & box,const TBOX * next_box,const char * correct_text)414 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
415                                  const char *correct_text) {
416   if (applybox_debug > 1) {
417     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
418   }
419   WERD *new_word = nullptr;
420   BLOCK_IT b_it(block_list);
421   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
422     BLOCK *block = b_it.data();
423     if (!box.major_overlap(block->pdblk.bounding_box())) {
424       continue;
425     }
426     ROW_IT r_it(block->row_list());
427     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
428       ROW *row = r_it.data();
429       if (!box.major_overlap(row->bounding_box())) {
430         continue;
431       }
432       WERD_IT w_it(row->word_list());
433       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
434         WERD *word = w_it.data();
435         if (applybox_debug > 2) {
436           tprintf("Checking word:");
437           word->bounding_box().print();
438         }
439         if (word->text() != nullptr && word->text()[0] != '\0') {
440           continue; // Ignore words that are already done.
441         }
442         if (!box.major_overlap(word->bounding_box())) {
443           continue;
444         }
445         C_BLOB_IT blob_it(word->cblob_list());
446         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
447           C_BLOB *blob = blob_it.data();
448           TBOX blob_box = blob->bounding_box();
449           if (!blob_box.major_overlap(box)) {
450             continue;
451           }
452           if (next_box != nullptr) {
453             const double current_box_miss_metric = BoxMissMetric(blob_box, box);
454             const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
455             if (applybox_debug > 2) {
456               tprintf("Checking blob:");
457               blob_box.print();
458               tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
459                       next_box_miss_metric);
460             }
461             if (current_box_miss_metric > next_box_miss_metric) {
462               continue; // Blob is a better match for next box.
463             }
464           }
465           if (applybox_debug > 2) {
466             tprintf("Blob match: blob:");
467             blob_box.print();
468             tprintf("Matches box:");
469             box.print();
470             if (next_box != nullptr) {
471               tprintf("With next box:");
472               next_box->print();
473             }
474           }
475           if (new_word == nullptr) {
476             // Make a new word with a single blob.
477             new_word = word->shallow_copy();
478             new_word->set_text(correct_text);
479             w_it.add_to_end(new_word);
480           }
481           C_BLOB_IT new_blob_it(new_word->cblob_list());
482           new_blob_it.add_to_end(blob_it.extract());
483         }
484       }
485     }
486   }
487   if (new_word == nullptr && applybox_debug > 0) {
488     tprintf("FAIL!\n");
489   }
490   return new_word != nullptr;
491 }
492 
493 /// Resegments the words by running the classifier in an attempt to find the
494 /// correct segmentation that produces the required string.
ReSegmentByClassification(PAGE_RES * page_res)495 void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
496   PAGE_RES_IT pr_it(page_res);
497   WERD_RES *word_res;
498   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
499     const WERD *word = word_res->word;
500     if (word->text() == nullptr || word->text()[0] == '\0') {
501       continue; // Ignore words that have no text.
502     }
503     // Convert the correct text to a vector of UNICHAR_ID
504     std::vector<UNICHAR_ID> target_text;
505     if (!ConvertStringToUnichars(word->text(), &target_text)) {
506       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
507       pr_it.DeleteCurrentWord();
508       continue;
509     }
510     if (!FindSegmentation(target_text, word_res)) {
511       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
512       pr_it.DeleteCurrentWord();
513       continue;
514     }
515   }
516 }
517 
518 /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
519 /// @return false if an invalid UNICHAR_ID is encountered.
ConvertStringToUnichars(const char * utf8,std::vector<UNICHAR_ID> * class_ids)520 bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
521   for (int step = 0; *utf8 != '\0'; utf8 += step) {
522     const char *next_space = strchr(utf8, ' ');
523     if (next_space == nullptr) {
524       next_space = utf8 + strlen(utf8);
525     }
526     step = next_space - utf8;
527     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
528     if (class_id == INVALID_UNICHAR_ID) {
529       return false;
530     }
531     while (utf8[step] == ' ') {
532       ++step;
533     }
534     class_ids->push_back(class_id);
535   }
536   return true;
537 }
538 
539 /// Resegments the word to achieve the target_text from the classifier.
540 /// Returns false if the re-segmentation fails.
541 /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
542 /// applies a full search on the classifier results to find the best classified
543 /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
544 /// substitutions ARE used.
FindSegmentation(const std::vector<UNICHAR_ID> & target_text,WERD_RES * word_res)545 bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
546   // Classify all required combinations of blobs and save results in choices.
547   const int word_length = word_res->box_word->length();
548   auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
549   for (int i = 0; i < word_length; ++i) {
550     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
551       BLOB_CHOICE_LIST *match_result =
552           classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
553                          word_res->blamer_bundle);
554       if (applybox_debug > 2) {
555         tprintf("%d+%d:", i, j);
556         print_ratings_list("Segment:", match_result, unicharset);
557       }
558       choices[i].push_back(match_result);
559     }
560   }
561   // Search the segmentation graph for the target text. Must be an exact
562   // match. Using wildcards makes it difficult to find the correct
563   // segmentation even when it is there.
564   word_res->best_state.clear();
565   std::vector<int> search_segmentation;
566   float best_rating = 0.0f;
567   SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
568                 &word_res->best_state);
569   for (int i = 0; i < word_length; ++i) {
570     for (auto choice : choices[i]) {
571       delete choice;
572     }
573   }
574   delete[] choices;
575   if (word_res->best_state.empty()) {
576     // Build the original segmentation and if it is the same length as the
577     // truth, assume it will do.
578     int blob_count = 1;
579     for (auto s : word_res->seam_array) {
580       SEAM *seam = s;
581       if (!seam->HasAnySplits()) {
582         word_res->best_state.push_back(blob_count);
583         blob_count = 1;
584       } else {
585         ++blob_count;
586       }
587     }
588     word_res->best_state.push_back(blob_count);
589     if (word_res->best_state.size() != target_text.size()) {
590       word_res->best_state.clear(); // No good. Original segmentation bad size.
591       return false;
592     }
593   }
594   word_res->correct_text.clear();
595   for (auto &text : target_text) {
596     word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
597   }
598   return true;
599 }
600 
601 /// Recursive helper to find a match to the target_text (from text_index
602 /// position) in the choices (from choices_pos position).
603 /// @param choices is an array of vectors of length choices_length,
604 /// with each element representing a starting position in the word, and the
605 /// #vector holding classification results for a sequence of consecutive
606 /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
607 /// @param choices_pos
608 /// @param choices_length
609 /// @param target_text
610 /// @param text_index
611 /// @param rating
612 /// @param segmentation
613 /// @param best_rating
614 /// @param best_segmentation
SearchForText(const std::vector<BLOB_CHOICE_LIST * > * choices,int choices_pos,unsigned choices_length,const std::vector<UNICHAR_ID> & target_text,unsigned text_index,float rating,std::vector<int> * segmentation,float * best_rating,std::vector<int> * best_segmentation)615 void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
616                               unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
617                               unsigned text_index, float rating, std::vector<int> *segmentation,
618                               float *best_rating, std::vector<int> *best_segmentation) {
619   const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
620   for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
621     // Rating of matching choice or worst choice if no match.
622     float choice_rating = 0.0f;
623     // Find the corresponding best BLOB_CHOICE.
624     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
625     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
626       const BLOB_CHOICE *choice = choice_it.data();
627       choice_rating = choice->rating();
628       auto class_id = choice->unichar_id();
629       if (class_id == target_text[text_index]) {
630         break;
631       }
632       // Search ambigs table.
633       if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
634         AmbigSpec_IT spec_it(table[class_id]);
635         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
636           const AmbigSpec *ambig_spec = spec_it.data();
637           // We'll only do 1-1.
638           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
639               ambig_spec->correct_ngram_id == target_text[text_index]) {
640             break;
641           }
642         }
643         if (!spec_it.cycled_list()) {
644           break; // Found an ambig.
645         }
646       }
647     }
648     if (choice_it.cycled_list()) {
649       continue; // No match.
650     }
651     segmentation->push_back(length);
652     if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
653       // This is a complete match. If the rating is good record a new best.
654       if (applybox_debug > 2) {
655         tprintf("Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
656                 rating + choice_rating, *best_rating, segmentation->size(),
657                 best_segmentation->size());
658       }
659       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
660         *best_segmentation = *segmentation;
661         *best_rating = rating + choice_rating;
662       }
663     } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
664       if (applybox_debug > 3) {
665         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
666                 unicharset.id_to_unichar(target_text[text_index]),
667                 choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
668                 choices_pos, length);
669       }
670       SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
671                     rating + choice_rating, segmentation, best_rating, best_segmentation);
672       if (applybox_debug > 3) {
673         tprintf("End recursion for %d=%s\n", target_text[text_index],
674                 unicharset.id_to_unichar(target_text[text_index]));
675       }
676     }
677     segmentation->resize(segmentation->size() - 1);
678   }
679 }
680 
681 /// - Counts up the labelled words and the blobs within.
682 /// - Deletes all unused or emptied words, counting the unused ones.
683 /// - Resets W_BOL and W_EOL flags correctly.
684 /// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
TidyUp(PAGE_RES * page_res)685 void Tesseract::TidyUp(PAGE_RES *page_res) {
686   int ok_blob_count = 0;
687   int bad_blob_count = 0;
688   int ok_word_count = 0;
689   int unlabelled_words = 0;
690   PAGE_RES_IT pr_it(page_res);
691   WERD_RES *word_res;
692   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
693     int ok_in_word = 0;
694     int blob_count = word_res->correct_text.size();
695     auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
696     word_choice->set_permuter(TOP_CHOICE_PERM);
697     for (int c = 0; c < blob_count; ++c) {
698       if (word_res->correct_text[c].length() > 0) {
699         ++ok_in_word;
700       }
701       // Since we only need a fake word_res->best_choice, the actual
702       // unichar_ids do not matter. Which is fortunate, since TidyUp()
703       // can be called while training Tesseract, at the stage where
704       // unicharset is not meaningful yet.
705       word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
706                                                      1.0f, -1.0f);
707     }
708     if (ok_in_word > 0) {
709       ok_blob_count += ok_in_word;
710       bad_blob_count += word_res->correct_text.size() - ok_in_word;
711       word_res->LogNewRawChoice(word_choice);
712       word_res->LogNewCookedChoice(1, false, word_choice);
713     } else {
714       ++unlabelled_words;
715       if (applybox_debug > 0) {
716         tprintf("APPLY_BOXES: Unlabelled word at :");
717         word_res->word->bounding_box().print();
718       }
719       pr_it.DeleteCurrentWord();
720       delete word_choice;
721     }
722   }
723   pr_it.restart_page();
724   for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
725     // Denormalize back to a BoxWord.
726     word_res->RebuildBestState();
727     word_res->SetupBoxWord();
728     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
729     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
730   }
731   if (applybox_debug > 0) {
732     tprintf("   Found %d good blobs.\n", ok_blob_count);
733     if (bad_blob_count > 0) {
734       tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
735     }
736     if (unlabelled_words > 0) {
737       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
738     }
739   }
740 }
741 
742 /** Logs a bad box by line in the box file and box coords.*/
ReportFailedBox(int boxfile_lineno,TBOX box,const char * box_ch,const char * err_msg)743 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
744                                 const char *err_msg) {
745   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
746           box.left(), box.bottom(), box.right(), box.top(), err_msg);
747 }
748 
749 /// Calls #LearnWord to extract features for labelled blobs within each word.
750 /// Features are stored in an internal buffer.
ApplyBoxTraining(const std::string & fontname,PAGE_RES * page_res)751 void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
752   PAGE_RES_IT pr_it(page_res);
753   int word_count = 0;
754   for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
755     LearnWord(fontname.c_str(), word_res);
756     ++word_count;
757   }
758   tprintf("Generated training data for %d words\n", word_count);
759 }
760 
761 #endif // ndef DISABLED_LEGACY_ENGINE
762 
763 /** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
CorrectClassifyWords(PAGE_RES * page_res)764 void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
765   PAGE_RES_IT pr_it(page_res);
766   for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
767     auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
768     for (auto &correct_text : word_res->correct_text) {
769       // The part before the first space is the real ground truth, and the
770       // rest is the bounding box location and page number.
771       std::vector<std::string> tokens = split(correct_text, ' ');
772       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
773       choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
774     }
775     word_res->ClearWordChoices();
776     word_res->LogNewRawChoice(choice);
777     word_res->LogNewCookedChoice(1, false, choice);
778   }
779 }
780 
781 } // namespace tesseract
782