1 /**********************************************************************
2 * File: pageres.cpp (Formerly page_res.c)
3 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4 * and an iterator class to iterate over the words.
5 * Main purposes:
6 * Easy way to iterate over the words without a 3-nested loop.
7 * Holds data used during word recognition.
8 * Holds information about alternative spacing paths.
9 * Author: Phil Cheatle
10 *
11 * (C) Copyright 1992, Hewlett-Packard Ltd.
12 ** Licensed under the Apache License, Version 2.0 (the "License");
13 ** you may not use this file except in compliance with the License.
14 ** You may obtain a copy of the License at
15 ** http://www.apache.org/licenses/LICENSE-2.0
16 ** Unless required by applicable law or agreed to in writing, software
17 ** distributed under the License is distributed on an "AS IS" BASIS,
18 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 ** See the License for the specific language governing permissions and
20 ** limitations under the License.
21 *
22 **********************************************************************/
23
24 #include "pageres.h"
25
26 #include "blamer.h" // for BlamerBundle
27 #include "blobs.h" // for TWERD, TBLOB
28 #include "boxword.h" // for BoxWord
29 #include "errcode.h" // for ASSERT_HOST
30 #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
31 #include "ocrrow.h" // for ROW, ROW_IT
32 #include "pdblock.h" // for PDBLK
33 #include "polyblk.h" // for POLY_BLOCK
34 #include "seam.h" // for SEAM, start_seam_list
35 #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
36 #include "tprintf.h" // for tprintf
37
38 #include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
39
40 #include <cassert> // for assert
41 #include <cstdint> // for INT32_MAX
42 #include <cstring> // for strlen
43
44 struct Pix;
45
46 namespace tesseract {
47
48 // Gain factor for computing thresholds that determine the ambiguity of a
49 // word.
50 static const double kStopperAmbiguityThresholdGain = 8.0;
51 // Constant offset for computing thresholds that determine the ambiguity of a
52 // word.
53 static const double kStopperAmbiguityThresholdOffset = 1.5;
54 // Max number of broken pieces to associate.
55 const int kWordrecMaxNumJoinChunks = 4;
56 // Max ratio of word box height to line size to allow it to be processed as
57 // a line with other words.
58 const double kMaxWordSizeRatio = 1.25;
59 // Max ratio of line box height to line size to allow a new word to be added.
60 const double kMaxLineSizeRatio = 1.25;
61 // Max ratio of word gap to line size to allow a new word to be added.
62 const double kMaxWordGapRatio = 2.0;
63
64 // Computes and returns a threshold of certainty difference used to determine
65 // which words to keep, based on the adjustment factors of the two words.
66 // TODO(rays) This is horrible. Replace with an enhance params training model.
StopperAmbigThreshold(double f1,double f2)67 static double StopperAmbigThreshold(double f1, double f2) {
68 return (f2 - f1) * kStopperAmbiguityThresholdGain -
69 kStopperAmbiguityThresholdOffset;
70 }
71
72 /*************************************************************************
73 * PAGE_RES::PAGE_RES
74 *
75 * Constructor for page results
76 *************************************************************************/
PAGE_RES(bool merge_similar_words,BLOCK_LIST * the_block_list,WERD_CHOICE ** prev_word_best_choice_ptr)77 PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,
78 WERD_CHOICE **prev_word_best_choice_ptr) {
79 Init();
80 BLOCK_IT block_it(the_block_list);
81 BLOCK_RES_IT block_res_it(&block_res_list);
82 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
83 block_res_it.add_to_end(
84 new BLOCK_RES(merge_similar_words, block_it.data()));
85 }
86 prev_word_best_choice = prev_word_best_choice_ptr;
87 }
88
89 /*************************************************************************
90 * BLOCK_RES::BLOCK_RES
91 *
92 * Constructor for BLOCK results
93 *************************************************************************/
94
BLOCK_RES(bool merge_similar_words,BLOCK * the_block)95 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
96 ROW_IT row_it(the_block->row_list());
97 ROW_RES_IT row_res_it(&row_res_list);
98
99 char_count = 0;
100 rej_count = 0;
101 font_class = -1; // not assigned
102 x_height = -1.0;
103 font_assigned = false;
104 row_count = 0;
105
106 block = the_block;
107
108 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
109 row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
110 }
111 }
112
113 /*************************************************************************
114 * ROW_RES::ROW_RES
115 *
116 * Constructor for ROW results
117 *************************************************************************/
118
ROW_RES(bool merge_similar_words,ROW * the_row)119 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
120 WERD_IT word_it(the_row->word_list());
121 WERD_RES_IT word_res_it(&word_res_list);
122 WERD_RES *combo = nullptr; // current combination of fuzzies
123 WERD *copy_word;
124
125 char_count = 0;
126 rej_count = 0;
127 whole_word_rej_count = 0;
128
129 row = the_row;
130 bool add_next_word = false;
131 TBOX union_box;
132 float line_height =
133 the_row->x_height() + the_row->ascenders() - the_row->descenders();
134 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
135 auto *word_res = new WERD_RES(word_it.data());
136 word_res->x_height = the_row->x_height();
137 if (add_next_word) {
138 ASSERT_HOST(combo != nullptr);
139 // We are adding this word to the combination.
140 word_res->part_of_combo = true;
141 combo->copy_on(word_res);
142 } else if (merge_similar_words) {
143 union_box = word_res->word->bounding_box();
144 add_next_word = !word_res->word->flag(W_REP_CHAR) &&
145 union_box.height() <= line_height * kMaxWordSizeRatio;
146 word_res->odd_size = !add_next_word;
147 }
148 WERD *next_word = word_it.data_relative(1);
149 if (merge_similar_words) {
150 if (add_next_word && !next_word->flag(W_REP_CHAR)) {
151 // Next word will be added on if all of the following are true:
152 // Not a rep char.
153 // Box height small enough.
154 // Union box height small enough.
155 // Horizontal gap small enough.
156 TBOX next_box = next_word->bounding_box();
157 int prev_right = union_box.right();
158 union_box += next_box;
159 if (next_box.height() > line_height * kMaxWordSizeRatio ||
160 union_box.height() > line_height * kMaxLineSizeRatio ||
161 next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
162 add_next_word = false;
163 }
164 }
165 next_word->set_flag(W_FUZZY_NON, add_next_word);
166 } else {
167 add_next_word = next_word->flag(W_FUZZY_NON);
168 }
169 if (add_next_word) {
170 if (combo == nullptr) {
171 copy_word = new WERD;
172 *copy_word = *(word_it.data()); // deep copy
173 combo = new WERD_RES(copy_word);
174 combo->x_height = the_row->x_height();
175 combo->combination = true;
176 word_res_it.add_to_end(combo);
177 }
178 word_res->part_of_combo = true;
179 } else {
180 combo = nullptr;
181 }
182 word_res_it.add_to_end(word_res);
183 }
184 }
185
operator =(const WERD_RES & source)186 WERD_RES &WERD_RES::operator=(const WERD_RES &source) {
187 this->ELIST_LINK::operator=(source);
188 Clear();
189 if (source.combination) {
190 word = new WERD;
191 *word = *(source.word); // deep copy
192 } else {
193 word = source.word; // pt to same word
194 }
195 if (source.bln_boxes != nullptr) {
196 bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
197 }
198 if (source.chopped_word != nullptr) {
199 chopped_word = new TWERD(*source.chopped_word);
200 }
201 if (source.rebuild_word != nullptr) {
202 rebuild_word = new TWERD(*source.rebuild_word);
203 }
204 // TODO(rays) Do we ever need to copy the seam_array?
205 blob_row = source.blob_row;
206 denorm = source.denorm;
207 if (source.box_word != nullptr) {
208 box_word = new tesseract::BoxWord(*source.box_word);
209 }
210 best_state = source.best_state;
211 correct_text = source.correct_text;
212 blob_widths = source.blob_widths;
213 blob_gaps = source.blob_gaps;
214 // None of the uses of operator= require the ratings matrix to be copied,
215 // so don't as it would be really slow.
216
217 // Copy the cooked choices.
218 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
219 WERD_CHOICE_IT wc_dest_it(&best_choices);
220 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
221 const WERD_CHOICE *choice = wc_it.data();
222 wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
223 }
224 if (!wc_dest_it.empty()) {
225 wc_dest_it.move_to_first();
226 best_choice = wc_dest_it.data();
227 } else {
228 best_choice = nullptr;
229 }
230
231 if (source.raw_choice != nullptr) {
232 raw_choice = new WERD_CHOICE(*source.raw_choice);
233 } else {
234 raw_choice = nullptr;
235 }
236 if (source.ep_choice != nullptr) {
237 ep_choice = new WERD_CHOICE(*source.ep_choice);
238 } else {
239 ep_choice = nullptr;
240 }
241 reject_map = source.reject_map;
242 combination = source.combination;
243 part_of_combo = source.part_of_combo;
244 CopySimpleFields(source);
245 if (source.blamer_bundle != nullptr) {
246 blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
247 }
248 return *this;
249 }
250
251 // Copies basic fields that don't involve pointers that might be useful
252 // to copy when making one WERD_RES from another.
CopySimpleFields(const WERD_RES & source)253 void WERD_RES::CopySimpleFields(const WERD_RES &source) {
254 tess_failed = source.tess_failed;
255 tess_accepted = source.tess_accepted;
256 tess_would_adapt = source.tess_would_adapt;
257 done = source.done;
258 unlv_crunch_mode = source.unlv_crunch_mode;
259 small_caps = source.small_caps;
260 odd_size = source.odd_size;
261 fontinfo = source.fontinfo;
262 fontinfo2 = source.fontinfo2;
263 fontinfo_id_count = source.fontinfo_id_count;
264 fontinfo_id2_count = source.fontinfo_id2_count;
265 x_height = source.x_height;
266 caps_height = source.caps_height;
267 baseline_shift = source.baseline_shift;
268 guessed_x_ht = source.guessed_x_ht;
269 guessed_caps_ht = source.guessed_caps_ht;
270 reject_spaces = source.reject_spaces;
271 uch_set = source.uch_set;
272 tesseract = source.tesseract;
273 }
274
275 // Initializes a blank (default constructed) WERD_RES from one that has
276 // already been recognized.
277 // Use SetupFor*Recognition afterwards to complete the setup and make
278 // it ready for a retry recognition.
InitForRetryRecognition(const WERD_RES & source)279 void WERD_RES::InitForRetryRecognition(const WERD_RES &source) {
280 word = source.word;
281 CopySimpleFields(source);
282 if (source.blamer_bundle != nullptr) {
283 blamer_bundle = new BlamerBundle();
284 blamer_bundle->CopyTruth(*source.blamer_bundle);
285 }
286 }
287
288 // Sets up the members used in recognition: bln_boxes, chopped_word,
289 // seam_array, denorm. Returns false if
290 // the word is empty and sets up fake results. If use_body_size is
291 // true and row->body_size is set, then body_size will be used for
292 // blob normalization instead of xheight + ascrise. This flag is for
293 // those languages that are using CJK pitch model and thus it has to
294 // be true if and only if tesseract->textord_use_cjk_fp_model is
295 // true.
296 // If allow_detailed_fx is true, the feature extractor will receive fine
297 // precision outline information, allowing smoother features and better
298 // features on low resolution images.
299 // The norm_mode_hint sets the default mode for normalization in absence
300 // of any of the above flags.
301 // norm_box is used to override the word bounding box to determine the
302 // normalization scale and offset.
303 // Returns false if the word is empty and sets up fake results.
SetupForRecognition(const UNICHARSET & unicharset_in,tesseract::Tesseract * tess,Image pix,int norm_mode,const TBOX * norm_box,bool numeric_mode,bool use_body_size,bool allow_detailed_fx,ROW * row,const BLOCK * block)304 bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,
305 tesseract::Tesseract *tess, Image pix,
306 int norm_mode, const TBOX *norm_box,
307 bool numeric_mode, bool use_body_size,
308 bool allow_detailed_fx, ROW *row,
309 const BLOCK *block) {
310 auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
311 tesseract = tess;
312 POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
313 if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
314 word->cblob_list()->empty()) ||
315 (pb != nullptr && !pb->IsText())) {
316 // Empty words occur when all the blobs have been moved to the rej_blobs
317 // list, which seems to occur frequently in junk.
318 SetupFake(unicharset_in);
319 word->set_flag(W_REP_CHAR, false);
320 return false;
321 }
322 ClearResults();
323 SetupWordScript(unicharset_in);
324 chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
325 float word_xheight =
326 use_body_size && row != nullptr && row->body_size() > 0.0f
327 ? row->body_size()
328 : x_height;
329 chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
330 word_xheight, baseline_shift, numeric_mode,
331 norm_mode_hint, norm_box, &denorm);
332 blob_row = row;
333 SetupBasicsFromChoppedWord(unicharset_in);
334 SetupBlamerBundle();
335 int num_blobs = chopped_word->NumBlobs();
336 ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
337 tess_failed = false;
338 return true;
339 }
340
341 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
342 // accumulators from a made chopped word. We presume the fields are already
343 // empty.
SetupBasicsFromChoppedWord(const UNICHARSET & unicharset_in)344 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
345 bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
346 start_seam_list(chopped_word, &seam_array);
347 SetupBlobWidthsAndGaps();
348 ClearWordChoices();
349 }
350
351 // Sets up the members used in recognition for an empty recognition result:
352 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
SetupFake(const UNICHARSET & unicharset_in)353 void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {
354 ClearResults();
355 SetupWordScript(unicharset_in);
356 chopped_word = new TWERD;
357 rebuild_word = new TWERD;
358 bln_boxes = new tesseract::BoxWord;
359 box_word = new tesseract::BoxWord;
360 int blob_count = word->cblob_list()->length();
361 if (blob_count > 0) {
362 auto **fake_choices = new BLOB_CHOICE *[blob_count];
363 // For non-text blocks, just pass any blobs through to the box_word
364 // and call the word failed with a fake classification.
365 C_BLOB_IT b_it(word->cblob_list());
366 int blob_id = 0;
367 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
368 TBOX box = b_it.data()->bounding_box();
369 box_word->InsertBox(box_word->length(), box);
370 fake_choices[blob_id++] = new BLOB_CHOICE;
371 }
372 FakeClassifyWord(blob_count, fake_choices);
373 delete[] fake_choices;
374 } else {
375 auto *word = new WERD_CHOICE(&unicharset_in);
376 word->make_bad();
377 LogNewRawChoice(word);
378 // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
379 LogNewCookedChoice(1, false, word);
380 }
381 tess_failed = true;
382 done = true;
383 }
384
SetupWordScript(const UNICHARSET & uch)385 void WERD_RES::SetupWordScript(const UNICHARSET &uch) {
386 uch_set = &uch;
387 int script = uch.default_sid();
388 word->set_script_id(script);
389 word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
390 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
391 }
392
393 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
SetupBlamerBundle()394 void WERD_RES::SetupBlamerBundle() {
395 if (blamer_bundle != nullptr) {
396 blamer_bundle->SetupNormTruthWord(denorm);
397 }
398 }
399
400 // Computes the blob_widths and blob_gaps from the chopped_word.
SetupBlobWidthsAndGaps()401 void WERD_RES::SetupBlobWidthsAndGaps() {
402 blob_widths.clear();
403 blob_gaps.clear();
404 int num_blobs = chopped_word->NumBlobs();
405 for (int b = 0; b < num_blobs; ++b) {
406 TBLOB *blob = chopped_word->blobs[b];
407 TBOX box = blob->bounding_box();
408 blob_widths.push_back(box.width());
409 if (b + 1 < num_blobs) {
410 blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
411 box.right());
412 }
413 }
414 }
415
416 // Updates internal data to account for a new SEAM (chop) at the given
417 // blob_number. Fixes the ratings matrix and states in the choices, as well
418 // as the blob widths and gaps.
InsertSeam(int blob_number,SEAM * seam)419 void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
420 // Insert the seam into the SEAMS array.
421 seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
422 seam_array.insert(seam_array.begin() + blob_number, seam);
423 if (ratings != nullptr) {
424 // Expand the ratings matrix.
425 ratings = ratings->ConsumeAndMakeBigger(blob_number);
426 // Fix all the segmentation states.
427 if (raw_choice != nullptr) {
428 raw_choice->UpdateStateForSplit(blob_number);
429 }
430 WERD_CHOICE_IT wc_it(&best_choices);
431 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
432 WERD_CHOICE *choice = wc_it.data();
433 choice->UpdateStateForSplit(blob_number);
434 }
435 SetupBlobWidthsAndGaps();
436 }
437 }
438
439 // Returns true if all the word choices except the first have adjust_factors
440 // worse than the given threshold.
AlternativeChoiceAdjustmentsWorseThan(float threshold) const441 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
442 // The choices are not changed by this iteration.
443 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
444 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
445 WERD_CHOICE *choice = wc_it.data();
446 if (choice->adjust_factor() <= threshold) {
447 return false;
448 }
449 }
450 return true;
451 }
452
453 // Returns true if the current word is ambiguous (by number of answers or
454 // by dangerous ambigs.)
IsAmbiguous()455 bool WERD_RES::IsAmbiguous() {
456 return !best_choices.singleton() || best_choice->dangerous_ambig_found();
457 }
458
459 // Returns true if the ratings matrix size matches the sum of each of the
460 // segmentation states.
StatesAllValid()461 bool WERD_RES::StatesAllValid() {
462 unsigned ratings_dim = ratings->dimension();
463 if (raw_choice->TotalOfStates() != ratings_dim) {
464 tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
465 raw_choice->TotalOfStates(), ratings_dim);
466 return false;
467 }
468 WERD_CHOICE_IT it(&best_choices);
469 unsigned index = 0;
470 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
471 WERD_CHOICE *choice = it.data();
472 if (choice->TotalOfStates() != ratings_dim) {
473 tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
474 index, choice->TotalOfStates(), ratings_dim);
475 return false;
476 }
477 }
478 return true;
479 }
480
481 // Prints a list of words found if debug is true or the word result matches
482 // the word_to_debug.
DebugWordChoices(bool debug,const char * word_to_debug)483 void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {
484 if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
485 best_choice != nullptr &&
486 best_choice->unichar_string() == std::string(word_to_debug))) {
487 if (raw_choice != nullptr) {
488 raw_choice->print("\nBest Raw Choice");
489 }
490
491 WERD_CHOICE_IT it(&best_choices);
492 int index = 0;
493 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
494 WERD_CHOICE *choice = it.data();
495 std::string label;
496 label += "\nCooked Choice #" + std::to_string(index);
497 choice->print(label.c_str());
498 }
499 }
500 }
501
502 // Prints the top choice along with the accepted/done flags.
DebugTopChoice(const char * msg) const503 void WERD_RES::DebugTopChoice(const char *msg) const {
504 tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
505 tess_would_adapt, done);
506 if (best_choice == nullptr) {
507 tprintf("<Null choice>\n");
508 } else {
509 best_choice->print(msg);
510 }
511 }
512
513 // Removes from best_choices all choices which are not within a reasonable
514 // range of the best choice.
515 // TODO(rays) incorporate the information used here into the params training
516 // re-ranker, in place of this heuristic that is based on the previous
517 // adjustment factor.
FilterWordChoices(int debug_level)518 void WERD_RES::FilterWordChoices(int debug_level) {
519 if (best_choice == nullptr || best_choices.singleton()) {
520 return;
521 }
522
523 if (debug_level >= 2) {
524 best_choice->print("\nFiltering against best choice");
525 }
526 WERD_CHOICE_IT it(&best_choices);
527 int index = 0;
528 for (it.forward(); !it.at_first(); it.forward(), ++index) {
529 WERD_CHOICE *choice = it.data();
530 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
531 choice->adjust_factor());
532 // i, j index the blob choice in choice, best_choice.
533 // chunk is an index into the chopped_word blobs (AKA chunks).
534 // Since the two words may use different segmentations of the chunks, we
535 // iterate over the chunks to find out whether a comparable blob
536 // classification is much worse than the best result.
537 unsigned i = 0, j = 0, chunk = 0;
538 // Each iteration of the while deals with 1 chunk. On entry choice_chunk
539 // and best_chunk are the indices of the first chunk in the NEXT blob,
540 // i.e. we don't have to increment i, j while chunk < choice_chunk and
541 // best_chunk respectively.
542 auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
543 while (i < choice->length() && j < best_choice->length()) {
544 if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
545 choice->certainty(i) - best_choice->certainty(j) < threshold) {
546 if (debug_level >= 2) {
547 choice->print("WorstCertaintyDiffWorseThan");
548 tprintf(
549 "i %u j %u Choice->Blob[i].Certainty %.4g"
550 " WorstOtherChoiceCertainty %g Threshold %g\n",
551 i, j, choice->certainty(i), best_choice->certainty(j), threshold);
552 tprintf("Discarding bad choice #%d\n", index);
553 }
554 delete it.extract();
555 break;
556 }
557 ++chunk;
558 // If needed, advance choice_chunk to keep up with chunk.
559 while (choice_chunk < chunk && ++i < choice->length()) {
560 choice_chunk += choice->state(i);
561 }
562 // If needed, advance best_chunk to keep up with chunk.
563 while (best_chunk < chunk && ++j < best_choice->length()) {
564 best_chunk += best_choice->state(j);
565 }
566 }
567 }
568 }
569
ComputeAdaptionThresholds(float certainty_scale,float min_rating,float max_rating,float rating_margin,float * thresholds)570 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
571 float min_rating, float max_rating,
572 float rating_margin,
573 float *thresholds) {
574 int chunk = 0;
575 int end_chunk = best_choice->state(0);
576 int end_raw_chunk = raw_choice->state(0);
577 int raw_blob = 0;
578 for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
579 float avg_rating = 0.0f;
580 int num_error_chunks = 0;
581
582 // For each chunk in best choice blob i, count non-matching raw results.
583 while (chunk < end_chunk) {
584 if (chunk >= end_raw_chunk) {
585 ++raw_blob;
586 end_raw_chunk += raw_choice->state(raw_blob);
587 }
588 if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
589 avg_rating += raw_choice->certainty(raw_blob);
590 ++num_error_chunks;
591 }
592 ++chunk;
593 }
594
595 if (num_error_chunks > 0) {
596 avg_rating /= num_error_chunks;
597 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
598 } else {
599 *thresholds = max_rating;
600 }
601
602 if (*thresholds > max_rating) {
603 *thresholds = max_rating;
604 }
605 if (*thresholds < min_rating) {
606 *thresholds = min_rating;
607 }
608 }
609 }
610
611 // Saves a copy of the word_choice if it has the best unadjusted rating.
612 // Returns true if the word_choice was the new best.
LogNewRawChoice(WERD_CHOICE * word_choice)613 bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {
614 if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
615 delete raw_choice;
616 raw_choice = new WERD_CHOICE(*word_choice);
617 raw_choice->set_permuter(TOP_CHOICE_PERM);
618 return true;
619 }
620 return false;
621 }
622
623 // Consumes word_choice by adding it to best_choices, (taking ownership) if
624 // the certainty for word_choice is some distance of the best choice in
625 // best_choices, or by deleting the word_choice and returning false.
626 // The best_choices list is kept in sorted order by rating. Duplicates are
627 // removed, and the list is kept no longer than max_num_choices in length.
628 // Returns true if the word_choice is still a valid pointer.
LogNewCookedChoice(int max_num_choices,bool debug,WERD_CHOICE * word_choice)629 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
630 WERD_CHOICE *word_choice) {
631 if (best_choice != nullptr) {
632 // Throw out obviously bad choices to save some work.
633 // TODO(rays) Get rid of this! This piece of code produces different
634 // results according to the order in which words are found, which is an
635 // undesirable behavior. It would be better to keep all the choices and
636 // prune them later when more information is available.
637 float max_certainty_delta = StopperAmbigThreshold(
638 best_choice->adjust_factor(), word_choice->adjust_factor());
639 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
640 max_certainty_delta = -kStopperAmbiguityThresholdOffset;
641 }
642 if (word_choice->certainty() - best_choice->certainty() <
643 max_certainty_delta) {
644 if (debug) {
645 std::string bad_string;
646 word_choice->string_and_lengths(&bad_string, nullptr);
647 tprintf(
648 "Discarding choice \"%s\" with an overly low certainty"
649 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
650 bad_string.c_str(), word_choice->certainty(),
651 best_choice->certainty(),
652 max_certainty_delta + best_choice->certainty());
653 }
654 delete word_choice;
655 return false;
656 }
657 }
658
659 // Insert in the list in order of increasing rating, but knock out worse
660 // string duplicates.
661 WERD_CHOICE_IT it(&best_choices);
662 const std::string &new_str = word_choice->unichar_string();
663 bool inserted = false;
664 int num_choices = 0;
665 if (!it.empty()) {
666 do {
667 WERD_CHOICE *choice = it.data();
668 if (choice->rating() > word_choice->rating() && !inserted) {
669 // Time to insert.
670 it.add_before_stay_put(word_choice);
671 inserted = true;
672 if (num_choices == 0) {
673 best_choice = word_choice; // This is the new best.
674 }
675 ++num_choices;
676 }
677 if (choice->unichar_string() == new_str) {
678 if (inserted) {
679 // New is better.
680 delete it.extract();
681 } else {
682 // Old is better.
683 if (debug) {
684 tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
685 new_str.c_str(), word_choice->rating(), choice->rating());
686 }
687 delete word_choice;
688 return false;
689 }
690 } else {
691 ++num_choices;
692 if (num_choices > max_num_choices) {
693 delete it.extract();
694 }
695 }
696 it.forward();
697 } while (!it.at_first());
698 }
699 if (!inserted && num_choices < max_num_choices) {
700 it.add_to_end(word_choice);
701 inserted = true;
702 if (num_choices == 0) {
703 best_choice = word_choice; // This is the new best.
704 }
705 }
706 if (debug) {
707 if (inserted) {
708 tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
709 } else {
710 tprintf("Poor");
711 }
712 word_choice->print(" Word Choice");
713 }
714 if (!inserted) {
715 delete word_choice;
716 return false;
717 }
718 return true;
719 }
720
721 // Simple helper moves the ownership of the pointer data from src to dest,
722 // first deleting anything in dest, and nulling out src afterwards.
723 template <class T>
MovePointerData(T ** dest,T ** src)724 static void MovePointerData(T **dest, T **src) {
725 delete *dest;
726 *dest = *src;
727 *src = nullptr;
728 }
729
730 // Prints a brief list of all the best choices.
PrintBestChoices() const731 void WERD_RES::PrintBestChoices() const {
732 std::string alternates_str;
733 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
734 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
735 if (!it.at_first()) {
736 alternates_str += "\", \"";
737 }
738 alternates_str += it.data()->unichar_string();
739 }
740 tprintf("Alternates for \"%s\": {\"%s\"}\n",
741 best_choice->unichar_string().c_str(), alternates_str.c_str());
742 }
743
744 // Returns the sum of the widths of the blob between start_blob and last_blob
745 // inclusive.
GetBlobsWidth(int start_blob,int last_blob) const746 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
747 int result = 0;
748 for (int b = start_blob; b <= last_blob; ++b) {
749 result += blob_widths[b];
750 if (b < last_blob) {
751 result += blob_gaps[b];
752 }
753 }
754 return result;
755 }
756 // Returns the width of a gap between the specified blob and the next one.
GetBlobsGap(unsigned blob_index) const757 int WERD_RES::GetBlobsGap(unsigned blob_index) const {
758 if (blob_index >= blob_gaps.size()) {
759 return 0;
760 }
761 return blob_gaps[blob_index];
762 }
763
764 // Returns the BLOB_CHOICE corresponding to the given index in the
765 // best choice word taken from the appropriate cell in the ratings MATRIX.
766 // Borrowed pointer, so do not delete. May return nullptr if there is no
767 // BLOB_CHOICE matching the unichar_id at the given index.
GetBlobChoice(unsigned index) const768 BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
769 if (index >= best_choice->length()) {
770 return nullptr;
771 }
772 BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
773 return FindMatchingChoice(best_choice->unichar_id(index), choices);
774 }
775
776 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
777 // best choice word taken from the appropriate cell in the ratings MATRIX.
778 // Borrowed pointer, so do not delete.
GetBlobChoices(int index) const779 BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {
780 return best_choice->blob_choices(index, ratings);
781 }
782
783 // Moves the results fields from word to this. This takes ownership of all
784 // the data, so src can be destructed.
ConsumeWordResults(WERD_RES * word)785 void WERD_RES::ConsumeWordResults(WERD_RES *word) {
786 denorm = word->denorm;
787 blob_row = word->blob_row;
788 MovePointerData(&chopped_word, &word->chopped_word);
789 MovePointerData(&rebuild_word, &word->rebuild_word);
790 MovePointerData(&box_word, &word->box_word);
791 for (auto data : seam_array) {
792 delete data;
793 }
794 seam_array = word->seam_array;
795 word->seam_array.clear();
796 // TODO: optimize moves.
797 best_state = word->best_state;
798 word->best_state.clear();
799 correct_text = word->correct_text;
800 word->correct_text.clear();
801 blob_widths = word->blob_widths;
802 word->blob_widths.clear();
803 blob_gaps = word->blob_gaps;
804 word->blob_gaps.clear();
805 if (ratings != nullptr) {
806 ratings->delete_matrix_pointers();
807 }
808 MovePointerData(&ratings, &word->ratings);
809 best_choice = word->best_choice;
810 MovePointerData(&raw_choice, &word->raw_choice);
811 best_choices.clear();
812 WERD_CHOICE_IT wc_it(&best_choices);
813 wc_it.add_list_after(&word->best_choices);
814 reject_map = word->reject_map;
815 if (word->blamer_bundle != nullptr) {
816 assert(blamer_bundle != nullptr);
817 blamer_bundle->CopyResults(*(word->blamer_bundle));
818 }
819 CopySimpleFields(*word);
820 }
821
822 // Replace the best choice and rebuild box word.
823 // choice must be from the current best_choices list.
ReplaceBestChoice(WERD_CHOICE * choice)824 void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {
825 best_choice = choice;
826 RebuildBestState();
827 SetupBoxWord();
828 // Make up a fake reject map of the right length to keep the
829 // rejection pass happy.
830 reject_map.initialise(best_state.size());
831 done = tess_accepted = tess_would_adapt = true;
832 SetScriptPositions();
833 }
834
835 // Builds the rebuild_word and sets the best_state from the chopped_word and
836 // the best_choice->state.
RebuildBestState()837 void WERD_RES::RebuildBestState() {
838 ASSERT_HOST(best_choice != nullptr);
839 delete rebuild_word;
840 rebuild_word = new TWERD;
841 if (seam_array.empty()) {
842 start_seam_list(chopped_word, &seam_array);
843 }
844 best_state.clear();
845 int start = 0;
846 for (unsigned i = 0; i < best_choice->length(); ++i) {
847 int length = best_choice->state(i);
848 best_state.push_back(length);
849 if (length > 1) {
850 SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
851 start + length - 1);
852 }
853 TBLOB *blob = chopped_word->blobs[start];
854 rebuild_word->blobs.push_back(new TBLOB(*blob));
855 if (length > 1) {
856 SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
857 start + length - 1);
858 }
859 start += length;
860 }
861 }
862
863 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
864 // Also sets up the output box_word.
CloneChoppedToRebuild()865 void WERD_RES::CloneChoppedToRebuild() {
866 delete rebuild_word;
867 rebuild_word = new TWERD(*chopped_word);
868 SetupBoxWord();
869 auto word_len = box_word->length();
870 best_state.reserve(word_len);
871 correct_text.reserve(word_len);
872 for (unsigned i = 0; i < word_len; ++i) {
873 best_state.push_back(1);
874 correct_text.emplace_back("");
875 }
876 }
877
878 // Sets/replaces the box_word with one made from the rebuild_word.
SetupBoxWord()879 void WERD_RES::SetupBoxWord() {
880 delete box_word;
881 rebuild_word->ComputeBoundingBoxes();
882 box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
883 box_word->ClipToOriginalWord(denorm.block(), word);
884 }
885
886 // Sets up the script positions in the output best_choice using the best_choice
887 // to get the unichars, and the unicharset to get the target positions.
SetScriptPositions()888 void WERD_RES::SetScriptPositions() {
889 best_choice->SetScriptPositions(small_caps, chopped_word);
890 }
891 // Sets all the blobs in all the words (raw choice and best choices) to be
892 // the given position. (When a sub/superscript is recognized as a separate
893 // word, it falls victim to the rule that a whole word cannot be sub or
894 // superscript, so this function overrides that problem.)
SetAllScriptPositions(tesseract::ScriptPos position)895 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
896 raw_choice->SetAllScriptPositions(position);
897 WERD_CHOICE_IT wc_it(&best_choices);
898 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
899 wc_it.data()->SetAllScriptPositions(position);
900 }
901 }
902
903 // Classifies the word with some already-calculated BLOB_CHOICEs.
904 // The choices are an array of blob_count pointers to BLOB_CHOICE,
905 // providing a single classifier result for each blob.
906 // The BLOB_CHOICEs are consumed and the word takes ownership.
907 // The number of blobs in the box_word must match blob_count.
FakeClassifyWord(unsigned blob_count,BLOB_CHOICE ** choices)908 void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
909 // Setup the WERD_RES.
910 ASSERT_HOST(box_word != nullptr);
911 ASSERT_HOST(blob_count == box_word->length());
912 ClearWordChoices();
913 ClearRatings();
914 ratings = new MATRIX(blob_count, 1);
915 for (unsigned c = 0; c < blob_count; ++c) {
916 auto *choice_list = new BLOB_CHOICE_LIST;
917 BLOB_CHOICE_IT choice_it(choice_list);
918 choice_it.add_after_then_move(choices[c]);
919 ratings->put(c, c, choice_list);
920 }
921 FakeWordFromRatings(TOP_CHOICE_PERM);
922 reject_map.initialise(blob_count);
923 best_state.clear();
924 best_state.resize(blob_count, 1);
925 done = true;
926 }
927
928 // Creates a WERD_CHOICE for the word using the top choices from the leading
929 // diagonal of the ratings matrix.
FakeWordFromRatings(PermuterType permuter)930 void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
931 int num_blobs = ratings->dimension();
932 auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
933 word_choice->set_permuter(permuter);
934 for (int b = 0; b < num_blobs; ++b) {
935 UNICHAR_ID unichar_id = UNICHAR_SPACE;
936 // Initialize rating and certainty like in WERD_CHOICE::make_bad().
937 float rating = WERD_CHOICE::kBadRating;
938 float certainty = -FLT_MAX;
939 BLOB_CHOICE_LIST *choices = ratings->get(b, b);
940 if (choices != nullptr && !choices->empty()) {
941 BLOB_CHOICE_IT bc_it(choices);
942 BLOB_CHOICE *choice = bc_it.data();
943 unichar_id = choice->unichar_id();
944 rating = choice->rating();
945 certainty = choice->certainty();
946 }
947 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
948 certainty);
949 }
950 LogNewRawChoice(word_choice);
951 // Ownership of word_choice taken by word here.
952 LogNewCookedChoice(1, false, word_choice);
953 }
954
955 // Copies the best_choice strings to the correct_text for adaption/training.
BestChoiceToCorrectText()956 void WERD_RES::BestChoiceToCorrectText() {
957 correct_text.clear();
958 ASSERT_HOST(best_choice != nullptr);
959 for (unsigned i = 0; i < best_choice->length(); ++i) {
960 UNICHAR_ID choice_id = best_choice->unichar_id(i);
961 const char *blob_choice = uch_set->id_to_unichar(choice_id);
962 correct_text.emplace_back(blob_choice);
963 }
964 }
965
966 // Merges 2 adjacent blobs in the result if the permanent callback
967 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
968 // callback box_cb is nullptr or returns true, setting the merged blob
969 // result to the class returned from class_cb.
970 // Returns true if anything was merged.
ConditionalBlobMerge(const std::function<UNICHAR_ID (UNICHAR_ID,UNICHAR_ID)> & class_cb,const std::function<bool (const TBOX &,const TBOX &)> & box_cb)971 bool WERD_RES::ConditionalBlobMerge(
972 const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
973 const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {
974 ASSERT_HOST(best_choice->empty() || ratings != nullptr);
975 bool modified = false;
976 for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
977 UNICHAR_ID new_id =
978 class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
979 if (new_id != INVALID_UNICHAR_ID &&
980 (box_cb == nullptr ||
981 box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
982 // Raw choice should not be fixed.
983 best_choice->set_unichar_id(new_id, i);
984 modified = true;
985 MergeAdjacentBlobs(i);
986 const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
987 if (!coord.Valid(*ratings)) {
988 ratings->IncreaseBandSize(coord.row + 1 - coord.col);
989 }
990 BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
991 if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
992 // Insert a fake result.
993 auto *blob_choice = new BLOB_CHOICE;
994 blob_choice->set_unichar_id(new_id);
995 BLOB_CHOICE_IT bc_it(blob_choices);
996 bc_it.add_before_then_move(blob_choice);
997 }
998 }
999 }
1000 return modified;
1001 }
1002
1003 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
1004 // all the data to account for the change.
MergeAdjacentBlobs(unsigned index)1005 void WERD_RES::MergeAdjacentBlobs(unsigned index) {
1006 if (reject_map.length() == best_choice->length()) {
1007 reject_map.remove_pos(index);
1008 }
1009 best_choice->remove_unichar_id(index + 1);
1010 rebuild_word->MergeBlobs(index, index + 2);
1011 box_word->MergeBoxes(index, index + 2);
1012 if (index + 1 < best_state.size()) {
1013 best_state[index] += best_state[index + 1];
1014 best_state.erase(best_state.begin() + index + 1);
1015 }
1016 }
1017
1018 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
1019 // training data.
1020
1021 // Utility function for fix_quotes
1022 // Return true if the next character in the string (given the UTF8 length in
1023 // bytes) is a quote character.
is_simple_quote(const char * signed_str,int length)1024 static int is_simple_quote(const char *signed_str, int length) {
1025 const auto *str = reinterpret_cast<const unsigned char *>(signed_str);
1026 // Standard 1 byte quotes.
1027 return (length == 1 && (*str == '\'' || *str == '`')) ||
1028 // UTF-8 3 bytes curved quotes.
1029 (length == 3 &&
1030 ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||
1031 (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));
1032 }
1033
1034 // Callback helper for fix_quotes returns a double quote if both
1035 // arguments are quote, otherwise INVALID_UNICHAR_ID.
BothQuotes(UNICHAR_ID id1,UNICHAR_ID id2)1036 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
1037 const char *ch = uch_set->id_to_unichar(id1);
1038 const char *next_ch = uch_set->id_to_unichar(id2);
1039 if (is_simple_quote(ch, strlen(ch)) &&
1040 is_simple_quote(next_ch, strlen(next_ch))) {
1041 return uch_set->unichar_to_id("\"");
1042 }
1043 return INVALID_UNICHAR_ID;
1044 }
1045
1046 // Change pairs of quotes to double quotes.
fix_quotes()1047 void WERD_RES::fix_quotes() {
1048 if (!uch_set->contains_unichar("\"") ||
1049 !uch_set->get_enabled(uch_set->unichar_to_id("\""))) {
1050 return; // Don't create it if it is disallowed.
1051 }
1052
1053 using namespace std::placeholders; // for _1, _2
1054 ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);
1055 }
1056
1057 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1058 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
BothHyphens(UNICHAR_ID id1,UNICHAR_ID id2)1059 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
1060 const char *ch = uch_set->id_to_unichar(id1);
1061 const char *next_ch = uch_set->id_to_unichar(id2);
1062 if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&
1063 (*next_ch == '-' || *next_ch == '~')) {
1064 return uch_set->unichar_to_id("-");
1065 }
1066 return INVALID_UNICHAR_ID;
1067 }
1068
1069 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1070 // (assuming both on the same textline, are in order and a chopped em dash.)
HyphenBoxesOverlap(const TBOX & box1,const TBOX & box2)1071 bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {
1072 return box1.right() >= box2.left();
1073 }
1074
1075 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1076 // Typically a long dash which has been segmented.
fix_hyphens()1077 void WERD_RES::fix_hyphens() {
1078 if (!uch_set->contains_unichar("-") ||
1079 !uch_set->get_enabled(uch_set->unichar_to_id("-"))) {
1080 return; // Don't create it if it is disallowed.
1081 }
1082
1083 using namespace std::placeholders; // for _1, _2
1084 ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
1085 std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
1086 }
1087
1088 // Callback helper for merge_tess_fails returns a space if both
1089 // arguments are space, otherwise INVALID_UNICHAR_ID.
BothSpaces(UNICHAR_ID id1,UNICHAR_ID id2)1090 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
1091 if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {
1092 return id1;
1093 } else {
1094 return INVALID_UNICHAR_ID;
1095 }
1096 }
1097
1098 // Change pairs of tess failures to a single one
merge_tess_fails()1099 void WERD_RES::merge_tess_fails() {
1100 using namespace std::placeholders; // for _1, _2
1101 if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),
1102 nullptr)) {
1103 unsigned len = best_choice->length();
1104 ASSERT_HOST(reject_map.length() == len);
1105 ASSERT_HOST(box_word->length() == len);
1106 }
1107 }
1108
1109 // Returns true if the collection of count pieces, starting at start, are all
1110 // natural connected components, ie there are no real chops involved.
PiecesAllNatural(int start,int count) const1111 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1112 // all seams must have no splits.
1113 for (int index = start; index < start + count - 1; ++index) {
1114 if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
1115 SEAM *seam = seam_array[index];
1116 if (seam != nullptr && seam->HasAnySplits()) {
1117 return false;
1118 }
1119 }
1120 }
1121 return true;
1122 }
1123
~WERD_RES()1124 WERD_RES::~WERD_RES() {
1125 Clear();
1126 }
1127
Clear()1128 void WERD_RES::Clear() {
1129 if (combination) {
1130 delete word;
1131 }
1132 word = nullptr;
1133 delete blamer_bundle;
1134 blamer_bundle = nullptr;
1135 ClearResults();
1136 }
1137
ClearResults()1138 void WERD_RES::ClearResults() {
1139 done = false;
1140 fontinfo = nullptr;
1141 fontinfo2 = nullptr;
1142 fontinfo_id_count = 0;
1143 fontinfo_id2_count = 0;
1144 delete bln_boxes;
1145 bln_boxes = nullptr;
1146 blob_row = nullptr;
1147 delete chopped_word;
1148 chopped_word = nullptr;
1149 delete rebuild_word;
1150 rebuild_word = nullptr;
1151 delete box_word;
1152 box_word = nullptr;
1153 best_state.clear();
1154 correct_text.clear();
1155 for (auto data : seam_array) {
1156 delete data;
1157 }
1158 seam_array.clear();
1159 blob_widths.clear();
1160 blob_gaps.clear();
1161 ClearRatings();
1162 ClearWordChoices();
1163 if (blamer_bundle != nullptr) {
1164 blamer_bundle->ClearResults();
1165 }
1166 }
ClearWordChoices()1167 void WERD_RES::ClearWordChoices() {
1168 best_choice = nullptr;
1169 delete raw_choice;
1170 raw_choice = nullptr;
1171 best_choices.clear();
1172 delete ep_choice;
1173 ep_choice = nullptr;
1174 }
ClearRatings()1175 void WERD_RES::ClearRatings() {
1176 if (ratings != nullptr) {
1177 ratings->delete_matrix_pointers();
1178 delete ratings;
1179 ratings = nullptr;
1180 }
1181 }
1182
cmp(const PAGE_RES_IT & other) const1183 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1184 ASSERT_HOST(page_res == other.page_res);
1185 if (other.block_res == nullptr) {
1186 // other points to the end of the page.
1187 if (block_res == nullptr) {
1188 return 0;
1189 }
1190 return -1;
1191 }
1192 if (block_res == nullptr) {
1193 return 1; // we point to the end of the page.
1194 }
1195 if (block_res == other.block_res) {
1196 if (other.row_res == nullptr || row_res == nullptr) {
1197 // this should only happen if we hit an image block.
1198 return 0;
1199 }
1200 if (row_res == other.row_res) {
1201 // we point to the same block and row.
1202 ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1203 if (word_res == other.word_res) {
1204 // we point to the same word!
1205 return 0;
1206 }
1207
1208 WERD_RES_IT word_res_it(&row_res->word_res_list);
1209 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1210 word_res_it.forward()) {
1211 if (word_res_it.data() == word_res) {
1212 return -1;
1213 } else if (word_res_it.data() == other.word_res) {
1214 return 1;
1215 }
1216 }
1217 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1218 }
1219
1220 // we both point to the same block, but different rows.
1221 ROW_RES_IT row_res_it(&block_res->row_res_list);
1222 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1223 row_res_it.forward()) {
1224 if (row_res_it.data() == row_res) {
1225 return -1;
1226 } else if (row_res_it.data() == other.row_res) {
1227 return 1;
1228 }
1229 }
1230 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1231 }
1232
1233 // We point to different blocks.
1234 BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1235 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
1236 block_res_it.forward()) {
1237 if (block_res_it.data() == block_res) {
1238 return -1;
1239 } else if (block_res_it.data() == other.block_res) {
1240 return 1;
1241 }
1242 }
1243 // Shouldn't happen...
1244 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1245 return 0;
1246 }
1247
1248 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1249 // before the current position. The simple fields of the WERD_RES are copied
1250 // from clone_res and the resulting WERD_RES is returned for further setup
1251 // with best_choice etc.
InsertSimpleCloneWord(const WERD_RES & clone_res,WERD * new_word)1252 WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,
1253 WERD *new_word) {
1254 // Make a WERD_RES for the new_word.
1255 auto *new_res = new WERD_RES(new_word);
1256 new_res->CopySimpleFields(clone_res);
1257 new_res->combination = true;
1258 // Insert into the appropriate place in the ROW_RES.
1259 WERD_RES_IT wr_it(&row()->word_res_list);
1260 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1261 WERD_RES *word = wr_it.data();
1262 if (word == word_res) {
1263 break;
1264 }
1265 }
1266 ASSERT_HOST(!wr_it.cycled_list());
1267 wr_it.add_before_then_move(new_res);
1268 if (wr_it.at_first()) {
1269 // This is the new first word, so reset the member iterator so it
1270 // detects the cycled_list state correctly.
1271 ResetWordIterator();
1272 }
1273 return new_res;
1274 }
1275
1276 // Helper computes the boundaries between blobs in the word. The blob bounds
1277 // are likely very poor, if they come from LSTM, where it only outputs the
1278 // character at one pixel within it, so we find the midpoints between them.
ComputeBlobEnds(const WERD_RES & word,const TBOX & clip_box,C_BLOB_LIST * next_word_blobs,std::vector<int> * blob_ends)1279 static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
1280 C_BLOB_LIST *next_word_blobs,
1281 std::vector<int> *blob_ends) {
1282 C_BLOB_IT blob_it(word.word->cblob_list());
1283 for (int length : word.best_state) {
1284 // Get the bounding box of the fake blobs
1285 TBOX blob_box = blob_it.data()->bounding_box();
1286 blob_it.forward();
1287 for (int b = 1; b < length; ++b) {
1288 blob_box += blob_it.data()->bounding_box();
1289 blob_it.forward();
1290 }
1291 // This blob_box is crap, so for now we are only looking for the
1292 // boundaries between them.
1293 int blob_end = INT32_MAX;
1294 if (!blob_it.at_first() || next_word_blobs != nullptr) {
1295 if (blob_it.at_first()) {
1296 blob_it.set_to_list(next_word_blobs);
1297 }
1298 blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1299 }
1300 blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1301 blob_ends->push_back(blob_end);
1302 }
1303 blob_ends->back() = clip_box.right();
1304 }
1305
1306 // Helper computes the bounds of a word by restricting it to existing words
1307 // that significantly overlap.
ComputeWordBounds(const tesseract::PointerVector<WERD_RES> & words,int w_index,TBOX prev_box,WERD_RES_IT w_it)1308 static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
1309 int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1310 constexpr int kSignificantOverlapFraction = 4;
1311 TBOX clipped_box;
1312 TBOX current_box = words[w_index]->word->bounding_box();
1313 TBOX next_box;
1314 if (static_cast<size_t>(w_index + 1) < words.size() &&
1315 words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {
1316 next_box = words[w_index + 1]->word->bounding_box();
1317 }
1318 for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1319 w_it.forward()) {
1320 if (w_it.data() == nullptr || w_it.data()->word == nullptr) {
1321 continue;
1322 }
1323 TBOX w_box = w_it.data()->word->bounding_box();
1324 int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1325 int width_limit = w_box.width() / kSignificantOverlapFraction;
1326 int min_significant_overlap = std::max(height_limit, width_limit);
1327 int overlap = w_box.intersection(current_box).width();
1328 int prev_overlap = w_box.intersection(prev_box).width();
1329 int next_overlap = w_box.intersection(next_box).width();
1330 if (overlap > min_significant_overlap) {
1331 if (prev_overlap > min_significant_overlap) {
1332 // We have no choice but to use the LSTM word edge.
1333 clipped_box.set_left(current_box.left());
1334 } else if (next_overlap > min_significant_overlap) {
1335 // We have no choice but to use the LSTM word edge.
1336 clipped_box.set_right(current_box.right());
1337 } else {
1338 clipped_box += w_box;
1339 }
1340 }
1341 }
1342 if (clipped_box.height() <= 0) {
1343 clipped_box.set_top(current_box.top());
1344 clipped_box.set_bottom(current_box.bottom());
1345 }
1346 if (clipped_box.width() <= 0) {
1347 clipped_box = current_box;
1348 }
1349 return clipped_box;
1350 }
1351
1352 // Helper moves the blob from src to dest. If it isn't contained by clip_box,
1353 // the blob is replaced by a fake that is contained.
MoveAndClipBlob(C_BLOB_IT * src_it,C_BLOB_IT * dest_it,const TBOX & clip_box)1354 static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
1355 const TBOX &clip_box) {
1356 C_BLOB *src_blob = src_it->extract();
1357 TBOX box = src_blob->bounding_box();
1358 if (!clip_box.contains(box)) {
1359 int left =
1360 ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1361 int right =
1362 ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1363 int top =
1364 ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1365 int bottom =
1366 ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1367 box = TBOX(left, bottom, right, top);
1368 delete src_blob;
1369 src_blob = C_BLOB::FakeBlob(box);
1370 }
1371 dest_it->add_after_then_move(src_blob);
1372 return box;
1373 }
1374
1375 // Replaces the current WERD/WERD_RES with the given words. The given words
1376 // contain fake blobs that indicate the position of the characters. These are
1377 // replaced with real blobs from the current word as much as possible.
ReplaceCurrentWord(tesseract::PointerVector<WERD_RES> * words)1378 void PAGE_RES_IT::ReplaceCurrentWord(
1379 tesseract::PointerVector<WERD_RES> *words) {
1380 if (words->empty()) {
1381 DeleteCurrentWord();
1382 return;
1383 }
1384 WERD_RES *input_word = word();
1385 // Set the BOL/EOL flags on the words from the input word.
1386 if (input_word->word->flag(W_BOL)) {
1387 (*words)[0]->word->set_flag(W_BOL, true);
1388 } else {
1389 (*words)[0]->word->set_blanks(input_word->word->space());
1390 }
1391 words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1392
1393 // Move the blobs from the input word to the new set of words.
1394 // If the input word_res is a combination, then the replacements will also be
1395 // combinations, and will own their own words. If the input word_res is not a
1396 // combination, then the final replacements will not be either, (although it
1397 // is allowed for the input words to be combinations) and their words
1398 // will get put on the row list. This maintains the ownership rules.
1399 WERD_IT w_it(row()->row->word_list());
1400 if (!input_word->combination) {
1401 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1402 WERD *word = w_it.data();
1403 if (word == input_word->word) {
1404 break;
1405 }
1406 }
1407 // w_it is now set to the input_word's word.
1408 ASSERT_HOST(!w_it.cycled_list());
1409 }
1410 // Insert into the appropriate place in the ROW_RES.
1411 WERD_RES_IT wr_it(&row()->word_res_list);
1412 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1413 WERD_RES *word = wr_it.data();
1414 if (word == input_word) {
1415 break;
1416 }
1417 }
1418 ASSERT_HOST(!wr_it.cycled_list());
1419 // Since we only have an estimate of the bounds between blobs, use the blob
1420 // x-middle as the determiner of where to put the blobs
1421 C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422 src_b_it.sort(&C_BLOB::SortByXMiddle);
1423 C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424 rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425 TBOX clip_box;
1426 for (size_t w = 0; w < words->size(); ++w) {
1427 WERD_RES *word_w = (*words)[w];
1428 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429 // Compute blob boundaries.
1430 std::vector<int> blob_ends;
1431 C_BLOB_LIST *next_word_blobs =
1432 w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434 // Remove the fake blobs on the current word, but keep safe for back-up if
1435 // no blob can be found.
1436 C_BLOB_LIST fake_blobs;
1437 C_BLOB_IT fake_b_it(&fake_blobs);
1438 fake_b_it.add_list_after(word_w->word->cblob_list());
1439 fake_b_it.move_to_first();
1440 word_w->word->cblob_list()->clear();
1441 C_BLOB_IT dest_it(word_w->word->cblob_list());
1442 // Build the box word as we move the blobs.
1443 auto *box_word = new tesseract::BoxWord;
1444 for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445 int end_x = blob_ends[i];
1446 TBOX blob_box;
1447 // Add the blobs up to end_x.
1448 while (!src_b_it.empty() &&
1449 src_b_it.data()->bounding_box().x_middle() < end_x) {
1450 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451 src_b_it.forward();
1452 }
1453 while (!rej_b_it.empty() &&
1454 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456 rej_b_it.forward();
1457 }
1458 if (blob_box.null_box()) {
1459 // Use the original box as a back-up.
1460 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461 }
1462 box_word->InsertBox(i, blob_box);
1463 }
1464 delete word_w->box_word;
1465 word_w->box_word = box_word;
1466 if (!input_word->combination) {
1467 // Insert word_w->word into the ROW. It doesn't own its word, so the
1468 // ROW needs to own it.
1469 w_it.add_before_stay_put(word_w->word);
1470 word_w->combination = false;
1471 }
1472 (*words)[w] = nullptr; // We are taking ownership.
1473 wr_it.add_before_stay_put(word_w);
1474 }
1475 // We have taken ownership of the words.
1476 words->clear();
1477 // Delete the current word, which has been replaced. We could just call
1478 // DeleteCurrentWord, but that would iterate both lists again, and we know
1479 // we are already in the right place.
1480 if (!input_word->combination) {
1481 delete w_it.extract();
1482 }
1483 delete wr_it.extract();
1484 ResetWordIterator();
1485 }
1486
1487 // Deletes the current WERD_RES and its underlying WERD.
DeleteCurrentWord()1488 void PAGE_RES_IT::DeleteCurrentWord() {
1489 // Check that this word is as we expect. part_of_combos are NEVER iterated
1490 // by the normal iterator, so we should never be trying to delete them.
1491 ASSERT_HOST(!word_res->part_of_combo);
1492 if (!word_res->combination) {
1493 // Combinations own their own word, so we won't find the word on the
1494 // row's word_list, but it is legitimate to try to delete them.
1495 // Delete word from the ROW when not a combination.
1496 WERD_IT w_it(row()->row->word_list());
1497 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1498 if (w_it.data() == word_res->word) {
1499 break;
1500 }
1501 }
1502 ASSERT_HOST(!w_it.cycled_list());
1503 delete w_it.extract();
1504 }
1505 // Remove the WERD_RES for the new_word.
1506 // Remove the WORD_RES from the ROW_RES.
1507 WERD_RES_IT wr_it(&row()->word_res_list);
1508 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1509 if (wr_it.data() == word_res) {
1510 word_res = nullptr;
1511 break;
1512 }
1513 }
1514 ASSERT_HOST(!wr_it.cycled_list());
1515 delete wr_it.extract();
1516 ResetWordIterator();
1517 }
1518
1519 // Makes the current word a fuzzy space if not already fuzzy. Updates
1520 // corresponding part of combo if required.
MakeCurrentWordFuzzy()1521 void PAGE_RES_IT::MakeCurrentWordFuzzy() {
1522 WERD *real_word = word_res->word;
1523 if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1524 real_word->set_flag(W_FUZZY_SP, true);
1525 if (word_res->combination) {
1526 // The next word should be the corresponding part of combo, but we have
1527 // already stepped past it, so find it by search.
1528 WERD_RES_IT wr_it(&row()->word_res_list);
1529 for (wr_it.mark_cycle_pt();
1530 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1531 }
1532 wr_it.forward();
1533 ASSERT_HOST(wr_it.data()->part_of_combo);
1534 real_word = wr_it.data()->word;
1535 ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1536 !real_word->flag(W_FUZZY_NON));
1537 real_word->set_flag(W_FUZZY_SP, true);
1538 }
1539 }
1540 }
1541
1542 /*************************************************************************
1543 * PAGE_RES_IT::restart_page
1544 *
1545 * Set things up at the start of the page
1546 *************************************************************************/
1547
start_page(bool empty_ok)1548 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
1549 block_res_it.set_to_list(&page_res->block_res_list);
1550 block_res_it.mark_cycle_pt();
1551 prev_block_res = nullptr;
1552 prev_row_res = nullptr;
1553 prev_word_res = nullptr;
1554 block_res = nullptr;
1555 row_res = nullptr;
1556 word_res = nullptr;
1557 next_block_res = nullptr;
1558 next_row_res = nullptr;
1559 next_word_res = nullptr;
1560 internal_forward(true, empty_ok);
1561 return internal_forward(false, empty_ok);
1562 }
1563
1564 // Recovers from operations on the current word, such as in InsertCloneWord
1565 // and DeleteCurrentWord.
1566 // Resets the word_res_it so that it is one past the next_word_res, as
1567 // it should be after internal_forward. If next_row_res != row_res,
1568 // then the next_word_res is in the next row, so there is no need to do
1569 // anything to word_res_it, but it is still a good idea to reset the pointers
1570 // word_res and prev_word_res, which are still in the current row.
ResetWordIterator()1571 void PAGE_RES_IT::ResetWordIterator() {
1572 if (row_res == next_row_res) {
1573 // Reset the member iterator so it can move forward and detect the
1574 // cycled_list state correctly.
1575 word_res_it.move_to_first();
1576 for (word_res_it.mark_cycle_pt();
1577 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1578 word_res_it.forward()) {
1579 if (!word_res_it.data()->part_of_combo) {
1580 if (prev_row_res == row_res) {
1581 prev_word_res = word_res;
1582 }
1583 word_res = word_res_it.data();
1584 }
1585 }
1586 ASSERT_HOST(!word_res_it.cycled_list());
1587 wr_it_of_next_word = word_res_it;
1588 word_res_it.forward();
1589 } else {
1590 // word_res_it is OK, but reset word_res and prev_word_res if needed.
1591 WERD_RES_IT wr_it(&row_res->word_res_list);
1592 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1593 if (!wr_it.data()->part_of_combo) {
1594 if (prev_row_res == row_res) {
1595 prev_word_res = word_res;
1596 }
1597 word_res = wr_it.data();
1598 }
1599 }
1600 }
1601 }
1602
1603 /*************************************************************************
1604 * PAGE_RES_IT::internal_forward
1605 *
1606 * Find the next word on the page. If empty_ok is true, then non-text blocks
1607 * and text blocks with no text are visited as if they contain a single
1608 * imaginary word in a single imaginary row. (word() and row() both return
1609 *nullptr in such a block and the return value is nullptr.) If empty_ok is
1610 *false, the old behaviour is maintained. Each real word is visited and empty
1611 *and non-text blocks and rows are skipped. new_block is used to initialize the
1612 *iterators for a new block. The iterator maintains pointers to block, row and
1613 *word for the previous, current and next words. These are correct, regardless
1614 *of block/row boundaries. nullptr values denote start and end of the page.
1615 *************************************************************************/
1616
internal_forward(bool new_block,bool empty_ok)1617 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1618 bool new_row = false;
1619
1620 prev_block_res = block_res;
1621 prev_row_res = row_res;
1622 prev_word_res = word_res;
1623 block_res = next_block_res;
1624 row_res = next_row_res;
1625 word_res = next_word_res;
1626 wr_it_of_current_word = wr_it_of_next_word;
1627 next_block_res = nullptr;
1628 next_row_res = nullptr;
1629 next_word_res = nullptr;
1630
1631 while (!block_res_it.cycled_list()) {
1632 if (new_block) {
1633 new_block = false;
1634 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1635 row_res_it.mark_cycle_pt();
1636 if (row_res_it.empty() && empty_ok) {
1637 next_block_res = block_res_it.data();
1638 break;
1639 }
1640 new_row = true;
1641 }
1642 while (!row_res_it.cycled_list()) {
1643 if (new_row) {
1644 new_row = false;
1645 word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1646 word_res_it.mark_cycle_pt();
1647 }
1648 // Skip any part_of_combo words.
1649 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {
1650 word_res_it.forward();
1651 }
1652 if (!word_res_it.cycled_list()) {
1653 next_block_res = block_res_it.data();
1654 next_row_res = row_res_it.data();
1655 next_word_res = word_res_it.data();
1656 wr_it_of_next_word = word_res_it;
1657 word_res_it.forward();
1658 goto foundword;
1659 }
1660 // end of row reached
1661 row_res_it.forward();
1662 new_row = true;
1663 }
1664 // end of block reached
1665 block_res_it.forward();
1666 new_block = true;
1667 }
1668 foundword:
1669 // Update prev_word_best_choice pointer.
1670 if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1671 *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)
1672 ? nullptr
1673 : prev_word_res->best_choice;
1674 }
1675 return word_res;
1676 }
1677
1678 /*************************************************************************
1679 * PAGE_RES_IT::restart_row()
1680 *
1681 * Move to the beginning (leftmost word) of the current row.
1682 *************************************************************************/
restart_row()1683 WERD_RES *PAGE_RES_IT::restart_row() {
1684 ROW_RES *row = this->row();
1685 if (!row) {
1686 return nullptr;
1687 }
1688 for (restart_page(); this->row() != row; forward()) {
1689 // pass
1690 }
1691 return word();
1692 }
1693
1694 /*************************************************************************
1695 * PAGE_RES_IT::forward_paragraph
1696 *
1697 * Move to the beginning of the next paragraph, allowing empty blocks.
1698 *************************************************************************/
1699
forward_paragraph()1700 WERD_RES *PAGE_RES_IT::forward_paragraph() {
1701 while (block_res == next_block_res &&
1702 (next_row_res != nullptr && next_row_res->row != nullptr &&
1703 row_res->row->para() == next_row_res->row->para())) {
1704 internal_forward(false, true);
1705 }
1706 return internal_forward(false, true);
1707 }
1708
1709 /*************************************************************************
1710 * PAGE_RES_IT::forward_block
1711 *
1712 * Move to the beginning of the next block, allowing empty blocks.
1713 *************************************************************************/
1714
forward_block()1715 WERD_RES *PAGE_RES_IT::forward_block() {
1716 while (block_res == next_block_res) {
1717 internal_forward(false, true);
1718 }
1719 return internal_forward(false, true);
1720 }
1721
rej_stat_word()1722 void PAGE_RES_IT::rej_stat_word() {
1723 int16_t chars_in_word;
1724 int16_t rejects_in_word = 0;
1725
1726 chars_in_word = word_res->reject_map.length();
1727 page_res->char_count += chars_in_word;
1728 block_res->char_count += chars_in_word;
1729 row_res->char_count += chars_in_word;
1730
1731 rejects_in_word = word_res->reject_map.reject_count();
1732
1733 page_res->rej_count += rejects_in_word;
1734 block_res->rej_count += rejects_in_word;
1735 row_res->rej_count += rejects_in_word;
1736 if (chars_in_word == rejects_in_word) {
1737 row_res->whole_word_rej_count += rejects_in_word;
1738 }
1739 }
1740
1741 } // namespace tesseract
1742