1 /**********************************************************************
2 * File: reject.cpp (Formerly reject.c)
3 * Description: Rejection functions used in tessedit
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "reject.h"
25
26 #ifdef DISABLED_LEGACY_ENGINE
27
28 # include "tesseractclass.h"
29
30 namespace tesseract {
31
safe_dict_word(const WERD_RES * werd_res)32 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
33 const WERD_CHOICE &word = *werd_res->best_choice;
34 int dict_word_type = werd_res->tesseract->dict_word(word);
35 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
36 }
37 } // namespace tesseract
38
39 #else
40
41 # include "control.h"
42 # include "docqual.h"
43 # include "tesseractclass.h"
44 # include "tessvars.h"
45
46 # include "helpers.h"
47
48 # include <algorithm> // for std::sort
49 # include <cctype>
50 # include <cerrno>
51 # include <cstring>
52 # include <vector> // for std::vector
53
54 namespace tesseract {
55
56 /*************************************************************************
57 * set_done()
58 *
59 * Set the done flag based on the word acceptability criteria
60 *************************************************************************/
61
set_done(WERD_RES * word,int16_t pass)62 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
63 word->done =
64 word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65 bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67 word->best_choice->permuter() == FREQ_DAWG_PERM ||
68 word->best_choice->permuter() == USER_DAWG_PERM;
69 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70 one_ell_conflict(word, false)) {
71 if (tessedit_rejection_debug) {
72 tprintf("one_ell_conflict detected\n");
73 }
74 word->done = false;
75 }
76 if (word->done &&
77 ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78 if (tessedit_rejection_debug) {
79 tprintf("non-dict or ambig word detected\n");
80 }
81 word->done = false;
82 }
83 if (tessedit_rejection_debug) {
84 tprintf("set_done(): done=%d\n", word->done);
85 word->best_choice->print("");
86 }
87 }
88
89 /*************************************************************************
90 * make_reject_map()
91 *
92 * Sets the done flag to indicate whether the resylt is acceptable.
93 *
94 * Sets a reject map for the word.
95 *************************************************************************/
make_reject_map(WERD_RES * word,ROW * row,int16_t pass)96 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
97 flip_0O(word);
98 check_debug_pt(word, -1); // For trap only
99 set_done(word, pass); // Set acceptance
100 word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101 reject_blanks(word);
102 /*
103 0: Rays original heuristic - the baseline
104 */
105 if (tessedit_reject_mode == 0) {
106 if (!word->done) {
107 reject_poor_matches(word);
108 }
109 } else if (tessedit_reject_mode == 5) {
110 /*
111 5: Reject I/1/l from words where there is no strong contextual confirmation;
112 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113 and the whole of any words which are very small
114 */
115 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
116 word->reject_map.rej_word_small_xht();
117 } else {
118 one_ell_conflict(word, true);
119 /*
120 Originally the code here just used the done flag. Now I have duplicated
121 and unpacked the conditions for setting the done flag so that each
122 mechanism can be turned on or off independently. This works WITHOUT
123 affecting the done flag setting.
124 */
125 if (rej_use_tess_accepted && !word->tess_accepted) {
126 word->reject_map.rej_word_not_tess_accepted();
127 }
128
129 if (rej_use_tess_blanks &&
130 (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
131 word->reject_map.rej_word_contains_blanks();
132 }
133
134 WERD_CHOICE *best_choice = word->best_choice;
135 if (rej_use_good_perm) {
136 if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137 best_choice->permuter() == FREQ_DAWG_PERM ||
138 best_choice->permuter() == USER_DAWG_PERM) &&
139 (!rej_use_sensible_wd ||
140 acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141 best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142 // PASSED TEST
143 } else if (best_choice->permuter() == NUMBER_PERM) {
144 if (rej_alphas_in_number_perm) {
145 for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146 offset += best_choice->unichar_lengths()[i++]) {
147 if (word->reject_map[i].accepted() &&
148 word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149 best_choice->unichar_lengths()[i])) {
150 word->reject_map[i].setrej_bad_permuter();
151 }
152 // rej alpha
153 }
154 }
155 } else {
156 word->reject_map.rej_word_bad_permuter();
157 }
158 }
159 /* Ambig word rejection was here once !!*/
160 }
161 } else {
162 tprintf("BAD tessedit_reject_mode\n");
163 ASSERT_HOST("Fatal error encountered!" == nullptr);
164 }
165
166 if (tessedit_image_border > -1) {
167 reject_edge_blobs(word);
168 }
169
170 check_debug_pt(word, 10);
171 if (tessedit_rejection_debug) {
172 tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173 tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty(),
174 word->best_choice->rating());
175 tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176 }
177
178 flip_hyphens(word);
179 check_debug_pt(word, 20);
180 }
181
reject_blanks(WERD_RES * word)182 void reject_blanks(WERD_RES *word) {
183 int16_t i;
184 int16_t offset;
185
186 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
187 offset += word->best_choice->unichar_lengths()[i], i += 1) {
188 if (word->best_choice->unichar_string()[offset] == ' ') {
189 // rej unrecognised blobs
190 word->reject_map[i].setrej_tess_failure();
191 }
192 }
193 }
194
reject_I_1_L(WERD_RES * word)195 void Tesseract::reject_I_1_L(WERD_RES *word) {
196 int16_t i;
197 int16_t offset;
198
199 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200 offset += word->best_choice->unichar_lengths()[i], i += 1) {
201 if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202 // rej 1Il conflict
203 word->reject_map[i].setrej_1Il_conflict();
204 }
205 }
206 }
207
reject_poor_matches(WERD_RES * word)208 void reject_poor_matches(WERD_RES *word) {
209 float threshold = compute_reject_threshold(word->best_choice);
210 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
211 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
212 word->reject_map[i].setrej_tess_failure();
213 } else if (word->best_choice->certainty(i) < threshold) {
214 word->reject_map[i].setrej_poor_match();
215 }
216 }
217 }
218
219 /**********************************************************************
220 * compute_reject_threshold
221 *
222 * Set a rejection threshold for this word.
223 * Initially this is a trivial function which looks for the largest
224 * gap in the certainty value.
225 **********************************************************************/
226
compute_reject_threshold(WERD_CHOICE * word)227 float compute_reject_threshold(WERD_CHOICE *word) {
228 float threshold; // rejection threshold
229 float bestgap = 0.0f; // biggest gap
230 float gapstart; // bottom of gap
231
232 auto blob_count = word->length();
233 std::vector<float> ratings;
234 ratings.reserve(blob_count);
235 for (unsigned i = 0; i < blob_count; ++i) {
236 ratings.push_back(word->certainty(i));
237 }
238 std::sort(ratings.begin(), ratings.end());
239 gapstart = ratings[0] - 1; // all reject if none better
240 if (blob_count >= 3) {
241 for (unsigned index = 0; index < blob_count - 1; index++) {
242 if (ratings[index + 1] - ratings[index] > bestgap) {
243 bestgap = ratings[index + 1] - ratings[index];
244 // find biggest
245 gapstart = ratings[index];
246 }
247 }
248 }
249 threshold = gapstart + bestgap / 2;
250
251 return threshold;
252 }
253
254 /*************************************************************************
255 * reject_edge_blobs()
256 *
257 * If the word is perilously close to the edge of the image, reject those blobs
258 * in the word which are too close to the edge as they could be clipped.
259 *************************************************************************/
reject_edge_blobs(WERD_RES * word)260 void Tesseract::reject_edge_blobs(WERD_RES *word) {
261 TBOX word_box = word->word->bounding_box();
262 // Use the box_word as it is already denormed back to image coordinates.
263 int blobcount = word->box_word->length();
264
265 if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266 word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267 word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268 ASSERT_HOST(word->reject_map.length() == blobcount);
269 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270 TBOX blob_box = word->box_word->BlobBox(blobindex);
271 if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272 blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273 blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274 word->reject_map[blobindex].setrej_edge_char();
275 // Close to edge
276 }
277 }
278 }
279 }
280
281 /**********************************************************************
282 * one_ell_conflict()
283 *
284 * Identify words where there is a potential I/l/1 error.
285 * - A bundle of contextual heuristics!
286 **********************************************************************/
one_ell_conflict(WERD_RES * word_res,bool update_map)287 bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
288 const char *word;
289 const char *lengths;
290 int16_t word_len; // its length
291 int16_t first_alphanum_index_;
292 int16_t first_alphanum_offset_;
293 int16_t i;
294 int16_t offset;
295 bool non_conflict_set_char; // non conf set a/n?
296 bool conflict = false;
297 bool allow_1s;
298 ACCEPTABLE_WERD_TYPE word_type;
299 bool dict_perm_type;
300 bool dict_word_ok;
301 int dict_word_type;
302
303 word = word_res->best_choice->unichar_string().c_str();
304 lengths = word_res->best_choice->unichar_lengths().c_str();
305 word_len = strlen(lengths);
306 /*
307 If there are no occurrences of the conflict set characters then the word
308 is OK.
309 */
310 if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
311 return false;
312 }
313
314 /*
315 There is a conflict if there are NO other (confirmed) alphanumerics apart
316 from those in the conflict set.
317 */
318
319 for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
320 offset += lengths[i++]) {
321 non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
322 word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
323 !conflict_set_I_l_1.contains(word[offset]);
324 }
325 if (!non_conflict_set_char) {
326 if (update_map) {
327 reject_I_1_L(word_res);
328 }
329 return true;
330 }
331
332 /*
333 If the word is accepted by a dawg permuter, and the first alpha character
334 is "I" or "l", check to see if the alternative is also a dawg word. If it
335 is, then there is a potential error otherwise the word is ok.
336 */
337
338 dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
339 (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
340 (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
341 (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
342 dict_word_type = dict_word(*(word_res->best_choice));
343 dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
344
345 if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346 (dict_perm_type && dict_word_ok)) {
347 first_alphanum_index_ = first_alphanum_index(word, lengths);
348 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
349 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
350 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
351 if (safe_dict_word(word_res) > 0) {
352 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
353 if (update_map) {
354 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
355 }
356 return true;
357 } else {
358 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
359 return false;
360 }
361 }
362
363 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
364 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365 if (safe_dict_word(word_res) > 0) {
366 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
367 if (update_map) {
368 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
369 }
370 return true;
371 } else {
372 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
373 return false;
374 }
375 }
376 return false;
377 }
378
379 /*
380 NEW 1Il code. The old code relied on permuter types too much. In fact,
381 tess will use TOP_CHOICE permute for good things like "palette".
382 In this code the string is examined independently to see if it looks like
383 a well formed word.
384 */
385
386 /*
387 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
388 dictionary word.
389 */
390 first_alphanum_index_ = first_alphanum_index(word, lengths);
391 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
392 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
393 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
394 if (safe_dict_word(word_res) > 0) {
395 return false;
396 } else {
397 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
398 }
399 } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
400 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
401 if (safe_dict_word(word_res) > 0) {
402 return false;
403 } else {
404 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405 }
406 }
407 /*
408 For strings containing digits:
409 If there are no alphas OR the numeric permuter liked the word,
410 reject any non 1 conflict chs
411 Else reject all conflict chs
412 */
413 if (word_contains_non_1_digit(word, lengths)) {
414 allow_1s =
415 (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
416
417 int16_t offset;
418 conflict = false;
419 for (i = 0, offset = 0; word[offset] != '\0';
420 offset += word_res->best_choice->unichar_lengths()[i++]) {
421 if ((!allow_1s || (word[offset] != '1')) &&
422 conflict_set_I_l_1.contains(word[offset])) {
423 if (update_map) {
424 word_res->reject_map[i].setrej_1Il_conflict();
425 }
426 conflict = true;
427 }
428 }
429 return conflict;
430 }
431 /*
432 For anything else. See if it conforms to an acceptable word type. If so,
433 treat accordingly.
434 */
435 word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
436 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
437 first_alphanum_index_ = first_alphanum_index(word, lengths);
438 first_alphanum_offset_ = first_alphanum_offset(word, lengths);
439 if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
440 if (update_map) {
441 word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
442 }
443 return true;
444 } else {
445 return false;
446 }
447 } else if (word_type == AC_UPPER_CASE) {
448 return false;
449 } else {
450 if (update_map) {
451 reject_I_1_L(word_res);
452 }
453 return true;
454 }
455 }
456
first_alphanum_index(const char * word,const char * word_lengths)457 int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
458 int16_t i;
459 int16_t offset;
460
461 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
462 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
463 unicharset.get_isdigit(word + offset, word_lengths[i])) {
464 return i;
465 }
466 }
467 return -1;
468 }
469
first_alphanum_offset(const char * word,const char * word_lengths)470 int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
471 int16_t i;
472 int16_t offset;
473
474 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475 if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476 unicharset.get_isdigit(word + offset, word_lengths[i])) {
477 return offset;
478 }
479 }
480 return -1;
481 }
482
alpha_count(const char * word,const char * word_lengths)483 int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
484 int16_t i;
485 int16_t offset;
486 int16_t count = 0;
487
488 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489 if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
490 count++;
491 }
492 }
493 return count;
494 }
495
word_contains_non_1_digit(const char * word,const char * word_lengths)496 bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
497 int16_t i;
498 int16_t offset;
499
500 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
501 if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
502 (word_lengths[i] != 1 || word[offset] != '1')) {
503 return true;
504 }
505 }
506 return false;
507 }
508
509 /*************************************************************************
510 * dont_allow_1Il()
511 * Don't unreject LONE accepted 1Il conflict set chars
512 *************************************************************************/
dont_allow_1Il(WERD_RES * word)513 void Tesseract::dont_allow_1Il(WERD_RES *word) {
514 int word_len = word->reject_map.length();
515 const char *s = word->best_choice->unichar_string().c_str();
516 const char *lengths = word->best_choice->unichar_lengths().c_str();
517 bool accepted_1Il = false;
518
519 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
520 if (word->reject_map[i].accepted()) {
521 if (conflict_set_I_l_1.contains(s[offset])) {
522 accepted_1Il = true;
523 } else {
524 if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
525 word->uch_set->get_isdigit(s + offset, lengths[i])) {
526 return; // >=1 non 1Il ch accepted
527 }
528 }
529 }
530 }
531 if (!accepted_1Il) {
532 return; // Nothing to worry about
533 }
534
535 for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
536 if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
537 word->reject_map[i].setrej_postNN_1Il();
538 }
539 }
540 }
541
count_alphanums(WERD_RES * word_res)542 int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
543 int count = 0;
544 const WERD_CHOICE *best_choice = word_res->best_choice;
545 for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
546 if ((word_res->reject_map[i].accepted()) &&
547 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
548 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
549 count++;
550 }
551 }
552 return count;
553 }
554
555 // reject all if most rejected.
reject_mostly_rejects(WERD_RES * word)556 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
557 /* Reject the whole of the word if the fraction of rejects exceeds a limit */
558
559 if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
560 rej_whole_of_mostly_reject_word_fract) {
561 word->reject_map.rej_word_mostly_rej();
562 }
563 }
564
repeated_nonalphanum_wd(WERD_RES * word,ROW * row)565 bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
566 if (word->best_choice->unichar_lengths().length() <= 1) {
567 return false;
568 }
569
570 if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
571 return false;
572 }
573
574 UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
575 for (unsigned i = 1; i < word->best_choice->length(); ++i) {
576 if (word->best_choice->unichar_id(i) != uch_id) {
577 return false;
578 }
579 }
580
581 int16_t char_quality;
582 int16_t accepted_char_quality;
583 word_char_quality(word, &char_quality, &accepted_char_quality);
584
585 if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
586 (char_quality == accepted_char_quality)) {
587 return true;
588 } else {
589 return false;
590 }
591 }
592
safe_dict_word(const WERD_RES * werd_res)593 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
594 const WERD_CHOICE &word = *werd_res->best_choice;
595 int dict_word_type = werd_res->tesseract->dict_word(word);
596 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
597 }
598
599 // Note: After running this function word_res->ratings
600 // might not contain the right BLOB_CHOICE corresponding to each character
601 // in word_res->best_choice.
flip_hyphens(WERD_RES * word_res)602 void Tesseract::flip_hyphens(WERD_RES *word_res) {
603 WERD_CHOICE *best_choice = word_res->best_choice;
604 int prev_right = -9999;
605 int next_left;
606 TBOX out_box;
607 float aspect_ratio;
608
609 if (tessedit_lower_flip_hyphen <= 1) {
610 return;
611 }
612
613 auto num_blobs = word_res->rebuild_word->NumBlobs();
614 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
615 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
616 TBLOB *blob = word_res->rebuild_word->blobs[i];
617 out_box = blob->bounding_box();
618 if (i + 1 == num_blobs) {
619 next_left = 9999;
620 } else {
621 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
622 }
623 // Don't touch small or touching blobs - it is too dangerous.
624 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
625 (out_box.right() < next_left)) {
626 aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
627 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
628 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
629 word_res->uch_set->contains_unichar_id(unichar_dash) &&
630 word_res->uch_set->get_enabled(unichar_dash)) {
631 /* Certain HYPHEN */
632 best_choice->set_unichar_id(unichar_dash, i);
633 if (word_res->reject_map[i].rejected()) {
634 word_res->reject_map[i].setrej_hyphen_accept();
635 }
636 }
637 if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
638 // Suspected HYPHEN
639 word_res->reject_map[i].setrej_hyphen();
640 }
641 } else if (best_choice->unichar_id(i) == unichar_dash) {
642 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
643 word_res->reject_map[i].setrej_hyphen_accept();
644 }
645 // Certain HYPHEN
646
647 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
648 // Suspected HYPHEN
649 word_res->reject_map[i].setrej_hyphen();
650 }
651 }
652 }
653 prev_right = out_box.right();
654 }
655 }
656
657 // Note: After running this function word_res->ratings
658 // might not contain the right BLOB_CHOICE corresponding to each character
659 // in word_res->best_choice.
flip_0O(WERD_RES * word_res)660 void Tesseract::flip_0O(WERD_RES *word_res) {
661 WERD_CHOICE *best_choice = word_res->best_choice;
662 TBOX out_box;
663
664 if (!tessedit_flip_0O) {
665 return;
666 }
667
668 auto num_blobs = word_res->rebuild_word->NumBlobs();
669 for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
670 TBLOB *blob = word_res->rebuild_word->blobs[i];
671 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
672 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
673 out_box = blob->bounding_box();
674 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
675 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
676 return; // Beware words with sub/superscripts
677 }
678 }
679 }
680 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
681 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
682 if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
683 unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
684 return; // 0 or O are not present/enabled in unicharset
685 }
686 for (unsigned i = 1; i < best_choice->length(); ++i) {
687 if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
688 /* A0A */
689 if ((i + 1) < best_choice->length() &&
690 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
691 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
692 best_choice->set_unichar_id(unichar_O, i);
693 }
694 /* A00A */
695 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
696 (i + 1) < best_choice->length() &&
697 (best_choice->unichar_id(i + 1) == unichar_0 ||
698 best_choice->unichar_id(i + 1) == unichar_O) &&
699 (i + 2) < best_choice->length() &&
700 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
701 best_choice->set_unichar_id(unichar_O, i);
702 i++;
703 }
704 /* AA0<non digit or end of word> */
705 if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
707 (((i + 1) < best_choice->length() &&
708 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
709 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
710 !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
711 (i == best_choice->length() - 1))) {
712 best_choice->set_unichar_id(unichar_O, i);
713 }
714 /* 9O9 */
715 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
716 (i + 1) < best_choice->length() &&
717 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
718 best_choice->set_unichar_id(unichar_0, i);
719 }
720 /* 9OOO */
721 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
722 (i + 2) < best_choice->length() &&
723 (best_choice->unichar_id(i + 1) == unichar_0 ||
724 best_choice->unichar_id(i + 1) == unichar_O) &&
725 (best_choice->unichar_id(i + 2) == unichar_0 ||
726 best_choice->unichar_id(i + 2) == unichar_O)) {
727 best_choice->set_unichar_id(unichar_0, i);
728 best_choice->set_unichar_id(unichar_0, i + 1);
729 best_choice->set_unichar_id(unichar_0, i + 2);
730 i += 2;
731 }
732 /* 9OO<non upper> */
733 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
734 (i + 2) < best_choice->length() &&
735 (best_choice->unichar_id(i + 1) == unichar_0 ||
736 best_choice->unichar_id(i + 1) == unichar_O) &&
737 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
738 best_choice->set_unichar_id(unichar_0, i);
739 best_choice->set_unichar_id(unichar_0, i + 1);
740 i++;
741 }
742 /* 9O<non upper> */
743 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
744 (i + 1) < best_choice->length() &&
745 !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
746 best_choice->set_unichar_id(unichar_0, i);
747 }
748 /* 9[.,]OOO.. */
749 if ((i > 1) &&
750 (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
751 word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
752 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
753 best_choice->unichar_id(i - 2) == unichar_O)) {
754 if (best_choice->unichar_id(i - 2) == unichar_O) {
755 best_choice->set_unichar_id(unichar_0, i - 2);
756 }
757 while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
758 best_choice->unichar_id(i) == unichar_0)) {
759 best_choice->set_unichar_id(unichar_0, i);
760 i++;
761 }
762 i--;
763 }
764 }
765 }
766 }
767
non_O_upper(const UNICHARSET & ch_set,UNICHAR_ID unichar_id)768 bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
769 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
770 }
771
non_0_digit(const UNICHARSET & ch_set,UNICHAR_ID unichar_id)772 bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
773 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
774 }
775 } // namespace tesseract
776
777 #endif // def DISABLED_LEGACY_ENGINE
778