1 /**********************************************************************
2  * File:        reject.cpp  (Formerly reject.c)
3  * Description: Rejection functions used in tessedit
4  * Author:      Phil Cheatle
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include "reject.h"
25 
26 #ifdef DISABLED_LEGACY_ENGINE
27 
28 #  include "tesseractclass.h"
29 
30 namespace tesseract {
31 
safe_dict_word(const WERD_RES * werd_res)32 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
33   const WERD_CHOICE &word = *werd_res->best_choice;
34   int dict_word_type = werd_res->tesseract->dict_word(word);
35   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
36 }
37 } // namespace tesseract
38 
39 #else
40 
41 #  include "control.h"
42 #  include "docqual.h"
43 #  include "tesseractclass.h"
44 #  include "tessvars.h"
45 
46 #  include "helpers.h"
47 
48 #  include <algorithm> // for std::sort
49 #  include <cctype>
50 #  include <cerrno>
51 #  include <cstring>
52 #  include <vector> // for std::vector
53 
54 namespace tesseract {
55 
56 /*************************************************************************
57  * set_done()
58  *
59  * Set the done flag based on the word acceptability criteria
60  *************************************************************************/
61 
set_done(WERD_RES * word,int16_t pass)62 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
63   word->done =
64       word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);
65   bool word_is_ambig = word->best_choice->dangerous_ambig_found();
66   bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
67                         word->best_choice->permuter() == FREQ_DAWG_PERM ||
68                         word->best_choice->permuter() == USER_DAWG_PERM;
69   if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
70       one_ell_conflict(word, false)) {
71     if (tessedit_rejection_debug) {
72       tprintf("one_ell_conflict detected\n");
73     }
74     word->done = false;
75   }
76   if (word->done &&
77       ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
78     if (tessedit_rejection_debug) {
79       tprintf("non-dict or ambig word detected\n");
80     }
81     word->done = false;
82   }
83   if (tessedit_rejection_debug) {
84     tprintf("set_done(): done=%d\n", word->done);
85     word->best_choice->print("");
86   }
87 }
88 
89 /*************************************************************************
90  * make_reject_map()
91  *
92  * Sets the done flag to indicate whether the resylt is acceptable.
93  *
94  * Sets a reject map for the word.
95  *************************************************************************/
make_reject_map(WERD_RES * word,ROW * row,int16_t pass)96 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
97   flip_0O(word);
98   check_debug_pt(word, -1); // For trap only
99   set_done(word, pass);     // Set acceptance
100   word->reject_map.initialise(word->best_choice->unichar_lengths().length());
101   reject_blanks(word);
102   /*
103 0: Rays original heuristic - the baseline
104 */
105   if (tessedit_reject_mode == 0) {
106     if (!word->done) {
107       reject_poor_matches(word);
108     }
109   } else if (tessedit_reject_mode == 5) {
110     /*
111 5: Reject I/1/l from words where there is no strong contextual confirmation;
112   the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
113   and the whole of any words which are very small
114 */
115     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
116       word->reject_map.rej_word_small_xht();
117     } else {
118       one_ell_conflict(word, true);
119       /*
120   Originally the code here just used the done flag. Now I have duplicated
121   and unpacked the conditions for setting the done flag so that each
122   mechanism can be turned on or off independently. This works WITHOUT
123   affecting the done flag setting.
124 */
125       if (rej_use_tess_accepted && !word->tess_accepted) {
126         word->reject_map.rej_word_not_tess_accepted();
127       }
128 
129       if (rej_use_tess_blanks &&
130           (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {
131         word->reject_map.rej_word_contains_blanks();
132       }
133 
134       WERD_CHOICE *best_choice = word->best_choice;
135       if (rej_use_good_perm) {
136         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
137              best_choice->permuter() == FREQ_DAWG_PERM ||
138              best_choice->permuter() == USER_DAWG_PERM) &&
139             (!rej_use_sensible_wd ||
140              acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),
141                                     best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {
142           // PASSED TEST
143         } else if (best_choice->permuter() == NUMBER_PERM) {
144           if (rej_alphas_in_number_perm) {
145             for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
146                  offset += best_choice->unichar_lengths()[i++]) {
147               if (word->reject_map[i].accepted() &&
148                   word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
149                                              best_choice->unichar_lengths()[i])) {
150                 word->reject_map[i].setrej_bad_permuter();
151               }
152               // rej alpha
153             }
154           }
155         } else {
156           word->reject_map.rej_word_bad_permuter();
157         }
158       }
159       /* Ambig word rejection was here once !!*/
160     }
161   } else {
162     tprintf("BAD tessedit_reject_mode\n");
163     ASSERT_HOST("Fatal error encountered!" == nullptr);
164   }
165 
166   if (tessedit_image_border > -1) {
167     reject_edge_blobs(word);
168   }
169 
170   check_debug_pt(word, 10);
171   if (tessedit_rejection_debug) {
172     tprintf("Permuter Type = %d\n", word->best_choice->permuter());
173     tprintf("Certainty: %f     Rating: %f\n", word->best_choice->certainty(),
174             word->best_choice->rating());
175     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
176   }
177 
178   flip_hyphens(word);
179   check_debug_pt(word, 20);
180 }
181 
reject_blanks(WERD_RES * word)182 void reject_blanks(WERD_RES *word) {
183   int16_t i;
184   int16_t offset;
185 
186   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
187        offset += word->best_choice->unichar_lengths()[i], i += 1) {
188     if (word->best_choice->unichar_string()[offset] == ' ') {
189       // rej unrecognised blobs
190       word->reject_map[i].setrej_tess_failure();
191     }
192   }
193 }
194 
reject_I_1_L(WERD_RES * word)195 void Tesseract::reject_I_1_L(WERD_RES *word) {
196   int16_t i;
197   int16_t offset;
198 
199   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
200        offset += word->best_choice->unichar_lengths()[i], i += 1) {
201     if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
202       // rej 1Il conflict
203       word->reject_map[i].setrej_1Il_conflict();
204     }
205   }
206 }
207 
reject_poor_matches(WERD_RES * word)208 void reject_poor_matches(WERD_RES *word) {
209   float threshold = compute_reject_threshold(word->best_choice);
210   for (unsigned i = 0; i < word->best_choice->length(); ++i) {
211     if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
212       word->reject_map[i].setrej_tess_failure();
213     } else if (word->best_choice->certainty(i) < threshold) {
214       word->reject_map[i].setrej_poor_match();
215     }
216   }
217 }
218 
219 /**********************************************************************
220  * compute_reject_threshold
221  *
222  * Set a rejection threshold for this word.
223  * Initially this is a trivial function which looks for the largest
224  * gap in the certainty value.
225  **********************************************************************/
226 
compute_reject_threshold(WERD_CHOICE * word)227 float compute_reject_threshold(WERD_CHOICE *word) {
228   float threshold;      // rejection threshold
229   float bestgap = 0.0f; // biggest gap
230   float gapstart;       // bottom of gap
231 
232   auto blob_count = word->length();
233   std::vector<float> ratings;
234   ratings.reserve(blob_count);
235   for (unsigned i = 0; i < blob_count; ++i) {
236     ratings.push_back(word->certainty(i));
237   }
238   std::sort(ratings.begin(), ratings.end());
239   gapstart = ratings[0] - 1; // all reject if none better
240   if (blob_count >= 3) {
241     for (unsigned index = 0; index < blob_count - 1; index++) {
242       if (ratings[index + 1] - ratings[index] > bestgap) {
243         bestgap = ratings[index + 1] - ratings[index];
244         // find biggest
245         gapstart = ratings[index];
246       }
247     }
248   }
249   threshold = gapstart + bestgap / 2;
250 
251   return threshold;
252 }
253 
254 /*************************************************************************
255  * reject_edge_blobs()
256  *
257  * If the word is perilously close to the edge of the image, reject those blobs
258  * in the word which are too close to the edge as they could be clipped.
259  *************************************************************************/
reject_edge_blobs(WERD_RES * word)260 void Tesseract::reject_edge_blobs(WERD_RES *word) {
261   TBOX word_box = word->word->bounding_box();
262   // Use the box_word as it is already denormed back to image coordinates.
263   int blobcount = word->box_word->length();
264 
265   if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||
266       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
267       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
268     ASSERT_HOST(word->reject_map.length() == blobcount);
269     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
270       TBOX blob_box = word->box_word->BlobBox(blobindex);
271       if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||
272           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
273           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
274         word->reject_map[blobindex].setrej_edge_char();
275         // Close to edge
276       }
277     }
278   }
279 }
280 
281 /**********************************************************************
282  * one_ell_conflict()
283  *
284  * Identify words where there is a potential I/l/1 error.
285  * - A bundle of contextual heuristics!
286  **********************************************************************/
one_ell_conflict(WERD_RES * word_res,bool update_map)287 bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
288   const char *word;
289   const char *lengths;
290   int16_t word_len; // its length
291   int16_t first_alphanum_index_;
292   int16_t first_alphanum_offset_;
293   int16_t i;
294   int16_t offset;
295   bool non_conflict_set_char; // non conf set a/n?
296   bool conflict = false;
297   bool allow_1s;
298   ACCEPTABLE_WERD_TYPE word_type;
299   bool dict_perm_type;
300   bool dict_word_ok;
301   int dict_word_type;
302 
303   word = word_res->best_choice->unichar_string().c_str();
304   lengths = word_res->best_choice->unichar_lengths().c_str();
305   word_len = strlen(lengths);
306   /*
307   If there are no occurrences of the conflict set characters then the word
308   is OK.
309 */
310   if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {
311     return false;
312   }
313 
314   /*
315   There is a conflict if there are NO other (confirmed) alphanumerics apart
316   from those in the conflict set.
317 */
318 
319   for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;
320        offset += lengths[i++]) {
321     non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
322                              word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
323                             !conflict_set_I_l_1.contains(word[offset]);
324   }
325   if (!non_conflict_set_char) {
326     if (update_map) {
327       reject_I_1_L(word_res);
328     }
329     return true;
330   }
331 
332   /*
333   If the word is accepted by a dawg permuter, and the first alpha character
334   is "I" or "l", check to see if the alternative is also a dawg word. If it
335   is, then there is a potential error otherwise the word is ok.
336 */
337 
338   dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
339                    (word_res->best_choice->permuter() == USER_DAWG_PERM) ||
340                    (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||
341                    (word_res->best_choice->permuter() == FREQ_DAWG_PERM);
342   dict_word_type = dict_word(*(word_res->best_choice));
343   dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
344 
345   if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346       (dict_perm_type && dict_word_ok)) {
347     first_alphanum_index_ = first_alphanum_index(word, lengths);
348     first_alphanum_offset_ = first_alphanum_offset(word, lengths);
349     if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
350       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
351       if (safe_dict_word(word_res) > 0) {
352         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
353         if (update_map) {
354           word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
355         }
356         return true;
357       } else {
358         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
359         return false;
360       }
361     }
362 
363     if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
364       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
365       if (safe_dict_word(word_res) > 0) {
366         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
367         if (update_map) {
368           word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
369         }
370         return true;
371       } else {
372         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
373         return false;
374       }
375     }
376     return false;
377   }
378 
379   /*
380   NEW 1Il code. The old code relied on permuter types too much. In fact,
381   tess will use TOP_CHOICE permute for good things like "palette".
382   In this code the string is examined independently to see if it looks like
383   a well formed word.
384 */
385 
386   /*
387   REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
388   dictionary word.
389 */
390   first_alphanum_index_ = first_alphanum_index(word, lengths);
391   first_alphanum_offset_ = first_alphanum_offset(word, lengths);
392   if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {
393     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
394     if (safe_dict_word(word_res) > 0) {
395       return false;
396     } else {
397       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
398     }
399   } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {
400     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
401     if (safe_dict_word(word_res) > 0) {
402       return false;
403     } else {
404       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405     }
406   }
407   /*
408   For strings containing digits:
409     If there are no alphas OR the numeric permuter liked the word,
410       reject any non 1 conflict chs
411     Else reject all conflict chs
412 */
413   if (word_contains_non_1_digit(word, lengths)) {
414     allow_1s =
415         (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);
416 
417     int16_t offset;
418     conflict = false;
419     for (i = 0, offset = 0; word[offset] != '\0';
420          offset += word_res->best_choice->unichar_lengths()[i++]) {
421       if ((!allow_1s || (word[offset] != '1')) &&
422           conflict_set_I_l_1.contains(word[offset])) {
423         if (update_map) {
424           word_res->reject_map[i].setrej_1Il_conflict();
425         }
426         conflict = true;
427       }
428     }
429     return conflict;
430   }
431   /*
432   For anything else. See if it conforms to an acceptable word type. If so,
433   treat accordingly.
434 */
435   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
436   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
437     first_alphanum_index_ = first_alphanum_index(word, lengths);
438     first_alphanum_offset_ = first_alphanum_offset(word, lengths);
439     if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
440       if (update_map) {
441         word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
442       }
443       return true;
444     } else {
445       return false;
446     }
447   } else if (word_type == AC_UPPER_CASE) {
448     return false;
449   } else {
450     if (update_map) {
451       reject_I_1_L(word_res);
452     }
453     return true;
454   }
455 }
456 
first_alphanum_index(const char * word,const char * word_lengths)457 int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {
458   int16_t i;
459   int16_t offset;
460 
461   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
462     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
463         unicharset.get_isdigit(word + offset, word_lengths[i])) {
464       return i;
465     }
466   }
467   return -1;
468 }
469 
first_alphanum_offset(const char * word,const char * word_lengths)470 int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {
471   int16_t i;
472   int16_t offset;
473 
474   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476         unicharset.get_isdigit(word + offset, word_lengths[i])) {
477       return offset;
478     }
479   }
480   return -1;
481 }
482 
alpha_count(const char * word,const char * word_lengths)483 int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {
484   int16_t i;
485   int16_t offset;
486   int16_t count = 0;
487 
488   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
489     if (unicharset.get_isalpha(word + offset, word_lengths[i])) {
490       count++;
491     }
492   }
493   return count;
494 }
495 
word_contains_non_1_digit(const char * word,const char * word_lengths)496 bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {
497   int16_t i;
498   int16_t offset;
499 
500   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
501     if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&
502         (word_lengths[i] != 1 || word[offset] != '1')) {
503       return true;
504     }
505   }
506   return false;
507 }
508 
509 /*************************************************************************
510  * dont_allow_1Il()
511  * Don't unreject LONE accepted 1Il conflict set chars
512  *************************************************************************/
dont_allow_1Il(WERD_RES * word)513 void Tesseract::dont_allow_1Il(WERD_RES *word) {
514   int word_len = word->reject_map.length();
515   const char *s = word->best_choice->unichar_string().c_str();
516   const char *lengths = word->best_choice->unichar_lengths().c_str();
517   bool accepted_1Il = false;
518 
519   for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
520     if (word->reject_map[i].accepted()) {
521       if (conflict_set_I_l_1.contains(s[offset])) {
522         accepted_1Il = true;
523       } else {
524         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
525             word->uch_set->get_isdigit(s + offset, lengths[i])) {
526           return; // >=1 non 1Il ch accepted
527         }
528       }
529     }
530   }
531   if (!accepted_1Il) {
532     return; // Nothing to worry about
533   }
534 
535   for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
536     if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
537       word->reject_map[i].setrej_postNN_1Il();
538     }
539   }
540 }
541 
count_alphanums(WERD_RES * word_res)542 int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
543   int count = 0;
544   const WERD_CHOICE *best_choice = word_res->best_choice;
545   for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
546     if ((word_res->reject_map[i].accepted()) &&
547         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
548          word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
549       count++;
550     }
551   }
552   return count;
553 }
554 
555 // reject all if most rejected.
reject_mostly_rejects(WERD_RES * word)556 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
557   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
558 
559   if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=
560       rej_whole_of_mostly_reject_word_fract) {
561     word->reject_map.rej_word_mostly_rej();
562   }
563 }
564 
repeated_nonalphanum_wd(WERD_RES * word,ROW * row)565 bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
566   if (word->best_choice->unichar_lengths().length() <= 1) {
567     return false;
568   }
569 
570   if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {
571     return false;
572   }
573 
574   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
575   for (unsigned i = 1; i < word->best_choice->length(); ++i) {
576     if (word->best_choice->unichar_id(i) != uch_id) {
577       return false;
578     }
579   }
580 
581   int16_t char_quality;
582   int16_t accepted_char_quality;
583   word_char_quality(word, &char_quality, &accepted_char_quality);
584 
585   if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
586       (char_quality == accepted_char_quality)) {
587     return true;
588   } else {
589     return false;
590   }
591 }
592 
safe_dict_word(const WERD_RES * werd_res)593 int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
594   const WERD_CHOICE &word = *werd_res->best_choice;
595   int dict_word_type = werd_res->tesseract->dict_word(word);
596   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
597 }
598 
599 // Note: After running this function word_res->ratings
600 // might not contain the right BLOB_CHOICE corresponding to each character
601 // in word_res->best_choice.
flip_hyphens(WERD_RES * word_res)602 void Tesseract::flip_hyphens(WERD_RES *word_res) {
603   WERD_CHOICE *best_choice = word_res->best_choice;
604   int prev_right = -9999;
605   int next_left;
606   TBOX out_box;
607   float aspect_ratio;
608 
609   if (tessedit_lower_flip_hyphen <= 1) {
610     return;
611   }
612 
613   auto num_blobs = word_res->rebuild_word->NumBlobs();
614   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
615   for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
616     TBLOB *blob = word_res->rebuild_word->blobs[i];
617     out_box = blob->bounding_box();
618     if (i + 1 == num_blobs) {
619       next_left = 9999;
620     } else {
621       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
622     }
623     // Don't touch small or touching blobs - it is too dangerous.
624     if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&
625         (out_box.right() < next_left)) {
626       aspect_ratio = out_box.width() / static_cast<float>(out_box.height());
627       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
628         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
629             word_res->uch_set->contains_unichar_id(unichar_dash) &&
630             word_res->uch_set->get_enabled(unichar_dash)) {
631           /* Certain HYPHEN */
632           best_choice->set_unichar_id(unichar_dash, i);
633           if (word_res->reject_map[i].rejected()) {
634             word_res->reject_map[i].setrej_hyphen_accept();
635           }
636         }
637         if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {
638           // Suspected HYPHEN
639           word_res->reject_map[i].setrej_hyphen();
640         }
641       } else if (best_choice->unichar_id(i) == unichar_dash) {
642         if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {
643           word_res->reject_map[i].setrej_hyphen_accept();
644         }
645         // Certain HYPHEN
646 
647         if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {
648           // Suspected HYPHEN
649           word_res->reject_map[i].setrej_hyphen();
650         }
651       }
652     }
653     prev_right = out_box.right();
654   }
655 }
656 
657 // Note: After running this function word_res->ratings
658 // might not contain the right BLOB_CHOICE corresponding to each character
659 // in word_res->best_choice.
flip_0O(WERD_RES * word_res)660 void Tesseract::flip_0O(WERD_RES *word_res) {
661   WERD_CHOICE *best_choice = word_res->best_choice;
662   TBOX out_box;
663 
664   if (!tessedit_flip_0O) {
665     return;
666   }
667 
668   auto num_blobs = word_res->rebuild_word->NumBlobs();
669   for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
670     TBLOB *blob = word_res->rebuild_word->blobs[i];
671     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
672         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
673       out_box = blob->bounding_box();
674       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
675           (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {
676         return; // Beware words with sub/superscripts
677       }
678     }
679   }
680   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
681   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
682   if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||
683       unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
684     return; // 0 or O are not present/enabled in unicharset
685   }
686   for (unsigned i = 1; i < best_choice->length(); ++i) {
687     if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
688       /* A0A */
689       if ((i + 1) < best_choice->length() &&
690           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
691           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
692         best_choice->set_unichar_id(unichar_O, i);
693       }
694       /* A00A */
695       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
696           (i + 1) < best_choice->length() &&
697           (best_choice->unichar_id(i + 1) == unichar_0 ||
698            best_choice->unichar_id(i + 1) == unichar_O) &&
699           (i + 2) < best_choice->length() &&
700           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {
701         best_choice->set_unichar_id(unichar_O, i);
702         i++;
703       }
704       /* AA0<non digit or end of word> */
705       if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&
706           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
707           (((i + 1) < best_choice->length() &&
708             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&
709             !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&
710             !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||
711            (i == best_choice->length() - 1))) {
712         best_choice->set_unichar_id(unichar_O, i);
713       }
714       /* 9O9 */
715       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
716           (i + 1) < best_choice->length() &&
717           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {
718         best_choice->set_unichar_id(unichar_0, i);
719       }
720       /* 9OOO */
721       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
722           (i + 2) < best_choice->length() &&
723           (best_choice->unichar_id(i + 1) == unichar_0 ||
724            best_choice->unichar_id(i + 1) == unichar_O) &&
725           (best_choice->unichar_id(i + 2) == unichar_0 ||
726            best_choice->unichar_id(i + 2) == unichar_O)) {
727         best_choice->set_unichar_id(unichar_0, i);
728         best_choice->set_unichar_id(unichar_0, i + 1);
729         best_choice->set_unichar_id(unichar_0, i + 2);
730         i += 2;
731       }
732       /* 9OO<non upper> */
733       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
734           (i + 2) < best_choice->length() &&
735           (best_choice->unichar_id(i + 1) == unichar_0 ||
736            best_choice->unichar_id(i + 1) == unichar_O) &&
737           !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {
738         best_choice->set_unichar_id(unichar_0, i);
739         best_choice->set_unichar_id(unichar_0, i + 1);
740         i++;
741       }
742       /* 9O<non upper> */
743       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&
744           (i + 1) < best_choice->length() &&
745           !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {
746         best_choice->set_unichar_id(unichar_0, i);
747       }
748       /* 9[.,]OOO.. */
749       if ((i > 1) &&
750           (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||
751            word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&
752           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||
753            best_choice->unichar_id(i - 2) == unichar_O)) {
754         if (best_choice->unichar_id(i - 2) == unichar_O) {
755           best_choice->set_unichar_id(unichar_0, i - 2);
756         }
757         while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||
758                                              best_choice->unichar_id(i) == unichar_0)) {
759           best_choice->set_unichar_id(unichar_0, i);
760           i++;
761         }
762         i--;
763       }
764     }
765   }
766 }
767 
non_O_upper(const UNICHARSET & ch_set,UNICHAR_ID unichar_id)768 bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
769   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
770 }
771 
non_0_digit(const UNICHARSET & ch_set,UNICHAR_ID unichar_id)772 bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {
773   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
774 }
775 } // namespace tesseract
776 
777 #endif // def DISABLED_LEGACY_ENGINE
778