1 /******************************************************************
2 * File: superscript.cpp
3 * Description: Correction pass to fix superscripts and subscripts.
4 * Author: David Eger
5 *
6 * (C) Copyright 2012, Google, Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "normalis.h"
20 #include "tesseractclass.h"
21
22 namespace tesseract {
23
LeadingUnicharsToChopped(WERD_RES * word,int num_unichars)24 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
25 int num_chopped = 0;
26 for (int i = 0; i < num_unichars; i++) {
27 num_chopped += word->best_state[i];
28 }
29 return num_chopped;
30 }
31
TrailingUnicharsToChopped(WERD_RES * word,int num_unichars)32 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
33 int num_chopped = 0;
34 for (int i = 0; i < num_unichars; i++) {
35 num_chopped += word->best_state[word->best_state.size() - 1 - i];
36 }
37 return num_chopped;
38 }
39
40 /**
41 * Given a recognized blob, see if a contiguous collection of sub-pieces
42 * (chopped blobs) starting at its left might qualify as being a subscript
43 * or superscript letter based only on y position. Also do this for the
44 * right side.
45 */
YOutlierPieces(WERD_RES * word,int rebuilt_blob_index,int super_y_bottom,int sub_y_top,ScriptPos * leading_pos,int * num_leading_outliers,ScriptPos * trailing_pos,int * num_trailing_outliers)46 static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,
47 int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,
48 ScriptPos *trailing_pos, int *num_trailing_outliers) {
49 ScriptPos sp_unused1, sp_unused2;
50 int unused1, unused2;
51 if (!leading_pos) {
52 leading_pos = &sp_unused1;
53 }
54 if (!num_leading_outliers) {
55 num_leading_outliers = &unused1;
56 }
57 if (!trailing_pos) {
58 trailing_pos = &sp_unused2;
59 }
60 if (!num_trailing_outliers) {
61 num_trailing_outliers = &unused2;
62 }
63
64 *num_leading_outliers = *num_trailing_outliers = 0;
65 *leading_pos = *trailing_pos = SP_NORMAL;
66
67 int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
68 int num_chopped_pieces = word->best_state[rebuilt_blob_index];
69 ScriptPos last_pos = SP_NORMAL;
70 int trailing_outliers = 0;
71 for (int i = 0; i < num_chopped_pieces; i++) {
72 TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
73 ScriptPos pos = SP_NORMAL;
74 if (box.bottom() >= super_y_bottom) {
75 pos = SP_SUPERSCRIPT;
76 } else if (box.top() <= sub_y_top) {
77 pos = SP_SUBSCRIPT;
78 }
79 if (pos == SP_NORMAL) {
80 if (trailing_outliers == i) {
81 *num_leading_outliers = trailing_outliers;
82 *leading_pos = last_pos;
83 }
84 trailing_outliers = 0;
85 } else {
86 if (pos == last_pos) {
87 trailing_outliers++;
88 } else {
89 trailing_outliers = 1;
90 }
91 }
92 last_pos = pos;
93 }
94 *num_trailing_outliers = trailing_outliers;
95 *trailing_pos = last_pos;
96 }
97
98 /**
99 * Attempt to split off any high (or low) bits at the ends of the word with poor
100 * certainty and recognize them separately. If the certainty gets much better
101 * and other sanity checks pass, accept.
102 *
103 * This superscript fix is meant to be called in the second pass of recognition
104 * when we have tried once and already have a preliminary answer for word.
105 *
106 * @return Whether we modified the given word.
107 */
SubAndSuperscriptFix(WERD_RES * word)108 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
109 if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
110 return false;
111 }
112 int num_leading, num_trailing;
113 ScriptPos sp_leading, sp_trailing;
114 float leading_certainty, trailing_certainty;
115 float avg_certainty, unlikely_threshold;
116
117 // Calculate the number of whole suspicious characters at the edges.
118 GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
119 &sp_trailing, &trailing_certainty, &avg_certainty,
120 &unlikely_threshold);
121
122 const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
123 const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
124
125 int num_blobs = word->best_choice->length();
126
127 // Calculate the remainder (partial characters) at the edges.
128 // This accounts for us having classified the best version of
129 // a word as [speaker?'] when it was instead [speaker.^{21}]
130 // (that is we accidentally thought the 2 was attached to the period).
131 int num_remainder_leading = 0, num_remainder_trailing = 0;
132 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
134 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
135 int last_word_char = num_blobs - 1 - num_trailing;
136 float last_char_certainty = word->best_choice->certainty(last_word_char);
137 if (word->best_choice->unichar_id(last_word_char) != 0 &&
138 last_char_certainty <= unlikely_threshold) {
139 ScriptPos rpos;
140 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
141 &num_remainder_trailing);
142 if (num_trailing > 0 && rpos != sp_trailing) {
143 num_remainder_trailing = 0;
144 }
145 if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
146 trailing_certainty = last_char_certainty;
147 }
148 }
149 bool another_blob_available =
150 (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
151 int first_char_certainty = word->best_choice->certainty(num_leading);
152 if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
153 first_char_certainty <= unlikely_threshold) {
154 ScriptPos lpos;
155 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
156 nullptr, nullptr);
157 if (num_leading > 0 && lpos != sp_leading) {
158 num_remainder_leading = 0;
159 }
160 if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
161 leading_certainty = first_char_certainty;
162 }
163 }
164 }
165
166 // If nothing to do, bail now.
167 if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
168 return false;
169 }
170
171 if (superscript_debug >= 1) {
172 tprintf("Candidate for superscript detection: %s (",
173 word->best_choice->unichar_string().c_str());
174 if (num_leading || num_remainder_leading) {
175 tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
176 }
177 if (num_trailing || num_remainder_trailing) {
178 tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
179 }
180 tprintf(")\n");
181 }
182 if (superscript_debug >= 3) {
183 word->best_choice->print();
184 }
185 if (superscript_debug >= 2) {
186 tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
187 unlikely_threshold);
188 if (num_leading) {
189 tprintf("Orig. leading (min): %.2f ", leading_certainty);
190 }
191 if (num_trailing) {
192 tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
193 }
194 tprintf("\n");
195 }
196
197 // We've now calculated the number of rebuilt blobs we want to carve off.
198 // However, split_word() works from TBLOBs in chopped_word, so we need to
199 // convert to those.
200 int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
201 int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good = false;
206 WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
207 num_chopped_trailing, trailing_certainty, sp_trailing,
208 word, &is_good, &retry_leading, &retry_trailing);
209 if (is_good) {
210 word->ConsumeWordResults(revised);
211 } else if (retry_leading || retry_trailing) {
212 int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
213 int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
214 WERD_RES *revised2 = TrySuperscriptSplits(
215 retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
216 trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
217 if (is_good) {
218 word->ConsumeWordResults(revised2);
219 }
220 delete revised2;
221 }
222 delete revised;
223 return is_good;
224 }
225
226 /**
227 * Determine how many characters (rebuilt blobs) on each end of a given word
228 * might plausibly be superscripts so SubAndSuperscriptFix can try to
229 * re-recognize them. Even if we find no whole blobs at either end,
230 * we will set *unlikely_threshold to a certainty that might be used to
231 * select "bad enough" outlier characters. If *unlikely_threshold is set to 0,
232 * though, there's really no hope.
233 *
234 * @param[in] word The word to examine.
235 * @param[out] num_rebuilt_leading the number of rebuilt blobs at the start
236 * of the word which are all up or down and
237 * seem badly classified.
238 * @param[out] leading_pos "super" or "sub" (for debugging)
239 * @param[out] leading_certainty the worst certainty in the leading blobs.
240 * @param[out] num_rebuilt_trailing the number of rebuilt blobs at the end
241 * of the word which are all up or down and
242 * seem badly classified.
243 * @param[out] trailing_pos "super" or "sub" (for debugging)
244 * @param[out] trailing_certainty the worst certainty in the trailing blobs.
245 * @param[out] avg_certainty the average certainty of "normal" blobs in
246 * the word.
247 * @param[out] unlikely_threshold the threshold (on certainty) we used to
248 * select "bad enough" outlier characters.
249 */
GetSubAndSuperscriptCandidates(const WERD_RES * word,int * num_rebuilt_leading,ScriptPos * leading_pos,float * leading_certainty,int * num_rebuilt_trailing,ScriptPos * trailing_pos,float * trailing_certainty,float * avg_certainty,float * unlikely_threshold)250 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
251 ScriptPos *leading_pos, float *leading_certainty,
252 int *num_rebuilt_trailing, ScriptPos *trailing_pos,
253 float *trailing_certainty, float *avg_certainty,
254 float *unlikely_threshold) {
255 *avg_certainty = *unlikely_threshold = 0.0f;
256 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
257 *leading_certainty = *trailing_certainty = 0.0f;
258
259 int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
260 int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
261
262 // Step one: Get an average certainty for "normally placed" characters.
263
264 // Counts here are of blobs in the rebuild_word / unichars in best_choice.
265 *leading_pos = *trailing_pos = SP_NORMAL;
266 int leading_outliers = 0;
267 int trailing_outliers = 0;
268 int num_normal = 0;
269 float normal_certainty_total = 0.0f;
270 float worst_normal_certainty = 0.0f;
271 ScriptPos last_pos = SP_NORMAL;
272 int num_blobs = word->rebuild_word->NumBlobs();
273 for (int b = 0; b < num_blobs; ++b) {
274 TBOX box = word->rebuild_word->blobs[b]->bounding_box();
275 ScriptPos pos = SP_NORMAL;
276 if (box.bottom() >= super_y_bottom) {
277 pos = SP_SUPERSCRIPT;
278 } else if (box.top() <= sub_y_top) {
279 pos = SP_SUBSCRIPT;
280 }
281 if (pos == SP_NORMAL) {
282 if (word->best_choice->unichar_id(b) != 0) {
283 float char_certainty = word->best_choice->certainty(b);
284 if (char_certainty < worst_normal_certainty) {
285 worst_normal_certainty = char_certainty;
286 }
287 num_normal++;
288 normal_certainty_total += char_certainty;
289 }
290 if (trailing_outliers == b) {
291 leading_outliers = trailing_outliers;
292 *leading_pos = last_pos;
293 }
294 trailing_outliers = 0;
295 } else {
296 if (last_pos == pos) {
297 trailing_outliers++;
298 } else {
299 trailing_outliers = 1;
300 }
301 }
302 last_pos = pos;
303 }
304 *trailing_pos = last_pos;
305 if (num_normal >= 3) { // throw out the worst as an outlier.
306 num_normal--;
307 normal_certainty_total -= worst_normal_certainty;
308 }
309 if (num_normal > 0) {
310 *avg_certainty = normal_certainty_total / num_normal;
311 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
312 }
313 if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
314 return;
315 }
316
317 // Step two: Try to split off bits of the word that are both outliers
318 // and have much lower certainty than average
319 // Calculate num_leading and leading_certainty.
320 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
321 (*num_rebuilt_leading)++) {
322 float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
323 if (char_certainty > *unlikely_threshold) {
324 break;
325 }
326 if (char_certainty < *leading_certainty) {
327 *leading_certainty = char_certainty;
328 }
329 }
330
331 // Calculate num_trailing and trailing_certainty.
332 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
333 *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
334 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
335 float char_certainty = word->best_choice->certainty(blob_idx);
336 if (char_certainty > *unlikely_threshold) {
337 break;
338 }
339 if (char_certainty < *trailing_certainty) {
340 *trailing_certainty = char_certainty;
341 }
342 }
343 }
344
345 /**
346 * Try splitting off the given number of (chopped) blobs from the front and
347 * back of the given word and recognizing the pieces.
348 *
349 * @param[in] num_chopped_leading how many chopped blobs from the left
350 * end of the word to chop off and try recognizing as a
351 * superscript (or subscript)
352 * @param[in] leading_certainty the (minimum) certainty had by the
353 * characters in the original leading section.
354 * @param[in] leading_pos "super" or "sub" (for debugging)
355 * @param[in] num_chopped_trailing how many chopped blobs from the right
356 * end of the word to chop off and try recognizing as a
357 * superscript (or subscript)
358 * @param[in] trailing_certainty the (minimum) certainty had by the
359 * characters in the original trailing section.
360 * @param[in] trailing_pos "super" or "sub" (for debugging)
361 * @param[in] word the word to try to chop up.
362 * @param[out] is_good do we believe our result?
363 * @param[out] retry_rebuild_leading, retry_rebuild_trailing
364 * If non-zero, and !is_good, then the caller may have luck trying
365 * to split the returned word with this number of (rebuilt) leading
366 * and trailing blobs / unichars.
367 * @return A word which is the result of re-recognizing as asked.
368 */
TrySuperscriptSplits(int num_chopped_leading,float leading_certainty,ScriptPos leading_pos,int num_chopped_trailing,float trailing_certainty,ScriptPos trailing_pos,WERD_RES * word,bool * is_good,int * retry_rebuild_leading,int * retry_rebuild_trailing)369 WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
370 ScriptPos leading_pos, int num_chopped_trailing,
371 float trailing_certainty, ScriptPos trailing_pos,
372 WERD_RES *word, bool *is_good, int *retry_rebuild_leading,
373 int *retry_rebuild_trailing) {
374 int num_chopped = word->chopped_word->NumBlobs();
375
376 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
377
378 // Chop apart the word into up to three pieces.
379
380 BlamerBundle *bb0 = nullptr;
381 BlamerBundle *bb1 = nullptr;
382 WERD_RES *prefix = nullptr;
383 WERD_RES *core = nullptr;
384 WERD_RES *suffix = nullptr;
385 if (num_chopped_leading > 0) {
386 prefix = new WERD_RES(*word);
387 split_word(prefix, num_chopped_leading, &core, &bb0);
388 } else {
389 core = new WERD_RES(*word);
390 }
391
392 if (num_chopped_trailing > 0) {
393 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
394 split_word(core, split_pt, &suffix, &bb1);
395 }
396
397 // Recognize the pieces in turn.
398 int saved_cp_multiplier = classify_class_pruner_multiplier;
399 int saved_im_multiplier = classify_integer_matcher_multiplier;
400 if (prefix) {
401 // Turn off Tesseract's y-position penalties for the leading superscript.
402 classify_class_pruner_multiplier.set_value(0);
403 classify_integer_matcher_multiplier.set_value(0);
404
405 // Adjust our expectations about the baseline for this prefix.
406 if (superscript_debug >= 3) {
407 tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
408 }
409 recog_word_recursive(prefix);
410 if (superscript_debug >= 2) {
411 tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
412 prefix->best_choice->unichar_string().c_str());
413 }
414
415 // Restore the normal y-position penalties.
416 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
417 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
418 }
419
420 if (superscript_debug >= 3) {
421 tprintf(" recognizing middle %d chopped blobs\n",
422 num_chopped - num_chopped_leading - num_chopped_trailing);
423 }
424
425 if (suffix) {
426 // Turn off Tesseract's y-position penalties for the trailing superscript.
427 classify_class_pruner_multiplier.set_value(0);
428 classify_integer_matcher_multiplier.set_value(0);
429
430 if (superscript_debug >= 3) {
431 tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
432 }
433 recog_word_recursive(suffix);
434 if (superscript_debug >= 2) {
435 tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
436 suffix->best_choice->unichar_string().c_str());
437 }
438
439 // Restore the normal y-position penalties.
440 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
441 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
442 }
443
444 // Evaluate whether we think the results are believably better
445 // than what we already had.
446 bool good_prefix =
447 !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
448 superscript_bettered_certainty * leading_certainty,
449 retry_rebuild_leading, nullptr);
450 bool good_suffix =
451 !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
452 superscript_bettered_certainty * trailing_certainty, nullptr,
453 retry_rebuild_trailing);
454
455 *is_good = good_prefix && good_suffix;
456 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
457 // None of it is any good. Quit now.
458 delete core;
459 delete prefix;
460 delete suffix;
461 delete bb1;
462 return nullptr;
463 }
464 recog_word_recursive(core);
465
466 // Now paste the results together into core.
467 if (suffix) {
468 suffix->SetAllScriptPositions(trailing_pos);
469 join_words(core, suffix, bb1);
470 }
471 if (prefix) {
472 prefix->SetAllScriptPositions(leading_pos);
473 join_words(prefix, core, bb0);
474 core = prefix;
475 prefix = nullptr;
476 }
477
478 if (superscript_debug >= 1) {
479 tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
480 core->best_choice->unichar_string().c_str());
481 }
482 return core;
483 }
484
485 /**
486 * Return whether this is believable superscript or subscript text.
487 *
488 * We insist that:
489 * + there are no punctuation marks.
490 * + there are no italics.
491 * + no normal-sized character is smaller than superscript_scaledown_ratio
492 * of what it ought to be, and
493 * + each character is at least as certain as certainty_threshold.
494 *
495 * @param[in] debug If true, spew debug output
496 * @param[in] word The word whose best_choice we're evaluating
497 * @param[in] certainty_threshold If any of the characters have less
498 * certainty than this, reject.
499 * @param[out] left_ok How many left-side characters were ok?
500 * @param[out] right_ok How many right-side characters were ok?
501 * @return Whether the complete best choice is believable as a superscript.
502 */
BelievableSuperscript(bool debug,const WERD_RES & word,float certainty_threshold,int * left_ok,int * right_ok) const503 bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
504 int *left_ok, int *right_ok) const {
505 unsigned initial_ok_run_count = 0;
506 unsigned ok_run_count = 0;
507 float worst_certainty = 0.0f;
508 const WERD_CHOICE &wc = *word.best_choice;
509
510 const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
511 for (unsigned i = 0; i < wc.length(); i++) {
512 TBLOB *blob = word.rebuild_word->blobs[i];
513 UNICHAR_ID unichar_id = wc.unichar_id(i);
514 float char_certainty = wc.certainty(i);
515 bool bad_certainty = char_certainty < certainty_threshold;
516 bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
517 bool is_italic = word.fontinfo && word.fontinfo->is_italic();
518 BLOB_CHOICE *choice = word.GetBlobChoice(i);
519 if (choice && fontinfo_table.size() > 0) {
520 // Get better information from the specific choice, if available.
521 int font_id1 = choice->fontinfo_id();
522 bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
523 int font_id2 = choice->fontinfo_id2();
524 is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
525 }
526
527 float height_fraction = 1.0f;
528 float char_height = blob->bounding_box().height();
529 float normal_height = char_height;
530 if (wc.unicharset()->top_bottom_useful()) {
531 int min_bot, max_bot, min_top, max_top;
532 wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
533 float hi_height = max_top - max_bot;
534 float lo_height = min_top - min_bot;
535 normal_height = (hi_height + lo_height) / 2;
536 if (normal_height >= kBlnXHeight) {
537 // Only ding characters that we have decent information for because
538 // they're supposed to be normal sized, not tiny specks or dashes.
539 height_fraction = char_height / normal_height;
540 }
541 }
542 bool bad_height = height_fraction < superscript_scaledown_ratio;
543
544 if (debug) {
545 if (is_italic) {
546 tprintf(" Rejecting: superscript is italic.\n");
547 }
548 if (is_punc) {
549 tprintf(" Rejecting: punctuation present.\n");
550 }
551 const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
552 if (bad_certainty) {
553 tprintf(
554 " Rejecting: don't believe character %s with certainty %.2f "
555 "which is less than threshold %.2f\n",
556 char_str, char_certainty, certainty_threshold);
557 }
558 if (bad_height) {
559 tprintf(
560 " Rejecting: character %s seems too small @ %.2f versus "
561 "expected %.2f\n",
562 char_str, char_height, normal_height);
563 }
564 }
565 if (bad_certainty || bad_height || is_punc || is_italic) {
566 if (ok_run_count == i) {
567 initial_ok_run_count = ok_run_count;
568 }
569 ok_run_count = 0;
570 } else {
571 ok_run_count++;
572 }
573 if (char_certainty < worst_certainty) {
574 worst_certainty = char_certainty;
575 }
576 }
577 bool all_ok = ok_run_count == wc.length();
578 if (all_ok && debug) {
579 tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
580 }
581 if (!all_ok) {
582 if (left_ok) {
583 *left_ok = initial_ok_run_count;
584 }
585 if (right_ok) {
586 *right_ok = ok_run_count;
587 }
588 }
589 return all_ok;
590 }
591
592 } // namespace tesseract
593