1 /******************************************************************
2  * File:        superscript.cpp
3  * Description: Correction pass to fix superscripts and subscripts.
4  * Author:      David Eger
5  *
6  * (C) Copyright 2012, Google, Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "normalis.h"
20 #include "tesseractclass.h"
21 
22 namespace tesseract {
23 
LeadingUnicharsToChopped(WERD_RES * word,int num_unichars)24 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
25   int num_chopped = 0;
26   for (int i = 0; i < num_unichars; i++) {
27     num_chopped += word->best_state[i];
28   }
29   return num_chopped;
30 }
31 
TrailingUnicharsToChopped(WERD_RES * word,int num_unichars)32 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
33   int num_chopped = 0;
34   for (int i = 0; i < num_unichars; i++) {
35     num_chopped += word->best_state[word->best_state.size() - 1 - i];
36   }
37   return num_chopped;
38 }
39 
40 /**
41  * Given a recognized blob, see if a contiguous collection of sub-pieces
42  * (chopped blobs) starting at its left might qualify as being a subscript
43  * or superscript letter based only on y position.  Also do this for the
44  * right side.
45  */
YOutlierPieces(WERD_RES * word,int rebuilt_blob_index,int super_y_bottom,int sub_y_top,ScriptPos * leading_pos,int * num_leading_outliers,ScriptPos * trailing_pos,int * num_trailing_outliers)46 static void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom,
47                            int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers,
48                            ScriptPos *trailing_pos, int *num_trailing_outliers) {
49   ScriptPos sp_unused1, sp_unused2;
50   int unused1, unused2;
51   if (!leading_pos) {
52     leading_pos = &sp_unused1;
53   }
54   if (!num_leading_outliers) {
55     num_leading_outliers = &unused1;
56   }
57   if (!trailing_pos) {
58     trailing_pos = &sp_unused2;
59   }
60   if (!num_trailing_outliers) {
61     num_trailing_outliers = &unused2;
62   }
63 
64   *num_leading_outliers = *num_trailing_outliers = 0;
65   *leading_pos = *trailing_pos = SP_NORMAL;
66 
67   int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
68   int num_chopped_pieces = word->best_state[rebuilt_blob_index];
69   ScriptPos last_pos = SP_NORMAL;
70   int trailing_outliers = 0;
71   for (int i = 0; i < num_chopped_pieces; i++) {
72     TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
73     ScriptPos pos = SP_NORMAL;
74     if (box.bottom() >= super_y_bottom) {
75       pos = SP_SUPERSCRIPT;
76     } else if (box.top() <= sub_y_top) {
77       pos = SP_SUBSCRIPT;
78     }
79     if (pos == SP_NORMAL) {
80       if (trailing_outliers == i) {
81         *num_leading_outliers = trailing_outliers;
82         *leading_pos = last_pos;
83       }
84       trailing_outliers = 0;
85     } else {
86       if (pos == last_pos) {
87         trailing_outliers++;
88       } else {
89         trailing_outliers = 1;
90       }
91     }
92     last_pos = pos;
93   }
94   *num_trailing_outliers = trailing_outliers;
95   *trailing_pos = last_pos;
96 }
97 
98 /**
99  * Attempt to split off any high (or low) bits at the ends of the word with poor
100  * certainty and recognize them separately.  If the certainty gets much better
101  * and other sanity checks pass, accept.
102  *
103  * This superscript fix is meant to be called in the second pass of recognition
104  * when we have tried once and already have a preliminary answer for word.
105  *
106  * @return Whether we modified the given word.
107  */
SubAndSuperscriptFix(WERD_RES * word)108 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
109   if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) {
110     return false;
111   }
112   int num_leading, num_trailing;
113   ScriptPos sp_leading, sp_trailing;
114   float leading_certainty, trailing_certainty;
115   float avg_certainty, unlikely_threshold;
116 
117   // Calculate the number of whole suspicious characters at the edges.
118   GetSubAndSuperscriptCandidates(word, &num_leading, &sp_leading, &leading_certainty, &num_trailing,
119                                  &sp_trailing, &trailing_certainty, &avg_certainty,
120                                  &unlikely_threshold);
121 
122   const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
123   const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
124 
125   int num_blobs = word->best_choice->length();
126 
127   // Calculate the remainder (partial characters) at the edges.
128   // This accounts for us having classified the best version of
129   // a word as [speaker?'] when it was instead [speaker.^{21}]
130   // (that is we accidentally thought the 2 was attached to the period).
131   int num_remainder_leading = 0, num_remainder_trailing = 0;
132   if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133     int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
134     int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
135     int last_word_char = num_blobs - 1 - num_trailing;
136     float last_char_certainty = word->best_choice->certainty(last_word_char);
137     if (word->best_choice->unichar_id(last_word_char) != 0 &&
138         last_char_certainty <= unlikely_threshold) {
139       ScriptPos rpos;
140       YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, nullptr, nullptr, &rpos,
141                      &num_remainder_trailing);
142       if (num_trailing > 0 && rpos != sp_trailing) {
143         num_remainder_trailing = 0;
144       }
145       if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
146         trailing_certainty = last_char_certainty;
147       }
148     }
149     bool another_blob_available =
150         (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
151     int first_char_certainty = word->best_choice->certainty(num_leading);
152     if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 &&
153         first_char_certainty <= unlikely_threshold) {
154       ScriptPos lpos;
155       YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
156                      nullptr, nullptr);
157       if (num_leading > 0 && lpos != sp_leading) {
158         num_remainder_leading = 0;
159       }
160       if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
161         leading_certainty = first_char_certainty;
162       }
163     }
164   }
165 
166   // If nothing to do, bail now.
167   if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
168     return false;
169   }
170 
171   if (superscript_debug >= 1) {
172     tprintf("Candidate for superscript detection: %s (",
173             word->best_choice->unichar_string().c_str());
174     if (num_leading || num_remainder_leading) {
175       tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
176     }
177     if (num_trailing || num_remainder_trailing) {
178       tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
179     }
180     tprintf(")\n");
181   }
182   if (superscript_debug >= 3) {
183     word->best_choice->print();
184   }
185   if (superscript_debug >= 2) {
186     tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ", avg_certainty,
187             unlikely_threshold);
188     if (num_leading) {
189       tprintf("Orig. leading (min): %.2f  ", leading_certainty);
190     }
191     if (num_trailing) {
192       tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
193     }
194     tprintf("\n");
195   }
196 
197   // We've now calculated the number of rebuilt blobs we want to carve off.
198   // However, split_word() works from TBLOBs in chopped_word, so we need to
199   // convert to those.
200   int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
201   int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
202 
203   int retry_leading = 0;
204   int retry_trailing = 0;
205   bool is_good = false;
206   WERD_RES *revised = TrySuperscriptSplits(num_chopped_leading, leading_certainty, sp_leading,
207                                            num_chopped_trailing, trailing_certainty, sp_trailing,
208                                            word, &is_good, &retry_leading, &retry_trailing);
209   if (is_good) {
210     word->ConsumeWordResults(revised);
211   } else if (retry_leading || retry_trailing) {
212     int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
213     int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
214     WERD_RES *revised2 = TrySuperscriptSplits(
215         retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
216         trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
217     if (is_good) {
218       word->ConsumeWordResults(revised2);
219     }
220     delete revised2;
221   }
222   delete revised;
223   return is_good;
224 }
225 
226 /**
227  * Determine how many characters (rebuilt blobs) on each end of a given word
228  * might plausibly be superscripts so SubAndSuperscriptFix can try to
229  * re-recognize them.  Even if we find no whole blobs at either end,
230  * we will set *unlikely_threshold to a certainty that might be used to
231  * select "bad enough" outlier characters.  If *unlikely_threshold is set to 0,
232  * though, there's really no hope.
233  *
234  * @param[in]  word    The word to examine.
235  * @param[out] num_rebuilt_leading   the number of rebuilt blobs at the start
236  *                                   of the word which are all up or down and
237  *                                   seem badly classified.
238  * @param[out] leading_pos        "super" or "sub" (for debugging)
239  * @param[out] leading_certainty  the worst certainty in the leading blobs.
240  * @param[out] num_rebuilt_trailing   the number of rebuilt blobs at the end
241  *                                    of the word which are all up or down and
242  *                                    seem badly classified.
243  * @param[out] trailing_pos        "super" or "sub" (for debugging)
244  * @param[out] trailing_certainty  the worst certainty in the trailing blobs.
245  * @param[out] avg_certainty       the average certainty of "normal" blobs in
246  *                                 the word.
247  * @param[out] unlikely_threshold  the threshold (on certainty) we used to
248  *                                 select "bad enough" outlier characters.
249  */
GetSubAndSuperscriptCandidates(const WERD_RES * word,int * num_rebuilt_leading,ScriptPos * leading_pos,float * leading_certainty,int * num_rebuilt_trailing,ScriptPos * trailing_pos,float * trailing_certainty,float * avg_certainty,float * unlikely_threshold)250 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading,
251                                                ScriptPos *leading_pos, float *leading_certainty,
252                                                int *num_rebuilt_trailing, ScriptPos *trailing_pos,
253                                                float *trailing_certainty, float *avg_certainty,
254                                                float *unlikely_threshold) {
255   *avg_certainty = *unlikely_threshold = 0.0f;
256   *num_rebuilt_leading = *num_rebuilt_trailing = 0;
257   *leading_certainty = *trailing_certainty = 0.0f;
258 
259   int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
260   int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
261 
262   // Step one: Get an average certainty for "normally placed" characters.
263 
264   // Counts here are of blobs in the rebuild_word / unichars in best_choice.
265   *leading_pos = *trailing_pos = SP_NORMAL;
266   int leading_outliers = 0;
267   int trailing_outliers = 0;
268   int num_normal = 0;
269   float normal_certainty_total = 0.0f;
270   float worst_normal_certainty = 0.0f;
271   ScriptPos last_pos = SP_NORMAL;
272   int num_blobs = word->rebuild_word->NumBlobs();
273   for (int b = 0; b < num_blobs; ++b) {
274     TBOX box = word->rebuild_word->blobs[b]->bounding_box();
275     ScriptPos pos = SP_NORMAL;
276     if (box.bottom() >= super_y_bottom) {
277       pos = SP_SUPERSCRIPT;
278     } else if (box.top() <= sub_y_top) {
279       pos = SP_SUBSCRIPT;
280     }
281     if (pos == SP_NORMAL) {
282       if (word->best_choice->unichar_id(b) != 0) {
283         float char_certainty = word->best_choice->certainty(b);
284         if (char_certainty < worst_normal_certainty) {
285           worst_normal_certainty = char_certainty;
286         }
287         num_normal++;
288         normal_certainty_total += char_certainty;
289       }
290       if (trailing_outliers == b) {
291         leading_outliers = trailing_outliers;
292         *leading_pos = last_pos;
293       }
294       trailing_outliers = 0;
295     } else {
296       if (last_pos == pos) {
297         trailing_outliers++;
298       } else {
299         trailing_outliers = 1;
300       }
301     }
302     last_pos = pos;
303   }
304   *trailing_pos = last_pos;
305   if (num_normal >= 3) { // throw out the worst as an outlier.
306     num_normal--;
307     normal_certainty_total -= worst_normal_certainty;
308   }
309   if (num_normal > 0) {
310     *avg_certainty = normal_certainty_total / num_normal;
311     *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
312   }
313   if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
314     return;
315   }
316 
317   // Step two: Try to split off bits of the word that are both outliers
318   //           and have much lower certainty than average
319   // Calculate num_leading and leading_certainty.
320   for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
321        (*num_rebuilt_leading)++) {
322     float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
323     if (char_certainty > *unlikely_threshold) {
324       break;
325     }
326     if (char_certainty < *leading_certainty) {
327       *leading_certainty = char_certainty;
328     }
329   }
330 
331   // Calculate num_trailing and trailing_certainty.
332   for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
333        *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
334     int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
335     float char_certainty = word->best_choice->certainty(blob_idx);
336     if (char_certainty > *unlikely_threshold) {
337       break;
338     }
339     if (char_certainty < *trailing_certainty) {
340       *trailing_certainty = char_certainty;
341     }
342   }
343 }
344 
345 /**
346  * Try splitting off the given number of (chopped) blobs from the front and
347  * back of the given word and recognizing the pieces.
348  *
349  * @param[in]  num_chopped_leading   how many chopped blobs from the left
350  *                    end of the word to chop off and try recognizing as a
351  *                    superscript (or subscript)
352  * @param[in]  leading_certainty     the (minimum) certainty had by the
353  *                    characters in the original leading section.
354  * @param[in]  leading_pos    "super" or "sub" (for debugging)
355  * @param[in]  num_chopped_trailing  how many chopped blobs from the right
356  *                    end of the word to chop off and try recognizing as a
357  *                    superscript (or subscript)
358  * @param[in]  trailing_certainty    the (minimum) certainty had by the
359  *                    characters in the original trailing section.
360  * @param[in]  trailing_pos      "super" or "sub" (for debugging)
361  * @param[in]  word              the word to try to chop up.
362  * @param[out] is_good           do we believe our result?
363  * @param[out] retry_rebuild_leading, retry_rebuild_trailing
364  *         If non-zero, and !is_good, then the caller may have luck trying
365  *         to split the returned word with this number of (rebuilt) leading
366  *         and trailing blobs / unichars.
367  * @return A word which is the result of re-recognizing as asked.
368  */
TrySuperscriptSplits(int num_chopped_leading,float leading_certainty,ScriptPos leading_pos,int num_chopped_trailing,float trailing_certainty,ScriptPos trailing_pos,WERD_RES * word,bool * is_good,int * retry_rebuild_leading,int * retry_rebuild_trailing)369 WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading_certainty,
370                                           ScriptPos leading_pos, int num_chopped_trailing,
371                                           float trailing_certainty, ScriptPos trailing_pos,
372                                           WERD_RES *word, bool *is_good, int *retry_rebuild_leading,
373                                           int *retry_rebuild_trailing) {
374   int num_chopped = word->chopped_word->NumBlobs();
375 
376   *retry_rebuild_leading = *retry_rebuild_trailing = 0;
377 
378   // Chop apart the word into up to three pieces.
379 
380   BlamerBundle *bb0 = nullptr;
381   BlamerBundle *bb1 = nullptr;
382   WERD_RES *prefix = nullptr;
383   WERD_RES *core = nullptr;
384   WERD_RES *suffix = nullptr;
385   if (num_chopped_leading > 0) {
386     prefix = new WERD_RES(*word);
387     split_word(prefix, num_chopped_leading, &core, &bb0);
388   } else {
389     core = new WERD_RES(*word);
390   }
391 
392   if (num_chopped_trailing > 0) {
393     int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
394     split_word(core, split_pt, &suffix, &bb1);
395   }
396 
397   //  Recognize the pieces in turn.
398   int saved_cp_multiplier = classify_class_pruner_multiplier;
399   int saved_im_multiplier = classify_integer_matcher_multiplier;
400   if (prefix) {
401     // Turn off Tesseract's y-position penalties for the leading superscript.
402     classify_class_pruner_multiplier.set_value(0);
403     classify_integer_matcher_multiplier.set_value(0);
404 
405     // Adjust our expectations about the baseline for this prefix.
406     if (superscript_debug >= 3) {
407       tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
408     }
409     recog_word_recursive(prefix);
410     if (superscript_debug >= 2) {
411       tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos),
412               prefix->best_choice->unichar_string().c_str());
413     }
414 
415     // Restore the normal y-position penalties.
416     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
417     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
418   }
419 
420   if (superscript_debug >= 3) {
421     tprintf(" recognizing middle %d chopped blobs\n",
422             num_chopped - num_chopped_leading - num_chopped_trailing);
423   }
424 
425   if (suffix) {
426     // Turn off Tesseract's y-position penalties for the trailing superscript.
427     classify_class_pruner_multiplier.set_value(0);
428     classify_integer_matcher_multiplier.set_value(0);
429 
430     if (superscript_debug >= 3) {
431       tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
432     }
433     recog_word_recursive(suffix);
434     if (superscript_debug >= 2) {
435       tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos),
436               suffix->best_choice->unichar_string().c_str());
437     }
438 
439     // Restore the normal y-position penalties.
440     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
441     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
442   }
443 
444   // Evaluate whether we think the results are believably better
445   // than what we already had.
446   bool good_prefix =
447       !prefix || BelievableSuperscript(superscript_debug >= 1, *prefix,
448                                        superscript_bettered_certainty * leading_certainty,
449                                        retry_rebuild_leading, nullptr);
450   bool good_suffix =
451       !suffix || BelievableSuperscript(superscript_debug >= 1, *suffix,
452                                        superscript_bettered_certainty * trailing_certainty, nullptr,
453                                        retry_rebuild_trailing);
454 
455   *is_good = good_prefix && good_suffix;
456   if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
457     // None of it is any good. Quit now.
458     delete core;
459     delete prefix;
460     delete suffix;
461     delete bb1;
462     return nullptr;
463   }
464   recog_word_recursive(core);
465 
466   // Now paste the results together into core.
467   if (suffix) {
468     suffix->SetAllScriptPositions(trailing_pos);
469     join_words(core, suffix, bb1);
470   }
471   if (prefix) {
472     prefix->SetAllScriptPositions(leading_pos);
473     join_words(prefix, core, bb0);
474     core = prefix;
475     prefix = nullptr;
476   }
477 
478   if (superscript_debug >= 1) {
479     tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
480             core->best_choice->unichar_string().c_str());
481   }
482   return core;
483 }
484 
485 /**
486  * Return whether this is believable superscript or subscript text.
487  *
488  * We insist that:
489  *   + there are no punctuation marks.
490  *   + there are no italics.
491  *   + no normal-sized character is smaller than superscript_scaledown_ratio
492  *     of what it ought to be, and
493  *   + each character is at least as certain as certainty_threshold.
494  *
495  *  @param[in]  debug  If true, spew debug output
496  *  @param[in]  word   The word whose best_choice we're evaluating
497  *  @param[in]  certainty_threshold   If any of the characters have less
498  *                    certainty than this, reject.
499  *  @param[out]  left_ok  How many left-side characters were ok?
500  *  @param[out]  right_ok  How many right-side characters were ok?
501  *  @return  Whether the complete best choice is believable as a superscript.
502  */
BelievableSuperscript(bool debug,const WERD_RES & word,float certainty_threshold,int * left_ok,int * right_ok) const503 bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
504                                       int *left_ok, int *right_ok) const {
505   unsigned initial_ok_run_count = 0;
506   unsigned ok_run_count = 0;
507   float worst_certainty = 0.0f;
508   const WERD_CHOICE &wc = *word.best_choice;
509 
510   const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
511   for (unsigned i = 0; i < wc.length(); i++) {
512     TBLOB *blob = word.rebuild_word->blobs[i];
513     UNICHAR_ID unichar_id = wc.unichar_id(i);
514     float char_certainty = wc.certainty(i);
515     bool bad_certainty = char_certainty < certainty_threshold;
516     bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
517     bool is_italic = word.fontinfo && word.fontinfo->is_italic();
518     BLOB_CHOICE *choice = word.GetBlobChoice(i);
519     if (choice && fontinfo_table.size() > 0) {
520       // Get better information from the specific choice, if available.
521       int font_id1 = choice->fontinfo_id();
522       bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.at(font_id1).is_italic() : false;
523       int font_id2 = choice->fontinfo_id2();
524       is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.at(font_id2).is_italic());
525     }
526 
527     float height_fraction = 1.0f;
528     float char_height = blob->bounding_box().height();
529     float normal_height = char_height;
530     if (wc.unicharset()->top_bottom_useful()) {
531       int min_bot, max_bot, min_top, max_top;
532       wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top);
533       float hi_height = max_top - max_bot;
534       float lo_height = min_top - min_bot;
535       normal_height = (hi_height + lo_height) / 2;
536       if (normal_height >= kBlnXHeight) {
537         // Only ding characters that we have decent information for because
538         // they're supposed to be normal sized, not tiny specks or dashes.
539         height_fraction = char_height / normal_height;
540       }
541     }
542     bool bad_height = height_fraction < superscript_scaledown_ratio;
543 
544     if (debug) {
545       if (is_italic) {
546         tprintf(" Rejecting: superscript is italic.\n");
547       }
548       if (is_punc) {
549         tprintf(" Rejecting: punctuation present.\n");
550       }
551       const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
552       if (bad_certainty) {
553         tprintf(
554             " Rejecting: don't believe character %s with certainty %.2f "
555             "which is less than threshold %.2f\n",
556             char_str, char_certainty, certainty_threshold);
557       }
558       if (bad_height) {
559         tprintf(
560             " Rejecting: character %s seems too small @ %.2f versus "
561             "expected %.2f\n",
562             char_str, char_height, normal_height);
563       }
564     }
565     if (bad_certainty || bad_height || is_punc || is_italic) {
566       if (ok_run_count == i) {
567         initial_ok_run_count = ok_run_count;
568       }
569       ok_run_count = 0;
570     } else {
571       ok_run_count++;
572     }
573     if (char_certainty < worst_certainty) {
574       worst_certainty = char_certainty;
575     }
576   }
577   bool all_ok = ok_run_count == wc.length();
578   if (all_ok && debug) {
579     tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
580   }
581   if (!all_ok) {
582     if (left_ok) {
583       *left_ok = initial_ok_run_count;
584     }
585     if (right_ok) {
586       *right_ok = ok_run_count;
587     }
588   }
589   return all_ok;
590 }
591 
592 } // namespace tesseract
593