1 /**********************************************************************
2  * File:        paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author:      David Eger
5  *
6  * (C) Copyright 2011, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #include "paragraphs.h"
20 
21 #include "helpers.h"             // for UpdateRange, ClipToRange
22 #include "host.h"                // for NearlyEqual
23 #include "mutableiterator.h"     // for MutableIterator
24 #include "ocrblock.h"            // for BLOCK
25 #include "ocrpara.h"             // for ParagraphModel, PARA, PARA_IT, PARA...
26 #include "ocrrow.h"              // for ROW
27 #include "pageres.h"             // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
28 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
29 #include "pdblock.h"             // for PDBLK
30 #include "polyblk.h"             // for POLY_BLOCK
31 #include "ratngs.h"              // for WERD_CHOICE
32 #include "rect.h"                // for TBOX
33 #include "statistc.h"            // for STATS
34 #include "tprintf.h"             // for tprintf
35 #include "unicharset.h"          // for UNICHARSET
36 #include "werd.h"                // for WERD, W_REP_CHAR
37 
38 #include <tesseract/pageiterator.h> // for PageIterator
39 #include <tesseract/publictypes.h>  // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40 #include <tesseract/unichar.h>      // for UNICHAR, UNICHAR_ID
41 
42 #include <algorithm> // for max
43 #include <cctype>    // for isspace
44 #include <cmath>     // for abs
45 #include <cstdio>    // for snprintf
46 #include <cstdlib>   // for abs
47 #include <cstring>   // for strchr, strlen
48 #include <memory>    // for unique_ptr
49 
50 static const char *const kRLE = "\u202A"; // Right-to-Left Embedding
51 static const char *const kPDF = "\u202C"; // Pop Directional Formatting
52 
53 namespace tesseract {
54 
55 // Special "weak" ParagraphModels.
56 const ParagraphModel *kCrownLeft =
57     reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
58 const ParagraphModel *kCrownRight =
59     reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
60 
61 // Do the text and geometry of two rows support a paragraph break between them?
62 static bool LikelyParagraphStart(const RowScratchRegisters &before,
63                                  const RowScratchRegisters &after,
64                                  tesseract::ParagraphJustification j);
65 
66 // Given the width of a typical space between words, what is the threshold
67 // by which by which we think left and right alignments for paragraphs
68 // can vary and still be aligned.
Epsilon(int space_pix)69 static int Epsilon(int space_pix) {
70   return space_pix * 4 / 5;
71 }
72 
AcceptableRowArgs(int debug_level,int min_num_rows,const char * function_name,const std::vector<RowScratchRegisters> * rows,int row_start,int row_end)73 static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
74                               const std::vector<RowScratchRegisters> *rows, int row_start,
75                               int row_end) {
76   if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77     tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
78             rows->size());
79     return false;
80   }
81   if (row_end - row_start < min_num_rows) {
82     if (debug_level > 1) {
83       tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
84     }
85     return false;
86   }
87   return true;
88 }
89 
90 // =============================== Debug Code ================================
91 
92 // Given a row-major matrix of unicode text and a column separator, print
93 // a formatted table.  For ASCII, we get good column alignment.
PrintTable(const std::vector<std::vector<std::string>> & rows,const char * colsep)94 static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
95   std::vector<int> max_col_widths;
96   for (const auto &row : rows) {
97     auto num_columns = row.size();
98     for (size_t c = 0; c < num_columns; c++) {
99       int num_unicodes = 0;
100       for (char i : row[c]) {
101         if ((i & 0xC0) != 0x80) {
102           num_unicodes++;
103         }
104       }
105       if (c >= max_col_widths.size()) {
106         max_col_widths.push_back(num_unicodes);
107       } else {
108         if (num_unicodes > max_col_widths[c]) {
109           max_col_widths[c] = num_unicodes;
110         }
111       }
112     }
113   }
114 
115   std::vector<std::string> col_width_patterns;
116   col_width_patterns.reserve(max_col_widths.size());
117   for (int max_col_width : max_col_widths) {
118     col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s");
119   }
120 
121   for (const auto &row : rows) {
122     for (unsigned c = 0; c < row.size(); c++) {
123       if (c > 0) {
124         tprintf("%s", colsep);
125       }
126       tprintf(col_width_patterns[c].c_str(), row[c].c_str());
127     }
128     tprintf("\n");
129   }
130 }
131 
RtlEmbed(const std::string & word,bool rtlify)132 static std::string RtlEmbed(const std::string &word, bool rtlify) {
133   if (rtlify) {
134     return std::string(kRLE) + word + std::string(kPDF);
135   }
136   return word;
137 }
138 
139 // Print the current thoughts of the paragraph detector.
PrintDetectorState(const ParagraphTheory & theory,const std::vector<RowScratchRegisters> & rows)140 static void PrintDetectorState(const ParagraphTheory &theory,
141                                const std::vector<RowScratchRegisters> &rows) {
142   std::vector<std::vector<std::string>> output;
143   output.emplace_back();
144   output.back().push_back("#row");
145   output.back().push_back("space");
146   output.back().push_back("..");
147   output.back().push_back("lword[widthSEL]");
148   output.back().push_back("rword[widthSEL]");
149   RowScratchRegisters::AppendDebugHeaderFields(output.back());
150   output.back().push_back("text");
151 
152   for (unsigned i = 0; i < rows.size(); i++) {
153     output.emplace_back();
154     std::vector<std::string> &row = output.back();
155     const RowInfo &ri = *rows[i].ri_;
156     row.push_back(std::to_string(i));
157     row.push_back(std::to_string(ri.average_interword_space));
158     row.emplace_back(ri.has_leaders ? ".." : " ");
159     row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
160                   (ri.lword_likely_starts_idea ? "S" : "s") +
161                   (ri.lword_likely_ends_idea ? "E" : "e") +
162                   (ri.lword_indicates_list_item ? "L" : "l") + "]");
163     row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
164                   (ri.rword_likely_starts_idea ? "S" : "s") +
165                   (ri.rword_likely_ends_idea ? "E" : "e") +
166                   (ri.rword_indicates_list_item ? "L" : "l") + "]");
167     rows[i].AppendDebugInfo(theory, row);
168     row.push_back(RtlEmbed(ri.text, !ri.ltr));
169   }
170   PrintTable(output, " ");
171 
172   tprintf("Active Paragraph Models:\n");
173   unsigned m = 0;
174   for (const auto &model : theory.models()) {
175     tprintf(" %d: %s\n", ++m, model->ToString().c_str());
176   }
177 }
178 
DebugDump(bool should_print,const char * phase,const ParagraphTheory & theory,const std::vector<RowScratchRegisters> & rows)179 static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
180                       const std::vector<RowScratchRegisters> &rows) {
181   if (!should_print) {
182     return;
183   }
184   tprintf("# %s\n", phase);
185   PrintDetectorState(theory, rows);
186 }
187 
188 // Print out the text for rows[row_start, row_end)
PrintRowRange(const std::vector<RowScratchRegisters> & rows,int row_start,int row_end)189 static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
190                           int row_end) {
191   tprintf("======================================\n");
192   for (int row = row_start; row < row_end; row++) {
193     tprintf("%s\n", rows[row].ri_->text.c_str());
194   }
195   tprintf("======================================\n");
196 }
197 
198 // ============= Brain Dead Language Model (ASCII Version) ===================
199 
IsLatinLetter(int ch)200 static bool IsLatinLetter(int ch) {
201   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
202 }
203 
IsDigitLike(int ch)204 static bool IsDigitLike(int ch) {
205   return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
206 }
207 
IsOpeningPunct(int ch)208 static bool IsOpeningPunct(int ch) {
209   return strchr("'\"({[", ch) != nullptr;
210 }
211 
IsTerminalPunct(int ch)212 static bool IsTerminalPunct(int ch) {
213   return strchr(":'\".?!]})", ch) != nullptr;
214 }
215 
216 // Return a pointer after consuming as much text as qualifies as roman numeral.
SkipChars(const char * str,const char * toskip)217 static const char *SkipChars(const char *str, const char *toskip) {
218   while (*str != '\0' && strchr(toskip, *str)) {
219     str++;
220   }
221   return str;
222 }
223 
SkipChars(const char * str,bool (* skip)(int))224 static const char *SkipChars(const char *str, bool (*skip)(int)) {
225   while (*str != '\0' && skip(*str)) {
226     str++;
227   }
228   return str;
229 }
230 
SkipOne(const char * str,const char * toskip)231 static const char *SkipOne(const char *str, const char *toskip) {
232   if (*str != '\0' && strchr(toskip, *str)) {
233     return str + 1;
234   }
235   return str;
236 }
237 
238 // Return whether it is very likely that this is a numeral marker that could
239 // start a list item.  Some examples include:
240 //   A   I   iii.   VI   (2)   3.5.   [C-4]
LikelyListNumeral(const std::string & word)241 static bool LikelyListNumeral(const std::string &word) {
242   const char *kRomans = "ivxlmdIVXLMD";
243   const char *kDigits = "012345789";
244   const char *kOpen = "[{(";
245   const char *kSep = ":;-.,";
246   const char *kClose = "]})";
247 
248   int num_segments = 0;
249   const char *pos = word.c_str();
250   while (*pos != '\0' && num_segments < 3) {
251     // skip up to two open parens.
252     const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253     const char *numeral_end = SkipChars(numeral_start, kRomans);
254     if (numeral_end != numeral_start) {
255       // Got Roman Numeral. Great.
256     } else {
257       numeral_end = SkipChars(numeral_start, kDigits);
258       if (numeral_end == numeral_start) {
259         // If there's a single latin letter, we can use that.
260         numeral_end = SkipChars(numeral_start, IsLatinLetter);
261         if (numeral_end - numeral_start != 1) {
262           break;
263         }
264       }
265     }
266     // We got some sort of numeral.
267     num_segments++;
268     // Skip any trailing parens or punctuation.
269     pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270     if (pos == numeral_end) {
271       break;
272     }
273   }
274   return *pos == '\0';
275 }
276 
LikelyListMark(const std::string & word)277 static bool LikelyListMark(const std::string &word) {
278   const char *kListMarks = "0Oo*.,+.";
279   return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
280 }
281 
AsciiLikelyListItem(const std::string & word)282 bool AsciiLikelyListItem(const std::string &word) {
283   return LikelyListMark(word) || LikelyListNumeral(word);
284 }
285 
286 // ========== Brain Dead Language Model (Tesseract Version) ================
287 
288 // Return the first Unicode Codepoint from werd[pos].
UnicodeFor(const UNICHARSET * u,const WERD_CHOICE * werd,unsigned pos)289 static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
290   if (!u || !werd || pos > werd->length()) {
291     return 0;
292   }
293   return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
294 }
295 
296 // A useful helper class for finding the first j >= i so that word[j]
297 // does not have given character type.
298 class UnicodeSpanSkipper {
299 public:
UnicodeSpanSkipper(const UNICHARSET * unicharset,const WERD_CHOICE * word)300   UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
301       : u_(unicharset), word_(word), wordlen_(word->length()) {
302   }
303 
304   // Given an input position, return the first position >= pos not punc.
305   unsigned SkipPunc(unsigned pos);
306   // Given an input position, return the first position >= pos not digit.
307   unsigned SkipDigits(unsigned pos);
308   // Given an input position, return the first position >= pos not roman.
309   unsigned SkipRomans(unsigned pos);
310   // Given an input position, return the first position >= pos not alpha.
311   unsigned SkipAlpha(unsigned pos);
312 
313 private:
314   const UNICHARSET *u_;
315   const WERD_CHOICE *word_;
316   unsigned wordlen_;
317 };
318 
SkipPunc(unsigned pos)319 unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
320   while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
321     pos++;
322   }
323   return pos;
324 }
325 
SkipDigits(unsigned pos)326 unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
327   while (pos < wordlen_ &&
328          (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
329     pos++;
330   }
331   return pos;
332 }
333 
SkipRomans(unsigned pos)334 unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
335   const char *kRomans = "ivxlmdIVXLMD";
336   while (pos < wordlen_) {
337     int ch = UnicodeFor(u_, word_, pos);
338     if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {
339       break;
340     }
341     pos++;
342   }
343   return pos;
344 }
345 
SkipAlpha(unsigned pos)346 unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
347   while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
348     pos++;
349   }
350   return pos;
351 }
352 
LikelyListMarkUnicode(int ch)353 static bool LikelyListMarkUnicode(int ch) {
354   if (ch < 0x80) {
355     std::string single_ch;
356     single_ch += ch;
357     return LikelyListMark(single_ch);
358   }
359   switch (ch) {
360     // TODO(eger) expand this list of unicodes as needed.
361     case 0x00B0: // degree sign
362     case 0x2022: // bullet
363     case 0x25E6: // white bullet
364     case 0x00B7: // middle dot
365     case 0x25A1: // white square
366     case 0x25A0: // black square
367     case 0x25AA: // black small square
368     case 0x2B1D: // black very small square
369     case 0x25BA: // black right-pointing pointer
370     case 0x25CF: // black circle
371     case 0x25CB: // white circle
372       return true;
373     default:
374       break; // fall through
375   }
376   return false;
377 }
378 
379 // Return whether it is very likely that this is a numeral marker that could
380 // start a list item.  Some examples include:
381 //   A   I   iii.   VI   (2)   3.5.   [C-4]
UniLikelyListItem(const UNICHARSET * u,const WERD_CHOICE * werd)382 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
383   if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
384     return true;
385   }
386 
387   UnicodeSpanSkipper m(u, werd);
388   int num_segments = 0;
389   unsigned pos = 0;
390   while (pos < werd->length() && num_segments < 3) {
391     auto numeral_start = m.SkipPunc(pos);
392     if (numeral_start > pos + 1) {
393       break;
394     }
395     auto numeral_end = m.SkipRomans(numeral_start);
396     if (numeral_end == numeral_start) {
397       numeral_end = m.SkipDigits(numeral_start);
398       if (numeral_end == numeral_start) {
399         // If there's a single latin letter, we can use that.
400         numeral_end = m.SkipAlpha(numeral_start);
401         if (numeral_end - numeral_start != 1) {
402           break;
403         }
404       }
405     }
406     // We got some sort of numeral.
407     num_segments++;
408     // Skip any trailing punctuation.
409     pos = m.SkipPunc(numeral_end);
410     if (pos == numeral_end) {
411       break;
412     }
413   }
414   return pos == werd->length();
415 }
416 
417 template<class T>
push_back_new(std::vector<T> & vector,const T & data)418 void push_back_new(std::vector<T> &vector, const T &data) {
419   if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420     vector.push_back(data);
421   }
422 }
423 
424 // ========= Brain Dead Language Model (combined entry points) ================
425 
426 // Given the leftmost word of a line either as a Tesseract unicharset + werd
427 // or a utf8 string, set the following attributes for it:
428 //   is_list -      this word might be a list number or bullet.
429 //   starts_idea -  this word is likely to start a sentence.
430 //   ends_idea -    this word is likely to end a sentence.
LeftWordAttributes(const UNICHARSET * unicharset,const WERD_CHOICE * werd,const std::string & utf8,bool * is_list,bool * starts_idea,bool * ends_idea)431 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
432                         bool *is_list, bool *starts_idea, bool *ends_idea) {
433   *is_list = false;
434   *starts_idea = false;
435   *ends_idea = false;
436   if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
437     *ends_idea = true;
438     return;
439   }
440 
441   if (unicharset && werd) { // We have a proper werd and unicharset so use it.
442     if (UniLikelyListItem(unicharset, werd)) {
443       *is_list = true;
444       *starts_idea = true;
445       *ends_idea = true;
446     }
447     if (unicharset->get_isupper(werd->unichar_id(0))) {
448       *starts_idea = true;
449     }
450     if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
451       *starts_idea = true;
452       *ends_idea = true;
453     }
454   } else { // Assume utf8 is mostly ASCII
455     if (AsciiLikelyListItem(utf8)) {
456       *is_list = true;
457       *starts_idea = true;
458     }
459     int start_letter = utf8[0];
460     if (IsOpeningPunct(start_letter)) {
461       *starts_idea = true;
462     }
463     if (IsTerminalPunct(start_letter)) {
464       *ends_idea = true;
465     }
466     if (start_letter >= 'A' && start_letter <= 'Z') {
467       *starts_idea = true;
468     }
469   }
470 }
471 
472 // Given the rightmost word of a line either as a Tesseract unicharset + werd
473 // or a utf8 string, set the following attributes for it:
474 //   is_list -      this word might be a list number or bullet.
475 //   starts_idea -  this word is likely to start a sentence.
476 //   ends_idea -    this word is likely to end a sentence.
RightWordAttributes(const UNICHARSET * unicharset,const WERD_CHOICE * werd,const std::string & utf8,bool * is_list,bool * starts_idea,bool * ends_idea)477 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
478                          bool *is_list, bool *starts_idea, bool *ends_idea) {
479   *is_list = false;
480   *starts_idea = false;
481   *ends_idea = false;
482   if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
483     *ends_idea = true;
484     return;
485   }
486 
487   if (unicharset && werd) { // We have a proper werd and unicharset so use it.
488     if (UniLikelyListItem(unicharset, werd)) {
489       *is_list = true;
490       *starts_idea = true;
491     }
492     UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
493     if (unicharset->get_ispunctuation(last_letter)) {
494       *ends_idea = true;
495     }
496   } else { // Assume utf8 is mostly ASCII
497     if (AsciiLikelyListItem(utf8)) {
498       *is_list = true;
499       *starts_idea = true;
500     }
501     int last_letter = utf8[utf8.size() - 1];
502     if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
503       *ends_idea = true;
504     }
505   }
506 }
507 
508 // =============== Implementation of RowScratchRegisters =====================
509 /* static */
AppendDebugHeaderFields(std::vector<std::string> & header)510 void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
511   header.emplace_back("[lmarg,lind;rind,rmarg]");
512   header.emplace_back("model");
513 }
514 
AppendDebugInfo(const ParagraphTheory & theory,std::vector<std::string> & dbg) const515 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
516                                           std::vector<std::string> &dbg) const {
517   char s[30];
518   snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
519   dbg.emplace_back(s);
520   std::string model_string;
521   model_string += static_cast<char>(GetLineType());
522   model_string += ":";
523 
524   int model_numbers = 0;
525   for (const auto &hypothese : hypotheses_) {
526     if (hypothese.model == nullptr) {
527       continue;
528     }
529     if (model_numbers > 0) {
530       model_string += ",";
531     }
532     if (StrongModel(hypothese.model)) {
533       model_string += std::to_string(1 + theory.IndexOf(hypothese.model));
534     } else if (hypothese.model == kCrownLeft) {
535       model_string += "CrL";
536     } else if (hypothese.model == kCrownRight) {
537       model_string += "CrR";
538     }
539     model_numbers++;
540   }
541   if (model_numbers == 0) {
542     model_string += "0";
543   }
544 
545   dbg.push_back(model_string);
546 }
547 
Init(const RowInfo & row)548 void RowScratchRegisters::Init(const RowInfo &row) {
549   ri_ = &row;
550   lmargin_ = 0;
551   lindent_ = row.pix_ldistance;
552   rmargin_ = 0;
553   rindent_ = row.pix_rdistance;
554 }
555 
GetLineType() const556 LineType RowScratchRegisters::GetLineType() const {
557   if (hypotheses_.empty()) {
558     return LT_UNKNOWN;
559   }
560   bool has_start = false;
561   bool has_body = false;
562   for (const auto &hypothese : hypotheses_) {
563     switch (hypothese.ty) {
564       case LT_START:
565         has_start = true;
566         break;
567       case LT_BODY:
568         has_body = true;
569         break;
570       default:
571         tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
572         break;
573     }
574   }
575   if (has_start && has_body) {
576     return LT_MULTIPLE;
577   }
578   return has_start ? LT_START : LT_BODY;
579 }
580 
GetLineType(const ParagraphModel * model) const581 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
582   if (hypotheses_.empty()) {
583     return LT_UNKNOWN;
584   }
585   bool has_start = false;
586   bool has_body = false;
587   for (const auto &hypothese : hypotheses_) {
588     if (hypothese.model != model) {
589       continue;
590     }
591     switch (hypothese.ty) {
592       case LT_START:
593         has_start = true;
594         break;
595       case LT_BODY:
596         has_body = true;
597         break;
598       default:
599         tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
600         break;
601     }
602   }
603   if (has_start && has_body) {
604     return LT_MULTIPLE;
605   }
606   return has_start ? LT_START : LT_BODY;
607 }
608 
SetStartLine()609 void RowScratchRegisters::SetStartLine() {
610   LineType current_lt = GetLineType();
611   if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
612     tprintf("Trying to set a line to be START when it's already BODY.\n");
613   }
614   if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
615     push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
616   }
617 }
618 
SetBodyLine()619 void RowScratchRegisters::SetBodyLine() {
620   LineType current_lt = GetLineType();
621   if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
622     tprintf("Trying to set a line to be BODY when it's already START.\n");
623   }
624   if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
625     push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
626   }
627 }
628 
AddStartLine(const ParagraphModel * model)629 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
630   push_back_new(hypotheses_, LineHypothesis(LT_START, model));
631   auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
632   if (found != hypotheses_.end()) {
633     hypotheses_.erase(found);
634   }
635 }
636 
AddBodyLine(const ParagraphModel * model)637 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
638   push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
639   auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
640   if (found != hypotheses_.end()) {
641     hypotheses_.erase(found);
642   }
643 }
644 
StartHypotheses(SetOfModels * models) const645 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
646   for (const auto &hypothese : hypotheses_) {
647     if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {
648       push_back_new(*models, hypothese.model);
649     }
650   }
651 }
652 
StrongHypotheses(SetOfModels * models) const653 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
654   for (const auto &hypothese : hypotheses_) {
655     if (StrongModel(hypothese.model)) {
656       push_back_new(*models, hypothese.model);
657     }
658   }
659 }
660 
NonNullHypotheses(SetOfModels * models) const661 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
662   for (const auto &hypothese : hypotheses_) {
663     if (hypothese.model != nullptr) {
664       push_back_new(*models, hypothese.model);
665     }
666   }
667 }
668 
UniqueStartHypothesis() const669 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
670   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {
671     return nullptr;
672   }
673   return hypotheses_[0].model;
674 }
675 
UniqueBodyHypothesis() const676 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
677   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {
678     return nullptr;
679   }
680   return hypotheses_[0].model;
681 }
682 
683 // Discard any hypotheses whose model is not in the given list.
DiscardNonMatchingHypotheses(const SetOfModels & models)684 void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) {
685   if (models.empty()) {
686     return;
687   }
688   for (int h = hypotheses_.size() - 1; h >= 0; h--) {
689     if (!contains(models, hypotheses_[h].model)) {
690       hypotheses_.erase(hypotheses_.begin() + h);
691     }
692   }
693 }
694 
695 // ============ Geometry based Paragraph Detection Algorithm =================
696 
697 struct Cluster {
Clustertesseract::Cluster698   Cluster() : center(0), count(0) {}
Clustertesseract::Cluster699   Cluster(int cen, int num) : center(cen), count(num) {}
700 
701   int center; // The center of the cluster.
702   int count;  // The number of entries within the cluster.
703 };
704 
705 class SimpleClusterer {
706 public:
SimpleClusterer(int max_cluster_width)707   explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
Add(int value)708   void Add(int value) {
709     values_.push_back(value);
710   }
size() const711   size_t size() const {
712     return values_.size();
713   }
714   void GetClusters(std::vector<Cluster> *clusters);
715 
716 private:
717   int max_cluster_width_;
718   std::vector<int> values_;
719 };
720 
721 // Return the index of the cluster closest to value.
ClosestCluster(const std::vector<Cluster> & clusters,int value)722 static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
723   unsigned best_index = 0;
724   for (unsigned i = 0; i < clusters.size(); i++) {
725     if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
726       best_index = i;
727     }
728   }
729   return best_index;
730 }
731 
GetClusters(std::vector<Cluster> * clusters)732 void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
733   clusters->clear();
734   std::sort(values_.begin(), values_.end());
735   for (unsigned i = 0; i < values_.size();) {
736     int orig_i = i;
737     int lo = values_[i];
738     int hi = lo;
739     while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
740       hi = values_[i];
741     }
742     clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
743   }
744 }
745 
746 // Calculate left- and right-indent tab stop values seen in
747 // rows[row_start, row_end) given a tolerance of tolerance.
CalculateTabStops(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,int tolerance,std::vector<Cluster> * left_tabs,std::vector<Cluster> * right_tabs)748 static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
749                               int tolerance, std::vector<Cluster> *left_tabs,
750                               std::vector<Cluster> *right_tabs) {
751   if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
752     return;
753   }
754   // First pass: toss all left and right indents into clusterers.
755   SimpleClusterer initial_lefts(tolerance);
756   SimpleClusterer initial_rights(tolerance);
757   std::vector<Cluster> initial_left_tabs;
758   std::vector<Cluster> initial_right_tabs;
759   for (int i = row_start; i < row_end; i++) {
760     initial_lefts.Add((*rows)[i].lindent_);
761     initial_rights.Add((*rows)[i].rindent_);
762   }
763   initial_lefts.GetClusters(&initial_left_tabs);
764   initial_rights.GetClusters(&initial_right_tabs);
765 
766   // Second pass: cluster only lines that are not "stray"
767   //   An example of a stray line is a page number -- a line whose start
768   //   and end tab-stops are far outside the typical start and end tab-stops
769   //   for the block.
770   //   Put another way, we only cluster data from lines whose start or end
771   //   tab stop is frequent.
772   SimpleClusterer lefts(tolerance);
773   SimpleClusterer rights(tolerance);
774 
775   // Outlier elimination.  We might want to switch this to test outlier-ness
776   // based on how strange a position an outlier is in instead of or in addition
777   // to how rare it is.  These outliers get re-added if we end up having too
778   // few tab stops, to work with, however.
779   int infrequent_enough_to_ignore = 0;
780   if (row_end - row_start >= 8) {
781     infrequent_enough_to_ignore = 1;
782   }
783   if (row_end - row_start >= 20) {
784     infrequent_enough_to_ignore = 2;
785   }
786 
787   for (int i = row_start; i < row_end; i++) {
788     int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
789     int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
790     if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
791         initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
792       lefts.Add((*rows)[i].lindent_);
793       rights.Add((*rows)[i].rindent_);
794     }
795   }
796   lefts.GetClusters(left_tabs);
797   rights.GetClusters(right_tabs);
798 
799   if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
800       (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
801     // One side is really ragged, and the other only has one tab stop,
802     // so those "insignificant outliers" are probably important, actually.
803     // This often happens on a page of an index.  Add back in the ones
804     // we omitted in the first pass.
805     for (int i = row_start; i < row_end; i++) {
806       int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
807       int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
808       if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
809             initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
810         lefts.Add((*rows)[i].lindent_);
811         rights.Add((*rows)[i].rindent_);
812       }
813     }
814   }
815   lefts.GetClusters(left_tabs);
816   rights.GetClusters(right_tabs);
817 
818   // If one side is almost a two-indent aligned side, and the other clearly
819   // isn't, try to prune out the least frequent tab stop from that side.
820   if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
821     int to_prune = -1;
822     for (int i = left_tabs->size() - 1; i >= 0; i--) {
823       if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
824         to_prune = i;
825       }
826     }
827     if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
828       left_tabs->erase(left_tabs->begin() + to_prune);
829     }
830   }
831   if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
832     int to_prune = -1;
833     for (int i = right_tabs->size() - 1; i >= 0; i--) {
834       if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
835         to_prune = i;
836       }
837     }
838     if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
839       right_tabs->erase(right_tabs->begin() + to_prune);
840     }
841   }
842 }
843 
844 // Given a paragraph model mark rows[row_start, row_end) as said model
845 // start or body lines.
846 //
847 // Case 1: model->first_indent_ != model->body_indent_
848 //   Differentiating the paragraph start lines from the paragraph body lines in
849 //   this case is easy, we just see how far each line is indented.
850 //
851 // Case 2: model->first_indent_ == model->body_indent_
852 //   Here, we find end-of-paragraph lines by looking for "short lines."
853 //   What constitutes a "short line" changes depending on whether the text
854 //   ragged-right[left] or fully justified (aligned left and right).
855 //
856 //   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
857 //     We have a new paragraph it the first word would have at the end
858 //     of the previous line.
859 //
860 //   Case 2b: Fully Justified.  (eop_threshold > 0)
861 //     We mark a line as short (end of paragraph) if the offside indent
862 //     is greater than eop_threshold.
MarkRowsWithModel(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,const ParagraphModel * model,bool ltr,int eop_threshold)863 static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
864                               const ParagraphModel *model, bool ltr, int eop_threshold) {
865   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
866     return;
867   }
868   for (int row = row_start; row < row_end; row++) {
869     bool valid_first = ValidFirstLine(rows, row, model);
870     bool valid_body = ValidBodyLine(rows, row, model);
871     if (valid_first && !valid_body) {
872       (*rows)[row].AddStartLine(model);
873     } else if (valid_body && !valid_first) {
874       (*rows)[row].AddBodyLine(model);
875     } else if (valid_body && valid_first) {
876       bool after_eop = (row == row_start);
877       if (row > row_start) {
878         if (eop_threshold > 0) {
879           if (model->justification() == JUSTIFICATION_LEFT) {
880             after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
881           } else {
882             after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
883           }
884         } else {
885           after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());
886         }
887       }
888       if (after_eop) {
889         (*rows)[row].AddStartLine(model);
890       } else {
891         (*rows)[row].AddBodyLine(model);
892       }
893     } else {
894       // Do nothing. Stray row.
895     }
896   }
897 }
898 
899 // GeometricClassifierState holds all of the information we'll use while
900 // trying to determine a paragraph model for the text lines in a block of
901 // text:
902 //   + the rows under consideration [row_start, row_end)
903 //   + the common left- and right-indent tab stops
904 //   + does the block start out left-to-right or right-to-left
905 // Further, this struct holds the data we amass for the (single) ParagraphModel
906 // we'll assign to the text lines (assuming we get that far).
907 struct GeometricClassifierState {
GeometricClassifierStatetesseract::GeometricClassifierState908   GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
909                            int r_end)
910       : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
911     tolerance = InterwordSpace(*r, r_start, r_end);
912     CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);
913     if (debug_level >= 3) {
914       tprintf(
915           "Geometry: TabStop cluster tolerance = %d; "
916           "%zu left tabs; %zu right tabs\n",
917           tolerance, left_tabs.size(), right_tabs.size());
918     }
919     ltr = (*r)[r_start].ri_->ltr;
920   }
921 
AssumeLeftJustificationtesseract::GeometricClassifierState922   void AssumeLeftJustification() {
923     just = tesseract::JUSTIFICATION_LEFT;
924     margin = (*rows)[row_start].lmargin_;
925   }
926 
AssumeRightJustificationtesseract::GeometricClassifierState927   void AssumeRightJustification() {
928     just = tesseract::JUSTIFICATION_RIGHT;
929     margin = (*rows)[row_start].rmargin_;
930   }
931 
932   // Align tabs are the tab stops the text is aligned to.
AlignTabstesseract::GeometricClassifierState933   const std::vector<Cluster> &AlignTabs() const {
934     if (just == tesseract::JUSTIFICATION_RIGHT) {
935       return right_tabs;
936     }
937     return left_tabs;
938   }
939 
940   // Offside tabs are the tab stops opposite the tabs used to align the text.
941   //
942   // Note that for a left-to-right text which is aligned to the right such as
943   //     this function comment, the offside tabs are the horizontal tab stops
944   //                 marking the beginning of ("Note", "this" and "marking").
OffsideTabstesseract::GeometricClassifierState945   const std::vector<Cluster> &OffsideTabs() const {
946     if (just == tesseract::JUSTIFICATION_RIGHT) {
947       return left_tabs;
948     }
949     return right_tabs;
950   }
951 
952   // Return whether the i'th row extends from the leftmost left tab stop
953   // to the right most right tab stop.
IsFullRowtesseract::GeometricClassifierState954   bool IsFullRow(int i) const {
955     return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
956            ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
957   }
958 
AlignsideTabIndextesseract::GeometricClassifierState959   int AlignsideTabIndex(int row_idx) const {
960     return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
961   }
962 
963   // Given what we know about the paragraph justification (just), would the
964   // first word of row_b have fit at the end of row_a?
FirstWordWouldHaveFittesseract::GeometricClassifierState965   bool FirstWordWouldHaveFit(int row_a, int row_b) {
966     return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just);
967   }
968 
PrintRowstesseract::GeometricClassifierState969   void PrintRows() const {
970     PrintRowRange(*rows, row_start, row_end);
971   }
972 
Failtesseract::GeometricClassifierState973   void Fail(int min_debug_level, const char *why) const {
974     if (debug_level < min_debug_level) {
975       return;
976     }
977     tprintf("# %s\n", why);
978     PrintRows();
979   }
980 
Modeltesseract::GeometricClassifierState981   ParagraphModel Model() const {
982     return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
983   }
984 
985   // We print out messages with a debug level at least as great as debug_level.
986   int debug_level = 0;
987 
988   // The Geometric Classifier was asked to find a single paragraph model
989   // to fit the text rows (*rows)[row_start, row_end)
990   std::vector<RowScratchRegisters> *rows;
991   int row_start = 0;
992   int row_end = 0;
993 
994   // The amount by which we expect the text edge can vary and still be aligned.
995   int tolerance = 0;
996 
997   // Is the script in this text block left-to-right?
998   // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
999   bool ltr = false;
1000 
1001   // These left and right tab stops were determined to be the common tab
1002   // stops for the given text.
1003   std::vector<Cluster> left_tabs;
1004   std::vector<Cluster> right_tabs;
1005 
1006   // These are parameters we must determine to create a ParagraphModel.
1007   tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
1008   int margin = 0;
1009   int first_indent = 0;
1010   int body_indent = 0;
1011 
1012   // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
1013   int eop_threshold = 0;
1014 };
1015 
1016 // Given a section of text where strong textual clues did not help identifying
1017 // paragraph breaks, and for which the left and right indents have exactly
1018 // three tab stops between them, attempt to find the paragraph breaks based
1019 // solely on the outline of the text and whether the script is left-to-right.
1020 //
1021 // Algorithm Detail:
1022 //   The selected rows are in the form of a rectangle except
1023 //   for some number of "short lines" of the same length:
1024 //
1025 //   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
1026 //           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
1027 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
1028 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
1029 //
1030 //   We have a slightly different situation if the only short
1031 //   line is at the end of the excerpt.
1032 //
1033 //   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
1034 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
1035 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
1036 //          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
1037 //
1038 //   We'll interpret these as follows based on the reasoning in the comment for
1039 //   GeometricClassify():
1040 //       [script direction: first indent, body indent]
1041 //   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
1042 //   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
GeometricClassifyThreeTabStopTextBlock(int debug_level,GeometricClassifierState & s,ParagraphTheory * theory)1043 static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,
1044                                                    ParagraphTheory *theory) {
1045   int num_rows = s.row_end - s.row_start;
1046   int num_full_rows = 0;
1047   int last_row_full = 0;
1048   for (int i = s.row_start; i < s.row_end; i++) {
1049     if (s.IsFullRow(i)) {
1050       num_full_rows++;
1051       if (i == s.row_end - 1) {
1052         last_row_full++;
1053       }
1054     }
1055   }
1056 
1057   if (num_full_rows < 0.7 * num_rows) {
1058     s.Fail(1, "Not enough full lines to know which lines start paras.");
1059     return;
1060   }
1061 
1062   // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1063   s.eop_threshold = 0;
1064 
1065   if (s.ltr) {
1066     s.AssumeLeftJustification();
1067   } else {
1068     s.AssumeRightJustification();
1069   }
1070 
1071   if (debug_level > 0) {
1072     tprintf(
1073         "# Not enough variety for clear outline classification. "
1074         "Guessing these are %s aligned based on script.\n",
1075         s.ltr ? "left" : "right");
1076     s.PrintRows();
1077   }
1078 
1079   if (s.AlignTabs().size() == 2) { // case A1 or A2
1080     s.first_indent = s.AlignTabs()[1].center;
1081     s.body_indent = s.AlignTabs()[0].center;
1082   } else { // case B1 or B2
1083     if (num_rows - 1 == num_full_rows - last_row_full) {
1084       // case B2
1085       const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1086       (*s.rows)[s.row_start].AddStartLine(model);
1087       for (int i = s.row_start + 1; i < s.row_end; i++) {
1088         (*s.rows)[i].AddBodyLine(model);
1089       }
1090       return;
1091     } else {
1092       // case B1
1093       s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1094       s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1095     }
1096   }
1097   const ParagraphModel *model = theory->AddModel(s.Model());
1098   MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);
1099   return;
1100 }
1101 
1102 // This function is called if strong textual clues were not available, but
1103 // the caller hopes that the paragraph breaks will be super obvious just
1104 // by the outline of the text.
1105 //
1106 // The particularly difficult case is figuring out what's going on if you
1107 // don't have enough short paragraph end lines to tell us what's going on.
1108 //
1109 // For instance, let's say you have the following outline:
1110 //
1111 //   (A1)  xxxxxxxxxxxxxxxxxxxxxx
1112 //           xxxxxxxxxxxxxxxxxxxx
1113 //         xxxxxxxxxxxxxxxxxxxxxx
1114 //         xxxxxxxxxxxxxxxxxxxxxx
1115 //
1116 // Even if we know that the text is left-to-right and so will probably be
1117 // left-aligned, both of the following are possible texts:
1118 //
1119 //  (A1a)  1. Here our list item
1120 //           with two full lines.
1121 //         2. Here a second item.
1122 //         3. Here our third one.
1123 //
1124 //  (A1b)  so ends paragraph one.
1125 //           Here  starts another
1126 //         paragraph  we want  to
1127 //         read.  This  continues
1128 //
1129 // These examples are obvious from the text and should have been caught
1130 // by the StrongEvidenceClassify pass.  However, for languages where we don't
1131 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1132 // it's worth guessing that (A1b) is the correct interpretation if there are
1133 // far more "full" lines than "short" lines.
GeometricClassify(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)1134 static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
1135                               int row_start, int row_end, ParagraphTheory *theory) {
1136   if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1137     return;
1138   }
1139   if (debug_level > 1) {
1140     tprintf("###############################################\n");
1141     tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n", row_start, row_end);
1142     tprintf("###############################################\n");
1143   }
1144   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1145 
1146   GeometricClassifierState s(debug_level, rows, row_start, row_end);
1147   if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1148     s.Fail(2, "Too much variety for simple outline classification.");
1149     return;
1150   }
1151   if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1152     s.Fail(1, "Not enough variety for simple outline classification.");
1153     return;
1154   }
1155   if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1156     GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1157     return;
1158   }
1159 
1160   // At this point, we know that one side has at least two tab stops, and the
1161   // other side has one or two tab stops.
1162   // Left to determine:
1163   //   (1) Which is the body indent and which is the first line indent?
1164   //   (2) Is the text fully justified?
1165 
1166   // If one side happens to have three or more tab stops, assume that side
1167   // is opposite of the aligned side.
1168   if (s.right_tabs.size() > 2) {
1169     s.AssumeLeftJustification();
1170   } else if (s.left_tabs.size() > 2) {
1171     s.AssumeRightJustification();
1172   } else if (s.ltr) { // guess based on script direction
1173     s.AssumeLeftJustification();
1174   } else {
1175     s.AssumeRightJustification();
1176   }
1177 
1178   if (s.AlignTabs().size() == 2) {
1179     // For each tab stop on the aligned side, how many of them appear
1180     // to be paragraph start lines?  [first lines]
1181     int firsts[2] = {0, 0};
1182     // Count the first line as a likely paragraph start line.
1183     firsts[s.AlignsideTabIndex(s.row_start)]++;
1184     // For each line, if the first word would have fit on the previous
1185     // line count it as a likely paragraph start line.
1186     bool jam_packed = true;
1187     for (int i = s.row_start + 1; i < s.row_end; i++) {
1188       if (s.FirstWordWouldHaveFit(i - 1, i)) {
1189         firsts[s.AlignsideTabIndex(i)]++;
1190         jam_packed = false;
1191       }
1192     }
1193     // Make an extra accounting for the last line of the paragraph just
1194     // in case it's the only short line in the block.  That is, take its
1195     // first word as typical and see if this looks like the *last* line
1196     // of a paragraph.  If so, mark the *other* indent as probably a first.
1197     if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1198       firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1199     }
1200 
1201     int percent0firsts, percent1firsts;
1202     percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1203     percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1204 
1205     // TODO(eger): Tune these constants if necessary.
1206     if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1207       s.first_indent = s.AlignTabs()[1].center;
1208       s.body_indent = s.AlignTabs()[0].center;
1209     } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1210                percent1firsts + 30 < percent0firsts) {
1211       s.first_indent = s.AlignTabs()[0].center;
1212       s.body_indent = s.AlignTabs()[1].center;
1213     } else {
1214       // Ambiguous! Probably lineated (poetry)
1215       if (debug_level > 1) {
1216         tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1217                 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1218         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1219                 s.AlignTabs()[0].center, percent0firsts);
1220         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1221                 s.AlignTabs()[1].center, percent1firsts);
1222         s.PrintRows();
1223       }
1224       return;
1225     }
1226   } else {
1227     // There's only one tab stop for the "aligned to" side.
1228     s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1229   }
1230 
1231   // At this point, we have our model.
1232   const ParagraphModel *model = theory->AddModel(s.Model());
1233 
1234   // Now all we have to do is figure out if the text is fully justified or not.
1235   // eop_threshold: default to fully justified unless we see evidence below.
1236   //    See description on MarkRowsWithModel()
1237   s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1238   // If the text is not fully justified, re-set the eop_threshold to 0.
1239   if (s.AlignTabs().size() == 2) {
1240     // Paragraphs with a paragraph-start indent.
1241     for (int i = s.row_start; i < s.row_end - 1; i++) {
1242       if (ValidFirstLine(s.rows, i + 1, model) &&
1243           !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1244                        s.tolerance)) {
1245         // We found a non-end-of-paragraph short line: not fully justified.
1246         s.eop_threshold = 0;
1247         break;
1248       }
1249     }
1250   } else {
1251     // Paragraphs with no paragraph-start indent.
1252     for (int i = s.row_start; i < s.row_end - 1; i++) {
1253       if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1254           !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1255                        s.tolerance)) {
1256         // We found a non-end-of-paragraph short line: not fully justified.
1257         s.eop_threshold = 0;
1258         break;
1259       }
1260     }
1261   }
1262   MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1263 }
1264 
1265 // =============== Implementation of ParagraphTheory =====================
1266 
AddModel(const ParagraphModel & model)1267 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
1268   for (const auto &m : *models_) {
1269     if (m->Comparable(model)) {
1270       return m;
1271     }
1272   }
1273   auto *m = new ParagraphModel(model);
1274   models_->push_back(m);
1275   push_back_new(models_we_added_, m);
1276   return m;
1277 }
1278 
DiscardUnusedModels(const SetOfModels & used_models)1279 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
1280   size_t w = 0;
1281   for (size_t r = 0; r < models_->size(); r++) {
1282     ParagraphModel *m = (*models_)[r];
1283     if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
1284       delete m;
1285     } else {
1286       if (r > w) {
1287         (*models_)[w] = m;
1288       }
1289       w++;
1290     }
1291   }
1292   models_->resize(w);
1293 }
1294 
1295 // Examine rows[start, end) and try to determine if an existing non-centered
1296 // paragraph model would fit them perfectly.  If so, return a pointer to it.
1297 // If not, return nullptr.
Fits(const std::vector<RowScratchRegisters> * rows,int start,int end) const1298 const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
1299                                             int start, int end) const {
1300   for (const auto *model : *models_) {
1301     if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {
1302       return model;
1303     }
1304   }
1305   return nullptr;
1306 }
1307 
NonCenteredModels(SetOfModels * models)1308 void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
1309   for (const auto *model : *models_) {
1310     if (model->justification() != JUSTIFICATION_CENTER) {
1311       push_back_new(*models, model);
1312     }
1313   }
1314 }
1315 
IndexOf(const ParagraphModel * model) const1316 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1317   int i = 0;
1318   for (const auto *m : *models_) {
1319     if (m == model) {
1320       return i;
1321     }
1322     i++;
1323   }
1324   return -1;
1325 }
1326 
ValidFirstLine(const std::vector<RowScratchRegisters> * rows,int row,const ParagraphModel * model)1327 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
1328                     const ParagraphModel *model) {
1329   if (!StrongModel(model)) {
1330     tprintf("ValidFirstLine() should only be called with strong models!\n");
1331   }
1332   return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1333                                                      (*rows)[row].rindent_, (*rows)[row].rmargin_);
1334 }
1335 
ValidBodyLine(const std::vector<RowScratchRegisters> * rows,int row,const ParagraphModel * model)1336 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
1337                    const ParagraphModel *model) {
1338   if (!StrongModel(model)) {
1339     tprintf("ValidBodyLine() should only be called with strong models!\n");
1340   }
1341   return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1342                                                     (*rows)[row].rindent_, (*rows)[row].rmargin_);
1343 }
1344 
CrownCompatible(const std::vector<RowScratchRegisters> * rows,int a,int b,const ParagraphModel * model)1345 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
1346                      const ParagraphModel *model) {
1347   if (model != kCrownRight && model != kCrownLeft) {
1348     tprintf("CrownCompatible() should only be called with crown models!\n");
1349     return false;
1350   }
1351   auto &row_a = (*rows)[a];
1352   auto &row_b = (*rows)[b];
1353   if (model == kCrownRight) {
1354     return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1355                        Epsilon(row_a.ri_->average_interword_space));
1356   }
1357   return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1358                      Epsilon(row_a.ri_->average_interword_space));
1359 }
1360 
1361 // =============== Implementation of ParagraphModelSmearer ====================
1362 
ParagraphModelSmearer(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)1363 ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
1364                                              int row_start, int row_end, ParagraphTheory *theory)
1365     : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1366   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1367     row_start_ = 0;
1368     row_end_ = 0;
1369     return;
1370   }
1371   open_models_.resize(open_models_.size() + row_end - row_start + 2);
1372 }
1373 
1374 // see paragraphs_internal.h
CalculateOpenModels(int row_start,int row_end)1375 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1376   SetOfModels no_models;
1377   if (row_start < row_start_) {
1378     row_start = row_start_;
1379   }
1380   if (row_end > row_end_) {
1381     row_end = row_end_;
1382   }
1383 
1384   for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1385     if ((*rows_)[row].ri_->num_words == 0) {
1386       OpenModels(row + 1) = no_models;
1387     } else {
1388       SetOfModels &opened = OpenModels(row);
1389       (*rows_)[row].StartHypotheses(&opened);
1390 
1391       // Which models survive the transition from row to row + 1?
1392       SetOfModels still_open;
1393       for (auto &m : opened) {
1394         if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {
1395           // This is basic filtering; we check likely paragraph starty-ness down
1396           // below in Smear() -- you know, whether the first word would have fit
1397           // and such.
1398           push_back_new(still_open, m);
1399         }
1400       }
1401       OpenModels(row + 1) = still_open;
1402     }
1403   }
1404 }
1405 
1406 // see paragraphs_internal.h
Smear()1407 void ParagraphModelSmearer::Smear() {
1408   CalculateOpenModels(row_start_, row_end_);
1409 
1410   // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1411   // we have multiple LT_START hypotheses), see if there's a model that
1412   // was recently used (an "open" model) which might model it well.
1413   for (int i = row_start_; i < row_end_; i++) {
1414     RowScratchRegisters &row = (*rows_)[i];
1415     if (row.ri_->num_words == 0) {
1416       continue;
1417     }
1418 
1419     // Step One:
1420     //   Figure out if there are "open" models which are left-alined or
1421     //   right-aligned.  This is important for determining whether the
1422     //   "first" word in a row would fit at the "end" of the previous row.
1423     bool left_align_open = false;
1424     bool right_align_open = false;
1425     for (auto &m : OpenModels(i)) {
1426       switch (m->justification()) {
1427         case JUSTIFICATION_LEFT:
1428           left_align_open = true;
1429           break;
1430         case JUSTIFICATION_RIGHT:
1431           right_align_open = true;
1432           break;
1433         default:
1434           left_align_open = right_align_open = true;
1435       }
1436     }
1437     // Step Two:
1438     //   Use that knowledge to figure out if this row is likely to
1439     //   start a paragraph.
1440     bool likely_start;
1441     if (i == 0) {
1442       likely_start = true;
1443     } else {
1444       if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1445         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||
1446                        LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1447       } else if (left_align_open) {
1448         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);
1449       } else {
1450         likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1451       }
1452     }
1453 
1454     // Step Three:
1455     //   If this text line seems like an obvious first line of an
1456     //   open model, or an obvious continuation of an existing
1457     //   modelled paragraph, mark it up.
1458     if (likely_start) {
1459       // Add Start Hypotheses for all Open models that fit.
1460       for (unsigned m = 0; m < OpenModels(i).size(); m++) {
1461         if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1462           row.AddStartLine(OpenModels(i)[m]);
1463         }
1464       }
1465     } else {
1466       // Add relevant body line hypotheses.
1467       SetOfModels last_line_models;
1468       if (i > 0) {
1469         (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1470       } else {
1471         theory_->NonCenteredModels(&last_line_models);
1472       }
1473       for (auto model : last_line_models) {
1474         if (ValidBodyLine(rows_, i, model)) {
1475           row.AddBodyLine(model);
1476         }
1477       }
1478     }
1479 
1480     // Step Four:
1481     //   If we're still quite unsure about this line, go through all
1482     //   models in our theory and see if this row could be the start
1483     //   of any of our  models.
1484     if (row.GetLineType() == LT_UNKNOWN ||
1485         (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1486       SetOfModels all_models;
1487       theory_->NonCenteredModels(&all_models);
1488       for (auto &all_model : all_models) {
1489         if (ValidFirstLine(rows_, i, all_model)) {
1490           row.AddStartLine(all_model);
1491         }
1492       }
1493     }
1494     // Step Five:
1495     //   Since we may have updated the hypotheses about this row, we need
1496     //   to recalculate the Open models for the rest of rows[i + 1, row_end)
1497     if (row.GetLineType() != LT_UNKNOWN) {
1498       CalculateOpenModels(i + 1, row_end_);
1499     }
1500   }
1501 }
1502 
1503 // ================ Main Paragraph Detection Algorithm =======================
1504 
1505 // Find out what ParagraphModels are actually used, and discard any
1506 // that are not.
DiscardUnusedModels(const std::vector<RowScratchRegisters> & rows,ParagraphTheory * theory)1507 static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
1508                                 ParagraphTheory *theory) {
1509   SetOfModels used_models;
1510   for (const auto &row : rows) {
1511     row.StrongHypotheses(&used_models);
1512   }
1513   theory->DiscardUnusedModels(used_models);
1514 }
1515 
1516 // DowngradeWeakestToCrowns:
1517 //   Forget any flush-{left, right} models unless we see two or more
1518 //   of them in sequence.
1519 //
1520 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1521 // where the first line and body indent are the same) as having proper Models.
1522 // This is generally dangerous, since if you start imagining that flush-left
1523 // is a typical paragraph model when it is not, it will lead you to chop normal
1524 // indented paragraphs in the middle whenever a sentence happens to start on a
1525 // new line (see "This" above).  What to do?
1526 //   What we do is to take any paragraph which is flush left and is not
1527 // preceded by another paragraph of the same model and convert it to a "Crown"
1528 // paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
1529 // for later.  It means that the paragraph is flush, but it would be desirable
1530 // to mark it as the same model as following text if it fits.  This downgrade
1531 // FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
1532 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1533 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1534 //
1535 // Implementation:
1536 //
1537 //   Comb backwards through the row scratch registers, and turn any
1538 //   sequences of body lines of equivalent type abutted against the beginning
1539 //   or a body or start line of a different type into a crown paragraph.
DowngradeWeakestToCrowns(int debug_level,ParagraphTheory * theory,std::vector<RowScratchRegisters> * rows)1540 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1541                                      std::vector<RowScratchRegisters> *rows) {
1542   int start;
1543   for (int end = rows->size(); end > 0; end = start) {
1544     // Search back for a body line of a unique type.
1545     const ParagraphModel *model = nullptr;
1546     while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1547       end--;
1548     }
1549     if (end == 0) {
1550       break;
1551     }
1552     start = end - 1;
1553     while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1554       start--; // walk back to the first line that is not the same body type.
1555     }
1556     if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&
1557         NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1558       start--;
1559     }
1560     start++;
1561     // Now rows[start, end) is a sequence of unique body hypotheses of model.
1562     if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1563       continue;
1564     }
1565     if (!StrongModel(model)) {
1566       while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {
1567         start--;
1568       }
1569     }
1570     if (start == 0 || (!StrongModel(model)) ||
1571         (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1572       // crownify rows[start, end)
1573       const ParagraphModel *crown_model = model;
1574       if (StrongModel(model)) {
1575         if (model->justification() == JUSTIFICATION_LEFT) {
1576           crown_model = kCrownLeft;
1577         } else {
1578           crown_model = kCrownRight;
1579         }
1580       }
1581       (*rows)[start].SetUnknown();
1582       (*rows)[start].AddStartLine(crown_model);
1583       for (int row = start + 1; row < end; row++) {
1584         (*rows)[row].SetUnknown();
1585         (*rows)[row].AddBodyLine(crown_model);
1586       }
1587     }
1588   }
1589   DiscardUnusedModels(*rows, theory);
1590 }
1591 
1592 // Clear all hypotheses about lines [start, end) and reset margins.
1593 //
1594 // The empty space between the left of a row and the block boundary (and
1595 // similarly for the right) is split into two pieces: margin and indent.
1596 // In initial processing, we assume the block is tight and the margin for
1597 // all lines is set to zero.   However, if our first pass does not yield
1598 // models for  everything,  it may be  due to an  inset paragraph like a
1599 // block-quote.   In that case, we make a second pass over that unmarked
1600 // section of the page and reset the "margin" portion of the empty space
1601 // to the common amount of space at  the ends of the lines under consid-
1602 // eration.    This would be equivalent to percentile set to 0. However,
1603 // sometimes we have a single character sticking out in the right margin
1604 // of a text block  (like the 'r' in 'for' on line 3 above),  and we can
1605 // really  just ignore it as an outlier.   To express this, we allow the
1606 // user to specify  the percentile (0..100)  of indent values  to use as
1607 // the common margin for each row in the run of rows[start, end).
RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> * rows,int start,int end,int percentile)1608 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
1609                                         int end, int percentile) {
1610   if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1611     return;
1612   }
1613 
1614   int lmin, lmax, rmin, rmax;
1615   lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1616   rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1617   for (int i = start; i < end; i++) {
1618     RowScratchRegisters &sr = (*rows)[i];
1619     sr.SetUnknown();
1620     if (sr.ri_->num_words == 0) {
1621       continue;
1622     }
1623     UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1624     UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1625   }
1626   STATS lefts(lmin, lmax + 1);
1627   STATS rights(rmin, rmax + 1);
1628   for (int i = start; i < end; i++) {
1629     RowScratchRegisters &sr = (*rows)[i];
1630     if (sr.ri_->num_words == 0) {
1631       continue;
1632     }
1633     lefts.add(sr.lmargin_ + sr.lindent_, 1);
1634     rights.add(sr.rmargin_ + sr.rindent_, 1);
1635   }
1636   int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1637   int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1638   for (int i = start; i < end; i++) {
1639     RowScratchRegisters &sr = (*rows)[i];
1640     int ldelta = ignorable_left - sr.lmargin_;
1641     sr.lmargin_ += ldelta;
1642     sr.lindent_ -= ldelta;
1643     int rdelta = ignorable_right - sr.rmargin_;
1644     sr.rmargin_ += rdelta;
1645     sr.rindent_ -= rdelta;
1646   }
1647 }
1648 
1649 // Return the median inter-word space in rows[row_start, row_end).
InterwordSpace(const std::vector<RowScratchRegisters> & rows,int row_start,int row_end)1650 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
1651   if (row_end < row_start + 1) {
1652     return 1;
1653   }
1654   int word_height =
1655       (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1656   int word_width =
1657       (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1658   STATS spacing_widths(0, 5 + word_width);
1659   for (int i = row_start; i < row_end; i++) {
1660     if (rows[i].ri_->num_words > 1) {
1661       spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1662     }
1663   }
1664   int minimum_reasonable_space = word_height / 3;
1665   if (minimum_reasonable_space < 2) {
1666     minimum_reasonable_space = 2;
1667   }
1668   int median = spacing_widths.median();
1669   return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1670 }
1671 
1672 // Return whether the first word on the after line can fit in the space at
1673 // the end of the before line (knowing which way the text is aligned and read).
FirstWordWouldHaveFit(const RowScratchRegisters & before,const RowScratchRegisters & after,tesseract::ParagraphJustification justification)1674 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
1675                            tesseract::ParagraphJustification justification) {
1676   if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1677     return true;
1678   }
1679 
1680   if (justification == JUSTIFICATION_UNKNOWN) {
1681     tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1682   }
1683   int available_space;
1684   if (justification == JUSTIFICATION_CENTER) {
1685     available_space = before.lindent_ + before.rindent_;
1686   } else {
1687     available_space = before.OffsideIndent(justification);
1688   }
1689   available_space -= before.ri_->average_interword_space;
1690 
1691   if (before.ri_->ltr) {
1692     return after.ri_->lword_box.width() < available_space;
1693   }
1694   return after.ri_->rword_box.width() < available_space;
1695 }
1696 
1697 // Return whether the first word on the after line can fit in the space at
1698 // the end of the before line (not knowing which way the text goes) in a left
1699 // or right alignment.
FirstWordWouldHaveFit(const RowScratchRegisters & before,const RowScratchRegisters & after)1700 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1701   if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1702     return true;
1703   }
1704 
1705   int available_space = before.lindent_;
1706   if (before.rindent_ > available_space) {
1707     available_space = before.rindent_;
1708   }
1709   available_space -= before.ri_->average_interword_space;
1710 
1711   if (before.ri_->ltr) {
1712     return after.ri_->lword_box.width() < available_space;
1713   }
1714   return after.ri_->rword_box.width() < available_space;
1715 }
1716 
TextSupportsBreak(const RowScratchRegisters & before,const RowScratchRegisters & after)1717 static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1718   if (before.ri_->ltr) {
1719     return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1720   } else {
1721     return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1722   }
1723 }
1724 
LikelyParagraphStart(const RowScratchRegisters & before,const RowScratchRegisters & after,tesseract::ParagraphJustification j)1725 static bool LikelyParagraphStart(const RowScratchRegisters &before,
1726                                  const RowScratchRegisters &after,
1727                                  tesseract::ParagraphJustification j) {
1728   return before.ri_->num_words == 0 ||
1729          (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));
1730 }
1731 
1732 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1733 // would fit them as a single paragraph.
1734 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1735 // If the rows given could be a consistent start to a paragraph, set *consistent
1736 // true.
InternalParagraphModelByOutline(const std::vector<RowScratchRegisters> * rows,int start,int end,int tolerance,bool * consistent)1737 static ParagraphModel InternalParagraphModelByOutline(
1738     const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
1739     bool *consistent) {
1740   int ltr_line_count = 0;
1741   for (int i = start; i < end; i++) {
1742     ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1743   }
1744   bool ltr = (ltr_line_count >= (end - start) / 2);
1745 
1746   *consistent = true;
1747   if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1748     return ParagraphModel();
1749   }
1750 
1751   // Ensure the caller only passed us a region with a common rmargin and
1752   // lmargin.
1753   int lmargin = (*rows)[start].lmargin_;
1754   int rmargin = (*rows)[start].rmargin_;
1755   int lmin, lmax, rmin, rmax, cmin, cmax;
1756   lmin = lmax = (*rows)[start + 1].lindent_;
1757   rmin = rmax = (*rows)[start + 1].rindent_;
1758   cmin = cmax = 0;
1759   for (int i = start + 1; i < end; i++) {
1760     if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1761       tprintf("Margins don't match! Software error.\n");
1762       *consistent = false;
1763       return ParagraphModel();
1764     }
1765     UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1766     UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1767     UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1768   }
1769   int ldiff = lmax - lmin;
1770   int rdiff = rmax - rmin;
1771   int cdiff = cmax - cmin;
1772   if (rdiff > tolerance && ldiff > tolerance) {
1773     if (cdiff < tolerance * 2) {
1774       if (end - start < 3) {
1775         return ParagraphModel();
1776       }
1777       return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1778     }
1779     *consistent = false;
1780     return ParagraphModel();
1781   }
1782   if (end - start < 3) { // Don't return a model for two line paras.
1783     return ParagraphModel();
1784   }
1785 
1786   // These booleans keep us from saying something is aligned left when the body
1787   // left variance is too large.
1788   bool body_admits_left_alignment = ldiff < tolerance;
1789   bool body_admits_right_alignment = rdiff < tolerance;
1790 
1791   ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1792                                              (lmin + lmax) / 2, tolerance);
1793   ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1794                                               (rmin + rmax) / 2, tolerance);
1795 
1796   // These booleans keep us from having an indent on the "wrong side" for the
1797   // first line.
1798   bool text_admits_left_alignment = ltr || left_model.is_flush();
1799   bool text_admits_right_alignment = !ltr || right_model.is_flush();
1800 
1801   // At least one of the edges is less than tolerance in variance.
1802   // If the other is obviously ragged, it can't be the one aligned to.
1803   // [Note the last line is included in this raggedness.]
1804   if (tolerance < rdiff) {
1805     if (body_admits_left_alignment && text_admits_left_alignment) {
1806       return left_model;
1807     }
1808     *consistent = false;
1809     return ParagraphModel();
1810   }
1811   if (tolerance < ldiff) {
1812     if (body_admits_right_alignment && text_admits_right_alignment) {
1813       return right_model;
1814     }
1815     *consistent = false;
1816     return ParagraphModel();
1817   }
1818 
1819   // At this point, we know the body text doesn't vary much on either side.
1820 
1821   // If the first line juts out oddly in one direction or the other,
1822   // that likely indicates the side aligned to.
1823   int first_left = (*rows)[start].lindent_;
1824   int first_right = (*rows)[start].rindent_;
1825 
1826   if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1827     return left_model;
1828   }
1829   if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1830     return right_model;
1831   }
1832 
1833   *consistent = false;
1834   return ParagraphModel();
1835 }
1836 
1837 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1838 // would fit them as a single paragraph.   If nothing fits,
1839 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1840 // output if we're debugging.
ParagraphModelByOutline(int debug_level,const std::vector<RowScratchRegisters> * rows,int start,int end,int tolerance)1841 static ParagraphModel ParagraphModelByOutline(int debug_level,
1842                                               const std::vector<RowScratchRegisters> *rows,
1843                                               int start, int end, int tolerance) {
1844   bool unused_consistent;
1845   ParagraphModel retval =
1846       InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1847   if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1848     tprintf("Could not determine a model for this paragraph:\n");
1849     PrintRowRange(*rows, start, end);
1850   }
1851   return retval;
1852 }
1853 
1854 // Do rows[start, end) form a single instance of the given paragraph model?
RowsFitModel(const std::vector<RowScratchRegisters> * rows,int start,int end,const ParagraphModel * model)1855 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
1856                   const ParagraphModel *model) {
1857   if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1858     return false;
1859   }
1860   if (!ValidFirstLine(rows, start, model)) {
1861     return false;
1862   }
1863   for (int i = start + 1; i < end; i++) {
1864     if (!ValidBodyLine(rows, i, model)) {
1865       return false;
1866     }
1867   }
1868   return true;
1869 }
1870 
1871 // Examine rows[row_start, row_end) as an independent section of text,
1872 // and mark rows that are exceptionally clear as start-of-paragraph
1873 // and paragraph-body lines.
1874 //
1875 // We presume that any lines surrounding rows[row_start, row_end) may
1876 // have wildly different paragraph models, so we don't key any data off
1877 // of those lines.
1878 //
1879 // We only take the very strongest signals, as we don't want to get
1880 // confused and marking up centered text, poetry, or source code as
1881 // clearly part of a typical paragraph.
MarkStrongEvidence(std::vector<RowScratchRegisters> * rows,int row_start,int row_end)1882 static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
1883                                int row_end) {
1884   // Record patently obvious body text.
1885   for (int i = row_start + 1; i < row_end; i++) {
1886     const RowScratchRegisters &prev = (*rows)[i - 1];
1887     RowScratchRegisters &curr = (*rows)[i];
1888     tesseract::ParagraphJustification typical_justification =
1889         prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1890     if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1891         !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1892       curr.SetBodyLine();
1893     }
1894   }
1895 
1896   // Record patently obvious start paragraph lines.
1897   //
1898   // It's an extremely good signal of the start of a paragraph that
1899   // the first word would have fit on the end of the previous line.
1900   // However, applying just that signal would have us mark random
1901   // start lines of lineated text (poetry and source code) and some
1902   // centered headings as paragraph start lines.  Therefore, we use
1903   // a second qualification for a paragraph start: Not only should
1904   // the first word of this line have fit on the previous line,
1905   // but also, this line should go full to the right of the block,
1906   // disallowing a subsequent word from having fit on this line.
1907 
1908   // First row:
1909   {
1910     RowScratchRegisters &curr = (*rows)[row_start];
1911     RowScratchRegisters &next = (*rows)[row_start + 1];
1912     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1913     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1914         (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1915       curr.SetStartLine();
1916     }
1917   }
1918   // Middle rows
1919   for (int i = row_start + 1; i < row_end - 1; i++) {
1920     RowScratchRegisters &prev = (*rows)[i - 1];
1921     RowScratchRegisters &curr = (*rows)[i];
1922     RowScratchRegisters &next = (*rows)[i + 1];
1923     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1924     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1925         LikelyParagraphStart(prev, curr, j)) {
1926       curr.SetStartLine();
1927     }
1928   }
1929   // Last row
1930   { // the short circuit at the top means we have at least two lines.
1931     RowScratchRegisters &prev = (*rows)[row_end - 2];
1932     RowScratchRegisters &curr = (*rows)[row_end - 1];
1933     tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1934     if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&
1935         LikelyParagraphStart(prev, curr, j)) {
1936       curr.SetStartLine();
1937     }
1938   }
1939 }
1940 
1941 // Look for sequences of a start line followed by some body lines in
1942 // rows[row_start, row_end) and create ParagraphModels for them if
1943 // they seem coherent.
ModelStrongEvidence(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,bool allow_flush_models,ParagraphTheory * theory)1944 static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
1945                                 int row_start, int row_end, bool allow_flush_models,
1946                                 ParagraphTheory *theory) {
1947   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1948     return;
1949   }
1950 
1951   int start = row_start;
1952   while (start < row_end) {
1953     while (start < row_end && (*rows)[start].GetLineType() != LT_START) {
1954       start++;
1955     }
1956     if (start >= row_end - 1) {
1957       break;
1958     }
1959 
1960     int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1961     int end = start;
1962     ParagraphModel last_model;
1963     bool next_consistent;
1964     do {
1965       ++end;
1966       // rows[row, end) was consistent.
1967       // If rows[row, end + 1) is not consistent,
1968       //   just model rows[row, end)
1969       if (end < row_end - 1) {
1970         RowScratchRegisters &next = (*rows)[end];
1971         LineType lt = next.GetLineType();
1972         next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&
1973                                             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1974       } else {
1975         next_consistent = false;
1976       }
1977       if (next_consistent) {
1978         ParagraphModel next_model =
1979             InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1980         if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1981              next_model.justification() != JUSTIFICATION_LEFT) ||
1982             (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1983              next_model.justification() != JUSTIFICATION_RIGHT)) {
1984           next_consistent = false;
1985         }
1986         last_model = next_model;
1987       } else {
1988         next_consistent = false;
1989       }
1990     } while (next_consistent && end < row_end);
1991     // At this point, rows[start, end) looked like it could have been a
1992     // single paragraph.  If we can make a good ParagraphModel for it,
1993     // do so and mark this sequence with that model.
1994     if (end > start + 1) {
1995       // emit a new paragraph if we have more than one line.
1996       const ParagraphModel *model = nullptr;
1997       ParagraphModel new_model = ParagraphModelByOutline(
1998           debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));
1999       if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2000         // couldn't create a good model, oh well.
2001       } else if (new_model.is_flush()) {
2002         if (end == start + 2) {
2003           // It's very likely we just got two paragraph starts in a row.
2004           end = start + 1;
2005         } else if (start == row_start) {
2006           // Mark this as a Crown.
2007           if (new_model.justification() == JUSTIFICATION_LEFT) {
2008             model = kCrownLeft;
2009           } else {
2010             model = kCrownRight;
2011           }
2012         } else if (allow_flush_models) {
2013           model = theory->AddModel(new_model);
2014         }
2015       } else {
2016         model = theory->AddModel(new_model);
2017       }
2018       if (model) {
2019         (*rows)[start].AddStartLine(model);
2020         for (int i = start + 1; i < end; i++) {
2021           (*rows)[i].AddBodyLine(model);
2022         }
2023       }
2024     }
2025     start = end;
2026   }
2027 }
2028 
2029 // We examine rows[row_start, row_end) and do the following:
2030 //   (1) Clear all existing hypotheses for the rows being considered.
2031 //   (2) Mark up any rows as exceptionally likely to be paragraph starts
2032 //       or paragraph body lines as such using both geometric and textual
2033 //       clues.
2034 //   (3) Form models for any sequence of start + continuation lines.
2035 //   (4) Smear the paragraph models to cover surrounding text.
StrongEvidenceClassify(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)2036 static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
2037                                    int row_start, int row_end, ParagraphTheory *theory) {
2038   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2039     return;
2040   }
2041 
2042   if (debug_level > 1) {
2043     tprintf("#############################################\n");
2044     tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2045     tprintf("#############################################\n");
2046   }
2047 
2048   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2049   MarkStrongEvidence(rows, row_start, row_end);
2050 
2051   DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2052 
2053   // Create paragraph models.
2054   ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2055 
2056   DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2057 
2058   // At this point, some rows are marked up as paragraphs with model numbers,
2059   // and some rows are marked up as either LT_START or LT_BODY.  Now let's
2060   // smear any good paragraph hypotheses forward and backward.
2061   ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2062   smearer.Smear();
2063 }
2064 
SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)2065 static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
2066                                       int row_end, ParagraphTheory *theory) {
2067   for (int i = row_start + 1; i < row_end - 1; i++) {
2068     if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2069         (*rows)[i + 1].ri_->has_leaders) {
2070       const ParagraphModel *model =
2071           theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2072       (*rows)[i].AddStartLine(model);
2073     }
2074   }
2075 }
2076 
2077 // Collect sequences of unique hypotheses in row registers and create proper
2078 // paragraphs for them, referencing the paragraphs in row_owners.
ConvertHypothesizedModelRunsToParagraphs(int debug_level,std::vector<RowScratchRegisters> & rows,std::vector<PARA * > * row_owners,ParagraphTheory * theory)2079 static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
2080                                                      std::vector<RowScratchRegisters> &rows,
2081                                                      std::vector<PARA *> *row_owners,
2082                                                      ParagraphTheory *theory) {
2083   int end = rows.size();
2084   int start;
2085   for (; end > 0; end = start) {
2086     start = end - 1;
2087     const ParagraphModel *model = nullptr;
2088     // TODO(eger): Be smarter about dealing with multiple hypotheses.
2089     bool single_line_paragraph = false;
2090     SetOfModels models;
2091     rows[start].NonNullHypotheses(&models);
2092     if (!models.empty()) {
2093       model = models[0];
2094       if (rows[start].GetLineType(model) != LT_BODY) {
2095         single_line_paragraph = true;
2096       }
2097     }
2098     if (model && !single_line_paragraph) {
2099       // walk back looking for more body lines and then a start line.
2100       while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2101         // do nothing
2102       }
2103       if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2104         model = nullptr;
2105       }
2106     }
2107     if (model == nullptr) {
2108       continue;
2109     }
2110     // rows[start, end) should be a paragraph.
2111     PARA *p = new PARA();
2112     if (model == kCrownLeft || model == kCrownRight) {
2113       p->is_very_first_or_continuation = true;
2114       // Crown paragraph.
2115       //   If we can find an existing ParagraphModel that fits, use it,
2116       //   else create a new one.
2117       for (unsigned row = end; row < rows.size(); row++) {
2118         if ((*row_owners)[row] &&
2119             (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2120              (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2121           model = (*row_owners)[row]->model;
2122           break;
2123         }
2124       }
2125       if (model == kCrownLeft) {
2126         // No subsequent model fits, so cons one up.
2127         model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2128                                                 rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2129                                                 Epsilon(rows[start].ri_->average_interword_space)));
2130       } else if (model == kCrownRight) {
2131         // No subsequent model fits, so cons one up.
2132         model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2133                                                 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2134                                                 Epsilon(rows[start].ri_->average_interword_space)));
2135       }
2136     }
2137     rows[start].SetUnknown();
2138     rows[start].AddStartLine(model);
2139     for (int i = start + 1; i < end; i++) {
2140       rows[i].SetUnknown();
2141       rows[i].AddBodyLine(model);
2142     }
2143     p->model = model;
2144     p->has_drop_cap = rows[start].ri_->has_drop_cap;
2145     p->is_list_item = model->justification() == JUSTIFICATION_RIGHT
2146                           ? rows[start].ri_->rword_indicates_list_item
2147                           : rows[start].ri_->lword_indicates_list_item;
2148     for (int row = start; row < end; row++) {
2149       if ((*row_owners)[row] != nullptr) {
2150         tprintf(
2151             "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2152             "more than once!\n");
2153         delete (*row_owners)[row];
2154       }
2155       (*row_owners)[row] = p;
2156     }
2157   }
2158 }
2159 
2160 struct Interval {
Intervaltesseract::Interval2161   Interval() : begin(0), end(0) {}
Intervaltesseract::Interval2162   Interval(int b, int e) : begin(b), end(e) {}
2163 
2164   int begin;
2165   int end;
2166 };
2167 
2168 // Return whether rows[row] appears to be stranded, meaning that the evidence
2169 // for this row is very weak due to context.  For instance, two lines of source
2170 // code may happen to be indented at the same tab vector as body text starts,
2171 // leading us to think they are two start-of-paragraph lines.  This is not
2172 // optimal.  However, we also don't want to mark a sequence of short dialog
2173 // as "weak," so our heuristic is:
2174 //   (1) If a line is surrounded by lines of unknown type, it's weak.
2175 //   (2) If two lines in a row are start lines for a given paragraph type, but
2176 //       after that the same paragraph type does not continue, they're weak.
RowIsStranded(const std::vector<RowScratchRegisters> & rows,int row)2177 static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
2178   SetOfModels row_models;
2179   rows[row].StrongHypotheses(&row_models);
2180 
2181   for (auto &row_model : row_models) {
2182     bool all_starts = rows[row].GetLineType();
2183     int run_length = 1;
2184     bool continues = true;
2185     for (int i = row - 1; i >= 0 && continues; i--) {
2186       SetOfModels models;
2187       rows[i].NonNullHypotheses(&models);
2188       switch (rows[i].GetLineType(row_model)) {
2189         case LT_START:
2190           run_length++;
2191           break;
2192         case LT_MULTIPLE: // explicit fall-through
2193         case LT_BODY:
2194           run_length++;
2195           all_starts = false;
2196           break;
2197         case LT_UNKNOWN: // explicit fall-through
2198         default:
2199           continues = false;
2200       }
2201     }
2202     continues = true;
2203     for (unsigned i = row + 1; i < rows.size() && continues; i++) {
2204       SetOfModels models;
2205       rows[i].NonNullHypotheses(&models);
2206       switch (rows[i].GetLineType(row_model)) {
2207         case LT_START:
2208           run_length++;
2209           break;
2210         case LT_MULTIPLE: // explicit fall-through
2211         case LT_BODY:
2212           run_length++;
2213           all_starts = false;
2214           break;
2215         case LT_UNKNOWN: // explicit fall-through
2216         default:
2217           continues = false;
2218       }
2219     }
2220     if (run_length > 2 || (!all_starts && run_length > 1)) {
2221       return false;
2222     }
2223   }
2224   return true;
2225 }
2226 
2227 // Go through rows[row_start, row_end) and gather up sequences that need better
2228 // classification.
2229 // + Sequences of non-empty rows without hypotheses.
2230 // + Crown paragraphs not immediately followed by a strongly modeled line.
2231 // + Single line paragraphs surrounded by text that doesn't match the
2232 //   model.
LeftoverSegments(const std::vector<RowScratchRegisters> & rows,std::vector<Interval> * to_fix,int row_start,int row_end)2233 static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
2234                              std::vector<Interval> *to_fix, int row_start, int row_end) {
2235   to_fix->clear();
2236   for (int i = row_start; i < row_end; i++) {
2237     bool needs_fixing = false;
2238 
2239     SetOfModels models;
2240     SetOfModels models_w_crowns;
2241     rows[i].StrongHypotheses(&models);
2242     rows[i].NonNullHypotheses(&models_w_crowns);
2243     if (models.empty() && !models_w_crowns.empty()) {
2244       // Crown paragraph.  Is it followed by a modeled line?
2245       for (unsigned end = i + 1; end < rows.size(); end++) {
2246         SetOfModels end_models;
2247         SetOfModels strong_end_models;
2248         rows[end].NonNullHypotheses(&end_models);
2249         rows[end].StrongHypotheses(&strong_end_models);
2250         if (end_models.empty()) {
2251           needs_fixing = true;
2252           break;
2253         } else if (!strong_end_models.empty()) {
2254           needs_fixing = false;
2255           break;
2256         }
2257       }
2258     } else if (models.empty() && rows[i].ri_->num_words > 0) {
2259       // No models at all.
2260       needs_fixing = true;
2261     }
2262 
2263     if (!needs_fixing && !models.empty()) {
2264       needs_fixing = RowIsStranded(rows, i);
2265     }
2266 
2267     if (needs_fixing) {
2268       if (!to_fix->empty() && to_fix->back().end == i - 1) {
2269         to_fix->back().end = i;
2270       } else {
2271         to_fix->push_back(Interval(i, i));
2272       }
2273     }
2274   }
2275   // Convert inclusive intervals to half-open intervals.
2276   for (auto &i : *to_fix) {
2277     i.end = i.end + 1;
2278   }
2279 }
2280 
2281 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2282 // normalize each row_owner to point to an actual PARA, and output the
2283 // paragraphs in order onto paragraphs.
CanonicalizeDetectionResults(std::vector<PARA * > * row_owners,PARA_LIST * paragraphs)2284 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
2285   std::vector<PARA *> &rows = *row_owners;
2286   paragraphs->clear();
2287   PARA_IT out(paragraphs);
2288   PARA *formerly_null = nullptr;
2289   for (unsigned i = 0; i < rows.size(); i++) {
2290     if (rows[i] == nullptr) {
2291       if (i == 0 || rows[i - 1] != formerly_null) {
2292         rows[i] = formerly_null = new PARA();
2293       } else {
2294         rows[i] = formerly_null;
2295         continue;
2296       }
2297     } else if (i > 0 && rows[i - 1] == rows[i]) {
2298       continue;
2299     }
2300     out.add_after_then_move(rows[i]);
2301   }
2302 }
2303 
2304 // Main entry point for Paragraph Detection Algorithm.
2305 //
2306 // Given a set of equally spaced textlines (described by row_infos),
2307 // Split them into paragraphs.
2308 //
2309 // Output:
2310 //   row_owners - one pointer for each row, to the paragraph it belongs to.
2311 //   paragraphs - this is the actual list of PARA objects.
2312 //   models - the list of paragraph models referenced by the PARA objects.
2313 //            caller is responsible for deleting the models.
DetectParagraphs(int debug_level,std::vector<RowInfo> * row_infos,std::vector<PARA * > * row_owners,PARA_LIST * paragraphs,std::vector<ParagraphModel * > * models)2314 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
2315                       std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2316                       std::vector<ParagraphModel *> *models) {
2317   ParagraphTheory theory(models);
2318 
2319   // Initialize row_owners to be a bunch of nullptr pointers.
2320   row_owners->clear();
2321   row_owners->resize(row_infos->size());
2322 
2323   // Set up row scratch registers for the main algorithm.
2324   std::vector<RowScratchRegisters> rows(row_infos->size());
2325   for (unsigned i = 0; i < row_infos->size(); i++) {
2326     rows[i].Init((*row_infos)[i]);
2327   }
2328 
2329   // Pass 1:
2330   //   Detect sequences of lines that all contain leader dots (.....)
2331   //   These are likely Tables of Contents.  If there are three text lines in
2332   //   a row with leader dots, it's pretty safe to say the middle one should
2333   //   be a paragraph of its own.
2334   SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2335 
2336   DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2337 
2338   std::vector<Interval> leftovers;
2339   LeftoverSegments(rows, &leftovers, 0, rows.size());
2340   for (auto &leftover : leftovers) {
2341     // Pass 2a:
2342     //   Find any strongly evidenced start-of-paragraph lines.  If they're
2343     //   followed by two lines that look like body lines, make a paragraph
2344     //   model for that and see if that model applies throughout the text
2345     //   (that is, "smear" it).
2346     StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2347 
2348     // Pass 2b:
2349     //   If we had any luck in pass 2a, we got part of the page and didn't
2350     //   know how to classify a few runs of rows. Take the segments that
2351     //   didn't find a model and reprocess them individually.
2352     std::vector<Interval> leftovers2;
2353     LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2354     bool pass2a_was_useful =
2355         leftovers2.size() > 1 ||
2356         (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
2357     if (pass2a_was_useful) {
2358       for (auto &leftover2 : leftovers2) {
2359         StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2360       }
2361     }
2362   }
2363 
2364   DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2365 
2366   // Pass 3:
2367   //   These are the dregs for which we didn't have enough strong textual
2368   //   and geometric clues to form matching models for.  Let's see if
2369   //   the geometric clues are simple enough that we could just use those.
2370   LeftoverSegments(rows, &leftovers, 0, rows.size());
2371   for (auto &leftover : leftovers) {
2372     GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2373   }
2374 
2375   // Undo any flush models for which there's little evidence.
2376   DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2377 
2378   DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2379 
2380   // Pass 4:
2381   //   Take everything that's still not marked up well and clear all markings.
2382   LeftoverSegments(rows, &leftovers, 0, rows.size());
2383   for (auto &leftover : leftovers) {
2384     for (int j = leftover.begin; j < leftover.end; j++) {
2385       rows[j].SetUnknown();
2386     }
2387   }
2388 
2389   DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2390 
2391   // Convert all of the unique hypothesis runs to PARAs.
2392   ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2393 
2394   DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2395 
2396   // Finally, clean up any dangling nullptr row paragraph parents.
2397   CanonicalizeDetectionResults(row_owners, paragraphs);
2398 }
2399 
2400 // ============ Code interfacing with the rest of Tesseract ==================
2401 
InitializeTextAndBoxesPreRecognition(const MutableIterator & it,RowInfo * info)2402 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
2403   // Set up text, lword_text, and rword_text (mostly for debug printing).
2404   std::string fake_text;
2405   PageIterator pit(static_cast<const PageIterator &>(it));
2406   bool first_word = true;
2407   if (!pit.Empty(RIL_WORD)) {
2408     do {
2409       fake_text += "x";
2410       if (first_word) {
2411         info->lword_text += "x";
2412       }
2413       info->rword_text += "x";
2414       if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2415           !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2416         fake_text += " ";
2417         info->rword_text = "";
2418         first_word = false;
2419       }
2420     } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));
2421   }
2422   if (fake_text.empty()) {
2423     return;
2424   }
2425 
2426   int lspaces = info->pix_ldistance / info->average_interword_space;
2427   for (int i = 0; i < lspaces; i++) {
2428     info->text += ' ';
2429   }
2430   info->text += fake_text;
2431 
2432   // Set up lword_box, rword_box, and num_words.
2433   PAGE_RES_IT page_res_it = *it.PageResIt();
2434   WERD_RES *word_res = page_res_it.restart_row();
2435   ROW_RES *this_row = page_res_it.row();
2436 
2437   WERD_RES *lword = nullptr;
2438   WERD_RES *rword = nullptr;
2439   info->num_words = 0;
2440   do {
2441     if (word_res) {
2442       if (!lword) {
2443         lword = word_res;
2444       }
2445       if (rword != word_res) {
2446         info->num_words++;
2447       }
2448       rword = word_res;
2449     }
2450     word_res = page_res_it.forward();
2451   } while (page_res_it.row() == this_row);
2452 
2453   if (lword) {
2454     info->lword_box = lword->word->bounding_box();
2455   }
2456   if (rword) {
2457     info->rword_box = rword->word->bounding_box();
2458   }
2459 }
2460 
2461 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2462 // detector RowInfo with all relevant information from the row.
InitializeRowInfo(bool after_recognition,const MutableIterator & it,RowInfo * info)2463 static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {
2464   if (it.PageResIt()->row() != nullptr) {
2465     ROW *row = it.PageResIt()->row()->row;
2466     info->pix_ldistance = row->lmargin();
2467     info->pix_rdistance = row->rmargin();
2468     info->average_interword_space =
2469         row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2470     info->pix_xheight = row->x_height();
2471     info->has_leaders = false;
2472     info->has_drop_cap = row->has_drop_cap();
2473     info->ltr = true; // set below depending on word scripts
2474   } else {
2475     info->pix_ldistance = info->pix_rdistance = 0;
2476     info->average_interword_space = 1;
2477     info->pix_xheight = 1.0;
2478     info->has_leaders = false;
2479     info->has_drop_cap = false;
2480     info->ltr = true;
2481   }
2482 
2483   info->num_words = 0;
2484   info->lword_indicates_list_item = false;
2485   info->lword_likely_starts_idea = false;
2486   info->lword_likely_ends_idea = false;
2487   info->rword_indicates_list_item = false;
2488   info->rword_likely_starts_idea = false;
2489   info->rword_likely_ends_idea = false;
2490   info->has_leaders = false;
2491   info->ltr = true;
2492 
2493   if (!after_recognition) {
2494     InitializeTextAndBoxesPreRecognition(it, info);
2495     return;
2496   }
2497   info->text = "";
2498   const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2499   int trailing_ws_idx = strlen(text.get()); // strip trailing space
2500   while (trailing_ws_idx > 0 &&
2501          // isspace() only takes ASCII
2502          isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2503     trailing_ws_idx--;
2504   }
2505   if (trailing_ws_idx > 0) {
2506     int lspaces = info->pix_ldistance / info->average_interword_space;
2507     for (int i = 0; i < lspaces; i++) {
2508       info->text += ' ';
2509     }
2510     for (int i = 0; i < trailing_ws_idx; i++) {
2511       info->text += text[i];
2512     }
2513   }
2514 
2515   if (info->text.empty()) {
2516     return;
2517   }
2518 
2519   PAGE_RES_IT page_res_it = *it.PageResIt();
2520   std::vector<WERD_RES *> werds;
2521   WERD_RES *word_res = page_res_it.restart_row();
2522   ROW_RES *this_row = page_res_it.row();
2523   int num_leaders = 0;
2524   int ltr = 0;
2525   int rtl = 0;
2526   do {
2527     if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2528       werds.push_back(word_res);
2529       ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2530       rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2531       if (word_res->word->flag(W_REP_CHAR)) {
2532         num_leaders++;
2533       }
2534     }
2535     word_res = page_res_it.forward();
2536   } while (page_res_it.row() == this_row);
2537   info->ltr = ltr >= rtl;
2538   info->has_leaders = num_leaders > 3;
2539   info->num_words = werds.size();
2540   if (!werds.empty()) {
2541     WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2542     info->lword_text = lword->best_choice->unichar_string().c_str();
2543     info->rword_text = rword->best_choice->unichar_string().c_str();
2544     info->lword_box = lword->word->bounding_box();
2545     info->rword_box = rword->word->bounding_box();
2546     LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,
2547                        &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2548                        &info->lword_likely_ends_idea);
2549     RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,
2550                         &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2551                         &info->rword_likely_ends_idea);
2552   }
2553 }
2554 
2555 // This is called after rows have been identified and words are recognized.
2556 // Much of this could be implemented before word recognition, but text helps
2557 // to identify bulleted lists and gives good signals for sentence boundaries.
DetectParagraphs(int debug_level,bool after_text_recognition,const MutableIterator * block_start,std::vector<ParagraphModel * > * models)2558 void DetectParagraphs(int debug_level, bool after_text_recognition,
2559                       const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2560   // Clear out any preconceived notions.
2561   if (block_start->Empty(RIL_TEXTLINE)) {
2562     return;
2563   }
2564   BLOCK *block = block_start->PageResIt()->block()->block;
2565   block->para_list()->clear();
2566   bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2567 
2568   // Convert the Tesseract structures to RowInfos
2569   // for the paragraph detection algorithm.
2570   MutableIterator row(*block_start);
2571   if (row.Empty(RIL_TEXTLINE)) {
2572     return; // end of input already.
2573   }
2574 
2575   std::vector<RowInfo> row_infos;
2576   do {
2577     if (!row.PageResIt()->row()) {
2578       continue; // empty row.
2579     }
2580     row.PageResIt()->row()->row->set_para(nullptr);
2581     row_infos.emplace_back();
2582     RowInfo &ri = row_infos.back();
2583     InitializeRowInfo(after_text_recognition, row, &ri);
2584   } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));
2585 
2586   // If we're called before text recognition, we might not have
2587   // tight block bounding boxes, so trim by the minimum on each side.
2588   if (!row_infos.empty()) {
2589     int min_lmargin = row_infos[0].pix_ldistance;
2590     int min_rmargin = row_infos[0].pix_rdistance;
2591     for (unsigned i = 1; i < row_infos.size(); i++) {
2592       if (row_infos[i].pix_ldistance < min_lmargin) {
2593         min_lmargin = row_infos[i].pix_ldistance;
2594       }
2595       if (row_infos[i].pix_rdistance < min_rmargin) {
2596         min_rmargin = row_infos[i].pix_rdistance;
2597       }
2598     }
2599     if (min_lmargin > 0 || min_rmargin > 0) {
2600       for (auto &row_info : row_infos) {
2601         row_info.pix_ldistance -= min_lmargin;
2602         row_info.pix_rdistance -= min_rmargin;
2603       }
2604     }
2605   }
2606 
2607   // Run the paragraph detection algorithm.
2608   std::vector<PARA *> row_owners;
2609   std::vector<PARA *> the_paragraphs;
2610   if (!is_image_block) {
2611     DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
2612   } else {
2613     row_owners.resize(row_infos.size());
2614     CanonicalizeDetectionResults(&row_owners, block->para_list());
2615   }
2616 
2617   // Now stitch in the row_owners into the rows.
2618   row = *block_start;
2619   for (auto &row_owner : row_owners) {
2620     while (!row.PageResIt()->row()) {
2621       row.Next(RIL_TEXTLINE);
2622     }
2623     row.PageResIt()->row()->row->set_para(row_owner);
2624     row.Next(RIL_TEXTLINE);
2625   }
2626 }
2627 
2628 } // namespace tesseract
2629