1 /**********************************************************************
2 * File: paragraphs.cpp
3 * Description: Paragraph detection for tesseract.
4 * Author: David Eger
5 *
6 * (C) Copyright 2011, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "paragraphs.h"
20
21 #include "helpers.h" // for UpdateRange, ClipToRange
22 #include "host.h" // for NearlyEqual
23 #include "mutableiterator.h" // for MutableIterator
24 #include "ocrblock.h" // for BLOCK
25 #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
26 #include "ocrrow.h" // for ROW
27 #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
28 #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
29 #include "pdblock.h" // for PDBLK
30 #include "polyblk.h" // for POLY_BLOCK
31 #include "ratngs.h" // for WERD_CHOICE
32 #include "rect.h" // for TBOX
33 #include "statistc.h" // for STATS
34 #include "tprintf.h" // for tprintf
35 #include "unicharset.h" // for UNICHARSET
36 #include "werd.h" // for WERD, W_REP_CHAR
37
38 #include <tesseract/pageiterator.h> // for PageIterator
39 #include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40 #include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID
41
42 #include <algorithm> // for max
43 #include <cctype> // for isspace
44 #include <cmath> // for abs
45 #include <cstdio> // for snprintf
46 #include <cstdlib> // for abs
47 #include <cstring> // for strchr, strlen
48 #include <memory> // for unique_ptr
49
50 static const char *const kRLE = "\u202A"; // Right-to-Left Embedding
51 static const char *const kPDF = "\u202C"; // Pop Directional Formatting
52
53 namespace tesseract {
54
55 // Special "weak" ParagraphModels.
56 const ParagraphModel *kCrownLeft =
57 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
58 const ParagraphModel *kCrownRight =
59 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
60
61 // Do the text and geometry of two rows support a paragraph break between them?
62 static bool LikelyParagraphStart(const RowScratchRegisters &before,
63 const RowScratchRegisters &after,
64 tesseract::ParagraphJustification j);
65
66 // Given the width of a typical space between words, what is the threshold
67 // by which by which we think left and right alignments for paragraphs
68 // can vary and still be aligned.
Epsilon(int space_pix)69 static int Epsilon(int space_pix) {
70 return space_pix * 4 / 5;
71 }
72
AcceptableRowArgs(int debug_level,int min_num_rows,const char * function_name,const std::vector<RowScratchRegisters> * rows,int row_start,int row_end)73 static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
74 const std::vector<RowScratchRegisters> *rows, int row_start,
75 int row_end) {
76 if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77 tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
78 rows->size());
79 return false;
80 }
81 if (row_end - row_start < min_num_rows) {
82 if (debug_level > 1) {
83 tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
84 }
85 return false;
86 }
87 return true;
88 }
89
90 // =============================== Debug Code ================================
91
92 // Given a row-major matrix of unicode text and a column separator, print
93 // a formatted table. For ASCII, we get good column alignment.
PrintTable(const std::vector<std::vector<std::string>> & rows,const char * colsep)94 static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
95 std::vector<int> max_col_widths;
96 for (const auto &row : rows) {
97 auto num_columns = row.size();
98 for (size_t c = 0; c < num_columns; c++) {
99 int num_unicodes = 0;
100 for (char i : row[c]) {
101 if ((i & 0xC0) != 0x80) {
102 num_unicodes++;
103 }
104 }
105 if (c >= max_col_widths.size()) {
106 max_col_widths.push_back(num_unicodes);
107 } else {
108 if (num_unicodes > max_col_widths[c]) {
109 max_col_widths[c] = num_unicodes;
110 }
111 }
112 }
113 }
114
115 std::vector<std::string> col_width_patterns;
116 col_width_patterns.reserve(max_col_widths.size());
117 for (int max_col_width : max_col_widths) {
118 col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s");
119 }
120
121 for (const auto &row : rows) {
122 for (unsigned c = 0; c < row.size(); c++) {
123 if (c > 0) {
124 tprintf("%s", colsep);
125 }
126 tprintf(col_width_patterns[c].c_str(), row[c].c_str());
127 }
128 tprintf("\n");
129 }
130 }
131
RtlEmbed(const std::string & word,bool rtlify)132 static std::string RtlEmbed(const std::string &word, bool rtlify) {
133 if (rtlify) {
134 return std::string(kRLE) + word + std::string(kPDF);
135 }
136 return word;
137 }
138
139 // Print the current thoughts of the paragraph detector.
PrintDetectorState(const ParagraphTheory & theory,const std::vector<RowScratchRegisters> & rows)140 static void PrintDetectorState(const ParagraphTheory &theory,
141 const std::vector<RowScratchRegisters> &rows) {
142 std::vector<std::vector<std::string>> output;
143 output.emplace_back();
144 output.back().push_back("#row");
145 output.back().push_back("space");
146 output.back().push_back("..");
147 output.back().push_back("lword[widthSEL]");
148 output.back().push_back("rword[widthSEL]");
149 RowScratchRegisters::AppendDebugHeaderFields(output.back());
150 output.back().push_back("text");
151
152 for (unsigned i = 0; i < rows.size(); i++) {
153 output.emplace_back();
154 std::vector<std::string> &row = output.back();
155 const RowInfo &ri = *rows[i].ri_;
156 row.push_back(std::to_string(i));
157 row.push_back(std::to_string(ri.average_interword_space));
158 row.emplace_back(ri.has_leaders ? ".." : " ");
159 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
160 (ri.lword_likely_starts_idea ? "S" : "s") +
161 (ri.lword_likely_ends_idea ? "E" : "e") +
162 (ri.lword_indicates_list_item ? "L" : "l") + "]");
163 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
164 (ri.rword_likely_starts_idea ? "S" : "s") +
165 (ri.rword_likely_ends_idea ? "E" : "e") +
166 (ri.rword_indicates_list_item ? "L" : "l") + "]");
167 rows[i].AppendDebugInfo(theory, row);
168 row.push_back(RtlEmbed(ri.text, !ri.ltr));
169 }
170 PrintTable(output, " ");
171
172 tprintf("Active Paragraph Models:\n");
173 unsigned m = 0;
174 for (const auto &model : theory.models()) {
175 tprintf(" %d: %s\n", ++m, model->ToString().c_str());
176 }
177 }
178
DebugDump(bool should_print,const char * phase,const ParagraphTheory & theory,const std::vector<RowScratchRegisters> & rows)179 static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
180 const std::vector<RowScratchRegisters> &rows) {
181 if (!should_print) {
182 return;
183 }
184 tprintf("# %s\n", phase);
185 PrintDetectorState(theory, rows);
186 }
187
188 // Print out the text for rows[row_start, row_end)
PrintRowRange(const std::vector<RowScratchRegisters> & rows,int row_start,int row_end)189 static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
190 int row_end) {
191 tprintf("======================================\n");
192 for (int row = row_start; row < row_end; row++) {
193 tprintf("%s\n", rows[row].ri_->text.c_str());
194 }
195 tprintf("======================================\n");
196 }
197
198 // ============= Brain Dead Language Model (ASCII Version) ===================
199
IsLatinLetter(int ch)200 static bool IsLatinLetter(int ch) {
201 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
202 }
203
IsDigitLike(int ch)204 static bool IsDigitLike(int ch) {
205 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
206 }
207
IsOpeningPunct(int ch)208 static bool IsOpeningPunct(int ch) {
209 return strchr("'\"({[", ch) != nullptr;
210 }
211
IsTerminalPunct(int ch)212 static bool IsTerminalPunct(int ch) {
213 return strchr(":'\".?!]})", ch) != nullptr;
214 }
215
216 // Return a pointer after consuming as much text as qualifies as roman numeral.
SkipChars(const char * str,const char * toskip)217 static const char *SkipChars(const char *str, const char *toskip) {
218 while (*str != '\0' && strchr(toskip, *str)) {
219 str++;
220 }
221 return str;
222 }
223
SkipChars(const char * str,bool (* skip)(int))224 static const char *SkipChars(const char *str, bool (*skip)(int)) {
225 while (*str != '\0' && skip(*str)) {
226 str++;
227 }
228 return str;
229 }
230
SkipOne(const char * str,const char * toskip)231 static const char *SkipOne(const char *str, const char *toskip) {
232 if (*str != '\0' && strchr(toskip, *str)) {
233 return str + 1;
234 }
235 return str;
236 }
237
238 // Return whether it is very likely that this is a numeral marker that could
239 // start a list item. Some examples include:
240 // A I iii. VI (2) 3.5. [C-4]
LikelyListNumeral(const std::string & word)241 static bool LikelyListNumeral(const std::string &word) {
242 const char *kRomans = "ivxlmdIVXLMD";
243 const char *kDigits = "012345789";
244 const char *kOpen = "[{(";
245 const char *kSep = ":;-.,";
246 const char *kClose = "]})";
247
248 int num_segments = 0;
249 const char *pos = word.c_str();
250 while (*pos != '\0' && num_segments < 3) {
251 // skip up to two open parens.
252 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253 const char *numeral_end = SkipChars(numeral_start, kRomans);
254 if (numeral_end != numeral_start) {
255 // Got Roman Numeral. Great.
256 } else {
257 numeral_end = SkipChars(numeral_start, kDigits);
258 if (numeral_end == numeral_start) {
259 // If there's a single latin letter, we can use that.
260 numeral_end = SkipChars(numeral_start, IsLatinLetter);
261 if (numeral_end - numeral_start != 1) {
262 break;
263 }
264 }
265 }
266 // We got some sort of numeral.
267 num_segments++;
268 // Skip any trailing parens or punctuation.
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end) {
271 break;
272 }
273 }
274 return *pos == '\0';
275 }
276
LikelyListMark(const std::string & word)277 static bool LikelyListMark(const std::string &word) {
278 const char *kListMarks = "0Oo*.,+.";
279 return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
280 }
281
AsciiLikelyListItem(const std::string & word)282 bool AsciiLikelyListItem(const std::string &word) {
283 return LikelyListMark(word) || LikelyListNumeral(word);
284 }
285
286 // ========== Brain Dead Language Model (Tesseract Version) ================
287
288 // Return the first Unicode Codepoint from werd[pos].
UnicodeFor(const UNICHARSET * u,const WERD_CHOICE * werd,unsigned pos)289 static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
290 if (!u || !werd || pos > werd->length()) {
291 return 0;
292 }
293 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
294 }
295
296 // A useful helper class for finding the first j >= i so that word[j]
297 // does not have given character type.
298 class UnicodeSpanSkipper {
299 public:
UnicodeSpanSkipper(const UNICHARSET * unicharset,const WERD_CHOICE * word)300 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
301 : u_(unicharset), word_(word), wordlen_(word->length()) {
302 }
303
304 // Given an input position, return the first position >= pos not punc.
305 unsigned SkipPunc(unsigned pos);
306 // Given an input position, return the first position >= pos not digit.
307 unsigned SkipDigits(unsigned pos);
308 // Given an input position, return the first position >= pos not roman.
309 unsigned SkipRomans(unsigned pos);
310 // Given an input position, return the first position >= pos not alpha.
311 unsigned SkipAlpha(unsigned pos);
312
313 private:
314 const UNICHARSET *u_;
315 const WERD_CHOICE *word_;
316 unsigned wordlen_;
317 };
318
SkipPunc(unsigned pos)319 unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
320 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
321 pos++;
322 }
323 return pos;
324 }
325
SkipDigits(unsigned pos)326 unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
327 while (pos < wordlen_ &&
328 (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
329 pos++;
330 }
331 return pos;
332 }
333
SkipRomans(unsigned pos)334 unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
335 const char *kRomans = "ivxlmdIVXLMD";
336 while (pos < wordlen_) {
337 int ch = UnicodeFor(u_, word_, pos);
338 if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {
339 break;
340 }
341 pos++;
342 }
343 return pos;
344 }
345
SkipAlpha(unsigned pos)346 unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
347 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
348 pos++;
349 }
350 return pos;
351 }
352
LikelyListMarkUnicode(int ch)353 static bool LikelyListMarkUnicode(int ch) {
354 if (ch < 0x80) {
355 std::string single_ch;
356 single_ch += ch;
357 return LikelyListMark(single_ch);
358 }
359 switch (ch) {
360 // TODO(eger) expand this list of unicodes as needed.
361 case 0x00B0: // degree sign
362 case 0x2022: // bullet
363 case 0x25E6: // white bullet
364 case 0x00B7: // middle dot
365 case 0x25A1: // white square
366 case 0x25A0: // black square
367 case 0x25AA: // black small square
368 case 0x2B1D: // black very small square
369 case 0x25BA: // black right-pointing pointer
370 case 0x25CF: // black circle
371 case 0x25CB: // white circle
372 return true;
373 default:
374 break; // fall through
375 }
376 return false;
377 }
378
379 // Return whether it is very likely that this is a numeral marker that could
380 // start a list item. Some examples include:
381 // A I iii. VI (2) 3.5. [C-4]
UniLikelyListItem(const UNICHARSET * u,const WERD_CHOICE * werd)382 static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
383 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
384 return true;
385 }
386
387 UnicodeSpanSkipper m(u, werd);
388 int num_segments = 0;
389 unsigned pos = 0;
390 while (pos < werd->length() && num_segments < 3) {
391 auto numeral_start = m.SkipPunc(pos);
392 if (numeral_start > pos + 1) {
393 break;
394 }
395 auto numeral_end = m.SkipRomans(numeral_start);
396 if (numeral_end == numeral_start) {
397 numeral_end = m.SkipDigits(numeral_start);
398 if (numeral_end == numeral_start) {
399 // If there's a single latin letter, we can use that.
400 numeral_end = m.SkipAlpha(numeral_start);
401 if (numeral_end - numeral_start != 1) {
402 break;
403 }
404 }
405 }
406 // We got some sort of numeral.
407 num_segments++;
408 // Skip any trailing punctuation.
409 pos = m.SkipPunc(numeral_end);
410 if (pos == numeral_end) {
411 break;
412 }
413 }
414 return pos == werd->length();
415 }
416
417 template<class T>
push_back_new(std::vector<T> & vector,const T & data)418 void push_back_new(std::vector<T> &vector, const T &data) {
419 if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420 vector.push_back(data);
421 }
422 }
423
424 // ========= Brain Dead Language Model (combined entry points) ================
425
426 // Given the leftmost word of a line either as a Tesseract unicharset + werd
427 // or a utf8 string, set the following attributes for it:
428 // is_list - this word might be a list number or bullet.
429 // starts_idea - this word is likely to start a sentence.
430 // ends_idea - this word is likely to end a sentence.
LeftWordAttributes(const UNICHARSET * unicharset,const WERD_CHOICE * werd,const std::string & utf8,bool * is_list,bool * starts_idea,bool * ends_idea)431 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
432 bool *is_list, bool *starts_idea, bool *ends_idea) {
433 *is_list = false;
434 *starts_idea = false;
435 *ends_idea = false;
436 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
437 *ends_idea = true;
438 return;
439 }
440
441 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
442 if (UniLikelyListItem(unicharset, werd)) {
443 *is_list = true;
444 *starts_idea = true;
445 *ends_idea = true;
446 }
447 if (unicharset->get_isupper(werd->unichar_id(0))) {
448 *starts_idea = true;
449 }
450 if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
451 *starts_idea = true;
452 *ends_idea = true;
453 }
454 } else { // Assume utf8 is mostly ASCII
455 if (AsciiLikelyListItem(utf8)) {
456 *is_list = true;
457 *starts_idea = true;
458 }
459 int start_letter = utf8[0];
460 if (IsOpeningPunct(start_letter)) {
461 *starts_idea = true;
462 }
463 if (IsTerminalPunct(start_letter)) {
464 *ends_idea = true;
465 }
466 if (start_letter >= 'A' && start_letter <= 'Z') {
467 *starts_idea = true;
468 }
469 }
470 }
471
472 // Given the rightmost word of a line either as a Tesseract unicharset + werd
473 // or a utf8 string, set the following attributes for it:
474 // is_list - this word might be a list number or bullet.
475 // starts_idea - this word is likely to start a sentence.
476 // ends_idea - this word is likely to end a sentence.
RightWordAttributes(const UNICHARSET * unicharset,const WERD_CHOICE * werd,const std::string & utf8,bool * is_list,bool * starts_idea,bool * ends_idea)477 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
478 bool *is_list, bool *starts_idea, bool *ends_idea) {
479 *is_list = false;
480 *starts_idea = false;
481 *ends_idea = false;
482 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
483 *ends_idea = true;
484 return;
485 }
486
487 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
488 if (UniLikelyListItem(unicharset, werd)) {
489 *is_list = true;
490 *starts_idea = true;
491 }
492 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
493 if (unicharset->get_ispunctuation(last_letter)) {
494 *ends_idea = true;
495 }
496 } else { // Assume utf8 is mostly ASCII
497 if (AsciiLikelyListItem(utf8)) {
498 *is_list = true;
499 *starts_idea = true;
500 }
501 int last_letter = utf8[utf8.size() - 1];
502 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
503 *ends_idea = true;
504 }
505 }
506 }
507
508 // =============== Implementation of RowScratchRegisters =====================
509 /* static */
AppendDebugHeaderFields(std::vector<std::string> & header)510 void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
511 header.emplace_back("[lmarg,lind;rind,rmarg]");
512 header.emplace_back("model");
513 }
514
AppendDebugInfo(const ParagraphTheory & theory,std::vector<std::string> & dbg) const515 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
516 std::vector<std::string> &dbg) const {
517 char s[30];
518 snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
519 dbg.emplace_back(s);
520 std::string model_string;
521 model_string += static_cast<char>(GetLineType());
522 model_string += ":";
523
524 int model_numbers = 0;
525 for (const auto &hypothese : hypotheses_) {
526 if (hypothese.model == nullptr) {
527 continue;
528 }
529 if (model_numbers > 0) {
530 model_string += ",";
531 }
532 if (StrongModel(hypothese.model)) {
533 model_string += std::to_string(1 + theory.IndexOf(hypothese.model));
534 } else if (hypothese.model == kCrownLeft) {
535 model_string += "CrL";
536 } else if (hypothese.model == kCrownRight) {
537 model_string += "CrR";
538 }
539 model_numbers++;
540 }
541 if (model_numbers == 0) {
542 model_string += "0";
543 }
544
545 dbg.push_back(model_string);
546 }
547
Init(const RowInfo & row)548 void RowScratchRegisters::Init(const RowInfo &row) {
549 ri_ = &row;
550 lmargin_ = 0;
551 lindent_ = row.pix_ldistance;
552 rmargin_ = 0;
553 rindent_ = row.pix_rdistance;
554 }
555
GetLineType() const556 LineType RowScratchRegisters::GetLineType() const {
557 if (hypotheses_.empty()) {
558 return LT_UNKNOWN;
559 }
560 bool has_start = false;
561 bool has_body = false;
562 for (const auto &hypothese : hypotheses_) {
563 switch (hypothese.ty) {
564 case LT_START:
565 has_start = true;
566 break;
567 case LT_BODY:
568 has_body = true;
569 break;
570 default:
571 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
572 break;
573 }
574 }
575 if (has_start && has_body) {
576 return LT_MULTIPLE;
577 }
578 return has_start ? LT_START : LT_BODY;
579 }
580
GetLineType(const ParagraphModel * model) const581 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
582 if (hypotheses_.empty()) {
583 return LT_UNKNOWN;
584 }
585 bool has_start = false;
586 bool has_body = false;
587 for (const auto &hypothese : hypotheses_) {
588 if (hypothese.model != model) {
589 continue;
590 }
591 switch (hypothese.ty) {
592 case LT_START:
593 has_start = true;
594 break;
595 case LT_BODY:
596 has_body = true;
597 break;
598 default:
599 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
600 break;
601 }
602 }
603 if (has_start && has_body) {
604 return LT_MULTIPLE;
605 }
606 return has_start ? LT_START : LT_BODY;
607 }
608
SetStartLine()609 void RowScratchRegisters::SetStartLine() {
610 LineType current_lt = GetLineType();
611 if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
612 tprintf("Trying to set a line to be START when it's already BODY.\n");
613 }
614 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
615 push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
616 }
617 }
618
SetBodyLine()619 void RowScratchRegisters::SetBodyLine() {
620 LineType current_lt = GetLineType();
621 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
622 tprintf("Trying to set a line to be BODY when it's already START.\n");
623 }
624 if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
625 push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
626 }
627 }
628
AddStartLine(const ParagraphModel * model)629 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
630 push_back_new(hypotheses_, LineHypothesis(LT_START, model));
631 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
632 if (found != hypotheses_.end()) {
633 hypotheses_.erase(found);
634 }
635 }
636
AddBodyLine(const ParagraphModel * model)637 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
638 push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
639 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
640 if (found != hypotheses_.end()) {
641 hypotheses_.erase(found);
642 }
643 }
644
StartHypotheses(SetOfModels * models) const645 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
646 for (const auto &hypothese : hypotheses_) {
647 if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {
648 push_back_new(*models, hypothese.model);
649 }
650 }
651 }
652
StrongHypotheses(SetOfModels * models) const653 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
654 for (const auto &hypothese : hypotheses_) {
655 if (StrongModel(hypothese.model)) {
656 push_back_new(*models, hypothese.model);
657 }
658 }
659 }
660
NonNullHypotheses(SetOfModels * models) const661 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
662 for (const auto &hypothese : hypotheses_) {
663 if (hypothese.model != nullptr) {
664 push_back_new(*models, hypothese.model);
665 }
666 }
667 }
668
UniqueStartHypothesis() const669 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
670 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {
671 return nullptr;
672 }
673 return hypotheses_[0].model;
674 }
675
UniqueBodyHypothesis() const676 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
677 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {
678 return nullptr;
679 }
680 return hypotheses_[0].model;
681 }
682
683 // Discard any hypotheses whose model is not in the given list.
DiscardNonMatchingHypotheses(const SetOfModels & models)684 void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models) {
685 if (models.empty()) {
686 return;
687 }
688 for (int h = hypotheses_.size() - 1; h >= 0; h--) {
689 if (!contains(models, hypotheses_[h].model)) {
690 hypotheses_.erase(hypotheses_.begin() + h);
691 }
692 }
693 }
694
695 // ============ Geometry based Paragraph Detection Algorithm =================
696
697 struct Cluster {
Clustertesseract::Cluster698 Cluster() : center(0), count(0) {}
Clustertesseract::Cluster699 Cluster(int cen, int num) : center(cen), count(num) {}
700
701 int center; // The center of the cluster.
702 int count; // The number of entries within the cluster.
703 };
704
705 class SimpleClusterer {
706 public:
SimpleClusterer(int max_cluster_width)707 explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
Add(int value)708 void Add(int value) {
709 values_.push_back(value);
710 }
size() const711 size_t size() const {
712 return values_.size();
713 }
714 void GetClusters(std::vector<Cluster> *clusters);
715
716 private:
717 int max_cluster_width_;
718 std::vector<int> values_;
719 };
720
721 // Return the index of the cluster closest to value.
ClosestCluster(const std::vector<Cluster> & clusters,int value)722 static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
723 unsigned best_index = 0;
724 for (unsigned i = 0; i < clusters.size(); i++) {
725 if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
726 best_index = i;
727 }
728 }
729 return best_index;
730 }
731
GetClusters(std::vector<Cluster> * clusters)732 void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
733 clusters->clear();
734 std::sort(values_.begin(), values_.end());
735 for (unsigned i = 0; i < values_.size();) {
736 int orig_i = i;
737 int lo = values_[i];
738 int hi = lo;
739 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
740 hi = values_[i];
741 }
742 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
743 }
744 }
745
746 // Calculate left- and right-indent tab stop values seen in
747 // rows[row_start, row_end) given a tolerance of tolerance.
CalculateTabStops(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,int tolerance,std::vector<Cluster> * left_tabs,std::vector<Cluster> * right_tabs)748 static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
749 int tolerance, std::vector<Cluster> *left_tabs,
750 std::vector<Cluster> *right_tabs) {
751 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
752 return;
753 }
754 // First pass: toss all left and right indents into clusterers.
755 SimpleClusterer initial_lefts(tolerance);
756 SimpleClusterer initial_rights(tolerance);
757 std::vector<Cluster> initial_left_tabs;
758 std::vector<Cluster> initial_right_tabs;
759 for (int i = row_start; i < row_end; i++) {
760 initial_lefts.Add((*rows)[i].lindent_);
761 initial_rights.Add((*rows)[i].rindent_);
762 }
763 initial_lefts.GetClusters(&initial_left_tabs);
764 initial_rights.GetClusters(&initial_right_tabs);
765
766 // Second pass: cluster only lines that are not "stray"
767 // An example of a stray line is a page number -- a line whose start
768 // and end tab-stops are far outside the typical start and end tab-stops
769 // for the block.
770 // Put another way, we only cluster data from lines whose start or end
771 // tab stop is frequent.
772 SimpleClusterer lefts(tolerance);
773 SimpleClusterer rights(tolerance);
774
775 // Outlier elimination. We might want to switch this to test outlier-ness
776 // based on how strange a position an outlier is in instead of or in addition
777 // to how rare it is. These outliers get re-added if we end up having too
778 // few tab stops, to work with, however.
779 int infrequent_enough_to_ignore = 0;
780 if (row_end - row_start >= 8) {
781 infrequent_enough_to_ignore = 1;
782 }
783 if (row_end - row_start >= 20) {
784 infrequent_enough_to_ignore = 2;
785 }
786
787 for (int i = row_start; i < row_end; i++) {
788 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
789 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
790 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
791 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
792 lefts.Add((*rows)[i].lindent_);
793 rights.Add((*rows)[i].rindent_);
794 }
795 }
796 lefts.GetClusters(left_tabs);
797 rights.GetClusters(right_tabs);
798
799 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
800 (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
801 // One side is really ragged, and the other only has one tab stop,
802 // so those "insignificant outliers" are probably important, actually.
803 // This often happens on a page of an index. Add back in the ones
804 // we omitted in the first pass.
805 for (int i = row_start; i < row_end; i++) {
806 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
807 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
808 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
809 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
810 lefts.Add((*rows)[i].lindent_);
811 rights.Add((*rows)[i].rindent_);
812 }
813 }
814 }
815 lefts.GetClusters(left_tabs);
816 rights.GetClusters(right_tabs);
817
818 // If one side is almost a two-indent aligned side, and the other clearly
819 // isn't, try to prune out the least frequent tab stop from that side.
820 if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
821 int to_prune = -1;
822 for (int i = left_tabs->size() - 1; i >= 0; i--) {
823 if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
824 to_prune = i;
825 }
826 }
827 if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
828 left_tabs->erase(left_tabs->begin() + to_prune);
829 }
830 }
831 if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
832 int to_prune = -1;
833 for (int i = right_tabs->size() - 1; i >= 0; i--) {
834 if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
835 to_prune = i;
836 }
837 }
838 if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
839 right_tabs->erase(right_tabs->begin() + to_prune);
840 }
841 }
842 }
843
844 // Given a paragraph model mark rows[row_start, row_end) as said model
845 // start or body lines.
846 //
847 // Case 1: model->first_indent_ != model->body_indent_
848 // Differentiating the paragraph start lines from the paragraph body lines in
849 // this case is easy, we just see how far each line is indented.
850 //
851 // Case 2: model->first_indent_ == model->body_indent_
852 // Here, we find end-of-paragraph lines by looking for "short lines."
853 // What constitutes a "short line" changes depending on whether the text
854 // ragged-right[left] or fully justified (aligned left and right).
855 //
856 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
857 // We have a new paragraph it the first word would have at the end
858 // of the previous line.
859 //
860 // Case 2b: Fully Justified. (eop_threshold > 0)
861 // We mark a line as short (end of paragraph) if the offside indent
862 // is greater than eop_threshold.
MarkRowsWithModel(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,const ParagraphModel * model,bool ltr,int eop_threshold)863 static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
864 const ParagraphModel *model, bool ltr, int eop_threshold) {
865 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
866 return;
867 }
868 for (int row = row_start; row < row_end; row++) {
869 bool valid_first = ValidFirstLine(rows, row, model);
870 bool valid_body = ValidBodyLine(rows, row, model);
871 if (valid_first && !valid_body) {
872 (*rows)[row].AddStartLine(model);
873 } else if (valid_body && !valid_first) {
874 (*rows)[row].AddBodyLine(model);
875 } else if (valid_body && valid_first) {
876 bool after_eop = (row == row_start);
877 if (row > row_start) {
878 if (eop_threshold > 0) {
879 if (model->justification() == JUSTIFICATION_LEFT) {
880 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
881 } else {
882 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
883 }
884 } else {
885 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());
886 }
887 }
888 if (after_eop) {
889 (*rows)[row].AddStartLine(model);
890 } else {
891 (*rows)[row].AddBodyLine(model);
892 }
893 } else {
894 // Do nothing. Stray row.
895 }
896 }
897 }
898
899 // GeometricClassifierState holds all of the information we'll use while
900 // trying to determine a paragraph model for the text lines in a block of
901 // text:
902 // + the rows under consideration [row_start, row_end)
903 // + the common left- and right-indent tab stops
904 // + does the block start out left-to-right or right-to-left
905 // Further, this struct holds the data we amass for the (single) ParagraphModel
906 // we'll assign to the text lines (assuming we get that far).
907 struct GeometricClassifierState {
GeometricClassifierStatetesseract::GeometricClassifierState908 GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
909 int r_end)
910 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
911 tolerance = InterwordSpace(*r, r_start, r_end);
912 CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);
913 if (debug_level >= 3) {
914 tprintf(
915 "Geometry: TabStop cluster tolerance = %d; "
916 "%zu left tabs; %zu right tabs\n",
917 tolerance, left_tabs.size(), right_tabs.size());
918 }
919 ltr = (*r)[r_start].ri_->ltr;
920 }
921
AssumeLeftJustificationtesseract::GeometricClassifierState922 void AssumeLeftJustification() {
923 just = tesseract::JUSTIFICATION_LEFT;
924 margin = (*rows)[row_start].lmargin_;
925 }
926
AssumeRightJustificationtesseract::GeometricClassifierState927 void AssumeRightJustification() {
928 just = tesseract::JUSTIFICATION_RIGHT;
929 margin = (*rows)[row_start].rmargin_;
930 }
931
932 // Align tabs are the tab stops the text is aligned to.
AlignTabstesseract::GeometricClassifierState933 const std::vector<Cluster> &AlignTabs() const {
934 if (just == tesseract::JUSTIFICATION_RIGHT) {
935 return right_tabs;
936 }
937 return left_tabs;
938 }
939
940 // Offside tabs are the tab stops opposite the tabs used to align the text.
941 //
942 // Note that for a left-to-right text which is aligned to the right such as
943 // this function comment, the offside tabs are the horizontal tab stops
944 // marking the beginning of ("Note", "this" and "marking").
OffsideTabstesseract::GeometricClassifierState945 const std::vector<Cluster> &OffsideTabs() const {
946 if (just == tesseract::JUSTIFICATION_RIGHT) {
947 return left_tabs;
948 }
949 return right_tabs;
950 }
951
952 // Return whether the i'th row extends from the leftmost left tab stop
953 // to the right most right tab stop.
IsFullRowtesseract::GeometricClassifierState954 bool IsFullRow(int i) const {
955 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
956 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
957 }
958
AlignsideTabIndextesseract::GeometricClassifierState959 int AlignsideTabIndex(int row_idx) const {
960 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
961 }
962
963 // Given what we know about the paragraph justification (just), would the
964 // first word of row_b have fit at the end of row_a?
FirstWordWouldHaveFittesseract::GeometricClassifierState965 bool FirstWordWouldHaveFit(int row_a, int row_b) {
966 return ::tesseract::FirstWordWouldHaveFit((*rows)[row_a], (*rows)[row_b], just);
967 }
968
PrintRowstesseract::GeometricClassifierState969 void PrintRows() const {
970 PrintRowRange(*rows, row_start, row_end);
971 }
972
Failtesseract::GeometricClassifierState973 void Fail(int min_debug_level, const char *why) const {
974 if (debug_level < min_debug_level) {
975 return;
976 }
977 tprintf("# %s\n", why);
978 PrintRows();
979 }
980
Modeltesseract::GeometricClassifierState981 ParagraphModel Model() const {
982 return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
983 }
984
985 // We print out messages with a debug level at least as great as debug_level.
986 int debug_level = 0;
987
988 // The Geometric Classifier was asked to find a single paragraph model
989 // to fit the text rows (*rows)[row_start, row_end)
990 std::vector<RowScratchRegisters> *rows;
991 int row_start = 0;
992 int row_end = 0;
993
994 // The amount by which we expect the text edge can vary and still be aligned.
995 int tolerance = 0;
996
997 // Is the script in this text block left-to-right?
998 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
999 bool ltr = false;
1000
1001 // These left and right tab stops were determined to be the common tab
1002 // stops for the given text.
1003 std::vector<Cluster> left_tabs;
1004 std::vector<Cluster> right_tabs;
1005
1006 // These are parameters we must determine to create a ParagraphModel.
1007 tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
1008 int margin = 0;
1009 int first_indent = 0;
1010 int body_indent = 0;
1011
1012 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
1013 int eop_threshold = 0;
1014 };
1015
1016 // Given a section of text where strong textual clues did not help identifying
1017 // paragraph breaks, and for which the left and right indents have exactly
1018 // three tab stops between them, attempt to find the paragraph breaks based
1019 // solely on the outline of the text and whether the script is left-to-right.
1020 //
1021 // Algorithm Detail:
1022 // The selected rows are in the form of a rectangle except
1023 // for some number of "short lines" of the same length:
1024 //
1025 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
1026 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1027 // xxxxxxxxxxxxx xxxxxxxxxxxx
1028 // xxxxxxxxxxxxx xxxxxxxxxxxx
1029 //
1030 // We have a slightly different situation if the only short
1031 // line is at the end of the excerpt.
1032 //
1033 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
1034 // xxxxxxxxxxxxx xxxxxxxxxxxx
1035 // xxxxxxxxxxxxx xxxxxxxxxxxx
1036 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1037 //
1038 // We'll interpret these as follows based on the reasoning in the comment for
1039 // GeometricClassify():
1040 // [script direction: first indent, body indent]
1041 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
1042 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
GeometricClassifyThreeTabStopTextBlock(int debug_level,GeometricClassifierState & s,ParagraphTheory * theory)1043 static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,
1044 ParagraphTheory *theory) {
1045 int num_rows = s.row_end - s.row_start;
1046 int num_full_rows = 0;
1047 int last_row_full = 0;
1048 for (int i = s.row_start; i < s.row_end; i++) {
1049 if (s.IsFullRow(i)) {
1050 num_full_rows++;
1051 if (i == s.row_end - 1) {
1052 last_row_full++;
1053 }
1054 }
1055 }
1056
1057 if (num_full_rows < 0.7 * num_rows) {
1058 s.Fail(1, "Not enough full lines to know which lines start paras.");
1059 return;
1060 }
1061
1062 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1063 s.eop_threshold = 0;
1064
1065 if (s.ltr) {
1066 s.AssumeLeftJustification();
1067 } else {
1068 s.AssumeRightJustification();
1069 }
1070
1071 if (debug_level > 0) {
1072 tprintf(
1073 "# Not enough variety for clear outline classification. "
1074 "Guessing these are %s aligned based on script.\n",
1075 s.ltr ? "left" : "right");
1076 s.PrintRows();
1077 }
1078
1079 if (s.AlignTabs().size() == 2) { // case A1 or A2
1080 s.first_indent = s.AlignTabs()[1].center;
1081 s.body_indent = s.AlignTabs()[0].center;
1082 } else { // case B1 or B2
1083 if (num_rows - 1 == num_full_rows - last_row_full) {
1084 // case B2
1085 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1086 (*s.rows)[s.row_start].AddStartLine(model);
1087 for (int i = s.row_start + 1; i < s.row_end; i++) {
1088 (*s.rows)[i].AddBodyLine(model);
1089 }
1090 return;
1091 } else {
1092 // case B1
1093 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1094 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1095 }
1096 }
1097 const ParagraphModel *model = theory->AddModel(s.Model());
1098 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);
1099 return;
1100 }
1101
1102 // This function is called if strong textual clues were not available, but
1103 // the caller hopes that the paragraph breaks will be super obvious just
1104 // by the outline of the text.
1105 //
1106 // The particularly difficult case is figuring out what's going on if you
1107 // don't have enough short paragraph end lines to tell us what's going on.
1108 //
1109 // For instance, let's say you have the following outline:
1110 //
1111 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1112 // xxxxxxxxxxxxxxxxxxxx
1113 // xxxxxxxxxxxxxxxxxxxxxx
1114 // xxxxxxxxxxxxxxxxxxxxxx
1115 //
1116 // Even if we know that the text is left-to-right and so will probably be
1117 // left-aligned, both of the following are possible texts:
1118 //
1119 // (A1a) 1. Here our list item
1120 // with two full lines.
1121 // 2. Here a second item.
1122 // 3. Here our third one.
1123 //
1124 // (A1b) so ends paragraph one.
1125 // Here starts another
1126 // paragraph we want to
1127 // read. This continues
1128 //
1129 // These examples are obvious from the text and should have been caught
1130 // by the StrongEvidenceClassify pass. However, for languages where we don't
1131 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1132 // it's worth guessing that (A1b) is the correct interpretation if there are
1133 // far more "full" lines than "short" lines.
GeometricClassify(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)1134 static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
1135 int row_start, int row_end, ParagraphTheory *theory) {
1136 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1137 return;
1138 }
1139 if (debug_level > 1) {
1140 tprintf("###############################################\n");
1141 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end);
1142 tprintf("###############################################\n");
1143 }
1144 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1145
1146 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1147 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1148 s.Fail(2, "Too much variety for simple outline classification.");
1149 return;
1150 }
1151 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1152 s.Fail(1, "Not enough variety for simple outline classification.");
1153 return;
1154 }
1155 if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1156 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1157 return;
1158 }
1159
1160 // At this point, we know that one side has at least two tab stops, and the
1161 // other side has one or two tab stops.
1162 // Left to determine:
1163 // (1) Which is the body indent and which is the first line indent?
1164 // (2) Is the text fully justified?
1165
1166 // If one side happens to have three or more tab stops, assume that side
1167 // is opposite of the aligned side.
1168 if (s.right_tabs.size() > 2) {
1169 s.AssumeLeftJustification();
1170 } else if (s.left_tabs.size() > 2) {
1171 s.AssumeRightJustification();
1172 } else if (s.ltr) { // guess based on script direction
1173 s.AssumeLeftJustification();
1174 } else {
1175 s.AssumeRightJustification();
1176 }
1177
1178 if (s.AlignTabs().size() == 2) {
1179 // For each tab stop on the aligned side, how many of them appear
1180 // to be paragraph start lines? [first lines]
1181 int firsts[2] = {0, 0};
1182 // Count the first line as a likely paragraph start line.
1183 firsts[s.AlignsideTabIndex(s.row_start)]++;
1184 // For each line, if the first word would have fit on the previous
1185 // line count it as a likely paragraph start line.
1186 bool jam_packed = true;
1187 for (int i = s.row_start + 1; i < s.row_end; i++) {
1188 if (s.FirstWordWouldHaveFit(i - 1, i)) {
1189 firsts[s.AlignsideTabIndex(i)]++;
1190 jam_packed = false;
1191 }
1192 }
1193 // Make an extra accounting for the last line of the paragraph just
1194 // in case it's the only short line in the block. That is, take its
1195 // first word as typical and see if this looks like the *last* line
1196 // of a paragraph. If so, mark the *other* indent as probably a first.
1197 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1198 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1199 }
1200
1201 int percent0firsts, percent1firsts;
1202 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1203 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1204
1205 // TODO(eger): Tune these constants if necessary.
1206 if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1207 s.first_indent = s.AlignTabs()[1].center;
1208 s.body_indent = s.AlignTabs()[0].center;
1209 } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1210 percent1firsts + 30 < percent0firsts) {
1211 s.first_indent = s.AlignTabs()[0].center;
1212 s.body_indent = s.AlignTabs()[1].center;
1213 } else {
1214 // Ambiguous! Probably lineated (poetry)
1215 if (debug_level > 1) {
1216 tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1217 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1218 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1219 s.AlignTabs()[0].center, percent0firsts);
1220 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1221 s.AlignTabs()[1].center, percent1firsts);
1222 s.PrintRows();
1223 }
1224 return;
1225 }
1226 } else {
1227 // There's only one tab stop for the "aligned to" side.
1228 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1229 }
1230
1231 // At this point, we have our model.
1232 const ParagraphModel *model = theory->AddModel(s.Model());
1233
1234 // Now all we have to do is figure out if the text is fully justified or not.
1235 // eop_threshold: default to fully justified unless we see evidence below.
1236 // See description on MarkRowsWithModel()
1237 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1238 // If the text is not fully justified, re-set the eop_threshold to 0.
1239 if (s.AlignTabs().size() == 2) {
1240 // Paragraphs with a paragraph-start indent.
1241 for (int i = s.row_start; i < s.row_end - 1; i++) {
1242 if (ValidFirstLine(s.rows, i + 1, model) &&
1243 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1244 s.tolerance)) {
1245 // We found a non-end-of-paragraph short line: not fully justified.
1246 s.eop_threshold = 0;
1247 break;
1248 }
1249 }
1250 } else {
1251 // Paragraphs with no paragraph-start indent.
1252 for (int i = s.row_start; i < s.row_end - 1; i++) {
1253 if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1254 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1255 s.tolerance)) {
1256 // We found a non-end-of-paragraph short line: not fully justified.
1257 s.eop_threshold = 0;
1258 break;
1259 }
1260 }
1261 }
1262 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1263 }
1264
1265 // =============== Implementation of ParagraphTheory =====================
1266
AddModel(const ParagraphModel & model)1267 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
1268 for (const auto &m : *models_) {
1269 if (m->Comparable(model)) {
1270 return m;
1271 }
1272 }
1273 auto *m = new ParagraphModel(model);
1274 models_->push_back(m);
1275 push_back_new(models_we_added_, m);
1276 return m;
1277 }
1278
DiscardUnusedModels(const SetOfModels & used_models)1279 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
1280 size_t w = 0;
1281 for (size_t r = 0; r < models_->size(); r++) {
1282 ParagraphModel *m = (*models_)[r];
1283 if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
1284 delete m;
1285 } else {
1286 if (r > w) {
1287 (*models_)[w] = m;
1288 }
1289 w++;
1290 }
1291 }
1292 models_->resize(w);
1293 }
1294
1295 // Examine rows[start, end) and try to determine if an existing non-centered
1296 // paragraph model would fit them perfectly. If so, return a pointer to it.
1297 // If not, return nullptr.
Fits(const std::vector<RowScratchRegisters> * rows,int start,int end) const1298 const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
1299 int start, int end) const {
1300 for (const auto *model : *models_) {
1301 if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {
1302 return model;
1303 }
1304 }
1305 return nullptr;
1306 }
1307
NonCenteredModels(SetOfModels * models)1308 void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
1309 for (const auto *model : *models_) {
1310 if (model->justification() != JUSTIFICATION_CENTER) {
1311 push_back_new(*models, model);
1312 }
1313 }
1314 }
1315
IndexOf(const ParagraphModel * model) const1316 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1317 int i = 0;
1318 for (const auto *m : *models_) {
1319 if (m == model) {
1320 return i;
1321 }
1322 i++;
1323 }
1324 return -1;
1325 }
1326
ValidFirstLine(const std::vector<RowScratchRegisters> * rows,int row,const ParagraphModel * model)1327 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
1328 const ParagraphModel *model) {
1329 if (!StrongModel(model)) {
1330 tprintf("ValidFirstLine() should only be called with strong models!\n");
1331 }
1332 return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1333 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1334 }
1335
ValidBodyLine(const std::vector<RowScratchRegisters> * rows,int row,const ParagraphModel * model)1336 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
1337 const ParagraphModel *model) {
1338 if (!StrongModel(model)) {
1339 tprintf("ValidBodyLine() should only be called with strong models!\n");
1340 }
1341 return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1342 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1343 }
1344
CrownCompatible(const std::vector<RowScratchRegisters> * rows,int a,int b,const ParagraphModel * model)1345 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
1346 const ParagraphModel *model) {
1347 if (model != kCrownRight && model != kCrownLeft) {
1348 tprintf("CrownCompatible() should only be called with crown models!\n");
1349 return false;
1350 }
1351 auto &row_a = (*rows)[a];
1352 auto &row_b = (*rows)[b];
1353 if (model == kCrownRight) {
1354 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1355 Epsilon(row_a.ri_->average_interword_space));
1356 }
1357 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1358 Epsilon(row_a.ri_->average_interword_space));
1359 }
1360
1361 // =============== Implementation of ParagraphModelSmearer ====================
1362
ParagraphModelSmearer(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)1363 ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
1364 int row_start, int row_end, ParagraphTheory *theory)
1365 : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1366 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1367 row_start_ = 0;
1368 row_end_ = 0;
1369 return;
1370 }
1371 open_models_.resize(open_models_.size() + row_end - row_start + 2);
1372 }
1373
1374 // see paragraphs_internal.h
CalculateOpenModels(int row_start,int row_end)1375 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1376 SetOfModels no_models;
1377 if (row_start < row_start_) {
1378 row_start = row_start_;
1379 }
1380 if (row_end > row_end_) {
1381 row_end = row_end_;
1382 }
1383
1384 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1385 if ((*rows_)[row].ri_->num_words == 0) {
1386 OpenModels(row + 1) = no_models;
1387 } else {
1388 SetOfModels &opened = OpenModels(row);
1389 (*rows_)[row].StartHypotheses(&opened);
1390
1391 // Which models survive the transition from row to row + 1?
1392 SetOfModels still_open;
1393 for (auto &m : opened) {
1394 if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {
1395 // This is basic filtering; we check likely paragraph starty-ness down
1396 // below in Smear() -- you know, whether the first word would have fit
1397 // and such.
1398 push_back_new(still_open, m);
1399 }
1400 }
1401 OpenModels(row + 1) = still_open;
1402 }
1403 }
1404 }
1405
1406 // see paragraphs_internal.h
Smear()1407 void ParagraphModelSmearer::Smear() {
1408 CalculateOpenModels(row_start_, row_end_);
1409
1410 // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1411 // we have multiple LT_START hypotheses), see if there's a model that
1412 // was recently used (an "open" model) which might model it well.
1413 for (int i = row_start_; i < row_end_; i++) {
1414 RowScratchRegisters &row = (*rows_)[i];
1415 if (row.ri_->num_words == 0) {
1416 continue;
1417 }
1418
1419 // Step One:
1420 // Figure out if there are "open" models which are left-alined or
1421 // right-aligned. This is important for determining whether the
1422 // "first" word in a row would fit at the "end" of the previous row.
1423 bool left_align_open = false;
1424 bool right_align_open = false;
1425 for (auto &m : OpenModels(i)) {
1426 switch (m->justification()) {
1427 case JUSTIFICATION_LEFT:
1428 left_align_open = true;
1429 break;
1430 case JUSTIFICATION_RIGHT:
1431 right_align_open = true;
1432 break;
1433 default:
1434 left_align_open = right_align_open = true;
1435 }
1436 }
1437 // Step Two:
1438 // Use that knowledge to figure out if this row is likely to
1439 // start a paragraph.
1440 bool likely_start;
1441 if (i == 0) {
1442 likely_start = true;
1443 } else {
1444 if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1445 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||
1446 LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1447 } else if (left_align_open) {
1448 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);
1449 } else {
1450 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1451 }
1452 }
1453
1454 // Step Three:
1455 // If this text line seems like an obvious first line of an
1456 // open model, or an obvious continuation of an existing
1457 // modelled paragraph, mark it up.
1458 if (likely_start) {
1459 // Add Start Hypotheses for all Open models that fit.
1460 for (unsigned m = 0; m < OpenModels(i).size(); m++) {
1461 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1462 row.AddStartLine(OpenModels(i)[m]);
1463 }
1464 }
1465 } else {
1466 // Add relevant body line hypotheses.
1467 SetOfModels last_line_models;
1468 if (i > 0) {
1469 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1470 } else {
1471 theory_->NonCenteredModels(&last_line_models);
1472 }
1473 for (auto model : last_line_models) {
1474 if (ValidBodyLine(rows_, i, model)) {
1475 row.AddBodyLine(model);
1476 }
1477 }
1478 }
1479
1480 // Step Four:
1481 // If we're still quite unsure about this line, go through all
1482 // models in our theory and see if this row could be the start
1483 // of any of our models.
1484 if (row.GetLineType() == LT_UNKNOWN ||
1485 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1486 SetOfModels all_models;
1487 theory_->NonCenteredModels(&all_models);
1488 for (auto &all_model : all_models) {
1489 if (ValidFirstLine(rows_, i, all_model)) {
1490 row.AddStartLine(all_model);
1491 }
1492 }
1493 }
1494 // Step Five:
1495 // Since we may have updated the hypotheses about this row, we need
1496 // to recalculate the Open models for the rest of rows[i + 1, row_end)
1497 if (row.GetLineType() != LT_UNKNOWN) {
1498 CalculateOpenModels(i + 1, row_end_);
1499 }
1500 }
1501 }
1502
1503 // ================ Main Paragraph Detection Algorithm =======================
1504
1505 // Find out what ParagraphModels are actually used, and discard any
1506 // that are not.
DiscardUnusedModels(const std::vector<RowScratchRegisters> & rows,ParagraphTheory * theory)1507 static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
1508 ParagraphTheory *theory) {
1509 SetOfModels used_models;
1510 for (const auto &row : rows) {
1511 row.StrongHypotheses(&used_models);
1512 }
1513 theory->DiscardUnusedModels(used_models);
1514 }
1515
1516 // DowngradeWeakestToCrowns:
1517 // Forget any flush-{left, right} models unless we see two or more
1518 // of them in sequence.
1519 //
1520 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1521 // where the first line and body indent are the same) as having proper Models.
1522 // This is generally dangerous, since if you start imagining that flush-left
1523 // is a typical paragraph model when it is not, it will lead you to chop normal
1524 // indented paragraphs in the middle whenever a sentence happens to start on a
1525 // new line (see "This" above). What to do?
1526 // What we do is to take any paragraph which is flush left and is not
1527 // preceded by another paragraph of the same model and convert it to a "Crown"
1528 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1529 // for later. It means that the paragraph is flush, but it would be desirable
1530 // to mark it as the same model as following text if it fits. This downgrade
1531 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1532 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1533 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1534 //
1535 // Implementation:
1536 //
1537 // Comb backwards through the row scratch registers, and turn any
1538 // sequences of body lines of equivalent type abutted against the beginning
1539 // or a body or start line of a different type into a crown paragraph.
DowngradeWeakestToCrowns(int debug_level,ParagraphTheory * theory,std::vector<RowScratchRegisters> * rows)1540 static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1541 std::vector<RowScratchRegisters> *rows) {
1542 int start;
1543 for (int end = rows->size(); end > 0; end = start) {
1544 // Search back for a body line of a unique type.
1545 const ParagraphModel *model = nullptr;
1546 while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1547 end--;
1548 }
1549 if (end == 0) {
1550 break;
1551 }
1552 start = end - 1;
1553 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1554 start--; // walk back to the first line that is not the same body type.
1555 }
1556 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&
1557 NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1558 start--;
1559 }
1560 start++;
1561 // Now rows[start, end) is a sequence of unique body hypotheses of model.
1562 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1563 continue;
1564 }
1565 if (!StrongModel(model)) {
1566 while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {
1567 start--;
1568 }
1569 }
1570 if (start == 0 || (!StrongModel(model)) ||
1571 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1572 // crownify rows[start, end)
1573 const ParagraphModel *crown_model = model;
1574 if (StrongModel(model)) {
1575 if (model->justification() == JUSTIFICATION_LEFT) {
1576 crown_model = kCrownLeft;
1577 } else {
1578 crown_model = kCrownRight;
1579 }
1580 }
1581 (*rows)[start].SetUnknown();
1582 (*rows)[start].AddStartLine(crown_model);
1583 for (int row = start + 1; row < end; row++) {
1584 (*rows)[row].SetUnknown();
1585 (*rows)[row].AddBodyLine(crown_model);
1586 }
1587 }
1588 }
1589 DiscardUnusedModels(*rows, theory);
1590 }
1591
1592 // Clear all hypotheses about lines [start, end) and reset margins.
1593 //
1594 // The empty space between the left of a row and the block boundary (and
1595 // similarly for the right) is split into two pieces: margin and indent.
1596 // In initial processing, we assume the block is tight and the margin for
1597 // all lines is set to zero. However, if our first pass does not yield
1598 // models for everything, it may be due to an inset paragraph like a
1599 // block-quote. In that case, we make a second pass over that unmarked
1600 // section of the page and reset the "margin" portion of the empty space
1601 // to the common amount of space at the ends of the lines under consid-
1602 // eration. This would be equivalent to percentile set to 0. However,
1603 // sometimes we have a single character sticking out in the right margin
1604 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1605 // really just ignore it as an outlier. To express this, we allow the
1606 // user to specify the percentile (0..100) of indent values to use as
1607 // the common margin for each row in the run of rows[start, end).
RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> * rows,int start,int end,int percentile)1608 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
1609 int end, int percentile) {
1610 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1611 return;
1612 }
1613
1614 int lmin, lmax, rmin, rmax;
1615 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1616 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1617 for (int i = start; i < end; i++) {
1618 RowScratchRegisters &sr = (*rows)[i];
1619 sr.SetUnknown();
1620 if (sr.ri_->num_words == 0) {
1621 continue;
1622 }
1623 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1624 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1625 }
1626 STATS lefts(lmin, lmax + 1);
1627 STATS rights(rmin, rmax + 1);
1628 for (int i = start; i < end; i++) {
1629 RowScratchRegisters &sr = (*rows)[i];
1630 if (sr.ri_->num_words == 0) {
1631 continue;
1632 }
1633 lefts.add(sr.lmargin_ + sr.lindent_, 1);
1634 rights.add(sr.rmargin_ + sr.rindent_, 1);
1635 }
1636 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1637 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1638 for (int i = start; i < end; i++) {
1639 RowScratchRegisters &sr = (*rows)[i];
1640 int ldelta = ignorable_left - sr.lmargin_;
1641 sr.lmargin_ += ldelta;
1642 sr.lindent_ -= ldelta;
1643 int rdelta = ignorable_right - sr.rmargin_;
1644 sr.rmargin_ += rdelta;
1645 sr.rindent_ -= rdelta;
1646 }
1647 }
1648
1649 // Return the median inter-word space in rows[row_start, row_end).
InterwordSpace(const std::vector<RowScratchRegisters> & rows,int row_start,int row_end)1650 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
1651 if (row_end < row_start + 1) {
1652 return 1;
1653 }
1654 int word_height =
1655 (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1656 int word_width =
1657 (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1658 STATS spacing_widths(0, 5 + word_width);
1659 for (int i = row_start; i < row_end; i++) {
1660 if (rows[i].ri_->num_words > 1) {
1661 spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1662 }
1663 }
1664 int minimum_reasonable_space = word_height / 3;
1665 if (minimum_reasonable_space < 2) {
1666 minimum_reasonable_space = 2;
1667 }
1668 int median = spacing_widths.median();
1669 return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1670 }
1671
1672 // Return whether the first word on the after line can fit in the space at
1673 // the end of the before line (knowing which way the text is aligned and read).
FirstWordWouldHaveFit(const RowScratchRegisters & before,const RowScratchRegisters & after,tesseract::ParagraphJustification justification)1674 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
1675 tesseract::ParagraphJustification justification) {
1676 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1677 return true;
1678 }
1679
1680 if (justification == JUSTIFICATION_UNKNOWN) {
1681 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1682 }
1683 int available_space;
1684 if (justification == JUSTIFICATION_CENTER) {
1685 available_space = before.lindent_ + before.rindent_;
1686 } else {
1687 available_space = before.OffsideIndent(justification);
1688 }
1689 available_space -= before.ri_->average_interword_space;
1690
1691 if (before.ri_->ltr) {
1692 return after.ri_->lword_box.width() < available_space;
1693 }
1694 return after.ri_->rword_box.width() < available_space;
1695 }
1696
1697 // Return whether the first word on the after line can fit in the space at
1698 // the end of the before line (not knowing which way the text goes) in a left
1699 // or right alignment.
FirstWordWouldHaveFit(const RowScratchRegisters & before,const RowScratchRegisters & after)1700 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1701 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1702 return true;
1703 }
1704
1705 int available_space = before.lindent_;
1706 if (before.rindent_ > available_space) {
1707 available_space = before.rindent_;
1708 }
1709 available_space -= before.ri_->average_interword_space;
1710
1711 if (before.ri_->ltr) {
1712 return after.ri_->lword_box.width() < available_space;
1713 }
1714 return after.ri_->rword_box.width() < available_space;
1715 }
1716
TextSupportsBreak(const RowScratchRegisters & before,const RowScratchRegisters & after)1717 static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1718 if (before.ri_->ltr) {
1719 return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1720 } else {
1721 return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1722 }
1723 }
1724
LikelyParagraphStart(const RowScratchRegisters & before,const RowScratchRegisters & after,tesseract::ParagraphJustification j)1725 static bool LikelyParagraphStart(const RowScratchRegisters &before,
1726 const RowScratchRegisters &after,
1727 tesseract::ParagraphJustification j) {
1728 return before.ri_->num_words == 0 ||
1729 (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));
1730 }
1731
1732 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1733 // would fit them as a single paragraph.
1734 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1735 // If the rows given could be a consistent start to a paragraph, set *consistent
1736 // true.
InternalParagraphModelByOutline(const std::vector<RowScratchRegisters> * rows,int start,int end,int tolerance,bool * consistent)1737 static ParagraphModel InternalParagraphModelByOutline(
1738 const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
1739 bool *consistent) {
1740 int ltr_line_count = 0;
1741 for (int i = start; i < end; i++) {
1742 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1743 }
1744 bool ltr = (ltr_line_count >= (end - start) / 2);
1745
1746 *consistent = true;
1747 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1748 return ParagraphModel();
1749 }
1750
1751 // Ensure the caller only passed us a region with a common rmargin and
1752 // lmargin.
1753 int lmargin = (*rows)[start].lmargin_;
1754 int rmargin = (*rows)[start].rmargin_;
1755 int lmin, lmax, rmin, rmax, cmin, cmax;
1756 lmin = lmax = (*rows)[start + 1].lindent_;
1757 rmin = rmax = (*rows)[start + 1].rindent_;
1758 cmin = cmax = 0;
1759 for (int i = start + 1; i < end; i++) {
1760 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1761 tprintf("Margins don't match! Software error.\n");
1762 *consistent = false;
1763 return ParagraphModel();
1764 }
1765 UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1766 UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1767 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1768 }
1769 int ldiff = lmax - lmin;
1770 int rdiff = rmax - rmin;
1771 int cdiff = cmax - cmin;
1772 if (rdiff > tolerance && ldiff > tolerance) {
1773 if (cdiff < tolerance * 2) {
1774 if (end - start < 3) {
1775 return ParagraphModel();
1776 }
1777 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1778 }
1779 *consistent = false;
1780 return ParagraphModel();
1781 }
1782 if (end - start < 3) { // Don't return a model for two line paras.
1783 return ParagraphModel();
1784 }
1785
1786 // These booleans keep us from saying something is aligned left when the body
1787 // left variance is too large.
1788 bool body_admits_left_alignment = ldiff < tolerance;
1789 bool body_admits_right_alignment = rdiff < tolerance;
1790
1791 ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1792 (lmin + lmax) / 2, tolerance);
1793 ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1794 (rmin + rmax) / 2, tolerance);
1795
1796 // These booleans keep us from having an indent on the "wrong side" for the
1797 // first line.
1798 bool text_admits_left_alignment = ltr || left_model.is_flush();
1799 bool text_admits_right_alignment = !ltr || right_model.is_flush();
1800
1801 // At least one of the edges is less than tolerance in variance.
1802 // If the other is obviously ragged, it can't be the one aligned to.
1803 // [Note the last line is included in this raggedness.]
1804 if (tolerance < rdiff) {
1805 if (body_admits_left_alignment && text_admits_left_alignment) {
1806 return left_model;
1807 }
1808 *consistent = false;
1809 return ParagraphModel();
1810 }
1811 if (tolerance < ldiff) {
1812 if (body_admits_right_alignment && text_admits_right_alignment) {
1813 return right_model;
1814 }
1815 *consistent = false;
1816 return ParagraphModel();
1817 }
1818
1819 // At this point, we know the body text doesn't vary much on either side.
1820
1821 // If the first line juts out oddly in one direction or the other,
1822 // that likely indicates the side aligned to.
1823 int first_left = (*rows)[start].lindent_;
1824 int first_right = (*rows)[start].rindent_;
1825
1826 if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1827 return left_model;
1828 }
1829 if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1830 return right_model;
1831 }
1832
1833 *consistent = false;
1834 return ParagraphModel();
1835 }
1836
1837 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1838 // would fit them as a single paragraph. If nothing fits,
1839 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1840 // output if we're debugging.
ParagraphModelByOutline(int debug_level,const std::vector<RowScratchRegisters> * rows,int start,int end,int tolerance)1841 static ParagraphModel ParagraphModelByOutline(int debug_level,
1842 const std::vector<RowScratchRegisters> *rows,
1843 int start, int end, int tolerance) {
1844 bool unused_consistent;
1845 ParagraphModel retval =
1846 InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1847 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1848 tprintf("Could not determine a model for this paragraph:\n");
1849 PrintRowRange(*rows, start, end);
1850 }
1851 return retval;
1852 }
1853
1854 // Do rows[start, end) form a single instance of the given paragraph model?
RowsFitModel(const std::vector<RowScratchRegisters> * rows,int start,int end,const ParagraphModel * model)1855 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
1856 const ParagraphModel *model) {
1857 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1858 return false;
1859 }
1860 if (!ValidFirstLine(rows, start, model)) {
1861 return false;
1862 }
1863 for (int i = start + 1; i < end; i++) {
1864 if (!ValidBodyLine(rows, i, model)) {
1865 return false;
1866 }
1867 }
1868 return true;
1869 }
1870
1871 // Examine rows[row_start, row_end) as an independent section of text,
1872 // and mark rows that are exceptionally clear as start-of-paragraph
1873 // and paragraph-body lines.
1874 //
1875 // We presume that any lines surrounding rows[row_start, row_end) may
1876 // have wildly different paragraph models, so we don't key any data off
1877 // of those lines.
1878 //
1879 // We only take the very strongest signals, as we don't want to get
1880 // confused and marking up centered text, poetry, or source code as
1881 // clearly part of a typical paragraph.
MarkStrongEvidence(std::vector<RowScratchRegisters> * rows,int row_start,int row_end)1882 static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
1883 int row_end) {
1884 // Record patently obvious body text.
1885 for (int i = row_start + 1; i < row_end; i++) {
1886 const RowScratchRegisters &prev = (*rows)[i - 1];
1887 RowScratchRegisters &curr = (*rows)[i];
1888 tesseract::ParagraphJustification typical_justification =
1889 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1890 if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1891 !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1892 curr.SetBodyLine();
1893 }
1894 }
1895
1896 // Record patently obvious start paragraph lines.
1897 //
1898 // It's an extremely good signal of the start of a paragraph that
1899 // the first word would have fit on the end of the previous line.
1900 // However, applying just that signal would have us mark random
1901 // start lines of lineated text (poetry and source code) and some
1902 // centered headings as paragraph start lines. Therefore, we use
1903 // a second qualification for a paragraph start: Not only should
1904 // the first word of this line have fit on the previous line,
1905 // but also, this line should go full to the right of the block,
1906 // disallowing a subsequent word from having fit on this line.
1907
1908 // First row:
1909 {
1910 RowScratchRegisters &curr = (*rows)[row_start];
1911 RowScratchRegisters &next = (*rows)[row_start + 1];
1912 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1913 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1914 (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1915 curr.SetStartLine();
1916 }
1917 }
1918 // Middle rows
1919 for (int i = row_start + 1; i < row_end - 1; i++) {
1920 RowScratchRegisters &prev = (*rows)[i - 1];
1921 RowScratchRegisters &curr = (*rows)[i];
1922 RowScratchRegisters &next = (*rows)[i + 1];
1923 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1924 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1925 LikelyParagraphStart(prev, curr, j)) {
1926 curr.SetStartLine();
1927 }
1928 }
1929 // Last row
1930 { // the short circuit at the top means we have at least two lines.
1931 RowScratchRegisters &prev = (*rows)[row_end - 2];
1932 RowScratchRegisters &curr = (*rows)[row_end - 1];
1933 tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1934 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&
1935 LikelyParagraphStart(prev, curr, j)) {
1936 curr.SetStartLine();
1937 }
1938 }
1939 }
1940
1941 // Look for sequences of a start line followed by some body lines in
1942 // rows[row_start, row_end) and create ParagraphModels for them if
1943 // they seem coherent.
ModelStrongEvidence(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,bool allow_flush_models,ParagraphTheory * theory)1944 static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
1945 int row_start, int row_end, bool allow_flush_models,
1946 ParagraphTheory *theory) {
1947 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1948 return;
1949 }
1950
1951 int start = row_start;
1952 while (start < row_end) {
1953 while (start < row_end && (*rows)[start].GetLineType() != LT_START) {
1954 start++;
1955 }
1956 if (start >= row_end - 1) {
1957 break;
1958 }
1959
1960 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1961 int end = start;
1962 ParagraphModel last_model;
1963 bool next_consistent;
1964 do {
1965 ++end;
1966 // rows[row, end) was consistent.
1967 // If rows[row, end + 1) is not consistent,
1968 // just model rows[row, end)
1969 if (end < row_end - 1) {
1970 RowScratchRegisters &next = (*rows)[end];
1971 LineType lt = next.GetLineType();
1972 next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&
1973 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1974 } else {
1975 next_consistent = false;
1976 }
1977 if (next_consistent) {
1978 ParagraphModel next_model =
1979 InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1980 if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1981 next_model.justification() != JUSTIFICATION_LEFT) ||
1982 (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1983 next_model.justification() != JUSTIFICATION_RIGHT)) {
1984 next_consistent = false;
1985 }
1986 last_model = next_model;
1987 } else {
1988 next_consistent = false;
1989 }
1990 } while (next_consistent && end < row_end);
1991 // At this point, rows[start, end) looked like it could have been a
1992 // single paragraph. If we can make a good ParagraphModel for it,
1993 // do so and mark this sequence with that model.
1994 if (end > start + 1) {
1995 // emit a new paragraph if we have more than one line.
1996 const ParagraphModel *model = nullptr;
1997 ParagraphModel new_model = ParagraphModelByOutline(
1998 debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));
1999 if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2000 // couldn't create a good model, oh well.
2001 } else if (new_model.is_flush()) {
2002 if (end == start + 2) {
2003 // It's very likely we just got two paragraph starts in a row.
2004 end = start + 1;
2005 } else if (start == row_start) {
2006 // Mark this as a Crown.
2007 if (new_model.justification() == JUSTIFICATION_LEFT) {
2008 model = kCrownLeft;
2009 } else {
2010 model = kCrownRight;
2011 }
2012 } else if (allow_flush_models) {
2013 model = theory->AddModel(new_model);
2014 }
2015 } else {
2016 model = theory->AddModel(new_model);
2017 }
2018 if (model) {
2019 (*rows)[start].AddStartLine(model);
2020 for (int i = start + 1; i < end; i++) {
2021 (*rows)[i].AddBodyLine(model);
2022 }
2023 }
2024 }
2025 start = end;
2026 }
2027 }
2028
2029 // We examine rows[row_start, row_end) and do the following:
2030 // (1) Clear all existing hypotheses for the rows being considered.
2031 // (2) Mark up any rows as exceptionally likely to be paragraph starts
2032 // or paragraph body lines as such using both geometric and textual
2033 // clues.
2034 // (3) Form models for any sequence of start + continuation lines.
2035 // (4) Smear the paragraph models to cover surrounding text.
StrongEvidenceClassify(int debug_level,std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)2036 static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
2037 int row_start, int row_end, ParagraphTheory *theory) {
2038 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2039 return;
2040 }
2041
2042 if (debug_level > 1) {
2043 tprintf("#############################################\n");
2044 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2045 tprintf("#############################################\n");
2046 }
2047
2048 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2049 MarkStrongEvidence(rows, row_start, row_end);
2050
2051 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2052
2053 // Create paragraph models.
2054 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2055
2056 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2057
2058 // At this point, some rows are marked up as paragraphs with model numbers,
2059 // and some rows are marked up as either LT_START or LT_BODY. Now let's
2060 // smear any good paragraph hypotheses forward and backward.
2061 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2062 smearer.Smear();
2063 }
2064
SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> * rows,int row_start,int row_end,ParagraphTheory * theory)2065 static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
2066 int row_end, ParagraphTheory *theory) {
2067 for (int i = row_start + 1; i < row_end - 1; i++) {
2068 if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2069 (*rows)[i + 1].ri_->has_leaders) {
2070 const ParagraphModel *model =
2071 theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2072 (*rows)[i].AddStartLine(model);
2073 }
2074 }
2075 }
2076
2077 // Collect sequences of unique hypotheses in row registers and create proper
2078 // paragraphs for them, referencing the paragraphs in row_owners.
ConvertHypothesizedModelRunsToParagraphs(int debug_level,std::vector<RowScratchRegisters> & rows,std::vector<PARA * > * row_owners,ParagraphTheory * theory)2079 static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
2080 std::vector<RowScratchRegisters> &rows,
2081 std::vector<PARA *> *row_owners,
2082 ParagraphTheory *theory) {
2083 int end = rows.size();
2084 int start;
2085 for (; end > 0; end = start) {
2086 start = end - 1;
2087 const ParagraphModel *model = nullptr;
2088 // TODO(eger): Be smarter about dealing with multiple hypotheses.
2089 bool single_line_paragraph = false;
2090 SetOfModels models;
2091 rows[start].NonNullHypotheses(&models);
2092 if (!models.empty()) {
2093 model = models[0];
2094 if (rows[start].GetLineType(model) != LT_BODY) {
2095 single_line_paragraph = true;
2096 }
2097 }
2098 if (model && !single_line_paragraph) {
2099 // walk back looking for more body lines and then a start line.
2100 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2101 // do nothing
2102 }
2103 if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2104 model = nullptr;
2105 }
2106 }
2107 if (model == nullptr) {
2108 continue;
2109 }
2110 // rows[start, end) should be a paragraph.
2111 PARA *p = new PARA();
2112 if (model == kCrownLeft || model == kCrownRight) {
2113 p->is_very_first_or_continuation = true;
2114 // Crown paragraph.
2115 // If we can find an existing ParagraphModel that fits, use it,
2116 // else create a new one.
2117 for (unsigned row = end; row < rows.size(); row++) {
2118 if ((*row_owners)[row] &&
2119 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2120 (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2121 model = (*row_owners)[row]->model;
2122 break;
2123 }
2124 }
2125 if (model == kCrownLeft) {
2126 // No subsequent model fits, so cons one up.
2127 model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2128 rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2129 Epsilon(rows[start].ri_->average_interword_space)));
2130 } else if (model == kCrownRight) {
2131 // No subsequent model fits, so cons one up.
2132 model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2133 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2134 Epsilon(rows[start].ri_->average_interword_space)));
2135 }
2136 }
2137 rows[start].SetUnknown();
2138 rows[start].AddStartLine(model);
2139 for (int i = start + 1; i < end; i++) {
2140 rows[i].SetUnknown();
2141 rows[i].AddBodyLine(model);
2142 }
2143 p->model = model;
2144 p->has_drop_cap = rows[start].ri_->has_drop_cap;
2145 p->is_list_item = model->justification() == JUSTIFICATION_RIGHT
2146 ? rows[start].ri_->rword_indicates_list_item
2147 : rows[start].ri_->lword_indicates_list_item;
2148 for (int row = start; row < end; row++) {
2149 if ((*row_owners)[row] != nullptr) {
2150 tprintf(
2151 "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2152 "more than once!\n");
2153 delete (*row_owners)[row];
2154 }
2155 (*row_owners)[row] = p;
2156 }
2157 }
2158 }
2159
2160 struct Interval {
Intervaltesseract::Interval2161 Interval() : begin(0), end(0) {}
Intervaltesseract::Interval2162 Interval(int b, int e) : begin(b), end(e) {}
2163
2164 int begin;
2165 int end;
2166 };
2167
2168 // Return whether rows[row] appears to be stranded, meaning that the evidence
2169 // for this row is very weak due to context. For instance, two lines of source
2170 // code may happen to be indented at the same tab vector as body text starts,
2171 // leading us to think they are two start-of-paragraph lines. This is not
2172 // optimal. However, we also don't want to mark a sequence of short dialog
2173 // as "weak," so our heuristic is:
2174 // (1) If a line is surrounded by lines of unknown type, it's weak.
2175 // (2) If two lines in a row are start lines for a given paragraph type, but
2176 // after that the same paragraph type does not continue, they're weak.
RowIsStranded(const std::vector<RowScratchRegisters> & rows,int row)2177 static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
2178 SetOfModels row_models;
2179 rows[row].StrongHypotheses(&row_models);
2180
2181 for (auto &row_model : row_models) {
2182 bool all_starts = rows[row].GetLineType();
2183 int run_length = 1;
2184 bool continues = true;
2185 for (int i = row - 1; i >= 0 && continues; i--) {
2186 SetOfModels models;
2187 rows[i].NonNullHypotheses(&models);
2188 switch (rows[i].GetLineType(row_model)) {
2189 case LT_START:
2190 run_length++;
2191 break;
2192 case LT_MULTIPLE: // explicit fall-through
2193 case LT_BODY:
2194 run_length++;
2195 all_starts = false;
2196 break;
2197 case LT_UNKNOWN: // explicit fall-through
2198 default:
2199 continues = false;
2200 }
2201 }
2202 continues = true;
2203 for (unsigned i = row + 1; i < rows.size() && continues; i++) {
2204 SetOfModels models;
2205 rows[i].NonNullHypotheses(&models);
2206 switch (rows[i].GetLineType(row_model)) {
2207 case LT_START:
2208 run_length++;
2209 break;
2210 case LT_MULTIPLE: // explicit fall-through
2211 case LT_BODY:
2212 run_length++;
2213 all_starts = false;
2214 break;
2215 case LT_UNKNOWN: // explicit fall-through
2216 default:
2217 continues = false;
2218 }
2219 }
2220 if (run_length > 2 || (!all_starts && run_length > 1)) {
2221 return false;
2222 }
2223 }
2224 return true;
2225 }
2226
2227 // Go through rows[row_start, row_end) and gather up sequences that need better
2228 // classification.
2229 // + Sequences of non-empty rows without hypotheses.
2230 // + Crown paragraphs not immediately followed by a strongly modeled line.
2231 // + Single line paragraphs surrounded by text that doesn't match the
2232 // model.
LeftoverSegments(const std::vector<RowScratchRegisters> & rows,std::vector<Interval> * to_fix,int row_start,int row_end)2233 static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
2234 std::vector<Interval> *to_fix, int row_start, int row_end) {
2235 to_fix->clear();
2236 for (int i = row_start; i < row_end; i++) {
2237 bool needs_fixing = false;
2238
2239 SetOfModels models;
2240 SetOfModels models_w_crowns;
2241 rows[i].StrongHypotheses(&models);
2242 rows[i].NonNullHypotheses(&models_w_crowns);
2243 if (models.empty() && !models_w_crowns.empty()) {
2244 // Crown paragraph. Is it followed by a modeled line?
2245 for (unsigned end = i + 1; end < rows.size(); end++) {
2246 SetOfModels end_models;
2247 SetOfModels strong_end_models;
2248 rows[end].NonNullHypotheses(&end_models);
2249 rows[end].StrongHypotheses(&strong_end_models);
2250 if (end_models.empty()) {
2251 needs_fixing = true;
2252 break;
2253 } else if (!strong_end_models.empty()) {
2254 needs_fixing = false;
2255 break;
2256 }
2257 }
2258 } else if (models.empty() && rows[i].ri_->num_words > 0) {
2259 // No models at all.
2260 needs_fixing = true;
2261 }
2262
2263 if (!needs_fixing && !models.empty()) {
2264 needs_fixing = RowIsStranded(rows, i);
2265 }
2266
2267 if (needs_fixing) {
2268 if (!to_fix->empty() && to_fix->back().end == i - 1) {
2269 to_fix->back().end = i;
2270 } else {
2271 to_fix->push_back(Interval(i, i));
2272 }
2273 }
2274 }
2275 // Convert inclusive intervals to half-open intervals.
2276 for (auto &i : *to_fix) {
2277 i.end = i.end + 1;
2278 }
2279 }
2280
2281 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2282 // normalize each row_owner to point to an actual PARA, and output the
2283 // paragraphs in order onto paragraphs.
CanonicalizeDetectionResults(std::vector<PARA * > * row_owners,PARA_LIST * paragraphs)2284 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
2285 std::vector<PARA *> &rows = *row_owners;
2286 paragraphs->clear();
2287 PARA_IT out(paragraphs);
2288 PARA *formerly_null = nullptr;
2289 for (unsigned i = 0; i < rows.size(); i++) {
2290 if (rows[i] == nullptr) {
2291 if (i == 0 || rows[i - 1] != formerly_null) {
2292 rows[i] = formerly_null = new PARA();
2293 } else {
2294 rows[i] = formerly_null;
2295 continue;
2296 }
2297 } else if (i > 0 && rows[i - 1] == rows[i]) {
2298 continue;
2299 }
2300 out.add_after_then_move(rows[i]);
2301 }
2302 }
2303
2304 // Main entry point for Paragraph Detection Algorithm.
2305 //
2306 // Given a set of equally spaced textlines (described by row_infos),
2307 // Split them into paragraphs.
2308 //
2309 // Output:
2310 // row_owners - one pointer for each row, to the paragraph it belongs to.
2311 // paragraphs - this is the actual list of PARA objects.
2312 // models - the list of paragraph models referenced by the PARA objects.
2313 // caller is responsible for deleting the models.
DetectParagraphs(int debug_level,std::vector<RowInfo> * row_infos,std::vector<PARA * > * row_owners,PARA_LIST * paragraphs,std::vector<ParagraphModel * > * models)2314 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
2315 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2316 std::vector<ParagraphModel *> *models) {
2317 ParagraphTheory theory(models);
2318
2319 // Initialize row_owners to be a bunch of nullptr pointers.
2320 row_owners->clear();
2321 row_owners->resize(row_infos->size());
2322
2323 // Set up row scratch registers for the main algorithm.
2324 std::vector<RowScratchRegisters> rows(row_infos->size());
2325 for (unsigned i = 0; i < row_infos->size(); i++) {
2326 rows[i].Init((*row_infos)[i]);
2327 }
2328
2329 // Pass 1:
2330 // Detect sequences of lines that all contain leader dots (.....)
2331 // These are likely Tables of Contents. If there are three text lines in
2332 // a row with leader dots, it's pretty safe to say the middle one should
2333 // be a paragraph of its own.
2334 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2335
2336 DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2337
2338 std::vector<Interval> leftovers;
2339 LeftoverSegments(rows, &leftovers, 0, rows.size());
2340 for (auto &leftover : leftovers) {
2341 // Pass 2a:
2342 // Find any strongly evidenced start-of-paragraph lines. If they're
2343 // followed by two lines that look like body lines, make a paragraph
2344 // model for that and see if that model applies throughout the text
2345 // (that is, "smear" it).
2346 StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2347
2348 // Pass 2b:
2349 // If we had any luck in pass 2a, we got part of the page and didn't
2350 // know how to classify a few runs of rows. Take the segments that
2351 // didn't find a model and reprocess them individually.
2352 std::vector<Interval> leftovers2;
2353 LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2354 bool pass2a_was_useful =
2355 leftovers2.size() > 1 ||
2356 (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
2357 if (pass2a_was_useful) {
2358 for (auto &leftover2 : leftovers2) {
2359 StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2360 }
2361 }
2362 }
2363
2364 DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2365
2366 // Pass 3:
2367 // These are the dregs for which we didn't have enough strong textual
2368 // and geometric clues to form matching models for. Let's see if
2369 // the geometric clues are simple enough that we could just use those.
2370 LeftoverSegments(rows, &leftovers, 0, rows.size());
2371 for (auto &leftover : leftovers) {
2372 GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2373 }
2374
2375 // Undo any flush models for which there's little evidence.
2376 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2377
2378 DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2379
2380 // Pass 4:
2381 // Take everything that's still not marked up well and clear all markings.
2382 LeftoverSegments(rows, &leftovers, 0, rows.size());
2383 for (auto &leftover : leftovers) {
2384 for (int j = leftover.begin; j < leftover.end; j++) {
2385 rows[j].SetUnknown();
2386 }
2387 }
2388
2389 DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2390
2391 // Convert all of the unique hypothesis runs to PARAs.
2392 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2393
2394 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2395
2396 // Finally, clean up any dangling nullptr row paragraph parents.
2397 CanonicalizeDetectionResults(row_owners, paragraphs);
2398 }
2399
2400 // ============ Code interfacing with the rest of Tesseract ==================
2401
InitializeTextAndBoxesPreRecognition(const MutableIterator & it,RowInfo * info)2402 static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
2403 // Set up text, lword_text, and rword_text (mostly for debug printing).
2404 std::string fake_text;
2405 PageIterator pit(static_cast<const PageIterator &>(it));
2406 bool first_word = true;
2407 if (!pit.Empty(RIL_WORD)) {
2408 do {
2409 fake_text += "x";
2410 if (first_word) {
2411 info->lword_text += "x";
2412 }
2413 info->rword_text += "x";
2414 if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2415 !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2416 fake_text += " ";
2417 info->rword_text = "";
2418 first_word = false;
2419 }
2420 } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));
2421 }
2422 if (fake_text.empty()) {
2423 return;
2424 }
2425
2426 int lspaces = info->pix_ldistance / info->average_interword_space;
2427 for (int i = 0; i < lspaces; i++) {
2428 info->text += ' ';
2429 }
2430 info->text += fake_text;
2431
2432 // Set up lword_box, rword_box, and num_words.
2433 PAGE_RES_IT page_res_it = *it.PageResIt();
2434 WERD_RES *word_res = page_res_it.restart_row();
2435 ROW_RES *this_row = page_res_it.row();
2436
2437 WERD_RES *lword = nullptr;
2438 WERD_RES *rword = nullptr;
2439 info->num_words = 0;
2440 do {
2441 if (word_res) {
2442 if (!lword) {
2443 lword = word_res;
2444 }
2445 if (rword != word_res) {
2446 info->num_words++;
2447 }
2448 rword = word_res;
2449 }
2450 word_res = page_res_it.forward();
2451 } while (page_res_it.row() == this_row);
2452
2453 if (lword) {
2454 info->lword_box = lword->word->bounding_box();
2455 }
2456 if (rword) {
2457 info->rword_box = rword->word->bounding_box();
2458 }
2459 }
2460
2461 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2462 // detector RowInfo with all relevant information from the row.
InitializeRowInfo(bool after_recognition,const MutableIterator & it,RowInfo * info)2463 static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {
2464 if (it.PageResIt()->row() != nullptr) {
2465 ROW *row = it.PageResIt()->row()->row;
2466 info->pix_ldistance = row->lmargin();
2467 info->pix_rdistance = row->rmargin();
2468 info->average_interword_space =
2469 row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2470 info->pix_xheight = row->x_height();
2471 info->has_leaders = false;
2472 info->has_drop_cap = row->has_drop_cap();
2473 info->ltr = true; // set below depending on word scripts
2474 } else {
2475 info->pix_ldistance = info->pix_rdistance = 0;
2476 info->average_interword_space = 1;
2477 info->pix_xheight = 1.0;
2478 info->has_leaders = false;
2479 info->has_drop_cap = false;
2480 info->ltr = true;
2481 }
2482
2483 info->num_words = 0;
2484 info->lword_indicates_list_item = false;
2485 info->lword_likely_starts_idea = false;
2486 info->lword_likely_ends_idea = false;
2487 info->rword_indicates_list_item = false;
2488 info->rword_likely_starts_idea = false;
2489 info->rword_likely_ends_idea = false;
2490 info->has_leaders = false;
2491 info->ltr = true;
2492
2493 if (!after_recognition) {
2494 InitializeTextAndBoxesPreRecognition(it, info);
2495 return;
2496 }
2497 info->text = "";
2498 const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2499 int trailing_ws_idx = strlen(text.get()); // strip trailing space
2500 while (trailing_ws_idx > 0 &&
2501 // isspace() only takes ASCII
2502 isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2503 trailing_ws_idx--;
2504 }
2505 if (trailing_ws_idx > 0) {
2506 int lspaces = info->pix_ldistance / info->average_interword_space;
2507 for (int i = 0; i < lspaces; i++) {
2508 info->text += ' ';
2509 }
2510 for (int i = 0; i < trailing_ws_idx; i++) {
2511 info->text += text[i];
2512 }
2513 }
2514
2515 if (info->text.empty()) {
2516 return;
2517 }
2518
2519 PAGE_RES_IT page_res_it = *it.PageResIt();
2520 std::vector<WERD_RES *> werds;
2521 WERD_RES *word_res = page_res_it.restart_row();
2522 ROW_RES *this_row = page_res_it.row();
2523 int num_leaders = 0;
2524 int ltr = 0;
2525 int rtl = 0;
2526 do {
2527 if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2528 werds.push_back(word_res);
2529 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2530 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2531 if (word_res->word->flag(W_REP_CHAR)) {
2532 num_leaders++;
2533 }
2534 }
2535 word_res = page_res_it.forward();
2536 } while (page_res_it.row() == this_row);
2537 info->ltr = ltr >= rtl;
2538 info->has_leaders = num_leaders > 3;
2539 info->num_words = werds.size();
2540 if (!werds.empty()) {
2541 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2542 info->lword_text = lword->best_choice->unichar_string().c_str();
2543 info->rword_text = rword->best_choice->unichar_string().c_str();
2544 info->lword_box = lword->word->bounding_box();
2545 info->rword_box = rword->word->bounding_box();
2546 LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,
2547 &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2548 &info->lword_likely_ends_idea);
2549 RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,
2550 &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2551 &info->rword_likely_ends_idea);
2552 }
2553 }
2554
2555 // This is called after rows have been identified and words are recognized.
2556 // Much of this could be implemented before word recognition, but text helps
2557 // to identify bulleted lists and gives good signals for sentence boundaries.
DetectParagraphs(int debug_level,bool after_text_recognition,const MutableIterator * block_start,std::vector<ParagraphModel * > * models)2558 void DetectParagraphs(int debug_level, bool after_text_recognition,
2559 const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2560 // Clear out any preconceived notions.
2561 if (block_start->Empty(RIL_TEXTLINE)) {
2562 return;
2563 }
2564 BLOCK *block = block_start->PageResIt()->block()->block;
2565 block->para_list()->clear();
2566 bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2567
2568 // Convert the Tesseract structures to RowInfos
2569 // for the paragraph detection algorithm.
2570 MutableIterator row(*block_start);
2571 if (row.Empty(RIL_TEXTLINE)) {
2572 return; // end of input already.
2573 }
2574
2575 std::vector<RowInfo> row_infos;
2576 do {
2577 if (!row.PageResIt()->row()) {
2578 continue; // empty row.
2579 }
2580 row.PageResIt()->row()->row->set_para(nullptr);
2581 row_infos.emplace_back();
2582 RowInfo &ri = row_infos.back();
2583 InitializeRowInfo(after_text_recognition, row, &ri);
2584 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));
2585
2586 // If we're called before text recognition, we might not have
2587 // tight block bounding boxes, so trim by the minimum on each side.
2588 if (!row_infos.empty()) {
2589 int min_lmargin = row_infos[0].pix_ldistance;
2590 int min_rmargin = row_infos[0].pix_rdistance;
2591 for (unsigned i = 1; i < row_infos.size(); i++) {
2592 if (row_infos[i].pix_ldistance < min_lmargin) {
2593 min_lmargin = row_infos[i].pix_ldistance;
2594 }
2595 if (row_infos[i].pix_rdistance < min_rmargin) {
2596 min_rmargin = row_infos[i].pix_rdistance;
2597 }
2598 }
2599 if (min_lmargin > 0 || min_rmargin > 0) {
2600 for (auto &row_info : row_infos) {
2601 row_info.pix_ldistance -= min_lmargin;
2602 row_info.pix_rdistance -= min_rmargin;
2603 }
2604 }
2605 }
2606
2607 // Run the paragraph detection algorithm.
2608 std::vector<PARA *> row_owners;
2609 std::vector<PARA *> the_paragraphs;
2610 if (!is_image_block) {
2611 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
2612 } else {
2613 row_owners.resize(row_infos.size());
2614 CanonicalizeDetectionResults(&row_owners, block->para_list());
2615 }
2616
2617 // Now stitch in the row_owners into the rows.
2618 row = *block_start;
2619 for (auto &row_owner : row_owners) {
2620 while (!row.PageResIt()->row()) {
2621 row.Next(RIL_TEXTLINE);
2622 }
2623 row.PageResIt()->row()->row->set_para(row_owner);
2624 row.Next(RIL_TEXTLINE);
2625 }
2626 }
2627
2628 } // namespace tesseract
2629