1 /**********************************************************************
2  * File:        wordseg.cpp  (Formerly wspace.c)
3  * Description: Code to segment the blobs into words.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1992, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include "wordseg.h"
25 
26 #include <cmath>
27 
28 #include "blobbox.h"
29 #include "cjkpitch.h"
30 #include "drawtord.h"
31 #include "fpchop.h"
32 #include "makerow.h"
33 #include "pitsync1.h"
34 #include "statistc.h"
35 #include "textord.h"
36 #include "topitch.h"
37 #include "tovars.h"
38 
39 namespace tesseract {
40 
41 BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42 BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43 
44 #define BLOCK_STATS_CLUSTERS 10
45 
46 /**
47  * @name make_single_word
48  *
49  * For each row, arrange the blobs into one word. There is no fixed
50  * pitch detection.
51  */
52 
make_single_word(bool one_blob,TO_ROW_LIST * rows,ROW_LIST * real_rows)53 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54   TO_ROW_IT to_row_it(rows);
55   ROW_IT row_it(real_rows);
56   for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57     TO_ROW *row = to_row_it.data();
58     // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59     // to create the word.
60     C_BLOB_LIST cblobs;
61     C_BLOB_IT cblob_it(&cblobs);
62     BLOBNBOX_IT box_it(row->blob_list());
63     for (; !box_it.empty(); box_it.forward()) {
64       BLOBNBOX *bblob = box_it.extract();
65       if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66         auto cblob = bblob->remove_cblob();
67         if (cblob != nullptr) {
68           C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69           cout_it.move_to_last();
70           cout_it.add_list_after(cblob->out_list());
71           delete cblob;
72         }
73       } else {
74         auto cblob = bblob->remove_cblob();
75         if (cblob != nullptr) {
76           cblob_it.add_after_then_move(cblob);
77         }
78       }
79       delete bblob;
80     }
81     // Convert the TO_ROW to a ROW.
82     ROW *real_row =
83         new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84     WERD_IT word_it(real_row->word_list());
85     WERD *word = new WERD(&cblobs, 0, nullptr);
86     word->set_flag(W_BOL, true);
87     word->set_flag(W_EOL, true);
88     word->set_flag(W_DONT_CHOP, one_blob);
89     word_it.add_after_then_move(word);
90     row_it.add_after_then_move(real_row);
91   }
92 }
93 
94 /**
95  * make_words
96  *
97  * Arrange the blobs into words.
98  */
make_words(tesseract::Textord * textord,ICOORD page_tr,float gradient,BLOCK_LIST * blocks,TO_BLOCK_LIST * port_blocks)99 void make_words(tesseract::Textord *textord,
100                 ICOORD page_tr,               // top right
101                 float gradient,               // page skew
102                 BLOCK_LIST *blocks,           // block list
103                 TO_BLOCK_LIST *port_blocks) { // output list
104   TO_BLOCK_IT block_it;                       // iterator
105   TO_BLOCK *block;                            // current block
106 
107   if (textord->use_cjk_fp_model()) {
108     compute_fixed_pitch_cjk(page_tr, port_blocks);
109   } else {
110     compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
111                         !bool(textord_test_landscape));
112   }
113   textord->to_spacing(page_tr, port_blocks);
114   block_it.set_to_list(port_blocks);
115   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
116     block = block_it.data();
117     make_real_words(textord, block, FCOORD(1.0f, 0.0f));
118   }
119 }
120 
121 /**
122  * @name set_row_spaces
123  *
124  * Set the min_space and max_nonspace members of the row so that
125  * the blobs can be arranged into words.
126  */
127 
set_row_spaces(TO_BLOCK * block,FCOORD rotation,bool testing_on)128 void set_row_spaces( // find space sizes
129     TO_BLOCK *block, // block to do
130     FCOORD rotation, // for drawing
131     bool testing_on  // correct orientation
132 ) {
133   TO_ROW *row; // current row
134   TO_ROW_IT row_it = block->get_rows();
135 
136   if (row_it.empty()) {
137     return; // empty block
138   }
139   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
140     row = row_it.data();
141     if (row->fixed_pitch == 0) {
142       row->min_space = static_cast<int32_t>(
143           ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
144       row->max_nonspace = static_cast<int32_t>(
145           floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
146       if (testing_on && textord_show_initial_words) {
147         tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
148                 row->min_space, row->intercept());
149       }
150       row->space_threshold = (row->max_nonspace + row->min_space) / 2;
151       row->space_size = row->pr_space;
152       row->kern_size = row->pr_nonsp;
153     }
154 #ifndef GRAPHICS_DISABLED
155     if (textord_show_initial_words && testing_on) {
156       plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
157     }
158 #endif
159   }
160 }
161 
162 /**
163  * @name row_words
164  *
165  * Compute the max nonspace and min space for the row.
166  */
167 
row_words(TO_BLOCK * block,TO_ROW * row,int32_t maxwidth,FCOORD rotation,bool testing_on)168 int32_t row_words(    // compute space size
169     TO_BLOCK *block,  // block it came from
170     TO_ROW *row,      // row to operate on
171     int32_t maxwidth, // max expected space size
172     FCOORD rotation,  // for drawing
173     bool testing_on   // for debug
174 ) {
175   bool testing_row;      // contains testpt
176   bool prev_valid;       // if decent size
177   int32_t prev_x;        // end of prev blob
178   int32_t cluster_count; // no of clusters
179   int32_t gap_index;     // which cluster
180   int32_t smooth_factor; // for smoothing stats
181   BLOBNBOX *blob;        // current blob
182   float lower, upper;    // clustering parameters
183   float gaps[3];         // gap clusers
184   ICOORD testpt;
185   TBOX blob_box; // bounding box
186                  // iterator
187   BLOBNBOX_IT blob_it = row->blob_list();
188   STATS gap_stats(0, maxwidth);
189   STATS cluster_stats[4]; // clusters
190 
191   testpt = ICOORD(textord_test_x, textord_test_y);
192   smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
193   //      if (testing_on)
194   //              tprintf("Row smooth factor=%d\n",smooth_factor);
195   prev_valid = false;
196   prev_x = -INT32_MAX;
197   testing_row = false;
198   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
199     blob = blob_it.data();
200     blob_box = blob->bounding_box();
201     if (blob_box.contains(testpt)) {
202       testing_row = true;
203     }
204     gap_stats.add(blob_box.width(), 1);
205   }
206   gap_stats.clear();
207   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
208     blob = blob_it.data();
209     if (!blob->joined_to_prev()) {
210       blob_box = blob->bounding_box();
211       if (prev_valid && blob_box.left() - prev_x < maxwidth) {
212         gap_stats.add(blob_box.left() - prev_x, 1);
213       }
214       prev_valid = true;
215       prev_x = blob_box.right();
216     }
217   }
218   if (gap_stats.get_total() == 0) {
219     row->min_space = 0; // no evidence
220     row->max_nonspace = 0;
221     return 0;
222   }
223   gap_stats.smooth(smooth_factor);
224   lower = row->xheight * textord_words_initial_lower;
225   upper = row->xheight * textord_words_initial_upper;
226   cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
227   while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
228     // shrink gap
229     upper = (upper * 3 + lower) / 4;
230     lower = (lower * 3 + upper) / 4;
231     cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
232   }
233   if (cluster_count < 2) {
234     row->min_space = 0; // no evidence
235     row->max_nonspace = 0;
236     return 0;
237   }
238   for (gap_index = 0; gap_index < cluster_count; gap_index++) {
239     gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
240   }
241   // get medians
242   if (cluster_count > 2) {
243     if (testing_on && textord_show_initial_words) {
244       tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
245               cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
246     }
247     lower = gaps[0];
248     if (gaps[1] > lower) {
249       upper = gaps[1]; // prefer most frequent
250       if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
251         upper = gaps[2];
252       }
253     } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
254       upper = gaps[2];
255     } else if (lower >= block->xheight * textord_words_min_minspace) {
256       upper = lower; // not nice
257       lower = gaps[1];
258       if (testing_on && textord_show_initial_words) {
259         tprintf("Had to switch most common from lower to upper!!\n");
260         gap_stats.print();
261       }
262     } else {
263       row->min_space = 0; // no evidence
264       row->max_nonspace = 0;
265       return 0;
266     }
267   } else {
268     if (gaps[1] < gaps[0]) {
269       if (testing_on && textord_show_initial_words) {
270         tprintf("Had to switch most common from lower to upper!!\n");
271         gap_stats.print();
272       }
273       lower = gaps[1];
274       upper = gaps[0];
275     } else {
276       upper = gaps[1];
277       lower = gaps[0];
278     }
279   }
280   if (upper < block->xheight * textord_words_min_minspace) {
281     row->min_space = 0; // no evidence
282     row->max_nonspace = 0;
283     return 0;
284   }
285   if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
286       lower * 3 > block->min_space * 2 + block->max_nonspace) {
287     if (testing_on && textord_show_initial_words) {
288       tprintf("Disagreement between block and row at %g!!\n", row->intercept());
289       tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
290       gap_stats.print();
291     }
292   }
293   row->min_space =
294       static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
295   row->max_nonspace =
296       static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
297   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
298   row->space_size = upper;
299   row->kern_size = lower;
300   if (testing_on && textord_show_initial_words) {
301     if (testing_row) {
302       tprintf("GAP STATS\n");
303       gap_stats.print();
304       tprintf("SPACE stats\n");
305       cluster_stats[2].print_summary();
306       tprintf("NONSPACE stats\n");
307       cluster_stats[1].print_summary();
308     }
309     tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
310             upper, row->max_nonspace, lower);
311   }
312   return cluster_stats[2].get_total();
313 }
314 
315 /**
316  * @name row_words2
317  *
318  * Compute the max nonspace and min space for the row.
319  */
320 
row_words2(TO_BLOCK * block,TO_ROW * row,int32_t maxwidth,FCOORD rotation,bool testing_on)321 int32_t row_words2(   // compute space size
322     TO_BLOCK *block,  // block it came from
323     TO_ROW *row,      // row to operate on
324     int32_t maxwidth, // max expected space size
325     FCOORD rotation,  // for drawing
326     bool testing_on   // for debug
327 ) {
328   bool prev_valid;       // if decent size
329   bool this_valid;       // current blob big enough
330   int32_t prev_x;        // end of prev blob
331   int32_t min_width;     // min interesting width
332   int32_t valid_count;   // good gaps
333   int32_t total_count;   // total gaps
334   int32_t cluster_count; // no of clusters
335   int32_t prev_count;    // previous cluster_count
336   int32_t gap_index;     // which cluster
337   int32_t smooth_factor; // for smoothing stats
338   BLOBNBOX *blob;        // current blob
339   float lower, upper;    // clustering parameters
340   ICOORD testpt;
341   TBOX blob_box; // bounding box
342                  // iterator
343   BLOBNBOX_IT blob_it = row->blob_list();
344   STATS gap_stats(0, maxwidth);
345   // gap sizes
346   float gaps[BLOCK_STATS_CLUSTERS];
347   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
348   // clusters
349 
350   testpt = ICOORD(textord_test_x, textord_test_y);
351   smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
352   //      if (testing_on)
353   //              tprintf("Row smooth factor=%d\n",smooth_factor);
354   prev_valid = false;
355   prev_x = -INT16_MAX;
356   const bool testing_row = false;
357   // min blob size
358   min_width = static_cast<int32_t>(block->pr_space);
359   total_count = 0;
360   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
361     blob = blob_it.data();
362     if (!blob->joined_to_prev()) {
363       blob_box = blob->bounding_box();
364       this_valid = blob_box.width() >= min_width;
365       if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
366         gap_stats.add(blob_box.left() - prev_x, 1);
367       }
368       total_count++; // count possibles
369       prev_x = blob_box.right();
370       prev_valid = this_valid;
371     }
372   }
373   valid_count = gap_stats.get_total();
374   if (valid_count < total_count * textord_words_minlarge) {
375     gap_stats.clear();
376     prev_x = -INT16_MAX;
377     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
378       blob = blob_it.data();
379       if (!blob->joined_to_prev()) {
380         blob_box = blob->bounding_box();
381         if (blob_box.left() - prev_x < maxwidth) {
382           gap_stats.add(blob_box.left() - prev_x, 1);
383         }
384         prev_x = blob_box.right();
385       }
386     }
387   }
388   if (gap_stats.get_total() == 0) {
389     row->min_space = 0; // no evidence
390     row->max_nonspace = 0;
391     return 0;
392   }
393 
394   cluster_count = 0;
395   lower = block->xheight * words_initial_lower;
396   upper = block->xheight * words_initial_upper;
397   gap_stats.smooth(smooth_factor);
398   do {
399     prev_count = cluster_count;
400     cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
401                                       BLOCK_STATS_CLUSTERS, cluster_stats);
402   } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
403   if (cluster_count < 1) {
404     row->min_space = 0;
405     row->max_nonspace = 0;
406     return 0;
407   }
408   for (gap_index = 0; gap_index < cluster_count; gap_index++) {
409     gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
410   }
411   // get medians
412   if (testing_on) {
413     tprintf("cluster_count=%d:", cluster_count);
414     for (gap_index = 0; gap_index < cluster_count; gap_index++) {
415       tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
416     }
417     tprintf("\n");
418   }
419 
420   // Try to find proportional non-space and space for row.
421   for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
422        gap_index++) {
423     ;
424   }
425   if (gap_index < cluster_count) {
426     lower = gaps[gap_index]; // most frequent below
427   } else {
428     if (testing_on) {
429       tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
430     }
431     lower = block->pr_nonsp;
432   }
433   for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
434        gap_index++) {
435     ;
436   }
437   if (gap_index < cluster_count) {
438     upper = gaps[gap_index]; // most frequent above
439   } else {
440     if (testing_on) {
441       tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
442     }
443     upper = block->pr_space;
444   }
445   row->min_space =
446       static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
447   row->max_nonspace =
448       static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
449   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
450   row->space_size = upper;
451   row->kern_size = lower;
452   if (testing_on) {
453     if (testing_row) {
454       tprintf("GAP STATS\n");
455       gap_stats.print();
456       tprintf("SPACE stats\n");
457       cluster_stats[2].print_summary();
458       tprintf("NONSPACE stats\n");
459       cluster_stats[1].print_summary();
460     }
461     tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
462             upper, row->max_nonspace, lower);
463   }
464   return 1;
465 }
466 
467 /**
468  * @name make_real_words
469  *
470  * Convert a TO_BLOCK to a BLOCK.
471  */
472 
make_real_words(tesseract::Textord * textord,TO_BLOCK * block,FCOORD rotation)473 void make_real_words(tesseract::Textord *textord,
474                      TO_BLOCK *block, // block to do
475                      FCOORD rotation  // for drawing
476 ) {
477   TO_ROW *row; // current row
478   TO_ROW_IT row_it = block->get_rows();
479   ROW *real_row = nullptr; // output row
480   ROW_IT real_row_it = block->block->row_list();
481 
482   if (row_it.empty()) {
483     return; // empty block
484   }
485   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
486     row = row_it.data();
487     if (row->blob_list()->empty() && !row->rep_words.empty()) {
488       real_row = make_rep_words(row, block);
489     } else if (!row->blob_list()->empty()) {
490       // In a fixed pitch document, some lines may be detected as fixed pitch
491       // while others don't, and will go through different path.
492       // For non-space delimited language like CJK, fixed pitch chop always
493       // leave the entire line as one word.  We can force consistent chopping
494       // with force_make_prop_words flag.
495       POLY_BLOCK *pb = block->block->pdblk.poly_block();
496       if (textord_chopper_test) {
497         real_row = textord->make_blob_words(row, rotation);
498       } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
499                  row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {
500         real_row = textord->make_prop_words(row, rotation);
501       } else if (row->pitch_decision == PITCH_DEF_FIXED ||
502                  row->pitch_decision == PITCH_CORR_FIXED) {
503         real_row = fixed_pitch_words(row, rotation);
504       } else {
505         ASSERT_HOST(false);
506       }
507     }
508     if (real_row != nullptr) {
509       // put row in block
510       real_row_it.add_after_then_move(real_row);
511     }
512   }
513   block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
514                           static_cast<int16_t>(block->space_size),
515                           static_cast<int16_t>(block->fixed_pitch));
516   block->block->check_pitch();
517 }
518 
519 /**
520  * @name make_rep_words
521  *
522  * Fabricate a real row from only the repeated blob words.
523  * Get the xheight from the block as it may be more meaningful.
524  */
525 
make_rep_words(TO_ROW * row,TO_BLOCK * block)526 ROW *make_rep_words( // make a row
527     TO_ROW *row,     // row to convert
528     TO_BLOCK *block  // block it lives in
529 ) {
530   ROW *real_row; // output row
531   TBOX word_box; // bounding box
532                  // iterator
533   WERD_IT word_it = &row->rep_words;
534 
535   if (word_it.empty()) {
536     return nullptr;
537   }
538   word_box = word_it.data()->bounding_box();
539   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
540     word_box += word_it.data()->bounding_box();
541   }
542   row->xheight = block->xheight;
543   real_row =
544       new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
545   word_it.set_to_list(real_row->word_list());
546   // put words in row
547   word_it.add_list_after(&row->rep_words);
548   real_row->recalc_bounding_box();
549   return real_row;
550 }
551 
552 /**
553  * @name make_real_word
554  *
555  * Construct a WERD from a given number of adjacent entries in a
556  * list of BLOBNBOXs.
557  */
558 
make_real_word(BLOBNBOX_IT * box_it,int32_t blobcount,bool bol,uint8_t blanks)559 WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
560                      int32_t blobcount,   // no of blobs to use
561                      bool bol,            // start of line
562                      uint8_t blanks       // no of blanks
563 ) {
564   C_OUTLINE_IT cout_it;
565   C_BLOB_LIST cblobs;
566   C_BLOB_IT cblob_it = &cblobs;
567 
568   for (int blobindex = 0; blobindex < blobcount; blobindex++) {
569     auto bblob = box_it->extract();
570     if (bblob->joined_to_prev()) {
571       auto cblob = bblob->remove_cblob();
572       if (cblob != nullptr) {
573         cout_it.set_to_list(cblob_it.data()->out_list());
574         cout_it.move_to_last();
575         cout_it.add_list_after(cblob->out_list());
576         delete cblob;
577       }
578     } else {
579       auto cblob = bblob->remove_cblob();
580       if (cblob != nullptr) {
581         cblob_it.add_after_then_move(cblob);
582       }
583     }
584     delete bblob;
585     box_it->forward(); // next one
586   }
587 
588   if (blanks < 1) {
589     blanks = 1;
590   }
591 
592   auto word = new WERD(&cblobs, blanks, nullptr);
593 
594   if (bol) {
595     word->set_flag(W_BOL, true);
596   }
597   if (box_it->at_first()) {
598     word->set_flag(W_EOL, true); // at end of line
599   }
600 
601   return word;
602 }
603 
604 } // namespace tesseract
605