1 /**********************************************************************
2 * File: wordseg.cpp (Formerly wspace.c)
3 * Description: Code to segment the blobs into words.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "wordseg.h"
25
26 #include <cmath>
27
28 #include "blobbox.h"
29 #include "cjkpitch.h"
30 #include "drawtord.h"
31 #include "fpchop.h"
32 #include "makerow.h"
33 #include "pitsync1.h"
34 #include "statistc.h"
35 #include "textord.h"
36 #include "topitch.h"
37 #include "tovars.h"
38
39 namespace tesseract {
40
41 BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42 BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43
44 #define BLOCK_STATS_CLUSTERS 10
45
46 /**
47 * @name make_single_word
48 *
49 * For each row, arrange the blobs into one word. There is no fixed
50 * pitch detection.
51 */
52
make_single_word(bool one_blob,TO_ROW_LIST * rows,ROW_LIST * real_rows)53 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54 TO_ROW_IT to_row_it(rows);
55 ROW_IT row_it(real_rows);
56 for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57 TO_ROW *row = to_row_it.data();
58 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59 // to create the word.
60 C_BLOB_LIST cblobs;
61 C_BLOB_IT cblob_it(&cblobs);
62 BLOBNBOX_IT box_it(row->blob_list());
63 for (; !box_it.empty(); box_it.forward()) {
64 BLOBNBOX *bblob = box_it.extract();
65 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66 auto cblob = bblob->remove_cblob();
67 if (cblob != nullptr) {
68 C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69 cout_it.move_to_last();
70 cout_it.add_list_after(cblob->out_list());
71 delete cblob;
72 }
73 } else {
74 auto cblob = bblob->remove_cblob();
75 if (cblob != nullptr) {
76 cblob_it.add_after_then_move(cblob);
77 }
78 }
79 delete bblob;
80 }
81 // Convert the TO_ROW to a ROW.
82 ROW *real_row =
83 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84 WERD_IT word_it(real_row->word_list());
85 WERD *word = new WERD(&cblobs, 0, nullptr);
86 word->set_flag(W_BOL, true);
87 word->set_flag(W_EOL, true);
88 word->set_flag(W_DONT_CHOP, one_blob);
89 word_it.add_after_then_move(word);
90 row_it.add_after_then_move(real_row);
91 }
92 }
93
94 /**
95 * make_words
96 *
97 * Arrange the blobs into words.
98 */
make_words(tesseract::Textord * textord,ICOORD page_tr,float gradient,BLOCK_LIST * blocks,TO_BLOCK_LIST * port_blocks)99 void make_words(tesseract::Textord *textord,
100 ICOORD page_tr, // top right
101 float gradient, // page skew
102 BLOCK_LIST *blocks, // block list
103 TO_BLOCK_LIST *port_blocks) { // output list
104 TO_BLOCK_IT block_it; // iterator
105 TO_BLOCK *block; // current block
106
107 if (textord->use_cjk_fp_model()) {
108 compute_fixed_pitch_cjk(page_tr, port_blocks);
109 } else {
110 compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
111 !bool(textord_test_landscape));
112 }
113 textord->to_spacing(page_tr, port_blocks);
114 block_it.set_to_list(port_blocks);
115 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
116 block = block_it.data();
117 make_real_words(textord, block, FCOORD(1.0f, 0.0f));
118 }
119 }
120
121 /**
122 * @name set_row_spaces
123 *
124 * Set the min_space and max_nonspace members of the row so that
125 * the blobs can be arranged into words.
126 */
127
set_row_spaces(TO_BLOCK * block,FCOORD rotation,bool testing_on)128 void set_row_spaces( // find space sizes
129 TO_BLOCK *block, // block to do
130 FCOORD rotation, // for drawing
131 bool testing_on // correct orientation
132 ) {
133 TO_ROW *row; // current row
134 TO_ROW_IT row_it = block->get_rows();
135
136 if (row_it.empty()) {
137 return; // empty block
138 }
139 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
140 row = row_it.data();
141 if (row->fixed_pitch == 0) {
142 row->min_space = static_cast<int32_t>(
143 ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
144 row->max_nonspace = static_cast<int32_t>(
145 floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
146 if (testing_on && textord_show_initial_words) {
147 tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
148 row->min_space, row->intercept());
149 }
150 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
151 row->space_size = row->pr_space;
152 row->kern_size = row->pr_nonsp;
153 }
154 #ifndef GRAPHICS_DISABLED
155 if (textord_show_initial_words && testing_on) {
156 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
157 }
158 #endif
159 }
160 }
161
162 /**
163 * @name row_words
164 *
165 * Compute the max nonspace and min space for the row.
166 */
167
row_words(TO_BLOCK * block,TO_ROW * row,int32_t maxwidth,FCOORD rotation,bool testing_on)168 int32_t row_words( // compute space size
169 TO_BLOCK *block, // block it came from
170 TO_ROW *row, // row to operate on
171 int32_t maxwidth, // max expected space size
172 FCOORD rotation, // for drawing
173 bool testing_on // for debug
174 ) {
175 bool testing_row; // contains testpt
176 bool prev_valid; // if decent size
177 int32_t prev_x; // end of prev blob
178 int32_t cluster_count; // no of clusters
179 int32_t gap_index; // which cluster
180 int32_t smooth_factor; // for smoothing stats
181 BLOBNBOX *blob; // current blob
182 float lower, upper; // clustering parameters
183 float gaps[3]; // gap clusers
184 ICOORD testpt;
185 TBOX blob_box; // bounding box
186 // iterator
187 BLOBNBOX_IT blob_it = row->blob_list();
188 STATS gap_stats(0, maxwidth);
189 STATS cluster_stats[4]; // clusters
190
191 testpt = ICOORD(textord_test_x, textord_test_y);
192 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
193 // if (testing_on)
194 // tprintf("Row smooth factor=%d\n",smooth_factor);
195 prev_valid = false;
196 prev_x = -INT32_MAX;
197 testing_row = false;
198 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
199 blob = blob_it.data();
200 blob_box = blob->bounding_box();
201 if (blob_box.contains(testpt)) {
202 testing_row = true;
203 }
204 gap_stats.add(blob_box.width(), 1);
205 }
206 gap_stats.clear();
207 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
208 blob = blob_it.data();
209 if (!blob->joined_to_prev()) {
210 blob_box = blob->bounding_box();
211 if (prev_valid && blob_box.left() - prev_x < maxwidth) {
212 gap_stats.add(blob_box.left() - prev_x, 1);
213 }
214 prev_valid = true;
215 prev_x = blob_box.right();
216 }
217 }
218 if (gap_stats.get_total() == 0) {
219 row->min_space = 0; // no evidence
220 row->max_nonspace = 0;
221 return 0;
222 }
223 gap_stats.smooth(smooth_factor);
224 lower = row->xheight * textord_words_initial_lower;
225 upper = row->xheight * textord_words_initial_upper;
226 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
227 while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
228 // shrink gap
229 upper = (upper * 3 + lower) / 4;
230 lower = (lower * 3 + upper) / 4;
231 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
232 }
233 if (cluster_count < 2) {
234 row->min_space = 0; // no evidence
235 row->max_nonspace = 0;
236 return 0;
237 }
238 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
239 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
240 }
241 // get medians
242 if (cluster_count > 2) {
243 if (testing_on && textord_show_initial_words) {
244 tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
245 cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
246 }
247 lower = gaps[0];
248 if (gaps[1] > lower) {
249 upper = gaps[1]; // prefer most frequent
250 if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
251 upper = gaps[2];
252 }
253 } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
254 upper = gaps[2];
255 } else if (lower >= block->xheight * textord_words_min_minspace) {
256 upper = lower; // not nice
257 lower = gaps[1];
258 if (testing_on && textord_show_initial_words) {
259 tprintf("Had to switch most common from lower to upper!!\n");
260 gap_stats.print();
261 }
262 } else {
263 row->min_space = 0; // no evidence
264 row->max_nonspace = 0;
265 return 0;
266 }
267 } else {
268 if (gaps[1] < gaps[0]) {
269 if (testing_on && textord_show_initial_words) {
270 tprintf("Had to switch most common from lower to upper!!\n");
271 gap_stats.print();
272 }
273 lower = gaps[1];
274 upper = gaps[0];
275 } else {
276 upper = gaps[1];
277 lower = gaps[0];
278 }
279 }
280 if (upper < block->xheight * textord_words_min_minspace) {
281 row->min_space = 0; // no evidence
282 row->max_nonspace = 0;
283 return 0;
284 }
285 if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
286 lower * 3 > block->min_space * 2 + block->max_nonspace) {
287 if (testing_on && textord_show_initial_words) {
288 tprintf("Disagreement between block and row at %g!!\n", row->intercept());
289 tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
290 gap_stats.print();
291 }
292 }
293 row->min_space =
294 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
295 row->max_nonspace =
296 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
297 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
298 row->space_size = upper;
299 row->kern_size = lower;
300 if (testing_on && textord_show_initial_words) {
301 if (testing_row) {
302 tprintf("GAP STATS\n");
303 gap_stats.print();
304 tprintf("SPACE stats\n");
305 cluster_stats[2].print_summary();
306 tprintf("NONSPACE stats\n");
307 cluster_stats[1].print_summary();
308 }
309 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
310 upper, row->max_nonspace, lower);
311 }
312 return cluster_stats[2].get_total();
313 }
314
315 /**
316 * @name row_words2
317 *
318 * Compute the max nonspace and min space for the row.
319 */
320
row_words2(TO_BLOCK * block,TO_ROW * row,int32_t maxwidth,FCOORD rotation,bool testing_on)321 int32_t row_words2( // compute space size
322 TO_BLOCK *block, // block it came from
323 TO_ROW *row, // row to operate on
324 int32_t maxwidth, // max expected space size
325 FCOORD rotation, // for drawing
326 bool testing_on // for debug
327 ) {
328 bool prev_valid; // if decent size
329 bool this_valid; // current blob big enough
330 int32_t prev_x; // end of prev blob
331 int32_t min_width; // min interesting width
332 int32_t valid_count; // good gaps
333 int32_t total_count; // total gaps
334 int32_t cluster_count; // no of clusters
335 int32_t prev_count; // previous cluster_count
336 int32_t gap_index; // which cluster
337 int32_t smooth_factor; // for smoothing stats
338 BLOBNBOX *blob; // current blob
339 float lower, upper; // clustering parameters
340 ICOORD testpt;
341 TBOX blob_box; // bounding box
342 // iterator
343 BLOBNBOX_IT blob_it = row->blob_list();
344 STATS gap_stats(0, maxwidth);
345 // gap sizes
346 float gaps[BLOCK_STATS_CLUSTERS];
347 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
348 // clusters
349
350 testpt = ICOORD(textord_test_x, textord_test_y);
351 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
352 // if (testing_on)
353 // tprintf("Row smooth factor=%d\n",smooth_factor);
354 prev_valid = false;
355 prev_x = -INT16_MAX;
356 const bool testing_row = false;
357 // min blob size
358 min_width = static_cast<int32_t>(block->pr_space);
359 total_count = 0;
360 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
361 blob = blob_it.data();
362 if (!blob->joined_to_prev()) {
363 blob_box = blob->bounding_box();
364 this_valid = blob_box.width() >= min_width;
365 if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
366 gap_stats.add(blob_box.left() - prev_x, 1);
367 }
368 total_count++; // count possibles
369 prev_x = blob_box.right();
370 prev_valid = this_valid;
371 }
372 }
373 valid_count = gap_stats.get_total();
374 if (valid_count < total_count * textord_words_minlarge) {
375 gap_stats.clear();
376 prev_x = -INT16_MAX;
377 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
378 blob = blob_it.data();
379 if (!blob->joined_to_prev()) {
380 blob_box = blob->bounding_box();
381 if (blob_box.left() - prev_x < maxwidth) {
382 gap_stats.add(blob_box.left() - prev_x, 1);
383 }
384 prev_x = blob_box.right();
385 }
386 }
387 }
388 if (gap_stats.get_total() == 0) {
389 row->min_space = 0; // no evidence
390 row->max_nonspace = 0;
391 return 0;
392 }
393
394 cluster_count = 0;
395 lower = block->xheight * words_initial_lower;
396 upper = block->xheight * words_initial_upper;
397 gap_stats.smooth(smooth_factor);
398 do {
399 prev_count = cluster_count;
400 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
401 BLOCK_STATS_CLUSTERS, cluster_stats);
402 } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
403 if (cluster_count < 1) {
404 row->min_space = 0;
405 row->max_nonspace = 0;
406 return 0;
407 }
408 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
409 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
410 }
411 // get medians
412 if (testing_on) {
413 tprintf("cluster_count=%d:", cluster_count);
414 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
415 tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
416 }
417 tprintf("\n");
418 }
419
420 // Try to find proportional non-space and space for row.
421 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
422 gap_index++) {
423 ;
424 }
425 if (gap_index < cluster_count) {
426 lower = gaps[gap_index]; // most frequent below
427 } else {
428 if (testing_on) {
429 tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
430 }
431 lower = block->pr_nonsp;
432 }
433 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
434 gap_index++) {
435 ;
436 }
437 if (gap_index < cluster_count) {
438 upper = gaps[gap_index]; // most frequent above
439 } else {
440 if (testing_on) {
441 tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
442 }
443 upper = block->pr_space;
444 }
445 row->min_space =
446 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
447 row->max_nonspace =
448 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
449 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
450 row->space_size = upper;
451 row->kern_size = lower;
452 if (testing_on) {
453 if (testing_row) {
454 tprintf("GAP STATS\n");
455 gap_stats.print();
456 tprintf("SPACE stats\n");
457 cluster_stats[2].print_summary();
458 tprintf("NONSPACE stats\n");
459 cluster_stats[1].print_summary();
460 }
461 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
462 upper, row->max_nonspace, lower);
463 }
464 return 1;
465 }
466
467 /**
468 * @name make_real_words
469 *
470 * Convert a TO_BLOCK to a BLOCK.
471 */
472
make_real_words(tesseract::Textord * textord,TO_BLOCK * block,FCOORD rotation)473 void make_real_words(tesseract::Textord *textord,
474 TO_BLOCK *block, // block to do
475 FCOORD rotation // for drawing
476 ) {
477 TO_ROW *row; // current row
478 TO_ROW_IT row_it = block->get_rows();
479 ROW *real_row = nullptr; // output row
480 ROW_IT real_row_it = block->block->row_list();
481
482 if (row_it.empty()) {
483 return; // empty block
484 }
485 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
486 row = row_it.data();
487 if (row->blob_list()->empty() && !row->rep_words.empty()) {
488 real_row = make_rep_words(row, block);
489 } else if (!row->blob_list()->empty()) {
490 // In a fixed pitch document, some lines may be detected as fixed pitch
491 // while others don't, and will go through different path.
492 // For non-space delimited language like CJK, fixed pitch chop always
493 // leave the entire line as one word. We can force consistent chopping
494 // with force_make_prop_words flag.
495 POLY_BLOCK *pb = block->block->pdblk.poly_block();
496 if (textord_chopper_test) {
497 real_row = textord->make_blob_words(row, rotation);
498 } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
499 row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) {
500 real_row = textord->make_prop_words(row, rotation);
501 } else if (row->pitch_decision == PITCH_DEF_FIXED ||
502 row->pitch_decision == PITCH_CORR_FIXED) {
503 real_row = fixed_pitch_words(row, rotation);
504 } else {
505 ASSERT_HOST(false);
506 }
507 }
508 if (real_row != nullptr) {
509 // put row in block
510 real_row_it.add_after_then_move(real_row);
511 }
512 }
513 block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
514 static_cast<int16_t>(block->space_size),
515 static_cast<int16_t>(block->fixed_pitch));
516 block->block->check_pitch();
517 }
518
519 /**
520 * @name make_rep_words
521 *
522 * Fabricate a real row from only the repeated blob words.
523 * Get the xheight from the block as it may be more meaningful.
524 */
525
make_rep_words(TO_ROW * row,TO_BLOCK * block)526 ROW *make_rep_words( // make a row
527 TO_ROW *row, // row to convert
528 TO_BLOCK *block // block it lives in
529 ) {
530 ROW *real_row; // output row
531 TBOX word_box; // bounding box
532 // iterator
533 WERD_IT word_it = &row->rep_words;
534
535 if (word_it.empty()) {
536 return nullptr;
537 }
538 word_box = word_it.data()->bounding_box();
539 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
540 word_box += word_it.data()->bounding_box();
541 }
542 row->xheight = block->xheight;
543 real_row =
544 new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
545 word_it.set_to_list(real_row->word_list());
546 // put words in row
547 word_it.add_list_after(&row->rep_words);
548 real_row->recalc_bounding_box();
549 return real_row;
550 }
551
552 /**
553 * @name make_real_word
554 *
555 * Construct a WERD from a given number of adjacent entries in a
556 * list of BLOBNBOXs.
557 */
558
make_real_word(BLOBNBOX_IT * box_it,int32_t blobcount,bool bol,uint8_t blanks)559 WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
560 int32_t blobcount, // no of blobs to use
561 bool bol, // start of line
562 uint8_t blanks // no of blanks
563 ) {
564 C_OUTLINE_IT cout_it;
565 C_BLOB_LIST cblobs;
566 C_BLOB_IT cblob_it = &cblobs;
567
568 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
569 auto bblob = box_it->extract();
570 if (bblob->joined_to_prev()) {
571 auto cblob = bblob->remove_cblob();
572 if (cblob != nullptr) {
573 cout_it.set_to_list(cblob_it.data()->out_list());
574 cout_it.move_to_last();
575 cout_it.add_list_after(cblob->out_list());
576 delete cblob;
577 }
578 } else {
579 auto cblob = bblob->remove_cblob();
580 if (cblob != nullptr) {
581 cblob_it.add_after_then_move(cblob);
582 }
583 }
584 delete bblob;
585 box_it->forward(); // next one
586 }
587
588 if (blanks < 1) {
589 blanks = 1;
590 }
591
592 auto word = new WERD(&cblobs, blanks, nullptr);
593
594 if (bol) {
595 word->set_flag(W_BOL, true);
596 }
597 if (box_it->at_first()) {
598 word->set_flag(W_EOL, true); // at end of line
599 }
600
601 return word;
602 }
603
604 } // namespace tesseract
605