1 /**********************************************************************
2 * File: topitch.cpp (Formerly to_pitch.c)
3 * Description: Code to determine fixed pitchness and the pitch if fixed.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1993, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "topitch.h"
25
26 #include "blobbox.h"
27 #include "drawtord.h"
28 #include "makerow.h"
29 #include "pithsync.h"
30 #include "pitsync1.h"
31 #include "statistc.h"
32 #include "tovars.h"
33 #include "wordseg.h"
34
35 #include "helpers.h"
36
37 #include <memory>
38
39 namespace tesseract {
40
41 static BOOL_VAR(textord_all_prop, false, "All doc is proportial text");
42 BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test");
43 static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm");
44 BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm");
45 BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff");
46 BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts");
47 BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts");
48 BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch");
49 double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts");
50 double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells");
51
52 #define BLOCK_STATS_CLUSTERS 10
53 #define MAX_ALLOWED_PITCH 100 // max pixel pitch.
54
55 // qsort function to sort 2 floats.
sort_floats(const void * arg1,const void * arg2)56 static int sort_floats(const void *arg1, const void *arg2) {
57 float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2);
58 if (diff > 0) {
59 return 1;
60 } else if (diff < 0) {
61 return -1;
62 } else {
63 return 0;
64 }
65 }
66
67 /**********************************************************************
68 * compute_fixed_pitch
69 *
70 * Decide whether each row is fixed pitch individually.
71 * Correlate definite and uncertain results to obtain an individual
72 * result for each row in the TO_ROW class.
73 **********************************************************************/
74
compute_fixed_pitch(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient,FCOORD rotation,bool testing_on)75 void compute_fixed_pitch(ICOORD page_tr, // top right
76 TO_BLOCK_LIST *port_blocks, // input list
77 float gradient, // page skew
78 FCOORD rotation, // for drawing
79 bool testing_on) { // correct orientation
80 TO_BLOCK_IT block_it; // iterator
81 TO_BLOCK *block; // current block;
82 TO_ROW *row; // current row
83 int block_index; // block number
84 int row_index; // row number
85
86 #ifndef GRAPHICS_DISABLED
87 if (textord_show_initial_words && testing_on) {
88 if (to_win == nullptr) {
89 create_to_win(page_tr);
90 }
91 }
92 #endif
93
94 block_it.set_to_list(port_blocks);
95 block_index = 1;
96 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
97 block = block_it.data();
98 compute_block_pitch(block, rotation, block_index, testing_on);
99 block_index++;
100 }
101
102 if (!try_doc_fixed(page_tr, port_blocks, gradient)) {
103 block_index = 1;
104 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
105 block = block_it.data();
106 if (!try_block_fixed(block, block_index)) {
107 try_rows_fixed(block, block_index, testing_on);
108 }
109 block_index++;
110 }
111 }
112
113 block_index = 1;
114 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
115 block = block_it.data();
116 POLY_BLOCK *pb = block->block->pdblk.poly_block();
117 if (pb != nullptr && !pb->IsText()) {
118 continue; // Non-text doesn't exist!
119 }
120 // row iterator
121 TO_ROW_IT row_it(block->get_rows());
122 row_index = 1;
123 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
124 row = row_it.data();
125 fix_row_pitch(row, block, port_blocks, row_index, block_index);
126 row_index++;
127 }
128 block_index++;
129 }
130 #ifndef GRAPHICS_DISABLED
131 if (textord_show_initial_words && testing_on) {
132 ScrollView::Update();
133 }
134 #endif
135 }
136
137 /**********************************************************************
138 * fix_row_pitch
139 *
140 * Get a pitch_decision for this row by voting among similar rows in the
141 * block, then similar rows over all the page, or any other rows at all.
142 **********************************************************************/
143
fix_row_pitch(TO_ROW * bad_row,TO_BLOCK * bad_block,TO_BLOCK_LIST * blocks,int32_t row_target,int32_t block_target)144 void fix_row_pitch(TO_ROW *bad_row, // row to fix
145 TO_BLOCK *bad_block, // block of bad_row
146 TO_BLOCK_LIST *blocks, // blocks to scan
147 int32_t row_target, // number of row
148 int32_t block_target) { // number of block
149 int16_t mid_cuts;
150 int block_votes; // votes in block
151 int like_votes; // votes over page
152 int other_votes; // votes of unlike blocks
153 int block_index; // number of block
154 int row_index; // number of row
155 int maxwidth; // max pitch
156 TO_BLOCK_IT block_it = blocks; // block iterator
157 TO_BLOCK *block; // current block
158 TO_ROW *row; // current row
159 float sp_sd; // space deviation
160 STATS block_stats; // pitches in block
161 STATS like_stats; // pitches in page
162
163 block_votes = like_votes = other_votes = 0;
164 maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace));
165 if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) {
166 block_stats.set_range(0, maxwidth);
167 like_stats.set_range(0, maxwidth);
168 block_index = 1;
169 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170 block = block_it.data();
171 POLY_BLOCK *pb = block->block->pdblk.poly_block();
172 if (pb != nullptr && !pb->IsText()) {
173 continue; // Non text doesn't exist!
174 }
175 row_index = 1;
176 TO_ROW_IT row_it(block->get_rows());
177 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
178 row = row_it.data();
179 if ((bad_row->all_caps &&
180 row->xheight + row->ascrise <
181 (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) &&
182 row->xheight + row->ascrise >
183 (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) ||
184 (!bad_row->all_caps &&
185 row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) &&
186 row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
187 if (block_index == block_target) {
188 if (row->pitch_decision == PITCH_DEF_FIXED) {
189 block_votes += textord_words_veto_power;
190 block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
191 } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
192 row->pitch_decision == PITCH_CORR_FIXED) {
193 block_votes++;
194 block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
195 } else if (row->pitch_decision == PITCH_DEF_PROP) {
196 block_votes -= textord_words_veto_power;
197 } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
198 row->pitch_decision == PITCH_CORR_PROP) {
199 block_votes--;
200 }
201 } else {
202 if (row->pitch_decision == PITCH_DEF_FIXED) {
203 like_votes += textord_words_veto_power;
204 like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
205 } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
206 row->pitch_decision == PITCH_CORR_FIXED) {
207 like_votes++;
208 like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
209 } else if (row->pitch_decision == PITCH_DEF_PROP) {
210 like_votes -= textord_words_veto_power;
211 } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
212 row->pitch_decision == PITCH_CORR_PROP) {
213 like_votes--;
214 }
215 }
216 } else {
217 if (row->pitch_decision == PITCH_DEF_FIXED) {
218 other_votes += textord_words_veto_power;
219 } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
220 row->pitch_decision == PITCH_CORR_FIXED) {
221 other_votes++;
222 } else if (row->pitch_decision == PITCH_DEF_PROP) {
223 other_votes -= textord_words_veto_power;
224 } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
225 row->pitch_decision == PITCH_CORR_PROP) {
226 other_votes--;
227 }
228 }
229 row_index++;
230 }
231 block_index++;
232 }
233 if (block_votes > textord_words_veto_power) {
234 bad_row->fixed_pitch = block_stats.ile(0.5);
235 bad_row->pitch_decision = PITCH_CORR_FIXED;
236 } else if (block_votes <= textord_words_veto_power && like_votes > 0) {
237 bad_row->fixed_pitch = like_stats.ile(0.5);
238 bad_row->pitch_decision = PITCH_CORR_FIXED;
239 } else {
240 bad_row->pitch_decision = PITCH_CORR_PROP;
241 if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&
242 (textord_debug_pitch_test || textord_debug_pitch_metric)) {
243 tprintf(
244 "Warning:row %d of block %d set prop with no like rows against "
245 "trend\n",
246 row_target, block_target);
247 }
248 }
249 }
250 if (textord_debug_pitch_metric) {
251 tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes);
252 tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
253 }
254 if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
255 if (bad_row->fixed_pitch < textord_min_xheight) {
256 if (block_votes > 0) {
257 bad_row->fixed_pitch = block_stats.ile(0.5);
258 } else if (block_votes == 0 && like_votes > 0) {
259 bad_row->fixed_pitch = like_stats.ile(0.5);
260 } else {
261 tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target,
262 block_target);
263 bad_row->fixed_pitch = bad_row->xheight;
264 }
265 }
266 if (bad_row->fixed_pitch < textord_min_xheight) {
267 bad_row->fixed_pitch = (float)textord_min_xheight;
268 }
269 bad_row->kern_size = bad_row->fixed_pitch / 4;
270 bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6);
271 bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4);
272 bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2;
273 bad_row->space_size = bad_row->fixed_pitch;
274 if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
275 tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left,
276 bad_row->projection_right,
277 (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
278 sp_sd, mid_cuts, &bad_row->char_cells, false);
279 }
280 } else if (bad_row->pitch_decision == PITCH_CORR_PROP ||
281 bad_row->pitch_decision == PITCH_DEF_PROP) {
282 bad_row->fixed_pitch = 0.0f;
283 bad_row->char_cells.clear();
284 }
285 }
286
287 /**********************************************************************
288 * compute_block_pitch
289 *
290 * Decide whether each block is fixed pitch individually.
291 **********************************************************************/
292
compute_block_pitch(TO_BLOCK * block,FCOORD rotation,int32_t block_index,bool testing_on)293 void compute_block_pitch(TO_BLOCK *block, // input list
294 FCOORD rotation, // for drawing
295 int32_t block_index, // block number
296 bool testing_on) { // correct orientation
297 TBOX block_box; // bounding box
298
299 block_box = block->block->pdblk.bounding_box();
300 if (testing_on && textord_debug_pitch_test) {
301 tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(),
302 block_box.right(), block_box.top());
303 }
304 block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace));
305 block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace));
306 block->fixed_pitch = 0.0f;
307 block->space_size = static_cast<float>(block->min_space);
308 block->kern_size = static_cast<float>(block->max_nonspace);
309 block->pr_nonsp = block->xheight * words_default_prop_nonspace;
310 block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
311 if (!block->get_rows()->empty()) {
312 ASSERT_HOST(block->xheight > 0);
313 find_repeated_chars(block, textord_show_initial_words && testing_on);
314 #ifndef GRAPHICS_DISABLED
315 if (textord_show_initial_words && testing_on) {
316 // overlap_picture_ops(true);
317 ScrollView::Update();
318 }
319 #endif
320 compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on);
321 }
322 }
323
324 /**********************************************************************
325 * compute_rows_pitch
326 *
327 * Decide whether each row is fixed pitch individually.
328 **********************************************************************/
329
compute_rows_pitch(TO_BLOCK * block,int32_t block_index,bool testing_on)330 bool compute_rows_pitch( // find line stats
331 TO_BLOCK *block, // block to do
332 int32_t block_index, // block number
333 bool testing_on // correct orientation
334 ) {
335 int32_t maxwidth; // of spaces
336 TO_ROW *row; // current row
337 int32_t row_index; // row number.
338 float lower, upper; // cluster thresholds
339 TO_ROW_IT row_it = block->get_rows();
340
341 row_index = 1;
342 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
343 row = row_it.data();
344 ASSERT_HOST(row->xheight > 0);
345 row->compute_vertical_projection();
346 maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace));
347 if (row_pitch_stats(row, maxwidth, testing_on) &&
348 find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index,
349 testing_on)) {
350 if (row->fixed_pitch == 0) {
351 lower = row->pr_nonsp;
352 upper = row->pr_space;
353 row->space_size = upper;
354 row->kern_size = lower;
355 }
356 } else {
357 row->fixed_pitch = 0.0f; // insufficient data
358 row->pitch_decision = PITCH_DUNNO;
359 }
360 row_index++;
361 }
362 return false;
363 }
364
365 /**********************************************************************
366 * try_doc_fixed
367 *
368 * Attempt to call the entire document fixed pitch.
369 **********************************************************************/
370
try_doc_fixed(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient)371 bool try_doc_fixed( // determine pitch
372 ICOORD page_tr, // top right
373 TO_BLOCK_LIST *port_blocks, // input list
374 float gradient // page skew
375 ) {
376 int16_t master_x; // uniform shifts
377 int16_t pitch; // median pitch.
378 int x; // profile coord
379 int prop_blocks; // correct counts
380 int fixed_blocks;
381 int total_row_count; // total in page
382 // iterator
383 TO_BLOCK_IT block_it = port_blocks;
384 TO_BLOCK *block; // current block;
385 TO_ROW *row; // current row
386 int16_t projection_left; // edges
387 int16_t projection_right;
388 int16_t row_left; // edges of row
389 int16_t row_right;
390 float master_y; // uniform shifts
391 float shift_factor; // page skew correction
392 float final_pitch; // output pitch
393 float row_y; // baseline
394 STATS projection; // entire page
395 STATS pitches(0, MAX_ALLOWED_PITCH);
396 // for median
397 float sp_sd; // space sd
398 int16_t mid_cuts; // no of cheap cuts
399 float pitch_sd; // sync rating
400
401 if (block_it.empty()
402 // || block_it.data()==block_it.data_relative(1)
403 || !textord_blockndoc_fixed) {
404 return false;
405 }
406 shift_factor = gradient / (gradient * gradient + 1);
407 // row iterator
408 TO_ROW_IT row_it(block_it.data()->get_rows());
409 master_x = row_it.data()->projection_left;
410 master_y = row_it.data()->baseline.y(master_x);
411 projection_left = INT16_MAX;
412 projection_right = -INT16_MAX;
413 prop_blocks = 0;
414 fixed_blocks = 0;
415 total_row_count = 0;
416
417 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
418 block = block_it.data();
419 row_it.set_to_list(block->get_rows());
420 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
421 row = row_it.data();
422 total_row_count++;
423 if (row->fixed_pitch > 0) {
424 pitches.add(static_cast<int32_t>(row->fixed_pitch), 1);
425 }
426 // find median
427 row_y = row->baseline.y(master_x);
428 row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
429 row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y));
430 if (row_left < projection_left) {
431 projection_left = row_left;
432 }
433 if (row_right > projection_right) {
434 projection_right = row_right;
435 }
436 }
437 }
438 if (pitches.get_total() == 0) {
439 return false;
440 }
441 projection.set_range(projection_left, projection_right);
442
443 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
444 block = block_it.data();
445 row_it.set_to_list(block->get_rows());
446 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
447 row = row_it.data();
448 row_y = row->baseline.y(master_x);
449 row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
450 for (x = row->projection_left; x < row->projection_right; x++, row_left++) {
451 projection.add(row_left, row->projection.pile_count(x));
452 }
453 }
454 }
455
456 row_it.set_to_list(block_it.data()->get_rows());
457 row = row_it.data();
458 #ifndef GRAPHICS_DISABLED
459 if (textord_show_page_cuts && to_win != nullptr) {
460 projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
461 }
462 #endif
463 final_pitch = pitches.ile(0.5);
464 pitch = static_cast<int16_t>(final_pitch);
465 pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,
466 final_pitch, sp_sd, mid_cuts, &row->char_cells, false);
467
468 if (textord_debug_pitch_metric) {
469 tprintf(
470 "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%"
471 "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
472 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,
473 pitch_sd / pitch, pitch_sd / total_row_count / pitch);
474 }
475
476 #ifndef GRAPHICS_DISABLED
477 if (textord_show_page_cuts && to_win != nullptr) {
478 float row_shift; // shift for row
479 ICOORDELT_LIST *master_cells; // cells for page
480 master_cells = &row->char_cells;
481 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
482 block = block_it.data();
483 row_it.set_to_list(block->get_rows());
484 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
485 row = row_it.data();
486 row_y = row->baseline.y(master_x);
487 row_shift = shift_factor * (master_y - row_y);
488 plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
489 }
490 }
491 }
492 #endif
493 row->char_cells.clear();
494 return false;
495 }
496
497 /**********************************************************************
498 * try_block_fixed
499 *
500 * Try to call the entire block fixed.
501 **********************************************************************/
502
try_block_fixed(TO_BLOCK * block,int32_t block_index)503 bool try_block_fixed( // find line stats
504 TO_BLOCK *block, // block to do
505 int32_t block_index // block number
506 ) {
507 return false;
508 }
509
510 /**********************************************************************
511 * try_rows_fixed
512 *
513 * Decide whether each row is fixed pitch individually.
514 **********************************************************************/
515
try_rows_fixed(TO_BLOCK * block,int32_t block_index,bool testing_on)516 bool try_rows_fixed( // find line stats
517 TO_BLOCK *block, // block to do
518 int32_t block_index, // block number
519 bool testing_on // correct orientation
520 ) {
521 TO_ROW *row; // current row
522 int32_t row_index; // row number.
523 int32_t def_fixed = 0; // counters
524 int32_t def_prop = 0;
525 int32_t maybe_fixed = 0;
526 int32_t maybe_prop = 0;
527 int32_t dunno = 0;
528 int32_t corr_fixed = 0;
529 int32_t corr_prop = 0;
530 float lower, upper; // cluster thresholds
531 TO_ROW_IT row_it = block->get_rows();
532
533 row_index = 1;
534 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
535 row = row_it.data();
536 ASSERT_HOST(row->xheight > 0);
537 if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) {
538 if (row->fixed_pitch == 0) {
539 lower = row->pr_nonsp;
540 upper = row->pr_space;
541 row->space_size = upper;
542 row->kern_size = lower;
543 }
544 }
545 row_index++;
546 }
547 count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
548 dunno);
549 if (testing_on &&
550 (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) {
551 tprintf("Initially:");
552 print_block_counts(block, block_index);
553 }
554 if (def_fixed > def_prop * textord_words_veto_power) {
555 block->pitch_decision = PITCH_DEF_FIXED;
556 } else if (def_prop > def_fixed * textord_words_veto_power) {
557 block->pitch_decision = PITCH_DEF_PROP;
558 } else if (def_fixed > 0 || def_prop > 0) {
559 block->pitch_decision = PITCH_DUNNO;
560 } else if (maybe_fixed > maybe_prop * textord_words_veto_power) {
561 block->pitch_decision = PITCH_MAYBE_FIXED;
562 } else if (maybe_prop > maybe_fixed * textord_words_veto_power) {
563 block->pitch_decision = PITCH_MAYBE_PROP;
564 } else {
565 block->pitch_decision = PITCH_DUNNO;
566 }
567 return false;
568 }
569
570 /**********************************************************************
571 * print_block_counts
572 *
573 * Count up how many rows have what decision and print the results.
574 **********************************************************************/
575
print_block_counts(TO_BLOCK * block,int32_t block_index)576 void print_block_counts( // find line stats
577 TO_BLOCK *block, // block to do
578 int32_t block_index // block number
579 ) {
580 int32_t def_fixed = 0; // counters
581 int32_t def_prop = 0;
582 int32_t maybe_fixed = 0;
583 int32_t maybe_prop = 0;
584 int32_t dunno = 0;
585 int32_t corr_fixed = 0;
586 int32_t corr_prop = 0;
587
588 count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
589 dunno);
590 tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed);
591 if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) {
592 tprintf(" (Wrongly)");
593 }
594 tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
595 if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) {
596 tprintf(" (Wrongly)");
597 }
598 tprintf(" prop, %d dunno\n", dunno);
599 }
600
601 /**********************************************************************
602 * count_block_votes
603 *
604 * Count the number of rows in the block with each kind of pitch_decision.
605 **********************************************************************/
606
count_block_votes(TO_BLOCK * block,int32_t & def_fixed,int32_t & def_prop,int32_t & maybe_fixed,int32_t & maybe_prop,int32_t & corr_fixed,int32_t & corr_prop,int32_t & dunno)607 void count_block_votes( // find line stats
608 TO_BLOCK *block, // block to do
609 int32_t &def_fixed, // add to counts
610 int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,
611 int32_t &corr_prop, int32_t &dunno) {
612 TO_ROW *row; // current row
613 TO_ROW_IT row_it = block->get_rows();
614
615 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
616 row = row_it.data();
617 switch (row->pitch_decision) {
618 case PITCH_DUNNO:
619 dunno++;
620 break;
621 case PITCH_DEF_PROP:
622 def_prop++;
623 break;
624 case PITCH_MAYBE_PROP:
625 maybe_prop++;
626 break;
627 case PITCH_DEF_FIXED:
628 def_fixed++;
629 break;
630 case PITCH_MAYBE_FIXED:
631 maybe_fixed++;
632 break;
633 case PITCH_CORR_PROP:
634 corr_prop++;
635 break;
636 case PITCH_CORR_FIXED:
637 corr_fixed++;
638 break;
639 }
640 }
641 }
642
643 /**********************************************************************
644 * row_pitch_stats
645 *
646 * Decide whether each row is fixed pitch individually.
647 **********************************************************************/
648
row_pitch_stats(TO_ROW * row,int32_t maxwidth,bool testing_on)649 bool row_pitch_stats( // find line stats
650 TO_ROW *row, // current row
651 int32_t maxwidth, // of spaces
652 bool testing_on // correct orientation
653 ) {
654 BLOBNBOX *blob; // current blob
655 int gap_index; // current gap
656 int32_t prev_x; // end of prev blob
657 int32_t cluster_count; // no of clusters
658 int32_t prev_count; // of clusters
659 int32_t smooth_factor; // for smoothing stats
660 TBOX blob_box; // bounding box
661 float lower, upper; // cluster thresholds
662 // gap sizes
663 float gaps[BLOCK_STATS_CLUSTERS];
664 // blobs
665 BLOBNBOX_IT blob_it = row->blob_list();
666 STATS gap_stats(0, maxwidth);
667 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
668 // clusters
669
670 smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5);
671 if (!blob_it.empty()) {
672 prev_x = blob_it.data()->bounding_box().right();
673 blob_it.forward();
674 while (!blob_it.at_first()) {
675 blob = blob_it.data();
676 if (!blob->joined_to_prev()) {
677 blob_box = blob->bounding_box();
678 if (blob_box.left() - prev_x < maxwidth) {
679 gap_stats.add(blob_box.left() - prev_x, 1);
680 }
681 prev_x = blob_box.right();
682 }
683 blob_it.forward();
684 }
685 }
686 if (gap_stats.get_total() == 0) {
687 return false;
688 }
689 cluster_count = 0;
690 lower = row->xheight * words_initial_lower;
691 upper = row->xheight * words_initial_upper;
692 gap_stats.smooth(smooth_factor);
693 do {
694 prev_count = cluster_count;
695 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
696 BLOCK_STATS_CLUSTERS, cluster_stats);
697 } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
698 if (cluster_count < 1) {
699 return false;
700 }
701 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
702 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
703 }
704 // get medians
705 if (testing_on) {
706 tprintf("cluster_count=%d:", cluster_count);
707 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
708 tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
709 }
710 tprintf("\n");
711 }
712 qsort(gaps, cluster_count, sizeof(float), sort_floats);
713
714 // Try to find proportional non-space and space for row.
715 lower = row->xheight * words_default_prop_nonspace;
716 upper = row->xheight * textord_words_min_minspace;
717 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {
718 ;
719 }
720 if (gap_index == 0) {
721 if (testing_on) {
722 tprintf("No clusters below nonspace threshold!!\n");
723 }
724 if (cluster_count > 1) {
725 row->pr_nonsp = gaps[0];
726 row->pr_space = gaps[1];
727 } else {
728 row->pr_nonsp = lower;
729 row->pr_space = gaps[0];
730 }
731 } else {
732 row->pr_nonsp = gaps[gap_index - 1];
733 while (gap_index < cluster_count && gaps[gap_index] < upper) {
734 gap_index++;
735 }
736 if (gap_index == cluster_count) {
737 if (testing_on) {
738 tprintf("No clusters above nonspace threshold!!\n");
739 }
740 row->pr_space = lower * textord_spacesize_ratioprop;
741 } else {
742 row->pr_space = gaps[gap_index];
743 }
744 }
745
746 // Now try to find the fixed pitch space and non-space.
747 upper = row->xheight * words_default_fixed_space;
748 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {
749 ;
750 }
751 if (gap_index == 0) {
752 if (testing_on) {
753 tprintf("No clusters below space threshold!!\n");
754 }
755 row->fp_nonsp = upper;
756 row->fp_space = gaps[0];
757 } else {
758 row->fp_nonsp = gaps[gap_index - 1];
759 if (gap_index == cluster_count) {
760 if (testing_on) {
761 tprintf("No clusters above space threshold!!\n");
762 }
763 row->fp_space = row->xheight;
764 } else {
765 row->fp_space = gaps[gap_index];
766 }
767 }
768 if (testing_on) {
769 tprintf(
770 "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, "
771 "fp_space=%g\n",
772 row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
773 }
774 return true; // computed some stats
775 }
776
777 /**********************************************************************
778 * find_row_pitch
779 *
780 * Check to see if this row could be fixed pitch using the given spacings.
781 * Blobs with gaps smaller than the lower threshold are assumed to be one.
782 * The larger threshold is the word gap threshold.
783 **********************************************************************/
784
find_row_pitch(TO_ROW * row,int32_t maxwidth,int32_t dm_gap,TO_BLOCK * block,int32_t block_index,int32_t row_index,bool testing_on)785 bool find_row_pitch( // find lines
786 TO_ROW *row, // row to do
787 int32_t maxwidth, // max permitted space
788 int32_t dm_gap, // ignorable gaps
789 TO_BLOCK *block, // block of row
790 int32_t block_index, // block_number
791 int32_t row_index, // number of row
792 bool testing_on // correct orientation
793 ) {
794 bool used_dm_model; // looks like dot matrix
795 float min_space; // estimate threshold
796 float non_space; // gap size
797 float gap_iqr; // interquartile range
798 float pitch_iqr;
799 float dm_gap_iqr; // interquartile range
800 float dm_pitch_iqr;
801 float dm_pitch; // pitch with dm on
802 float pitch; // revised estimate
803 float initial_pitch; // guess at pitch
804 STATS gap_stats(0, maxwidth);
805 // centre-centre
806 STATS pitch_stats(0, maxwidth);
807
808 row->fixed_pitch = 0.0f;
809 initial_pitch = row->fp_space;
810 if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) {
811 initial_pitch = row->xheight; // keep pitch decent
812 }
813 non_space = row->fp_nonsp;
814 if (non_space > initial_pitch) {
815 non_space = initial_pitch;
816 }
817 min_space = (initial_pitch + non_space) / 2;
818
819 if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false,
820 dm_gap)) {
821 dm_gap_iqr = 0.0001f;
822 dm_pitch_iqr = maxwidth * 2.0f;
823 dm_pitch = initial_pitch;
824 } else {
825 dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
826 dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
827 dm_pitch = pitch_stats.ile(0.5);
828 }
829 gap_stats.clear();
830 pitch_stats.clear();
831 if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) {
832 gap_iqr = 0.0001f;
833 pitch_iqr = maxwidth * 3.0f;
834 } else {
835 gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
836 pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
837 if (testing_on) {
838 tprintf(
839 "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
840 "pitch=%g\n",
841 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
842 }
843 initial_pitch = pitch_stats.ile(0.5);
844 if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,
845 initial_pitch, true, false, 0)) {
846 min_space = initial_pitch;
847 gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
848 pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
849 if (testing_on) {
850 tprintf(
851 "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
852 "pitch=%g\n",
853 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
854 }
855 initial_pitch = pitch_stats.ile(0.5);
856 }
857 }
858 if (textord_debug_pitch_metric) {
859 tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index,
860 row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
861 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth
862 ? 'D'
863 : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
864 }
865 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
866 row->pitch_decision = PITCH_DUNNO;
867 if (textord_debug_pitch_metric) {
868 tprintf("\n");
869 }
870 return false; // insufficient data
871 }
872 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
873 if (testing_on) {
874 tprintf(
875 "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
876 "dm_gap_iqr=%g\n",
877 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
878 }
879 gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
880 pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
881 pitch = pitch_stats.ile(0.5);
882 used_dm_model = false;
883 } else {
884 if (testing_on) {
885 tprintf(
886 "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
887 "dm_gap_iqr=%g\n",
888 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
889 }
890 gap_iqr = dm_gap_iqr;
891 pitch_iqr = dm_pitch_iqr;
892 pitch = dm_pitch;
893 used_dm_model = true;
894 }
895 if (textord_debug_pitch_metric) {
896 tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch);
897 tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
898 pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
899 pitch_iqr < block->xheight * textord_max_pitch_iqr &&
900 pitch < block->xheight * textord_words_default_maxspace
901 ? 'F'
902 : 'P');
903 }
904 if (pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
905 pitch_iqr < block->xheight * textord_max_pitch_iqr &&
906 pitch < block->xheight * textord_words_default_maxspace) {
907 row->pitch_decision = PITCH_MAYBE_FIXED;
908 } else {
909 row->pitch_decision = PITCH_MAYBE_PROP;
910 }
911 row->fixed_pitch = pitch;
912 row->kern_size = gap_stats.ile(0.5);
913 row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2;
914 if (row->min_space > row->fixed_pitch) {
915 row->min_space = static_cast<int32_t>(row->fixed_pitch);
916 }
917 row->max_nonspace = row->min_space;
918 row->space_size = row->fixed_pitch;
919 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
920 row->used_dm_model = used_dm_model;
921 return true;
922 }
923
924 /**********************************************************************
925 * fixed_pitch_row
926 *
927 * Check to see if this row could be fixed pitch using the given spacings.
928 * Blobs with gaps smaller than the lower threshold are assumed to be one.
929 * The larger threshold is the word gap threshold.
930 **********************************************************************/
931
fixed_pitch_row(TO_ROW * row,BLOCK * block,int32_t block_index)932 bool fixed_pitch_row(TO_ROW *row, // row to do
933 BLOCK *block,
934 int32_t block_index // block_number
935 ) {
936 const char *res_string; // pitch result
937 int16_t mid_cuts; // no of cheap cuts
938 float non_space; // gap size
939 float pitch_sd; // error on pitch
940 float sp_sd = 0.0f; // space sd
941
942 non_space = row->fp_nonsp;
943 if (non_space > row->fixed_pitch) {
944 non_space = row->fixed_pitch;
945 }
946 POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
947 if (textord_all_prop || (pb != nullptr && !pb->IsText())) {
948 // Set the decision to definitely proportional.
949 pitch_sd = textord_words_def_prop * row->fixed_pitch;
950 row->pitch_decision = PITCH_DEF_PROP;
951 } else {
952 pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right,
953 (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd,
954 mid_cuts, &row->char_cells, block_index == textord_debug_block);
955 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&
956 ((pitsync_linear_version & 3) < 3 ||
957 ((pitsync_linear_version & 3) >= 3 &&
958 (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {
959 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps &&
960 ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) {
961 row->pitch_decision = PITCH_DEF_FIXED;
962 } else {
963 row->pitch_decision = PITCH_MAYBE_FIXED;
964 }
965 } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 ||
966 pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
967 if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {
968 row->pitch_decision = PITCH_MAYBE_PROP;
969 } else {
970 row->pitch_decision = PITCH_DEF_PROP;
971 }
972 } else {
973 row->pitch_decision = PITCH_DUNNO;
974 }
975 }
976
977 if (textord_debug_pitch_metric) {
978 res_string = "??";
979 switch (row->pitch_decision) {
980 case PITCH_DEF_PROP:
981 res_string = "DP";
982 break;
983 case PITCH_MAYBE_PROP:
984 res_string = "MP";
985 break;
986 case PITCH_DEF_FIXED:
987 res_string = "DF";
988 break;
989 case PITCH_MAYBE_FIXED:
990 res_string = "MF";
991 break;
992 default:
993 res_string = "??";
994 }
995 tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string);
996 }
997 return true;
998 }
999
1000 /**********************************************************************
1001 * count_pitch_stats
1002 *
1003 * Count up the gap and pitch stats on the block to see if it is fixed pitch.
1004 * Blobs with gaps smaller than the lower threshold are assumed to be one.
1005 * The larger threshold is the word gap threshold.
1006 * The return value indicates whether there were any decent values to use.
1007 **********************************************************************/
1008
count_pitch_stats(TO_ROW * row,STATS * gap_stats,STATS * pitch_stats,float initial_pitch,float min_space,bool ignore_outsize,bool split_outsize,int32_t dm_gap)1009 bool count_pitch_stats( // find lines
1010 TO_ROW *row, // row to do
1011 STATS *gap_stats, // blob gaps
1012 STATS *pitch_stats, // centre-centre stats
1013 float initial_pitch, // guess at pitch
1014 float min_space, // estimate space size
1015 bool ignore_outsize, // discard big objects
1016 bool split_outsize, // split big objects
1017 int32_t dm_gap // ignorable gaps
1018 ) {
1019 bool prev_valid; // not word broken
1020 BLOBNBOX *blob; // current blob
1021 // blobs
1022 BLOBNBOX_IT blob_it = row->blob_list();
1023 int32_t prev_right; // end of prev blob
1024 int32_t prev_centre; // centre of previous blob
1025 int32_t x_centre; // centre of this blob
1026 int32_t blob_width; // width of blob
1027 int32_t width_units; // no of widths in blob
1028 float width; // blob width
1029 TBOX blob_box; // bounding box
1030 TBOX joined_box; // of super blob
1031
1032 gap_stats->clear();
1033 pitch_stats->clear();
1034 if (blob_it.empty()) {
1035 return false;
1036 }
1037 prev_valid = false;
1038 prev_centre = 0;
1039 prev_right = 0; // stop compiler warning
1040 joined_box = blob_it.data()->bounding_box();
1041 do {
1042 blob_it.forward();
1043 blob = blob_it.data();
1044 if (!blob->joined_to_prev()) {
1045 blob_box = blob->bounding_box();
1046 if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) ||
1047 blob->cblob() == nullptr) {
1048 joined_box += blob_box; // merge blobs
1049 } else {
1050 blob_width = joined_box.width();
1051 if (split_outsize) {
1052 width_units =
1053 static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5));
1054 if (width_units < 1) {
1055 width_units = 1;
1056 }
1057 width_units--;
1058 } else if (ignore_outsize) {
1059 width = static_cast<float>(blob_width) / initial_pitch;
1060 width_units =
1061 width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0
1062 : -1;
1063 } else {
1064 width_units = 0; // everything in
1065 }
1066 x_centre = static_cast<int32_t>(joined_box.left() +
1067 (blob_width - width_units * initial_pitch) / 2);
1068 if (prev_valid && width_units >= 0) {
1069 // if (width_units>0)
1070 // {
1071 // tprintf("wu=%d,
1072 // width=%d,
1073 // xc=%d, adding
1074 // %d\n",
1075 // width_units,blob_width,x_centre,x_centre-prev_centre);
1076 // }
1077 gap_stats->add(joined_box.left() - prev_right, 1);
1078 pitch_stats->add(x_centre - prev_centre, 1);
1079 }
1080 prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch);
1081 prev_right = joined_box.right();
1082 prev_valid = blob_box.left() - joined_box.right() < min_space;
1083 prev_valid = prev_valid && width_units >= 0;
1084 joined_box = blob_box;
1085 }
1086 }
1087 } while (!blob_it.at_first());
1088 return gap_stats->get_total() >= 3;
1089 }
1090
1091 /**********************************************************************
1092 * tune_row_pitch
1093 *
1094 * Use a dp algorithm to fit the character cells and return the sd of
1095 * the cell size over the row.
1096 **********************************************************************/
1097
tune_row_pitch(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float & initial_pitch,float & best_sp_sd,int16_t & best_mid_cuts,ICOORDELT_LIST * best_cells,bool testing_on)1098 float tune_row_pitch( // find fp cells
1099 TO_ROW *row, // row to do
1100 STATS *projection, // vertical projection
1101 int16_t projection_left, // edge of projection
1102 int16_t projection_right, // edge of projection
1103 float space_size, // size of blank
1104 float &initial_pitch, // guess at pitch
1105 float &best_sp_sd, // space sd
1106 int16_t &best_mid_cuts, // no of cheap cuts
1107 ICOORDELT_LIST *best_cells, // row cells
1108 bool testing_on // inidividual words
1109 ) {
1110 int pitch_delta; // offset pitch
1111 int16_t mid_cuts; // cheap cuts
1112 float pitch_sd; // current sd
1113 float best_sd; // best result
1114 float best_pitch; // pitch for best result
1115 float initial_sd; // starting error
1116 float sp_sd; // space sd
1117 ICOORDELT_LIST test_cells; // row cells
1118 ICOORDELT_IT best_it; // start of best list
1119
1120 if (textord_fast_pitch_test) {
1121 return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,
1122 initial_pitch, best_sp_sd,
1123 // space sd
1124 best_mid_cuts, best_cells, testing_on);
1125 }
1126 if (textord_disable_pitch_test) {
1127 best_sp_sd = initial_pitch;
1128 return initial_pitch;
1129 }
1130 initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1131 initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);
1132 best_sd = initial_sd;
1133 best_pitch = initial_pitch;
1134 if (testing_on) {
1135 tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1136 }
1137 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1138 pitch_sd =
1139 compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1140 initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1141 if (testing_on) {
1142 tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd);
1143 }
1144 if (pitch_sd < best_sd) {
1145 best_sd = pitch_sd;
1146 best_mid_cuts = mid_cuts;
1147 best_sp_sd = sp_sd;
1148 best_pitch = initial_pitch + pitch_delta;
1149 best_cells->clear();
1150 best_it.set_to_list(best_cells);
1151 best_it.add_list_after(&test_cells);
1152 } else {
1153 test_cells.clear();
1154 }
1155 if (pitch_sd > initial_sd) {
1156 break; // getting worse
1157 }
1158 }
1159 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1160 pitch_sd =
1161 compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1162 initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1163 if (testing_on) {
1164 tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd);
1165 }
1166 if (pitch_sd < best_sd) {
1167 best_sd = pitch_sd;
1168 best_mid_cuts = mid_cuts;
1169 best_sp_sd = sp_sd;
1170 best_pitch = initial_pitch - pitch_delta;
1171 best_cells->clear();
1172 best_it.set_to_list(best_cells);
1173 best_it.add_list_after(&test_cells);
1174 } else {
1175 test_cells.clear();
1176 }
1177 if (pitch_sd > initial_sd) {
1178 break;
1179 }
1180 }
1181 initial_pitch = best_pitch;
1182
1183 if (textord_debug_pitch_metric) {
1184 print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);
1185 }
1186
1187 return best_sd;
1188 }
1189
1190 /**********************************************************************
1191 * tune_row_pitch
1192 *
1193 * Use a dp algorithm to fit the character cells and return the sd of
1194 * the cell size over the row.
1195 **********************************************************************/
1196
tune_row_pitch2(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float & initial_pitch,float & best_sp_sd,int16_t & best_mid_cuts,ICOORDELT_LIST * best_cells,bool testing_on)1197 float tune_row_pitch2( // find fp cells
1198 TO_ROW *row, // row to do
1199 STATS *projection, // vertical projection
1200 int16_t projection_left, // edge of projection
1201 int16_t projection_right, // edge of projection
1202 float space_size, // size of blank
1203 float &initial_pitch, // guess at pitch
1204 float &best_sp_sd, // space sd
1205 int16_t &best_mid_cuts, // no of cheap cuts
1206 ICOORDELT_LIST *best_cells, // row cells
1207 bool testing_on // inidividual words
1208 ) {
1209 int pitch_delta; // offset pitch
1210 int16_t pixel; // pixel coord
1211 int16_t best_pixel; // pixel coord
1212 int16_t best_delta; // best pitch
1213 int16_t best_pitch; // best pitch
1214 int16_t start; // of good range
1215 int16_t end; // of good range
1216 int32_t best_count; // lowest sum
1217 float best_sd; // best result
1218
1219 best_sp_sd = initial_pitch;
1220
1221 best_pitch = static_cast<int>(initial_pitch);
1222 if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
1223 return initial_pitch;
1224 }
1225 std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection
1226
1227 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1228 sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta + 1);
1229 }
1230 for (pixel = projection_left; pixel <= projection_right; pixel++) {
1231 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1232 sum_proj[textord_pitch_range + pitch_delta].add(
1233 (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel));
1234 }
1235 }
1236 best_count = sum_proj[textord_pitch_range].pile_count(0);
1237 best_delta = 0;
1238 best_pixel = 0;
1239 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1240 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1241 if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) {
1242 best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel);
1243 best_delta = pitch_delta;
1244 best_pixel = pixel;
1245 }
1246 }
1247 }
1248 if (testing_on) {
1249 tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta,
1250 best_count);
1251 }
1252 best_pitch += best_delta;
1253 initial_pitch = best_pitch;
1254 best_count++;
1255 best_count += best_count;
1256 for (start = best_pixel - 2;
1257 start > best_pixel - best_pitch &&
1258 sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count;
1259 start--) {
1260 ;
1261 }
1262 for (end = best_pixel + 2;
1263 end < best_pixel + best_pitch &&
1264 sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count;
1265 end++) {
1266 ;
1267 }
1268
1269 best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1270 initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,
1271 start, end);
1272 if (testing_on) {
1273 tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd);
1274 }
1275
1276 if (textord_debug_pitch_metric) {
1277 print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);
1278 }
1279
1280 return best_sd;
1281 }
1282
1283 /**********************************************************************
1284 * compute_pitch_sd
1285 *
1286 * Use a dp algorithm to fit the character cells and return the sd of
1287 * the cell size over the row.
1288 **********************************************************************/
1289
compute_pitch_sd(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float initial_pitch,float & sp_sd,int16_t & mid_cuts,ICOORDELT_LIST * row_cells,bool testing_on,int16_t start,int16_t end)1290 float compute_pitch_sd( // find fp cells
1291 TO_ROW *row, // row to do
1292 STATS *projection, // vertical projection
1293 int16_t projection_left, // edge
1294 int16_t projection_right, // edge
1295 float space_size, // size of blank
1296 float initial_pitch, // guess at pitch
1297 float &sp_sd, // space sd
1298 int16_t &mid_cuts, // no of free cuts
1299 ICOORDELT_LIST *row_cells, // list of chop pts
1300 bool testing_on, // inidividual words
1301 int16_t start, // start of good range
1302 int16_t end // end of good range
1303 ) {
1304 int16_t occupation; // no of cells in word.
1305 // blobs
1306 BLOBNBOX_IT blob_it = row->blob_list();
1307 BLOBNBOX_IT start_it; // start of word
1308 BLOBNBOX_IT plot_it; // for plotting
1309 int16_t blob_count; // no of blobs
1310 TBOX blob_box; // bounding box
1311 TBOX prev_box; // of super blob
1312 int32_t prev_right; // of word sync
1313 int scale_factor; // on scores for big words
1314 int32_t sp_count; // spaces
1315 FPSEGPT_LIST seg_list; // char cells
1316 FPSEGPT_IT seg_it; // iterator
1317 int16_t segpos; // position of segment
1318 int16_t cellpos; // previous cell boundary
1319 // iterator
1320 ICOORDELT_IT cell_it = row_cells;
1321 ICOORDELT *cell; // new cell
1322 double sqsum; // sum of squares
1323 double spsum; // of spaces
1324 double sp_var; // space error
1325 double word_sync; // result for word
1326 int32_t total_count; // total blobs
1327
1328 if ((pitsync_linear_version & 3) > 1) {
1329 word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,
1330 occupation, mid_cuts, row_cells, testing_on, start, end);
1331 sp_sd = occupation;
1332 return word_sync;
1333 }
1334 mid_cuts = 0;
1335 cellpos = 0;
1336 total_count = 0;
1337 sqsum = 0;
1338 sp_count = 0;
1339 spsum = 0;
1340 prev_right = -1;
1341 if (blob_it.empty()) {
1342 return space_size * 10;
1343 }
1344 #ifndef GRAPHICS_DISABLED
1345 if (testing_on && to_win != nullptr) {
1346 blob_box = blob_it.data()->bounding_box();
1347 projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1348 }
1349 #endif
1350 start_it = blob_it;
1351 blob_count = 0;
1352 blob_box = box_next(&blob_it); // first blob
1353 blob_it.mark_cycle_pt();
1354 do {
1355 for (; blob_count > 0; blob_count--) {
1356 box_next(&start_it);
1357 }
1358 do {
1359 prev_box = blob_box;
1360 blob_count++;
1361 blob_box = box_next(&blob_it);
1362 } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1363 plot_it = start_it;
1364 if (pitsync_linear_version & 3) {
1365 word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1366 projection, projection_left, projection_right,
1367 row->xheight * textord_projection_scale, occupation, &seg_list,
1368 start, end);
1369 } else {
1370 word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1371 projection, &seg_list);
1372 }
1373 if (testing_on) {
1374 tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(),
1375 seg_list.length() - 1, word_sync);
1376 seg_it.set_to_list(&seg_list);
1377 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1378 if (seg_it.data()->faked) {
1379 tprintf("(F)");
1380 }
1381 tprintf("%d, ", seg_it.data()->position());
1382 // tprintf("C=%g, s=%g, sq=%g\n",
1383 // seg_it.data()->cost_function(),
1384 // seg_it.data()->sum(),
1385 // seg_it.data()->squares());
1386 }
1387 tprintf("\n");
1388 }
1389 #ifndef GRAPHICS_DISABLED
1390 if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1391 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1392 }
1393 #endif
1394 seg_it.set_to_list(&seg_list);
1395 if (prev_right >= 0) {
1396 sp_var = seg_it.data()->position() - prev_right;
1397 sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1398 sp_var *= sp_var;
1399 spsum += sp_var;
1400 sp_count++;
1401 }
1402 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1403 segpos = seg_it.data()->position();
1404 if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {
1405 // big gap
1406 while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {
1407 cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0);
1408 cell_it.add_after_then_move(cell);
1409 cellpos += static_cast<int16_t>(initial_pitch);
1410 }
1411 // make new one
1412 cell = new ICOORDELT(segpos, 0);
1413 cell_it.add_after_then_move(cell);
1414 cellpos = segpos;
1415 } else if (segpos > cellpos - initial_pitch / 2) {
1416 cell = cell_it.data();
1417 // average positions
1418 cell->set_x((cellpos + segpos) / 2);
1419 cellpos = cell->x();
1420 }
1421 }
1422 seg_it.move_to_last();
1423 prev_right = seg_it.data()->position();
1424 if (textord_pitch_scalebigwords) {
1425 scale_factor = (seg_list.length() - 2) / 2;
1426 if (scale_factor < 1) {
1427 scale_factor = 1;
1428 }
1429 } else {
1430 scale_factor = 1;
1431 }
1432 sqsum += word_sync * scale_factor;
1433 total_count += (seg_list.length() - 1) * scale_factor;
1434 seg_list.clear();
1435 } while (!blob_it.cycled_list());
1436 sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1437 return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1438 }
1439
1440 /**********************************************************************
1441 * compute_pitch_sd2
1442 *
1443 * Use a dp algorithm to fit the character cells and return the sd of
1444 * the cell size over the row.
1445 **********************************************************************/
1446
compute_pitch_sd2(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float initial_pitch,int16_t & occupation,int16_t & mid_cuts,ICOORDELT_LIST * row_cells,bool testing_on,int16_t start,int16_t end)1447 float compute_pitch_sd2( // find fp cells
1448 TO_ROW *row, // row to do
1449 STATS *projection, // vertical projection
1450 int16_t projection_left, // edge
1451 int16_t projection_right, // edge
1452 float initial_pitch, // guess at pitch
1453 int16_t &occupation, // no of occupied cells
1454 int16_t &mid_cuts, // no of free cuts
1455 ICOORDELT_LIST *row_cells, // list of chop pts
1456 bool testing_on, // inidividual words
1457 int16_t start, // start of good range
1458 int16_t end // end of good range
1459 ) {
1460 // blobs
1461 BLOBNBOX_IT blob_it = row->blob_list();
1462 BLOBNBOX_IT plot_it;
1463 int16_t blob_count; // no of blobs
1464 TBOX blob_box; // bounding box
1465 FPSEGPT_LIST seg_list; // char cells
1466 FPSEGPT_IT seg_it; // iterator
1467 int16_t segpos; // position of segment
1468 // iterator
1469 ICOORDELT_IT cell_it = row_cells;
1470 ICOORDELT *cell; // new cell
1471 double word_sync; // result for word
1472
1473 mid_cuts = 0;
1474 if (blob_it.empty()) {
1475 occupation = 0;
1476 return initial_pitch * 10;
1477 }
1478 #ifndef GRAPHICS_DISABLED
1479 if (testing_on && to_win != nullptr) {
1480 projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1481 }
1482 #endif
1483 blob_count = 0;
1484 blob_it.mark_cycle_pt();
1485 do {
1486 // first blob
1487 blob_box = box_next(&blob_it);
1488 blob_count++;
1489 } while (!blob_it.cycled_list());
1490 plot_it = blob_it;
1491 word_sync = check_pitch_sync2(
1492 &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1493 projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end);
1494 if (testing_on) {
1495 tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(),
1496 seg_list.length() - 1, word_sync);
1497 seg_it.set_to_list(&seg_list);
1498 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1499 if (seg_it.data()->faked) {
1500 tprintf("(F)");
1501 }
1502 tprintf("%d, ", seg_it.data()->position());
1503 // tprintf("C=%g, s=%g, sq=%g\n",
1504 // seg_it.data()->cost_function(),
1505 // seg_it.data()->sum(),
1506 // seg_it.data()->squares());
1507 }
1508 tprintf("\n");
1509 }
1510 #ifndef GRAPHICS_DISABLED
1511 if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1512 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1513 }
1514 #endif
1515 seg_it.set_to_list(&seg_list);
1516 for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1517 segpos = seg_it.data()->position();
1518 // make new one
1519 cell = new ICOORDELT(segpos, 0);
1520 cell_it.add_after_then_move(cell);
1521 if (seg_it.at_last()) {
1522 mid_cuts = seg_it.data()->cheap_cuts();
1523 }
1524 }
1525 seg_list.clear();
1526 return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;
1527 }
1528
1529 /**********************************************************************
1530 * print_pitch_sd
1531 *
1532 * Use a dp algorithm to fit the character cells and return the sd of
1533 * the cell size over the row.
1534 **********************************************************************/
1535
print_pitch_sd(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float initial_pitch)1536 void print_pitch_sd( // find fp cells
1537 TO_ROW *row, // row to do
1538 STATS *projection, // vertical projection
1539 int16_t projection_left, // edges //size of blank
1540 int16_t projection_right, float space_size,
1541 float initial_pitch // guess at pitch
1542 ) {
1543 const char *res2; // pitch result
1544 int16_t occupation; // used cells
1545 float sp_sd; // space sd
1546 // blobs
1547 BLOBNBOX_IT blob_it = row->blob_list();
1548 BLOBNBOX_IT start_it; // start of word
1549 BLOBNBOX_IT row_start; // start of row
1550 int16_t blob_count; // no of blobs
1551 int16_t total_blob_count; // total blobs in line
1552 TBOX blob_box; // bounding box
1553 TBOX prev_box; // of super blob
1554 int32_t prev_right; // of word sync
1555 int scale_factor; // on scores for big words
1556 int32_t sp_count; // spaces
1557 FPSEGPT_LIST seg_list; // char cells
1558 FPSEGPT_IT seg_it; // iterator
1559 double sqsum; // sum of squares
1560 double spsum; // of spaces
1561 double sp_var; // space error
1562 double word_sync; // result for word
1563 double total_count; // total cuts
1564
1565 if (blob_it.empty()) {
1566 return;
1567 }
1568 row_start = blob_it;
1569 total_blob_count = 0;
1570
1571 total_count = 0;
1572 sqsum = 0;
1573 sp_count = 0;
1574 spsum = 0;
1575 prev_right = -1;
1576 blob_it = row_start;
1577 start_it = blob_it;
1578 blob_count = 0;
1579 blob_box = box_next(&blob_it); // first blob
1580 blob_it.mark_cycle_pt();
1581 do {
1582 for (; blob_count > 0; blob_count--) {
1583 box_next(&start_it);
1584 }
1585 do {
1586 prev_box = blob_box;
1587 blob_count++;
1588 blob_box = box_next(&blob_it);
1589 } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1590 word_sync = check_pitch_sync2(
1591 &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1592 projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1593 total_blob_count += blob_count;
1594 seg_it.set_to_list(&seg_list);
1595 if (prev_right >= 0) {
1596 sp_var = seg_it.data()->position() - prev_right;
1597 sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1598 sp_var *= sp_var;
1599 spsum += sp_var;
1600 sp_count++;
1601 }
1602 seg_it.move_to_last();
1603 prev_right = seg_it.data()->position();
1604 if (textord_pitch_scalebigwords) {
1605 scale_factor = (seg_list.length() - 2) / 2;
1606 if (scale_factor < 1) {
1607 scale_factor = 1;
1608 }
1609 } else {
1610 scale_factor = 1;
1611 }
1612 sqsum += word_sync * scale_factor;
1613 total_count += (seg_list.length() - 1) * scale_factor;
1614 seg_list.clear();
1615 } while (!blob_it.cycled_list());
1616 sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1617 word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1618 tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd,
1619 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P');
1620
1621 start_it = row_start;
1622 blob_it = row_start;
1623 word_sync =
1624 check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2,
1625 projection, projection_left, projection_right,
1626 row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1627 if (occupation > 1) {
1628 word_sync /= occupation;
1629 }
1630 word_sync = sqrt(word_sync);
1631
1632 #ifndef GRAPHICS_DISABLED
1633 if (textord_show_row_cuts && to_win != nullptr) {
1634 plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1635 }
1636 #endif
1637 seg_list.clear();
1638 if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1639 if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {
1640 res2 = "DF";
1641 } else {
1642 res2 = "MF";
1643 }
1644 } else {
1645 res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1646 }
1647 tprintf(
1648 "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, "
1649 "all_caps=%d\n",
1650 word_sync, word_sync / initial_pitch,
1651 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2,
1652 initial_pitch, row->fixed_pitch, row->all_caps);
1653 }
1654
1655 /**********************************************************************
1656 * find_repeated_chars
1657 *
1658 * Extract marked leader blobs and put them
1659 * into words in advance of fixed pitch checking and word generation.
1660 **********************************************************************/
find_repeated_chars(TO_BLOCK * block,bool testing_on)1661 void find_repeated_chars(TO_BLOCK *block, // Block to search.
1662 bool testing_on) { // Debug mode.
1663 POLY_BLOCK *pb = block->block->pdblk.poly_block();
1664 if (pb != nullptr && !pb->IsText()) {
1665 return; // Don't find repeated chars in non-text blocks.
1666 }
1667
1668 TO_ROW *row;
1669 BLOBNBOX_IT box_it;
1670 BLOBNBOX_IT search_it; // forward search
1671 WERD *word; // new word
1672 TBOX word_box; // for plotting
1673 int blobcount, repeated_set;
1674
1675 TO_ROW_IT row_it = block->get_rows();
1676 if (row_it.empty()) {
1677 return; // empty block
1678 }
1679 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1680 row = row_it.data();
1681 box_it.set_to_list(row->blob_list());
1682 if (box_it.empty()) {
1683 continue; // no blobs in this row
1684 }
1685 if (!row->rep_chars_marked()) {
1686 mark_repeated_chars(row);
1687 }
1688 if (row->num_repeated_sets() == 0) {
1689 continue; // nothing to do for this row
1690 }
1691 // new words
1692 WERD_IT word_it(&row->rep_words);
1693 do {
1694 if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {
1695 blobcount = 1;
1696 repeated_set = box_it.data()->repeated_set();
1697 search_it = box_it;
1698 search_it.forward();
1699 while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {
1700 blobcount++;
1701 search_it.forward();
1702 }
1703 // After the call to make_real_word() all the blobs from this
1704 // repeated set will be removed from the blob list. box_it will be
1705 // set to point to the blob after the end of the extracted sequence.
1706 word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
1707 if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1708 tprintf("Bad box joined to prev at");
1709 box_it.data()->bounding_box().print();
1710 tprintf("After repeated word:");
1711 word->bounding_box().print();
1712 }
1713 ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1714 word->set_flag(W_REP_CHAR, true);
1715 word->set_flag(W_DONT_CHOP, true);
1716 word_it.add_after_then_move(word);
1717 } else {
1718 box_it.forward();
1719 }
1720 } while (!box_it.at_first());
1721 }
1722 }
1723
1724 /**********************************************************************
1725 * plot_fp_word
1726 *
1727 * Plot a block of words as if fixed pitch.
1728 **********************************************************************/
1729
1730 #ifndef GRAPHICS_DISABLED
plot_fp_word(TO_BLOCK * block,float pitch,float nonspace)1731 void plot_fp_word( // draw block of words
1732 TO_BLOCK *block, // block to draw
1733 float pitch, // pitch to draw with
1734 float nonspace // for space threshold
1735 ) {
1736 TO_ROW *row; // current row
1737 TO_ROW_IT row_it = block->get_rows();
1738
1739 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1740 row = row_it.data();
1741 row->min_space = static_cast<int32_t>((pitch + nonspace) / 2);
1742 row->max_nonspace = row->min_space;
1743 row->space_threshold = row->min_space;
1744 plot_word_decisions(to_win, static_cast<int16_t>(pitch), row);
1745 }
1746 }
1747 #endif
1748
1749 } // namespace tesseract
1750