1 /**********************************************************************
2  * File:        topitch.cpp  (Formerly to_pitch.c)
3  * Description: Code to determine fixed pitchness and the pitch if fixed.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1993, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include "topitch.h"
25 
26 #include "blobbox.h"
27 #include "drawtord.h"
28 #include "makerow.h"
29 #include "pithsync.h"
30 #include "pitsync1.h"
31 #include "statistc.h"
32 #include "tovars.h"
33 #include "wordseg.h"
34 
35 #include "helpers.h"
36 
37 #include <memory>
38 
39 namespace tesseract {
40 
41 static BOOL_VAR(textord_all_prop, false, "All doc is proportial text");
42 BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test");
43 static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm");
44 BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm");
45 BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff");
46 BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts");
47 BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts");
48 BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch");
49 double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts");
50 double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells");
51 
52 #define BLOCK_STATS_CLUSTERS 10
53 #define MAX_ALLOWED_PITCH 100 // max pixel pitch.
54 
55 // qsort function to sort 2 floats.
sort_floats(const void * arg1,const void * arg2)56 static int sort_floats(const void *arg1, const void *arg2) {
57   float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2);
58   if (diff > 0) {
59     return 1;
60   } else if (diff < 0) {
61     return -1;
62   } else {
63     return 0;
64   }
65 }
66 
67 /**********************************************************************
68  * compute_fixed_pitch
69  *
70  * Decide whether each row is fixed pitch individually.
71  * Correlate definite and uncertain results to obtain an individual
72  * result for each row in the TO_ROW class.
73  **********************************************************************/
74 
compute_fixed_pitch(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient,FCOORD rotation,bool testing_on)75 void compute_fixed_pitch(ICOORD page_tr,             // top right
76                          TO_BLOCK_LIST *port_blocks, // input list
77                          float gradient,             // page skew
78                          FCOORD rotation,            // for drawing
79                          bool testing_on) {          // correct orientation
80   TO_BLOCK_IT block_it;                              // iterator
81   TO_BLOCK *block;                                   // current block;
82   TO_ROW *row;                                       // current row
83   int block_index;                                   // block number
84   int row_index;                                     // row number
85 
86 #ifndef GRAPHICS_DISABLED
87   if (textord_show_initial_words && testing_on) {
88     if (to_win == nullptr) {
89       create_to_win(page_tr);
90     }
91   }
92 #endif
93 
94   block_it.set_to_list(port_blocks);
95   block_index = 1;
96   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
97     block = block_it.data();
98     compute_block_pitch(block, rotation, block_index, testing_on);
99     block_index++;
100   }
101 
102   if (!try_doc_fixed(page_tr, port_blocks, gradient)) {
103     block_index = 1;
104     for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
105       block = block_it.data();
106       if (!try_block_fixed(block, block_index)) {
107         try_rows_fixed(block, block_index, testing_on);
108       }
109       block_index++;
110     }
111   }
112 
113   block_index = 1;
114   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
115     block = block_it.data();
116     POLY_BLOCK *pb = block->block->pdblk.poly_block();
117     if (pb != nullptr && !pb->IsText()) {
118       continue; // Non-text doesn't exist!
119     }
120     // row iterator
121     TO_ROW_IT row_it(block->get_rows());
122     row_index = 1;
123     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
124       row = row_it.data();
125       fix_row_pitch(row, block, port_blocks, row_index, block_index);
126       row_index++;
127     }
128     block_index++;
129   }
130 #ifndef GRAPHICS_DISABLED
131   if (textord_show_initial_words && testing_on) {
132     ScrollView::Update();
133   }
134 #endif
135 }
136 
137 /**********************************************************************
138  * fix_row_pitch
139  *
140  * Get a pitch_decision for this row by voting among similar rows in the
141  * block, then similar rows over all the page, or any other rows at all.
142  **********************************************************************/
143 
fix_row_pitch(TO_ROW * bad_row,TO_BLOCK * bad_block,TO_BLOCK_LIST * blocks,int32_t row_target,int32_t block_target)144 void fix_row_pitch(TO_ROW *bad_row,        // row to fix
145                    TO_BLOCK *bad_block,    // block of bad_row
146                    TO_BLOCK_LIST *blocks,  // blocks to scan
147                    int32_t row_target,     // number of row
148                    int32_t block_target) { // number of block
149   int16_t mid_cuts;
150   int block_votes;               // votes in block
151   int like_votes;                // votes over page
152   int other_votes;               // votes of unlike blocks
153   int block_index;               // number of block
154   int row_index;                 // number of row
155   int maxwidth;                  // max pitch
156   TO_BLOCK_IT block_it = blocks; // block iterator
157   TO_BLOCK *block;               // current block
158   TO_ROW *row;                   // current row
159   float sp_sd;                   // space deviation
160   STATS block_stats;             // pitches in block
161   STATS like_stats;              // pitches in page
162 
163   block_votes = like_votes = other_votes = 0;
164   maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace));
165   if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) {
166     block_stats.set_range(0, maxwidth);
167     like_stats.set_range(0, maxwidth);
168     block_index = 1;
169     for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
170       block = block_it.data();
171       POLY_BLOCK *pb = block->block->pdblk.poly_block();
172       if (pb != nullptr && !pb->IsText()) {
173         continue; // Non text doesn't exist!
174       }
175       row_index = 1;
176       TO_ROW_IT row_it(block->get_rows());
177       for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
178         row = row_it.data();
179         if ((bad_row->all_caps &&
180              row->xheight + row->ascrise <
181                  (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) &&
182              row->xheight + row->ascrise >
183                  (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) ||
184             (!bad_row->all_caps &&
185              row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) &&
186              row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
187           if (block_index == block_target) {
188             if (row->pitch_decision == PITCH_DEF_FIXED) {
189               block_votes += textord_words_veto_power;
190               block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
191             } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
192                        row->pitch_decision == PITCH_CORR_FIXED) {
193               block_votes++;
194               block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
195             } else if (row->pitch_decision == PITCH_DEF_PROP) {
196               block_votes -= textord_words_veto_power;
197             } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
198                        row->pitch_decision == PITCH_CORR_PROP) {
199               block_votes--;
200             }
201           } else {
202             if (row->pitch_decision == PITCH_DEF_FIXED) {
203               like_votes += textord_words_veto_power;
204               like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
205             } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
206                        row->pitch_decision == PITCH_CORR_FIXED) {
207               like_votes++;
208               like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
209             } else if (row->pitch_decision == PITCH_DEF_PROP) {
210               like_votes -= textord_words_veto_power;
211             } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
212                        row->pitch_decision == PITCH_CORR_PROP) {
213               like_votes--;
214             }
215           }
216         } else {
217           if (row->pitch_decision == PITCH_DEF_FIXED) {
218             other_votes += textord_words_veto_power;
219           } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
220                      row->pitch_decision == PITCH_CORR_FIXED) {
221             other_votes++;
222           } else if (row->pitch_decision == PITCH_DEF_PROP) {
223             other_votes -= textord_words_veto_power;
224           } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
225                      row->pitch_decision == PITCH_CORR_PROP) {
226             other_votes--;
227           }
228         }
229         row_index++;
230       }
231       block_index++;
232     }
233     if (block_votes > textord_words_veto_power) {
234       bad_row->fixed_pitch = block_stats.ile(0.5);
235       bad_row->pitch_decision = PITCH_CORR_FIXED;
236     } else if (block_votes <= textord_words_veto_power && like_votes > 0) {
237       bad_row->fixed_pitch = like_stats.ile(0.5);
238       bad_row->pitch_decision = PITCH_CORR_FIXED;
239     } else {
240       bad_row->pitch_decision = PITCH_CORR_PROP;
241       if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&
242           (textord_debug_pitch_test || textord_debug_pitch_metric)) {
243         tprintf(
244             "Warning:row %d of block %d set prop with no like rows against "
245             "trend\n",
246             row_target, block_target);
247       }
248     }
249   }
250   if (textord_debug_pitch_metric) {
251     tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes);
252     tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
253   }
254   if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
255     if (bad_row->fixed_pitch < textord_min_xheight) {
256       if (block_votes > 0) {
257         bad_row->fixed_pitch = block_stats.ile(0.5);
258       } else if (block_votes == 0 && like_votes > 0) {
259         bad_row->fixed_pitch = like_stats.ile(0.5);
260       } else {
261         tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target,
262                 block_target);
263         bad_row->fixed_pitch = bad_row->xheight;
264       }
265     }
266     if (bad_row->fixed_pitch < textord_min_xheight) {
267       bad_row->fixed_pitch = (float)textord_min_xheight;
268     }
269     bad_row->kern_size = bad_row->fixed_pitch / 4;
270     bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6);
271     bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4);
272     bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2;
273     bad_row->space_size = bad_row->fixed_pitch;
274     if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
275       tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left,
276                      bad_row->projection_right,
277                      (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
278                      sp_sd, mid_cuts, &bad_row->char_cells, false);
279     }
280   } else if (bad_row->pitch_decision == PITCH_CORR_PROP ||
281              bad_row->pitch_decision == PITCH_DEF_PROP) {
282     bad_row->fixed_pitch = 0.0f;
283     bad_row->char_cells.clear();
284   }
285 }
286 
287 /**********************************************************************
288  * compute_block_pitch
289  *
290  * Decide whether each block is fixed pitch individually.
291  **********************************************************************/
292 
compute_block_pitch(TO_BLOCK * block,FCOORD rotation,int32_t block_index,bool testing_on)293 void compute_block_pitch(TO_BLOCK *block,     // input list
294                          FCOORD rotation,     // for drawing
295                          int32_t block_index, // block number
296                          bool testing_on) {   // correct orientation
297   TBOX block_box;                             // bounding box
298 
299   block_box = block->block->pdblk.bounding_box();
300   if (testing_on && textord_debug_pitch_test) {
301     tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(),
302             block_box.right(), block_box.top());
303   }
304   block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace));
305   block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace));
306   block->fixed_pitch = 0.0f;
307   block->space_size = static_cast<float>(block->min_space);
308   block->kern_size = static_cast<float>(block->max_nonspace);
309   block->pr_nonsp = block->xheight * words_default_prop_nonspace;
310   block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
311   if (!block->get_rows()->empty()) {
312     ASSERT_HOST(block->xheight > 0);
313     find_repeated_chars(block, textord_show_initial_words && testing_on);
314 #ifndef GRAPHICS_DISABLED
315     if (textord_show_initial_words && testing_on) {
316       // overlap_picture_ops(true);
317       ScrollView::Update();
318     }
319 #endif
320     compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on);
321   }
322 }
323 
324 /**********************************************************************
325  * compute_rows_pitch
326  *
327  * Decide whether each row is fixed pitch individually.
328  **********************************************************************/
329 
compute_rows_pitch(TO_BLOCK * block,int32_t block_index,bool testing_on)330 bool compute_rows_pitch( // find line stats
331     TO_BLOCK *block,     // block to do
332     int32_t block_index, // block number
333     bool testing_on      // correct orientation
334 ) {
335   int32_t maxwidth;   // of spaces
336   TO_ROW *row;        // current row
337   int32_t row_index;  // row number.
338   float lower, upper; // cluster thresholds
339   TO_ROW_IT row_it = block->get_rows();
340 
341   row_index = 1;
342   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
343     row = row_it.data();
344     ASSERT_HOST(row->xheight > 0);
345     row->compute_vertical_projection();
346     maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace));
347     if (row_pitch_stats(row, maxwidth, testing_on) &&
348         find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index,
349                        testing_on)) {
350       if (row->fixed_pitch == 0) {
351         lower = row->pr_nonsp;
352         upper = row->pr_space;
353         row->space_size = upper;
354         row->kern_size = lower;
355       }
356     } else {
357       row->fixed_pitch = 0.0f; // insufficient data
358       row->pitch_decision = PITCH_DUNNO;
359     }
360     row_index++;
361   }
362   return false;
363 }
364 
365 /**********************************************************************
366  * try_doc_fixed
367  *
368  * Attempt to call the entire document fixed pitch.
369  **********************************************************************/
370 
try_doc_fixed(ICOORD page_tr,TO_BLOCK_LIST * port_blocks,float gradient)371 bool try_doc_fixed(             // determine pitch
372     ICOORD page_tr,             // top right
373     TO_BLOCK_LIST *port_blocks, // input list
374     float gradient              // page skew
375 ) {
376   int16_t master_x; // uniform shifts
377   int16_t pitch;    // median pitch.
378   int x;            // profile coord
379   int prop_blocks;  // correct counts
380   int fixed_blocks;
381   int total_row_count; // total in page
382                        // iterator
383   TO_BLOCK_IT block_it = port_blocks;
384   TO_BLOCK *block;         // current block;
385   TO_ROW *row;             // current row
386   int16_t projection_left; // edges
387   int16_t projection_right;
388   int16_t row_left; // edges of row
389   int16_t row_right;
390   float master_y;     // uniform shifts
391   float shift_factor; // page skew correction
392   float final_pitch;  // output pitch
393   float row_y;        // baseline
394   STATS projection;   // entire page
395   STATS pitches(0, MAX_ALLOWED_PITCH);
396   // for median
397   float sp_sd;      // space sd
398   int16_t mid_cuts; // no of cheap cuts
399   float pitch_sd;   // sync rating
400 
401   if (block_it.empty()
402       //      || block_it.data()==block_it.data_relative(1)
403       || !textord_blockndoc_fixed) {
404     return false;
405   }
406   shift_factor = gradient / (gradient * gradient + 1);
407   // row iterator
408   TO_ROW_IT row_it(block_it.data()->get_rows());
409   master_x = row_it.data()->projection_left;
410   master_y = row_it.data()->baseline.y(master_x);
411   projection_left = INT16_MAX;
412   projection_right = -INT16_MAX;
413   prop_blocks = 0;
414   fixed_blocks = 0;
415   total_row_count = 0;
416 
417   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
418     block = block_it.data();
419     row_it.set_to_list(block->get_rows());
420     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
421       row = row_it.data();
422       total_row_count++;
423       if (row->fixed_pitch > 0) {
424         pitches.add(static_cast<int32_t>(row->fixed_pitch), 1);
425       }
426       // find median
427       row_y = row->baseline.y(master_x);
428       row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
429       row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y));
430       if (row_left < projection_left) {
431         projection_left = row_left;
432       }
433       if (row_right > projection_right) {
434         projection_right = row_right;
435       }
436     }
437   }
438   if (pitches.get_total() == 0) {
439     return false;
440   }
441   projection.set_range(projection_left, projection_right);
442 
443   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
444     block = block_it.data();
445     row_it.set_to_list(block->get_rows());
446     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
447       row = row_it.data();
448       row_y = row->baseline.y(master_x);
449       row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
450       for (x = row->projection_left; x < row->projection_right; x++, row_left++) {
451         projection.add(row_left, row->projection.pile_count(x));
452       }
453     }
454   }
455 
456   row_it.set_to_list(block_it.data()->get_rows());
457   row = row_it.data();
458 #ifndef GRAPHICS_DISABLED
459   if (textord_show_page_cuts && to_win != nullptr) {
460     projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
461   }
462 #endif
463   final_pitch = pitches.ile(0.5);
464   pitch = static_cast<int16_t>(final_pitch);
465   pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,
466                             final_pitch, sp_sd, mid_cuts, &row->char_cells, false);
467 
468   if (textord_debug_pitch_metric) {
469     tprintf(
470         "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%"
471         "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
472         prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,
473         pitch_sd / pitch, pitch_sd / total_row_count / pitch);
474   }
475 
476 #ifndef GRAPHICS_DISABLED
477   if (textord_show_page_cuts && to_win != nullptr) {
478     float row_shift;              // shift for row
479     ICOORDELT_LIST *master_cells; // cells for page
480     master_cells = &row->char_cells;
481     for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
482       block = block_it.data();
483       row_it.set_to_list(block->get_rows());
484       for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
485         row = row_it.data();
486         row_y = row->baseline.y(master_x);
487         row_shift = shift_factor * (master_y - row_y);
488         plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
489       }
490     }
491   }
492 #endif
493   row->char_cells.clear();
494   return false;
495 }
496 
497 /**********************************************************************
498  * try_block_fixed
499  *
500  * Try to call the entire block fixed.
501  **********************************************************************/
502 
try_block_fixed(TO_BLOCK * block,int32_t block_index)503 bool try_block_fixed(   // find line stats
504     TO_BLOCK *block,    // block to do
505     int32_t block_index // block number
506 ) {
507   return false;
508 }
509 
510 /**********************************************************************
511  * try_rows_fixed
512  *
513  * Decide whether each row is fixed pitch individually.
514  **********************************************************************/
515 
try_rows_fixed(TO_BLOCK * block,int32_t block_index,bool testing_on)516 bool try_rows_fixed(     // find line stats
517     TO_BLOCK *block,     // block to do
518     int32_t block_index, // block number
519     bool testing_on      // correct orientation
520 ) {
521   TO_ROW *row;           // current row
522   int32_t row_index;     // row number.
523   int32_t def_fixed = 0; // counters
524   int32_t def_prop = 0;
525   int32_t maybe_fixed = 0;
526   int32_t maybe_prop = 0;
527   int32_t dunno = 0;
528   int32_t corr_fixed = 0;
529   int32_t corr_prop = 0;
530   float lower, upper; // cluster thresholds
531   TO_ROW_IT row_it = block->get_rows();
532 
533   row_index = 1;
534   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
535     row = row_it.data();
536     ASSERT_HOST(row->xheight > 0);
537     if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) {
538       if (row->fixed_pitch == 0) {
539         lower = row->pr_nonsp;
540         upper = row->pr_space;
541         row->space_size = upper;
542         row->kern_size = lower;
543       }
544     }
545     row_index++;
546   }
547   count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
548                     dunno);
549   if (testing_on &&
550       (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) {
551     tprintf("Initially:");
552     print_block_counts(block, block_index);
553   }
554   if (def_fixed > def_prop * textord_words_veto_power) {
555     block->pitch_decision = PITCH_DEF_FIXED;
556   } else if (def_prop > def_fixed * textord_words_veto_power) {
557     block->pitch_decision = PITCH_DEF_PROP;
558   } else if (def_fixed > 0 || def_prop > 0) {
559     block->pitch_decision = PITCH_DUNNO;
560   } else if (maybe_fixed > maybe_prop * textord_words_veto_power) {
561     block->pitch_decision = PITCH_MAYBE_FIXED;
562   } else if (maybe_prop > maybe_fixed * textord_words_veto_power) {
563     block->pitch_decision = PITCH_MAYBE_PROP;
564   } else {
565     block->pitch_decision = PITCH_DUNNO;
566   }
567   return false;
568 }
569 
570 /**********************************************************************
571  * print_block_counts
572  *
573  * Count up how many rows have what decision and print the results.
574  **********************************************************************/
575 
print_block_counts(TO_BLOCK * block,int32_t block_index)576 void print_block_counts( // find line stats
577     TO_BLOCK *block,     // block to do
578     int32_t block_index  // block number
579 ) {
580   int32_t def_fixed = 0; // counters
581   int32_t def_prop = 0;
582   int32_t maybe_fixed = 0;
583   int32_t maybe_prop = 0;
584   int32_t dunno = 0;
585   int32_t corr_fixed = 0;
586   int32_t corr_prop = 0;
587 
588   count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
589                     dunno);
590   tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed);
591   if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) {
592     tprintf(" (Wrongly)");
593   }
594   tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
595   if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) {
596     tprintf(" (Wrongly)");
597   }
598   tprintf(" prop, %d dunno\n", dunno);
599 }
600 
601 /**********************************************************************
602  * count_block_votes
603  *
604  * Count the number of rows in the block with each kind of pitch_decision.
605  **********************************************************************/
606 
count_block_votes(TO_BLOCK * block,int32_t & def_fixed,int32_t & def_prop,int32_t & maybe_fixed,int32_t & maybe_prop,int32_t & corr_fixed,int32_t & corr_prop,int32_t & dunno)607 void count_block_votes( // find line stats
608     TO_BLOCK *block,    // block to do
609     int32_t &def_fixed, // add to counts
610     int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,
611     int32_t &corr_prop, int32_t &dunno) {
612   TO_ROW *row; // current row
613   TO_ROW_IT row_it = block->get_rows();
614 
615   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
616     row = row_it.data();
617     switch (row->pitch_decision) {
618       case PITCH_DUNNO:
619         dunno++;
620         break;
621       case PITCH_DEF_PROP:
622         def_prop++;
623         break;
624       case PITCH_MAYBE_PROP:
625         maybe_prop++;
626         break;
627       case PITCH_DEF_FIXED:
628         def_fixed++;
629         break;
630       case PITCH_MAYBE_FIXED:
631         maybe_fixed++;
632         break;
633       case PITCH_CORR_PROP:
634         corr_prop++;
635         break;
636       case PITCH_CORR_FIXED:
637         corr_fixed++;
638         break;
639     }
640   }
641 }
642 
643 /**********************************************************************
644  * row_pitch_stats
645  *
646  * Decide whether each row is fixed pitch individually.
647  **********************************************************************/
648 
row_pitch_stats(TO_ROW * row,int32_t maxwidth,bool testing_on)649 bool row_pitch_stats( // find line stats
650     TO_ROW *row,      // current row
651     int32_t maxwidth, // of spaces
652     bool testing_on   // correct orientation
653 ) {
654   BLOBNBOX *blob;        // current blob
655   int gap_index;         // current gap
656   int32_t prev_x;        // end of prev blob
657   int32_t cluster_count; // no of clusters
658   int32_t prev_count;    // of clusters
659   int32_t smooth_factor; // for smoothing stats
660   TBOX blob_box;         // bounding box
661   float lower, upper;    // cluster thresholds
662                          // gap sizes
663   float gaps[BLOCK_STATS_CLUSTERS];
664   // blobs
665   BLOBNBOX_IT blob_it = row->blob_list();
666   STATS gap_stats(0, maxwidth);
667   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
668   // clusters
669 
670   smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5);
671   if (!blob_it.empty()) {
672     prev_x = blob_it.data()->bounding_box().right();
673     blob_it.forward();
674     while (!blob_it.at_first()) {
675       blob = blob_it.data();
676       if (!blob->joined_to_prev()) {
677         blob_box = blob->bounding_box();
678         if (blob_box.left() - prev_x < maxwidth) {
679           gap_stats.add(blob_box.left() - prev_x, 1);
680         }
681         prev_x = blob_box.right();
682       }
683       blob_it.forward();
684     }
685   }
686   if (gap_stats.get_total() == 0) {
687     return false;
688   }
689   cluster_count = 0;
690   lower = row->xheight * words_initial_lower;
691   upper = row->xheight * words_initial_upper;
692   gap_stats.smooth(smooth_factor);
693   do {
694     prev_count = cluster_count;
695     cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
696                                       BLOCK_STATS_CLUSTERS, cluster_stats);
697   } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
698   if (cluster_count < 1) {
699     return false;
700   }
701   for (gap_index = 0; gap_index < cluster_count; gap_index++) {
702     gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
703   }
704   // get medians
705   if (testing_on) {
706     tprintf("cluster_count=%d:", cluster_count);
707     for (gap_index = 0; gap_index < cluster_count; gap_index++) {
708       tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
709     }
710     tprintf("\n");
711   }
712   qsort(gaps, cluster_count, sizeof(float), sort_floats);
713 
714   // Try to find proportional non-space and space for row.
715   lower = row->xheight * words_default_prop_nonspace;
716   upper = row->xheight * textord_words_min_minspace;
717   for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {
718     ;
719   }
720   if (gap_index == 0) {
721     if (testing_on) {
722       tprintf("No clusters below nonspace threshold!!\n");
723     }
724     if (cluster_count > 1) {
725       row->pr_nonsp = gaps[0];
726       row->pr_space = gaps[1];
727     } else {
728       row->pr_nonsp = lower;
729       row->pr_space = gaps[0];
730     }
731   } else {
732     row->pr_nonsp = gaps[gap_index - 1];
733     while (gap_index < cluster_count && gaps[gap_index] < upper) {
734       gap_index++;
735     }
736     if (gap_index == cluster_count) {
737       if (testing_on) {
738         tprintf("No clusters above nonspace threshold!!\n");
739       }
740       row->pr_space = lower * textord_spacesize_ratioprop;
741     } else {
742       row->pr_space = gaps[gap_index];
743     }
744   }
745 
746   // Now try to find the fixed pitch space and non-space.
747   upper = row->xheight * words_default_fixed_space;
748   for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {
749     ;
750   }
751   if (gap_index == 0) {
752     if (testing_on) {
753       tprintf("No clusters below space threshold!!\n");
754     }
755     row->fp_nonsp = upper;
756     row->fp_space = gaps[0];
757   } else {
758     row->fp_nonsp = gaps[gap_index - 1];
759     if (gap_index == cluster_count) {
760       if (testing_on) {
761         tprintf("No clusters above space threshold!!\n");
762       }
763       row->fp_space = row->xheight;
764     } else {
765       row->fp_space = gaps[gap_index];
766     }
767   }
768   if (testing_on) {
769     tprintf(
770         "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, "
771         "fp_space=%g\n",
772         row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
773   }
774   return true; // computed some stats
775 }
776 
777 /**********************************************************************
778  * find_row_pitch
779  *
780  * Check to see if this row could be fixed pitch using the given spacings.
781  * Blobs with gaps smaller than the lower threshold are assumed to be one.
782  * The larger threshold is the word gap threshold.
783  **********************************************************************/
784 
find_row_pitch(TO_ROW * row,int32_t maxwidth,int32_t dm_gap,TO_BLOCK * block,int32_t block_index,int32_t row_index,bool testing_on)785 bool find_row_pitch(     // find lines
786     TO_ROW *row,         // row to do
787     int32_t maxwidth,    // max permitted space
788     int32_t dm_gap,      // ignorable gaps
789     TO_BLOCK *block,     // block of row
790     int32_t block_index, // block_number
791     int32_t row_index,   // number of row
792     bool testing_on      // correct orientation
793 ) {
794   bool used_dm_model; // looks like dot matrix
795   float min_space;    // estimate threshold
796   float non_space;    // gap size
797   float gap_iqr;      // interquartile range
798   float pitch_iqr;
799   float dm_gap_iqr; // interquartile range
800   float dm_pitch_iqr;
801   float dm_pitch;      // pitch with dm on
802   float pitch;         // revised estimate
803   float initial_pitch; // guess at pitch
804   STATS gap_stats(0, maxwidth);
805   // centre-centre
806   STATS pitch_stats(0, maxwidth);
807 
808   row->fixed_pitch = 0.0f;
809   initial_pitch = row->fp_space;
810   if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) {
811     initial_pitch = row->xheight; // keep pitch decent
812   }
813   non_space = row->fp_nonsp;
814   if (non_space > initial_pitch) {
815     non_space = initial_pitch;
816   }
817   min_space = (initial_pitch + non_space) / 2;
818 
819   if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false,
820                          dm_gap)) {
821     dm_gap_iqr = 0.0001f;
822     dm_pitch_iqr = maxwidth * 2.0f;
823     dm_pitch = initial_pitch;
824   } else {
825     dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
826     dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
827     dm_pitch = pitch_stats.ile(0.5);
828   }
829   gap_stats.clear();
830   pitch_stats.clear();
831   if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) {
832     gap_iqr = 0.0001f;
833     pitch_iqr = maxwidth * 3.0f;
834   } else {
835     gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
836     pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
837     if (testing_on) {
838       tprintf(
839           "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
840           "pitch=%g\n",
841           initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
842     }
843     initial_pitch = pitch_stats.ile(0.5);
844     if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,
845                                                        initial_pitch, true, false, 0)) {
846       min_space = initial_pitch;
847       gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
848       pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
849       if (testing_on) {
850         tprintf(
851             "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
852             "pitch=%g\n",
853             initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
854       }
855       initial_pitch = pitch_stats.ile(0.5);
856     }
857   }
858   if (textord_debug_pitch_metric) {
859     tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index,
860             row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
861             pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth
862                 ? 'D'
863                 : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
864   }
865   if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
866     row->pitch_decision = PITCH_DUNNO;
867     if (textord_debug_pitch_metric) {
868       tprintf("\n");
869     }
870     return false; // insufficient data
871   }
872   if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
873     if (testing_on) {
874       tprintf(
875           "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
876           "dm_gap_iqr=%g\n",
877           pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
878     }
879     gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
880     pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
881     pitch = pitch_stats.ile(0.5);
882     used_dm_model = false;
883   } else {
884     if (testing_on) {
885       tprintf(
886           "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
887           "dm_gap_iqr=%g\n",
888           pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
889     }
890     gap_iqr = dm_gap_iqr;
891     pitch_iqr = dm_pitch_iqr;
892     pitch = dm_pitch;
893     used_dm_model = true;
894   }
895   if (textord_debug_pitch_metric) {
896     tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch);
897     tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
898             pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
899                     pitch_iqr < block->xheight * textord_max_pitch_iqr &&
900                     pitch < block->xheight * textord_words_default_maxspace
901                 ? 'F'
902                 : 'P');
903   }
904   if (pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
905       pitch_iqr < block->xheight * textord_max_pitch_iqr &&
906       pitch < block->xheight * textord_words_default_maxspace) {
907     row->pitch_decision = PITCH_MAYBE_FIXED;
908   } else {
909     row->pitch_decision = PITCH_MAYBE_PROP;
910   }
911   row->fixed_pitch = pitch;
912   row->kern_size = gap_stats.ile(0.5);
913   row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2;
914   if (row->min_space > row->fixed_pitch) {
915     row->min_space = static_cast<int32_t>(row->fixed_pitch);
916   }
917   row->max_nonspace = row->min_space;
918   row->space_size = row->fixed_pitch;
919   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
920   row->used_dm_model = used_dm_model;
921   return true;
922 }
923 
924 /**********************************************************************
925  * fixed_pitch_row
926  *
927  * Check to see if this row could be fixed pitch using the given spacings.
928  * Blobs with gaps smaller than the lower threshold are assumed to be one.
929  * The larger threshold is the word gap threshold.
930  **********************************************************************/
931 
fixed_pitch_row(TO_ROW * row,BLOCK * block,int32_t block_index)932 bool fixed_pitch_row(TO_ROW *row, // row to do
933                      BLOCK *block,
934                      int32_t block_index // block_number
935 ) {
936   const char *res_string; // pitch result
937   int16_t mid_cuts;       // no of cheap cuts
938   float non_space;        // gap size
939   float pitch_sd;         // error on pitch
940   float sp_sd = 0.0f;     // space sd
941 
942   non_space = row->fp_nonsp;
943   if (non_space > row->fixed_pitch) {
944     non_space = row->fixed_pitch;
945   }
946   POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
947   if (textord_all_prop || (pb != nullptr && !pb->IsText())) {
948     // Set the decision to definitely proportional.
949     pitch_sd = textord_words_def_prop * row->fixed_pitch;
950     row->pitch_decision = PITCH_DEF_PROP;
951   } else {
952     pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right,
953                               (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd,
954                               mid_cuts, &row->char_cells, block_index == textord_debug_block);
955     if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&
956         ((pitsync_linear_version & 3) < 3 ||
957          ((pitsync_linear_version & 3) >= 3 &&
958           (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {
959       if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps &&
960           ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) {
961         row->pitch_decision = PITCH_DEF_FIXED;
962       } else {
963         row->pitch_decision = PITCH_MAYBE_FIXED;
964       }
965     } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 ||
966                pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
967       if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {
968         row->pitch_decision = PITCH_MAYBE_PROP;
969       } else {
970         row->pitch_decision = PITCH_DEF_PROP;
971       }
972     } else {
973       row->pitch_decision = PITCH_DUNNO;
974     }
975   }
976 
977   if (textord_debug_pitch_metric) {
978     res_string = "??";
979     switch (row->pitch_decision) {
980       case PITCH_DEF_PROP:
981         res_string = "DP";
982         break;
983       case PITCH_MAYBE_PROP:
984         res_string = "MP";
985         break;
986       case PITCH_DEF_FIXED:
987         res_string = "DF";
988         break;
989       case PITCH_MAYBE_FIXED:
990         res_string = "MF";
991         break;
992       default:
993         res_string = "??";
994     }
995     tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string);
996   }
997   return true;
998 }
999 
1000 /**********************************************************************
1001  * count_pitch_stats
1002  *
1003  * Count up the gap and pitch stats on the block to see if it is fixed pitch.
1004  * Blobs with gaps smaller than the lower threshold are assumed to be one.
1005  * The larger threshold is the word gap threshold.
1006  * The return value indicates whether there were any decent values to use.
1007  **********************************************************************/
1008 
count_pitch_stats(TO_ROW * row,STATS * gap_stats,STATS * pitch_stats,float initial_pitch,float min_space,bool ignore_outsize,bool split_outsize,int32_t dm_gap)1009 bool count_pitch_stats(  // find lines
1010     TO_ROW *row,         // row to do
1011     STATS *gap_stats,    // blob gaps
1012     STATS *pitch_stats,  // centre-centre stats
1013     float initial_pitch, // guess at pitch
1014     float min_space,     // estimate space size
1015     bool ignore_outsize, // discard big objects
1016     bool split_outsize,  // split big objects
1017     int32_t dm_gap       // ignorable gaps
1018 ) {
1019   bool prev_valid; // not word broken
1020   BLOBNBOX *blob;  // current blob
1021                    // blobs
1022   BLOBNBOX_IT blob_it = row->blob_list();
1023   int32_t prev_right;  // end of prev blob
1024   int32_t prev_centre; // centre of previous blob
1025   int32_t x_centre;    // centre of this blob
1026   int32_t blob_width;  // width of blob
1027   int32_t width_units; // no of widths in blob
1028   float width;         // blob width
1029   TBOX blob_box;       // bounding box
1030   TBOX joined_box;     // of super blob
1031 
1032   gap_stats->clear();
1033   pitch_stats->clear();
1034   if (blob_it.empty()) {
1035     return false;
1036   }
1037   prev_valid = false;
1038   prev_centre = 0;
1039   prev_right = 0; // stop compiler warning
1040   joined_box = blob_it.data()->bounding_box();
1041   do {
1042     blob_it.forward();
1043     blob = blob_it.data();
1044     if (!blob->joined_to_prev()) {
1045       blob_box = blob->bounding_box();
1046       if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) ||
1047           blob->cblob() == nullptr) {
1048         joined_box += blob_box; // merge blobs
1049       } else {
1050         blob_width = joined_box.width();
1051         if (split_outsize) {
1052           width_units =
1053               static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5));
1054           if (width_units < 1) {
1055             width_units = 1;
1056           }
1057           width_units--;
1058         } else if (ignore_outsize) {
1059           width = static_cast<float>(blob_width) / initial_pitch;
1060           width_units =
1061               width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0
1062                                                                                              : -1;
1063         } else {
1064           width_units = 0; // everything in
1065         }
1066         x_centre = static_cast<int32_t>(joined_box.left() +
1067                                         (blob_width - width_units * initial_pitch) / 2);
1068         if (prev_valid && width_units >= 0) {
1069           //                                              if (width_units>0)
1070           //                                              {
1071           //                                                      tprintf("wu=%d,
1072           //                                                      width=%d,
1073           //                                                      xc=%d, adding
1074           //                                                      %d\n",
1075           //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);
1076           //                                              }
1077           gap_stats->add(joined_box.left() - prev_right, 1);
1078           pitch_stats->add(x_centre - prev_centre, 1);
1079         }
1080         prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch);
1081         prev_right = joined_box.right();
1082         prev_valid = blob_box.left() - joined_box.right() < min_space;
1083         prev_valid = prev_valid && width_units >= 0;
1084         joined_box = blob_box;
1085       }
1086     }
1087   } while (!blob_it.at_first());
1088   return gap_stats->get_total() >= 3;
1089 }
1090 
1091 /**********************************************************************
1092  * tune_row_pitch
1093  *
1094  * Use a dp algorithm to fit the character cells and return the sd of
1095  * the cell size over the row.
1096  **********************************************************************/
1097 
tune_row_pitch(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float & initial_pitch,float & best_sp_sd,int16_t & best_mid_cuts,ICOORDELT_LIST * best_cells,bool testing_on)1098 float tune_row_pitch(           // find fp cells
1099     TO_ROW *row,                // row to do
1100     STATS *projection,          // vertical projection
1101     int16_t projection_left,    // edge of projection
1102     int16_t projection_right,   // edge of projection
1103     float space_size,           // size of blank
1104     float &initial_pitch,       // guess at pitch
1105     float &best_sp_sd,          // space sd
1106     int16_t &best_mid_cuts,     // no of cheap cuts
1107     ICOORDELT_LIST *best_cells, // row cells
1108     bool testing_on             // inidividual words
1109 ) {
1110   int pitch_delta;           // offset pitch
1111   int16_t mid_cuts;          // cheap cuts
1112   float pitch_sd;            // current sd
1113   float best_sd;             // best result
1114   float best_pitch;          // pitch for best result
1115   float initial_sd;          // starting error
1116   float sp_sd;               // space sd
1117   ICOORDELT_LIST test_cells; // row cells
1118   ICOORDELT_IT best_it;      // start of best list
1119 
1120   if (textord_fast_pitch_test) {
1121     return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,
1122                            initial_pitch, best_sp_sd,
1123                            // space sd
1124                            best_mid_cuts, best_cells, testing_on);
1125   }
1126   if (textord_disable_pitch_test) {
1127     best_sp_sd = initial_pitch;
1128     return initial_pitch;
1129   }
1130   initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1131                                 initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);
1132   best_sd = initial_sd;
1133   best_pitch = initial_pitch;
1134   if (testing_on) {
1135     tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1136   }
1137   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1138     pitch_sd =
1139         compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1140                          initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1141     if (testing_on) {
1142       tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd);
1143     }
1144     if (pitch_sd < best_sd) {
1145       best_sd = pitch_sd;
1146       best_mid_cuts = mid_cuts;
1147       best_sp_sd = sp_sd;
1148       best_pitch = initial_pitch + pitch_delta;
1149       best_cells->clear();
1150       best_it.set_to_list(best_cells);
1151       best_it.add_list_after(&test_cells);
1152     } else {
1153       test_cells.clear();
1154     }
1155     if (pitch_sd > initial_sd) {
1156       break; // getting worse
1157     }
1158   }
1159   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1160     pitch_sd =
1161         compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1162                          initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1163     if (testing_on) {
1164       tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd);
1165     }
1166     if (pitch_sd < best_sd) {
1167       best_sd = pitch_sd;
1168       best_mid_cuts = mid_cuts;
1169       best_sp_sd = sp_sd;
1170       best_pitch = initial_pitch - pitch_delta;
1171       best_cells->clear();
1172       best_it.set_to_list(best_cells);
1173       best_it.add_list_after(&test_cells);
1174     } else {
1175       test_cells.clear();
1176     }
1177     if (pitch_sd > initial_sd) {
1178       break;
1179     }
1180   }
1181   initial_pitch = best_pitch;
1182 
1183   if (textord_debug_pitch_metric) {
1184     print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);
1185   }
1186 
1187   return best_sd;
1188 }
1189 
1190 /**********************************************************************
1191  * tune_row_pitch
1192  *
1193  * Use a dp algorithm to fit the character cells and return the sd of
1194  * the cell size over the row.
1195  **********************************************************************/
1196 
tune_row_pitch2(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float & initial_pitch,float & best_sp_sd,int16_t & best_mid_cuts,ICOORDELT_LIST * best_cells,bool testing_on)1197 float tune_row_pitch2(          // find fp cells
1198     TO_ROW *row,                // row to do
1199     STATS *projection,          // vertical projection
1200     int16_t projection_left,    // edge of projection
1201     int16_t projection_right,   // edge of projection
1202     float space_size,           // size of blank
1203     float &initial_pitch,       // guess at pitch
1204     float &best_sp_sd,          // space sd
1205     int16_t &best_mid_cuts,     // no of cheap cuts
1206     ICOORDELT_LIST *best_cells, // row cells
1207     bool testing_on             // inidividual words
1208 ) {
1209   int pitch_delta;    // offset pitch
1210   int16_t pixel;      // pixel coord
1211   int16_t best_pixel; // pixel coord
1212   int16_t best_delta; // best pitch
1213   int16_t best_pitch; // best pitch
1214   int16_t start;      // of good range
1215   int16_t end;        // of good range
1216   int32_t best_count; // lowest sum
1217   float best_sd;      // best result
1218 
1219   best_sp_sd = initial_pitch;
1220 
1221   best_pitch = static_cast<int>(initial_pitch);
1222   if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
1223     return initial_pitch;
1224   }
1225   std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection
1226 
1227   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1228     sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta + 1);
1229   }
1230   for (pixel = projection_left; pixel <= projection_right; pixel++) {
1231     for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1232       sum_proj[textord_pitch_range + pitch_delta].add(
1233           (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel));
1234     }
1235   }
1236   best_count = sum_proj[textord_pitch_range].pile_count(0);
1237   best_delta = 0;
1238   best_pixel = 0;
1239   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1240     for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1241       if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) {
1242         best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel);
1243         best_delta = pitch_delta;
1244         best_pixel = pixel;
1245       }
1246     }
1247   }
1248   if (testing_on) {
1249     tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta,
1250             best_count);
1251   }
1252   best_pitch += best_delta;
1253   initial_pitch = best_pitch;
1254   best_count++;
1255   best_count += best_count;
1256   for (start = best_pixel - 2;
1257        start > best_pixel - best_pitch &&
1258        sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count;
1259        start--) {
1260     ;
1261   }
1262   for (end = best_pixel + 2;
1263        end < best_pixel + best_pitch &&
1264        sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count;
1265        end++) {
1266     ;
1267   }
1268 
1269   best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1270                              initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,
1271                              start, end);
1272   if (testing_on) {
1273     tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd);
1274   }
1275 
1276   if (textord_debug_pitch_metric) {
1277     print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);
1278   }
1279 
1280   return best_sd;
1281 }
1282 
1283 /**********************************************************************
1284  * compute_pitch_sd
1285  *
1286  * Use a dp algorithm to fit the character cells and return the sd of
1287  * the cell size over the row.
1288  **********************************************************************/
1289 
compute_pitch_sd(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float initial_pitch,float & sp_sd,int16_t & mid_cuts,ICOORDELT_LIST * row_cells,bool testing_on,int16_t start,int16_t end)1290 float compute_pitch_sd(        // find fp cells
1291     TO_ROW *row,               // row to do
1292     STATS *projection,         // vertical projection
1293     int16_t projection_left,   // edge
1294     int16_t projection_right,  // edge
1295     float space_size,          // size of blank
1296     float initial_pitch,       // guess at pitch
1297     float &sp_sd,              // space sd
1298     int16_t &mid_cuts,         // no of free cuts
1299     ICOORDELT_LIST *row_cells, // list of chop pts
1300     bool testing_on,           // inidividual words
1301     int16_t start,             // start of good range
1302     int16_t end                // end of good range
1303 ) {
1304   int16_t occupation; // no of cells in word.
1305                       // blobs
1306   BLOBNBOX_IT blob_it = row->blob_list();
1307   BLOBNBOX_IT start_it;  // start of word
1308   BLOBNBOX_IT plot_it;   // for plotting
1309   int16_t blob_count;    // no of blobs
1310   TBOX blob_box;         // bounding box
1311   TBOX prev_box;         // of super blob
1312   int32_t prev_right;    // of word sync
1313   int scale_factor;      // on scores for big words
1314   int32_t sp_count;      // spaces
1315   FPSEGPT_LIST seg_list; // char cells
1316   FPSEGPT_IT seg_it;     // iterator
1317   int16_t segpos;        // position of segment
1318   int16_t cellpos;       // previous cell boundary
1319                          // iterator
1320   ICOORDELT_IT cell_it = row_cells;
1321   ICOORDELT *cell;     // new cell
1322   double sqsum;        // sum of squares
1323   double spsum;        // of spaces
1324   double sp_var;       // space error
1325   double word_sync;    // result for word
1326   int32_t total_count; // total blobs
1327 
1328   if ((pitsync_linear_version & 3) > 1) {
1329     word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,
1330                                   occupation, mid_cuts, row_cells, testing_on, start, end);
1331     sp_sd = occupation;
1332     return word_sync;
1333   }
1334   mid_cuts = 0;
1335   cellpos = 0;
1336   total_count = 0;
1337   sqsum = 0;
1338   sp_count = 0;
1339   spsum = 0;
1340   prev_right = -1;
1341   if (blob_it.empty()) {
1342     return space_size * 10;
1343   }
1344 #ifndef GRAPHICS_DISABLED
1345   if (testing_on && to_win != nullptr) {
1346     blob_box = blob_it.data()->bounding_box();
1347     projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1348   }
1349 #endif
1350   start_it = blob_it;
1351   blob_count = 0;
1352   blob_box = box_next(&blob_it); // first blob
1353   blob_it.mark_cycle_pt();
1354   do {
1355     for (; blob_count > 0; blob_count--) {
1356       box_next(&start_it);
1357     }
1358     do {
1359       prev_box = blob_box;
1360       blob_count++;
1361       blob_box = box_next(&blob_it);
1362     } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1363     plot_it = start_it;
1364     if (pitsync_linear_version & 3) {
1365       word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1366                                     projection, projection_left, projection_right,
1367                                     row->xheight * textord_projection_scale, occupation, &seg_list,
1368                                     start, end);
1369     } else {
1370       word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1371                                    projection, &seg_list);
1372     }
1373     if (testing_on) {
1374       tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(),
1375               seg_list.length() - 1, word_sync);
1376       seg_it.set_to_list(&seg_list);
1377       for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1378         if (seg_it.data()->faked) {
1379           tprintf("(F)");
1380         }
1381         tprintf("%d, ", seg_it.data()->position());
1382         //                              tprintf("C=%g, s=%g, sq=%g\n",
1383         //                                      seg_it.data()->cost_function(),
1384         //                                      seg_it.data()->sum(),
1385         //                                      seg_it.data()->squares());
1386       }
1387       tprintf("\n");
1388     }
1389 #ifndef GRAPHICS_DISABLED
1390     if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1391       plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1392     }
1393 #endif
1394     seg_it.set_to_list(&seg_list);
1395     if (prev_right >= 0) {
1396       sp_var = seg_it.data()->position() - prev_right;
1397       sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1398       sp_var *= sp_var;
1399       spsum += sp_var;
1400       sp_count++;
1401     }
1402     for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1403       segpos = seg_it.data()->position();
1404       if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {
1405         // big gap
1406         while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {
1407           cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0);
1408           cell_it.add_after_then_move(cell);
1409           cellpos += static_cast<int16_t>(initial_pitch);
1410         }
1411         // make new one
1412         cell = new ICOORDELT(segpos, 0);
1413         cell_it.add_after_then_move(cell);
1414         cellpos = segpos;
1415       } else if (segpos > cellpos - initial_pitch / 2) {
1416         cell = cell_it.data();
1417         // average positions
1418         cell->set_x((cellpos + segpos) / 2);
1419         cellpos = cell->x();
1420       }
1421     }
1422     seg_it.move_to_last();
1423     prev_right = seg_it.data()->position();
1424     if (textord_pitch_scalebigwords) {
1425       scale_factor = (seg_list.length() - 2) / 2;
1426       if (scale_factor < 1) {
1427         scale_factor = 1;
1428       }
1429     } else {
1430       scale_factor = 1;
1431     }
1432     sqsum += word_sync * scale_factor;
1433     total_count += (seg_list.length() - 1) * scale_factor;
1434     seg_list.clear();
1435   } while (!blob_it.cycled_list());
1436   sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1437   return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1438 }
1439 
1440 /**********************************************************************
1441  * compute_pitch_sd2
1442  *
1443  * Use a dp algorithm to fit the character cells and return the sd of
1444  * the cell size over the row.
1445  **********************************************************************/
1446 
compute_pitch_sd2(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float initial_pitch,int16_t & occupation,int16_t & mid_cuts,ICOORDELT_LIST * row_cells,bool testing_on,int16_t start,int16_t end)1447 float compute_pitch_sd2(       // find fp cells
1448     TO_ROW *row,               // row to do
1449     STATS *projection,         // vertical projection
1450     int16_t projection_left,   // edge
1451     int16_t projection_right,  // edge
1452     float initial_pitch,       // guess at pitch
1453     int16_t &occupation,       // no of occupied cells
1454     int16_t &mid_cuts,         // no of free cuts
1455     ICOORDELT_LIST *row_cells, // list of chop pts
1456     bool testing_on,           // inidividual words
1457     int16_t start,             // start of good range
1458     int16_t end                // end of good range
1459 ) {
1460   // blobs
1461   BLOBNBOX_IT blob_it = row->blob_list();
1462   BLOBNBOX_IT plot_it;
1463   int16_t blob_count;    // no of blobs
1464   TBOX blob_box;         // bounding box
1465   FPSEGPT_LIST seg_list; // char cells
1466   FPSEGPT_IT seg_it;     // iterator
1467   int16_t segpos;        // position of segment
1468                          // iterator
1469   ICOORDELT_IT cell_it = row_cells;
1470   ICOORDELT *cell;  // new cell
1471   double word_sync; // result for word
1472 
1473   mid_cuts = 0;
1474   if (blob_it.empty()) {
1475     occupation = 0;
1476     return initial_pitch * 10;
1477   }
1478 #ifndef GRAPHICS_DISABLED
1479   if (testing_on && to_win != nullptr) {
1480     projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1481   }
1482 #endif
1483   blob_count = 0;
1484   blob_it.mark_cycle_pt();
1485   do {
1486     // first blob
1487     blob_box = box_next(&blob_it);
1488     blob_count++;
1489   } while (!blob_it.cycled_list());
1490   plot_it = blob_it;
1491   word_sync = check_pitch_sync2(
1492       &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1493       projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end);
1494   if (testing_on) {
1495     tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(),
1496             seg_list.length() - 1, word_sync);
1497     seg_it.set_to_list(&seg_list);
1498     for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1499       if (seg_it.data()->faked) {
1500         tprintf("(F)");
1501       }
1502       tprintf("%d, ", seg_it.data()->position());
1503       //                              tprintf("C=%g, s=%g, sq=%g\n",
1504       //                                      seg_it.data()->cost_function(),
1505       //                                      seg_it.data()->sum(),
1506       //                                      seg_it.data()->squares());
1507     }
1508     tprintf("\n");
1509   }
1510 #ifndef GRAPHICS_DISABLED
1511   if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1512     plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1513   }
1514 #endif
1515   seg_it.set_to_list(&seg_list);
1516   for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1517     segpos = seg_it.data()->position();
1518     // make new one
1519     cell = new ICOORDELT(segpos, 0);
1520     cell_it.add_after_then_move(cell);
1521     if (seg_it.at_last()) {
1522       mid_cuts = seg_it.data()->cheap_cuts();
1523     }
1524   }
1525   seg_list.clear();
1526   return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;
1527 }
1528 
1529 /**********************************************************************
1530  * print_pitch_sd
1531  *
1532  * Use a dp algorithm to fit the character cells and return the sd of
1533  * the cell size over the row.
1534  **********************************************************************/
1535 
print_pitch_sd(TO_ROW * row,STATS * projection,int16_t projection_left,int16_t projection_right,float space_size,float initial_pitch)1536 void print_pitch_sd(         // find fp cells
1537     TO_ROW *row,             // row to do
1538     STATS *projection,       // vertical projection
1539     int16_t projection_left, // edges //size of blank
1540     int16_t projection_right, float space_size,
1541     float initial_pitch // guess at pitch
1542 ) {
1543   const char *res2;   // pitch result
1544   int16_t occupation; // used cells
1545   float sp_sd;        // space sd
1546                       // blobs
1547   BLOBNBOX_IT blob_it = row->blob_list();
1548   BLOBNBOX_IT start_it;     // start of word
1549   BLOBNBOX_IT row_start;    // start of row
1550   int16_t blob_count;       // no of blobs
1551   int16_t total_blob_count; // total blobs in line
1552   TBOX blob_box;            // bounding box
1553   TBOX prev_box;            // of super blob
1554   int32_t prev_right;       // of word sync
1555   int scale_factor;         // on scores for big words
1556   int32_t sp_count;         // spaces
1557   FPSEGPT_LIST seg_list;    // char cells
1558   FPSEGPT_IT seg_it;        // iterator
1559   double sqsum;             // sum of squares
1560   double spsum;             // of spaces
1561   double sp_var;            // space error
1562   double word_sync;         // result for word
1563   double total_count;       // total cuts
1564 
1565   if (blob_it.empty()) {
1566     return;
1567   }
1568   row_start = blob_it;
1569   total_blob_count = 0;
1570 
1571   total_count = 0;
1572   sqsum = 0;
1573   sp_count = 0;
1574   spsum = 0;
1575   prev_right = -1;
1576   blob_it = row_start;
1577   start_it = blob_it;
1578   blob_count = 0;
1579   blob_box = box_next(&blob_it); // first blob
1580   blob_it.mark_cycle_pt();
1581   do {
1582     for (; blob_count > 0; blob_count--) {
1583       box_next(&start_it);
1584     }
1585     do {
1586       prev_box = blob_box;
1587       blob_count++;
1588       blob_box = box_next(&blob_it);
1589     } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1590     word_sync = check_pitch_sync2(
1591         &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1592         projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1593     total_blob_count += blob_count;
1594     seg_it.set_to_list(&seg_list);
1595     if (prev_right >= 0) {
1596       sp_var = seg_it.data()->position() - prev_right;
1597       sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1598       sp_var *= sp_var;
1599       spsum += sp_var;
1600       sp_count++;
1601     }
1602     seg_it.move_to_last();
1603     prev_right = seg_it.data()->position();
1604     if (textord_pitch_scalebigwords) {
1605       scale_factor = (seg_list.length() - 2) / 2;
1606       if (scale_factor < 1) {
1607         scale_factor = 1;
1608       }
1609     } else {
1610       scale_factor = 1;
1611     }
1612     sqsum += word_sync * scale_factor;
1613     total_count += (seg_list.length() - 1) * scale_factor;
1614     seg_list.clear();
1615   } while (!blob_it.cycled_list());
1616   sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1617   word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1618   tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd,
1619           word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P');
1620 
1621   start_it = row_start;
1622   blob_it = row_start;
1623   word_sync =
1624       check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2,
1625                         projection, projection_left, projection_right,
1626                         row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1627   if (occupation > 1) {
1628     word_sync /= occupation;
1629   }
1630   word_sync = sqrt(word_sync);
1631 
1632 #ifndef GRAPHICS_DISABLED
1633   if (textord_show_row_cuts && to_win != nullptr) {
1634     plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1635   }
1636 #endif
1637   seg_list.clear();
1638   if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1639     if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {
1640       res2 = "DF";
1641     } else {
1642       res2 = "MF";
1643     }
1644   } else {
1645     res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1646   }
1647   tprintf(
1648       "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, "
1649       "all_caps=%d\n",
1650       word_sync, word_sync / initial_pitch,
1651       word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2,
1652       initial_pitch, row->fixed_pitch, row->all_caps);
1653 }
1654 
1655 /**********************************************************************
1656  * find_repeated_chars
1657  *
1658  * Extract marked leader blobs and put them
1659  * into words in advance of fixed pitch checking and word generation.
1660  **********************************************************************/
find_repeated_chars(TO_BLOCK * block,bool testing_on)1661 void find_repeated_chars(TO_BLOCK *block,   // Block to search.
1662                          bool testing_on) { // Debug mode.
1663   POLY_BLOCK *pb = block->block->pdblk.poly_block();
1664   if (pb != nullptr && !pb->IsText()) {
1665     return; // Don't find repeated chars in non-text blocks.
1666   }
1667 
1668   TO_ROW *row;
1669   BLOBNBOX_IT box_it;
1670   BLOBNBOX_IT search_it; // forward search
1671   WERD *word;            // new word
1672   TBOX word_box;         // for plotting
1673   int blobcount, repeated_set;
1674 
1675   TO_ROW_IT row_it = block->get_rows();
1676   if (row_it.empty()) {
1677     return; // empty block
1678   }
1679   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1680     row = row_it.data();
1681     box_it.set_to_list(row->blob_list());
1682     if (box_it.empty()) {
1683       continue; // no blobs in this row
1684     }
1685     if (!row->rep_chars_marked()) {
1686       mark_repeated_chars(row);
1687     }
1688     if (row->num_repeated_sets() == 0) {
1689       continue; // nothing to do for this row
1690     }
1691     // new words
1692     WERD_IT word_it(&row->rep_words);
1693     do {
1694       if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {
1695         blobcount = 1;
1696         repeated_set = box_it.data()->repeated_set();
1697         search_it = box_it;
1698         search_it.forward();
1699         while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {
1700           blobcount++;
1701           search_it.forward();
1702         }
1703         // After the call to make_real_word() all the blobs from this
1704         // repeated set will be removed from the blob list. box_it will be
1705         // set to point to the blob after the end of the extracted sequence.
1706         word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
1707         if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1708           tprintf("Bad box joined to prev at");
1709           box_it.data()->bounding_box().print();
1710           tprintf("After repeated word:");
1711           word->bounding_box().print();
1712         }
1713         ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1714         word->set_flag(W_REP_CHAR, true);
1715         word->set_flag(W_DONT_CHOP, true);
1716         word_it.add_after_then_move(word);
1717       } else {
1718         box_it.forward();
1719       }
1720     } while (!box_it.at_first());
1721   }
1722 }
1723 
1724 /**********************************************************************
1725  * plot_fp_word
1726  *
1727  * Plot a block of words as if fixed pitch.
1728  **********************************************************************/
1729 
1730 #ifndef GRAPHICS_DISABLED
plot_fp_word(TO_BLOCK * block,float pitch,float nonspace)1731 void plot_fp_word(   // draw block of words
1732     TO_BLOCK *block, // block to draw
1733     float pitch,     // pitch to draw with
1734     float nonspace   // for space threshold
1735 ) {
1736   TO_ROW *row; // current row
1737   TO_ROW_IT row_it = block->get_rows();
1738 
1739   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1740     row = row_it.data();
1741     row->min_space = static_cast<int32_t>((pitch + nonspace) / 2);
1742     row->max_nonspace = row->min_space;
1743     row->space_threshold = row->min_space;
1744     plot_word_decisions(to_win, static_cast<int16_t>(pitch), row);
1745   }
1746 }
1747 #endif
1748 
1749 } // namespace tesseract
1750