1 /*
2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2018  Joerg Schulenburg
4 
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18 
19  see README for EMAIL-address
20 
21   sometimes I have written comments in german language, sorry for that
22 
23  - look for ??? for preliminary code
24  - space: avX=22 11-13 (empirical estimated)
25           avX=16  5-7
26           avX= 7  5-6
27 
28  ToDo: - add filter (r/s mismatch) g300c1
29        - better get_line2 function (problems on high resolution)
30        - write parallelizable code!
31        - learnmode (optimize filter)
32        - use ispell for final control or if unsure
33        - better line scanning (if not even)
34        - step 5: same chars differ? => expert mode
35        - chars dx>dy and above 50% hor-crossing > 4 is char-group ?
36        - detect color of chars and background
37        - better word space calculation (look at the examples)
38           (distance: left-left, middle-middle, left-right, thickness of e *0.75)
39 
40    GLOBAL DATA (mostly structures)
41    - pix   : image - one byte per pixel  bits0-2=working
42    - lines : rows of the text (points to pix)
43    - box   : list of bounding box for character
44    - obj   : objects (lines, splines, etc. building a character)
45  */
46 
47 
48 #include <stdlib.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <string.h>
52 #include <ctype.h>
53 #include "config.h"
54 #ifdef HAVE_WCHAR_H
55 #include <wchar.h>
56 #endif
57 
58 #include "amiga.h"
59 #include "list.h"
60 #include "pgm2asc.h"
61 // #include "pcx.h"        /* needed for writebmp (removed later) */
62 /* ocr1 is the test-engine - remember: this is development version */
63 #include "ocr1.h"
64 /* first engine */
65 #include "ocr0.h"
66 #include "otsu.h"
67 #include "barcode.h"
68 #include "progress.h"
69 #include "unicode_defs.h" /* UNKNOWN + PICTURES + ... */
70 
71 #include "gocr.h"
72 
73 #include "ocr0_dbg.h" /* added 2017-07 */
74 
75 /* wew: will be exceeded by capitals at 1200dpi */
76 #define MaxBox (100*200)	// largest possible letter (buffersize)
77 #define MAX(a,b)			((a) >= (b) ? (a) : (b))
78 
79 /* if the system does not know about wchar.h, define functions here */
80 #ifndef HAVE_WCHAR_H
81 /* typedef unsigned wchar_t; */
82 /* Find the first occurrence of WC in WCS.  */
wcschr(wchar_t * wcs,wchar_t wc)83 wchar_t *wcschr (wchar_t *wcs, wchar_t wc) {
84   int i; for(i=0;wcs[i];i++) if (wcs[i]==wc) return wcs+i; return NULL;
85 }
wcscpy(wchar_t * dest,const wchar_t * src)86 wchar_t *wcscpy (wchar_t *dest, const wchar_t *src) {
87   int i; for(i=0;src[i];i++) dest[i]=src[i]; dest[i]=0; return dest;
88 }
wcslen(const wchar_t * s)89 size_t wcslen (const wchar_t *s){
90   size_t i; for(i=0;s[i];i++); return i;
91 }
92 #endif
93 #ifndef HAVE_WCSDUP
wcsdup(const wchar_t * WS)94 wchar_t * wcsdup (const wchar_t *WS) {	/* its a gnu extension */
95   wchar_t *copy;
96   copy = (wchar_t *) malloc((wcslen(WS)+1)*sizeof(wchar_t));
97   if (!copy)return NULL;
98   wcscpy(copy, WS);
99   return copy;
100 }
101 #endif
102 
103 // ------------------------ feature extraction -----------------
104 // -------------------------------------------------------------
105 // detect maximas in of line overlaps (return in %) and line coordinates
106 // this is for future use
107 #define HOR 1    // horizontal
108 #define VER 2    // vertical
109 #define RIS 3    // rising=steigend
110 #define FAL 4    // falling=fallend
111 
112 /* exchange two variables */
swap(int * a,int * b)113 static void swap(int *a, int *b) {
114   int c = *a;
115   *a = *b;
116   *b = c;
117 }
118 
119 // calculate the overlapping of the line (0-1) with black points
120 // by recursive bisection
121 // line: y=dy/dx*x+b, implicit form: d=F(x,y)=dy*x-dx*y+b*dx=0
122 // incremental y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y))
123 // ret & 1 => inverse pixel!
124 // d=2*F(x,y) integer numbers
get_line(int x0,int y0,int x1,int y1,pix * p,int cs,int ret)125 int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
126    int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,
127        *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
128    dx=abs(x1-x0); tx=((x1>x0)?1:-1);    // tx=x-spiegelung (new)
129    dy=abs(y1-y0); ty=((y1>y0)?1:-1);	// ty=y-spiegelung (new)
130    // rotate coordinate system if dy>dx
131 /*bbg: can be faster if instead of pointers we use the variables and swaps? */
132 /*js: Do not know, I am happy that the current code is working and is small */
133    if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1; }
134    else     { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1; }
135    if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
136    d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
137    x=x0; y=y0; r0=r1=0; /* dd=tolerance (store max drift) */
138    while( (*px)<=(*px1) ){
139      if( ((getpixel(p,x,y)<cs)?1:0)^(ret&1) ) r0++; else r1++;
140      (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
141    }
142    return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
143 }
144 
145 // this function should detect whether a direct connection between points
146 //   exists or not, not finally implemented
147 // ret & 1 => inverse pixel!
148 // d=2*F(x,y) integer numbers, ideal line: ,I pixel: I@
149 //   ..@  @@@  .@.  ...,@2@. +1..+3 floodfill around line ???
150 //   ..@  .@@  .@.  ...,.@@@ +2..+4 <= that's not implemented yet
151 //   ..@  ..@  .@.  ...,.@@@ +2..+4
152 //   @.@  @..  .@.  ...,@@@. +1..+3
153 //   @.@  @@.  .@.  ...I@@@.  0..+3
154 //   @@@  @@@  .@.  ..@1@@..  0..+2
155 //   90%   0%  100%   90%     r1-r2
156 // I am not satisfied with it
get_line2(int x0,int y0,int x1,int y1,pix * p,int cs,int ret)157 int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
158    int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,q,ddy,rx,ry,
159        *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
160    dx=abs(x1-x0); tx=((x1>x0)?1:-1);    // tx=x-spiegelung (new)
161    dy=abs(y1-y0); ty=((y1>y0)?1:-1);	// ty=y-spiegelung (new)
162    // rotate coordinate system if dy>dx
163    if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1;rx=1;ry=0; }
164    else     { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1;rx=0;ry=1; }
165    if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
166    d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
167    x=x0; y=y0; r0=r1=0; ddy=3; // tolerance = bit 1 + bit 0 = left+right
168    // int t=(*pdx)/16,tl,tr;  // tolerance, left-,right delimiter
169    while( (*px)<=(*px1) ){  // not finaly implemented
170      q=((getpixel(p,x,y)<cs)?1:0)^(ret&1);
171      if ( !q ){		// tolerance one pixel perpenticular to the line
172                         // what about 2 or more pixels tolerance???
173        ddy&=(~1)|(((getpixel(p,x+ry,y+rx)<cs)?1:0)^(ret&1));
174        ddy&=(~2)|(((getpixel(p,x-ry,y-rx)<cs)?1:0)^(ret&1))*2;
175      } else ddy=3;
176      if( ddy ) r0++; else r1++;
177      (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
178    }
179    return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
180 }
181 
182 /* Look for dots in the rectangular region x0 <= x <= x1 and y0 <= y
183  <= y1 in pixmap p.  The two low order bits in mask indicate the color
184  of dots to look for: If mask==1 then look for black dots (where a
185  pixel value less than cs is considered black).  If mask==2 then look
186  for white dots.  If mask==3 then look for both black and white dots.
187  If the dots are found, the corresponding bits are set in the returned
188  value.  Heavily used by the engine ocr0*.cc */
get_bw(int x0,int x1,int y0,int y1,pix * p,int cs,int mask)189 char get_bw(int x0, int x1, int y0, int y1, pix * p, int cs, int mask) {
190   char rc = 0;			// later with error < 2% (1 dot)
191   int x, y;
192 
193   if (x0 < 0)        x0 = 0;
194   if (x1 >= p->x)    x1 = p->x - 1;
195   if (y0 < 0)        y0 = 0;
196   if (y1 >= p->y)    y1 = p->y - 1;
197 
198   for ( y = y0; y <= y1; y++)
199     for ( x = x0; x <= x1; x++) {
200       rc |= ((getpixel(p, x, y) < cs) ? 1 : 2);	// break if rc==3
201       if ((rc & mask) == mask)
202 	return mask;		// break loop
203     }
204   return (rc & mask);
205 }
206 
207 /* more general Mar2000 (x0,x1,y0,y1 instead of x0,y0,x1,y1! (history))
208  * look for black crossings throw a line from x0,y0 to x1,y1 and count them
209  * follow line and count crossings ([white]-black-transitions)
210  *  ex: horizontal num_cross of 'm' would return 3
211  *
212  * fail for:  .a... a-to-b counts no transitions, but there is
213  *            ...#.
214  *            ..#..
215  *            .#..b
216  *  ToDo18: make it tolerant against noise on big chars, +cross-width
217  *            ......#.#########.#...... should count as 1 cross
218  */
num_cross(int x0,int x1,int y0,int y1,pix * p,int cs)219 int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
220   int rc = 0, col = 0, k, x, y, i, d;	// rc=crossings  col=0=white
221   int dx = x1 - x0, dy = y1 - y0;
222   int w2_cross=0, w1_cross=0, w1_white=0; // last + 2nd-last cross-width
223 
224   d = MAX(abs(dx), abs(dy));
225   for (i = 0, x = x0, y = y0; i <= d; i++) {
226     if (d) {
227       x = x0 + i * dx / d;
228       y = y0 + i * dy / d;
229     }
230     k = ((getpixel(p, x, y) < cs) ? 1 : 0);	// 0=white 1=black
231     if (col == 0 && k == 1) rc++; // found a white-black transition
232     if (col == 1 && k == 1) w1_cross++; // 1810 add line-width
233     if (col == 1 && k == 0) {
234       if ((w2_cross<=1 && w1_white<=1 && w1_cross>7)
235        || (w1_cross<=1 && w1_white<=1 && w2_cross>7)) if (rc>1) rc--; // 1810 remove noise
236       if (w1_cross > w2_cross) { w2_cross=w1_cross; }
237       w1_cross=0;
238     }
239     if (col == 0 && k == 0) w1_white++; // 1810 add line-width
240     if (col == 0 && k == 1) w1_white=0;
241     col = k;        // last color
242   }
243   return rc;
244 }
245 
num_cross_fine(int x0,int x1,int y0,int y1,pix * p,int cs)246 int num_cross_fine(int x0, int x1, int y0, int y1, pix *p, int cs) {
247   int rc = 0, col = 0, k, x, y, i, d;	// rc=crossings  col=0=white
248   int dx = x1 - x0, dy = y1 - y0;
249 
250   d = MAX(abs(dx), abs(dy));
251   for (i = 0, x = x0, y = y0; i <= d; i++) {
252     if (d) {
253       x = x0 + i * dx / d;
254       y = y0 + i * dy / d;
255     }
256     k = ((getpixel(p, x, y) < cs) ? 1 : 0);	// 0=white 1=black
257     if (col == 0 && k == 1) rc++; // found a white-black transition
258     col = k;        // last color
259   }
260   return rc;
261 }
262 
263 /* check if test matches pattern
264  *   possible pattern: "a-zA-Z0-9+--\\"  (x-y dont work for c>127)
265  *  return: 0 means dont fit, 1 means found
266  *   ToDo: wchar_t cc + matching UTF-8 pattern for nonASCII
267  */
my_strchr(char * pattern,wchar_t cc)268 int my_strchr( char *pattern, wchar_t cc ) {
269   char *s1;
270   if (pattern==(char *)NULL) return 0;
271 
272   /* if (!(cc&0x80)) s1=strchr(pattern,(char)cc);  else */
273   switch (cc) {
274     case '-':                  /* used as a special character */
275       s1=strstr(pattern,"--"); /* search string -- in pattern */
276       if (s1) return 1; break;
277     default:
278       s1=strstr(pattern,decode(cc, UTF8)); /* search string cc in pattern */
279       if (s1) return 1; /* cc simply matches */
280       /* single char not found, now check the ranges */
281       s1=pattern;
282       while (s1) {
283         s1=strchr(s1+1,'-');  /* look for next '-' */
284         if ((!s1) || (!s1[0]) || (!s1[1])) return 0; /* nothing found or end */
285         if (*(s1-1)=='-' || *(s1+1)=='-') continue; /* skip -- pattern */
286         if (*(s1-1)<=cc && *(s1+1)>=cc) return 1;  /* within range */
287       }
288   }
289   return 0;
290 }
291 
292 /* set alternate chars and its weight, called from the engine
293     if a char is recognized to (weight) percent
294    can be used for filtering (only numbers etc)
295    often usefull if Il1 are looking very similar
296    should this function stay in box.c ???
297    weight is between 0 and 100 in percent, 100 means absolutely sure
298    - not final, not time critical (js)
299    - replace it by a string-function setaobj(*b,"string",weight)
300      and let call setac the setas function
301  */
302 
setas(struct box * b,char * as,int weight)303 int setas(struct box *b, char *as, int weight){
304   job_t *job=OCR_JOB;
305   int i,j;
306   if (b->num_ac > NumAlt || b->num_ac<0) {
307     fprintf(stderr,"\nDBG: There is something wrong with setas()!");
308     b->num_ac=0;
309   }
310   if (as==NULL) {
311     fprintf(stderr,"\nDBG: setas(NULL) makes no sense!"); return 0; }
312   if (as[0]==0) {
313     fprintf(stderr,"\nDBG: setas(\"\") makes no sense!"
314                    " x= %d %d", b->x0, b->y0);
315     // out_x(b);
316     return 0;
317   }
318 
319   /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
320   if (job->cfg.cfilter) {
321     /* do not accept chars which are not in the cfilter string */
322     if ( as[0]>0 && as[1]==0 )
323     if ( !my_strchr(job->cfg.cfilter,as[0]) ) return 0;
324   }
325 #if 0 /* obsolete, done in setac */
326   /* not sure that this is the right place, but where else? */
327   if ( as[0]>0 && as[1]==0 )
328   if (b->modifier != SPACE  &&  b->modifier != 0) {
329     wchar_t newac;
330     newac = compose(as[0], b->modifier);
331     as = (char *)decode(newac, UTF8); /* was (const char *) */
332     if (newac == as[0]) { /* nothing composed */
333       fprintf(stderr, "\nDBG setas compose was useless %d %d",b->x0,b->y0);
334       // out_x(b);
335     }
336   }
337 #endif
338 
339   /* only the first run gets the full weight */
340   weight=(100-job->tmp.n_run)*weight/100;
341 
342   /* remove same entries from table */
343   for (i=0;i<b->num_ac;i++)
344     if (b->tas[i])
345       if (strcmp(as,b->tas[i])==0) break;
346   if (b->num_ac>0 && i<b->num_ac){
347     if (weight<=b->wac[i]) return 0; /* if found + less weight ignore it */
348     /* to insert the new weigth on the right place, we remove it first */
349     if (b->tas[i]) free(b->tas[i]);
350     for (j=i;j<b->num_ac-1;j++){  /* shift lower entries */
351       b->tac[j]=b->tac[j+1]; /* copy the char   */
352       b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
353       b->wac[j]=b->wac[j+1]; /* copy the weight */
354     }
355     b->num_ac--; /* shrink table */
356   }
357   /* sorting and add it to the table */
358   for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
359   if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
360   for (j=b->num_ac-1;j>i;j--){  /* shift lower entries */
361     b->tac[j]=b->tac[j-1]; /* copy the char   */
362     b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
363     b->wac[j]=b->wac[j-1]; /* copy the weight */
364   }
365   if (i<b->num_ac) {    /* insert new entry */
366     b->tac[i]=0;        /* insert the char=0 ... */
367     b->tas[i]=(char *)malloc(strlen(as)+1);     /* ... string */
368     if (b->tas[i]) memcpy(b->tas[i],as,strlen(as)+1);
369     b->wac[i]=weight;   /* ... and its weight  */
370   }
371   if (i==0) b->c=b->tac[0];  /* char or 0 for string */
372   return 0;
373 }
374 
375 /* ToDo: this function will be replaced by a call of setas() later */
setac(struct box * b,wchar_t ac,int weight)376 int setac(struct box *b, wchar_t ac, int weight){
377   int i,j;
378   job_t *job=OCR_JOB;
379   if ((!b) || b->num_ac > NumAlt || b->num_ac<0) {
380     fprintf(stderr,"\nDBG: This is a bad call to setac()!");
381     if(b && (job->cfg.verbose & 6)) out_x(b);
382     b->num_ac=0;
383   }
384   if (ac==0 || ac==UNKNOWN) {
385     fprintf(stderr,"\nDBG: setac(0) makes no sense!");
386     return 0;
387   }
388   /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
389   if (job->cfg.cfilter) {
390     /* do not accept chars which are not in the cfilter string */
391     /* if ( ac>255 || !strchr(job->cfg.cfilter,(char)ac) ) return 0; */
392     if ( !my_strchr(job->cfg.cfilter,ac) ) return 0;
393   }
394   /* not sure that this is the right place, but where else? */
395   if (b->modifier != SPACE  &&  b->modifier != 0) {
396     wchar_t newac;
397     newac = compose(ac, b->modifier);
398     if (newac == ac) { /* nothing composed */
399       if(job->cfg.verbose & 7)
400         fprintf(stderr, "\nDBG %s setac (%d,%d): compose was useless, wac=%d",
401                 decode(ac,ASCII), b->x0, b->y0, weight);
402       /* if(job->cfg.verbose & 6) out_x(b); */
403     }
404     ac = newac;
405   }
406 
407   /* only the first run gets the full weight */
408   weight=(100-job->tmp.n_run)*weight/100;
409 
410   /* remove same entries from table */
411   for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) break;
412   if (b->num_ac>0 && i<b->num_ac){
413     if (weight<=b->wac[i]) return 0;
414     if (b->tas[i]) free(b->tas[i]);
415     for (j=i;j<b->num_ac-1;j++){  /* shift lower entries */
416       b->tac[j]=b->tac[j+1]; /* copy the char   */
417       b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
418       b->wac[j]=b->wac[j+1]; /* copy the weight */
419     }
420     b->num_ac--; /* shrink table */
421   }
422   /* sorting it to the table */
423   for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
424   if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
425   for (j=b->num_ac-1;j>i;j--){  /* shift lower entries */
426     b->tac[j]=b->tac[j-1]; /* copy the char   */
427     b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
428     b->wac[j]=b->wac[j-1]; /* copy the weight */
429   }
430   if (i<b->num_ac) {    /* insert new entry */
431     b->tac[i]=ac;       /* insert the char ... */
432     b->tas[i]=NULL;     /* ... no string (?) 2018-09 fix ji */
433     b->wac[i]=weight;   /* ... and its weight  */
434   }
435   if (i==0) b->c=ac; /* store best result to b->c (will be obsolete) */
436 
437   return 0;
438 }
439 
440 /* test if ac in wac-table
441    usefull for contextcorrection and box-splitting
442    return 0 if not found
443    return wac if found (wac>0)
444  */
testac(struct box * b,wchar_t ac)445 int testac(struct box *b, wchar_t ac){
446   int i;
447   if (b->num_ac > NumAlt || b->num_ac<0) {
448     fprintf(stderr,"\n#DEBUG: There is something wrong with testac()!");
449     b->num_ac=0;
450   }
451   /* search entries in table */
452   for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) return b->wac[i];
453   return 0;
454 }
455 
456 
457 /* look for edges: follow a line from x0,y0 to x1,y1, record the
458  * location of each transition, and return their number.
459  * ex: horizontal num_cross of 'm' would return 6
460  * remark: this function is not used, obsolete? ToDo: remove?
461  */
follow_path(int x0,int x1,int y0,int y1,pix * p,int cs,path_t * path)462 int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path) {
463   int rc = 0, prev, x, y, i, d, color; // rc=crossings  col=0=white
464   int dx = x1 - x0, dy = y1 - y0;
465 
466   d = MAX(abs(dx), abs(dy));
467   prev = getpixel(p, x0, y0) < cs;	// 0=white 1=black
468   path->start = prev;
469   for (i = 1, x = x0, y = y0; i <= d; i++) {
470     if (d) {
471       x = x0 + i * dx / d;
472       y = y0 + i * dy / d;
473     }
474     color = getpixel(p, x, y) < cs; // 0=white 1=black
475     if (color != prev){
476       if (rc>=path->max){
477 	int n=path->max*2+10;
478 	path->x = (int *) xrealloc(path->x, n*sizeof(int));
479 	path->y = (int *) xrealloc(path->y, n*sizeof(int));
480 	path->max = n;
481       }
482       path->x[rc]=x;
483       path->y[rc]=y;
484       rc++;
485     }
486     prev = color;
487   }
488   path->num=rc;
489   return rc;
490 }
491 
492 /* ToDo: only used in follow_path, which is obsolete, remove? */
xrealloc(void * ptr,size_t size)493 void *xrealloc(void *ptr, size_t size){
494   void *p;
495   p = realloc(ptr, size);
496   if (size>0 && (!p)){
497     fprintf(stderr, "insufficient memory");
498     exit(1);
499   }
500   return p;
501 }
502 
503 /*
504  *  -------------------------------------------------------------
505  *  mark edge-points
506  *   - first move forward until b/w-edge
507  *   - more than 2 pixel?
508  *   - loop around
509  *     - if forward    pixel : go up, rotate right
510  *     - if forward no pixel : rotate left
511  *   - stop if found first 2 pixel in same order
512  *  go_along_the_right_wall strategy is very similar and used otherwhere
513  *  --------------------------------------------------------------
514  *  turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border
515  * 	       out: last-position
516  *
517  *  could be used to extract more features:
518  *   by counting stepps, dead-end streets ,xmax,ymax,ro-,ru-,lo-,lu-edges
519  *
520  *   use this little animal to find features, I first was happy about it
521  *    but now I prefer the loop() function
522  */
523 
turmite(pix * p,int * x,int * y,int x0,int x1,int y0,int y1,int cs,int rw,int rb)524 void turmite(pix *p, int *x, int *y,
525 	     int x0, int x1, int y0, int y1, int cs, int rw, int rb) {
526   int r;
527   if (outbounds(p, x0, y0))	// out of pixmap
528     return;
529   while (*x >= x0 && *y >= y0 && *x <= x1 && *y <= y1) {
530     r = ((getpixel(p, *x, *y) < cs) ? rb : rw);	// select rule
531     switch (r) {
532       case UP: (*y)--; break;
533       case DO: (*y)++; break;
534       case RI: (*x)++; break;
535       case LE: (*x)--; break;
536       case ST:       break;
537       default:       assert(0);
538     }
539     if( r==ST ) break;	/* leave the while-loop */
540   }
541 }
542 
543 /* search a way from p0 to p1 without crossing pixels of type t
544  *  only two directions, useful to test if there is a gap 's'
545  * labyrinth algorithm - do you know a faster way? */
joined(pix * p,int x0,int y0,int x1,int y1,int cs)546 int joined(pix *p, int x0, int y0, int x1, int y1, int cs){
547   int t,r,x,y,dx,dy,xa,ya,xb,yb;
548   x=x0;y=y0;dx=1;dy=0;
549   if(x1>x0){xa=x0;xb=x1;} else {xb=x0;xa=x1;}
550   if(y1>y0){ya=y0;yb=y1;} else {yb=y0;ya=y1;}
551   t=((getpixel(p,x,y)<cs)?1:0);
552   for(;;){
553     if( t==((getpixel(p,x+dy,y-dx)<cs)?1:0)	// right free?
554      && x+dy>=xa && x+dy<=xb && y-dx>=ya && y-dx<=yb) // wall
555          { r=dy;dy=-dx;dx=r;x+=dx;y+=dy; } // rotate right and step forward
556     else { r=dx;dx=-dy;dy=r; } // rotate left
557     // fprintf(stderr," path xy %d-%d %d-%d %d %d  %d %d\n",xa,xb,ya,yb,x,y,dx,dy);
558     if( x==x1 && y==y1 ) return 1;
559     if( x==x0 && y==y0 && dx==1) return 0;
560   }
561   // return 0; // endless loop ?
562 }
563 
564 /* move from x,y to direction r until pixel of color col is found
565  *   or maximum of l steps
566  * return the number of steps done */
loop(pix * p,int x,int y,int l,int cs,int col,DIRECTION r)567 int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){
568   int i=0;
569   if(x>=0 && y>=0 && x<p->x && y<p->y){
570     switch (r) {
571     case UP:
572       for( ;i<l && y>=0;i++,y--)
573 	if( (getpixel(p,x,y)<cs)^col )
574 	  break;
575       break;
576     case DO:
577       for( ;i<l && y<p->y;i++,y++)
578 	if( (getpixel(p,x,y)<cs)^col )
579 	  break;
580       break;
581     case LE:
582       for( ;i<l && x>=0;i++,x--)
583 	if( (getpixel(p,x,y)<cs)^col )
584 	  break;
585       break;
586     case RI:
587       for( ;i<l && x<p->x;i++,x++)
588 	if( (getpixel(p,x,y)<cs)^col )
589 	  break;
590       break;
591     default:;
592     }
593   }
594   return i;
595 }
596 
597 /* Given a point, frames a rectangle containing all points of the same
598  * color surrounding it, and mark these points.
599  *  ToDo:  obsolate and replaced by frame_vector
600  *
601  * looking for better algo: go horizontally and look for upper/lower non_marked_pixel/nopixel
602  * use lowest three bits for mark
603  *   - recursive version removed! AmigaOS has no Stack-OVL-Event
604  * run around the chape using laby-robot
605  * bad changes can lead to endless loop!
606  *  - this is not absolutely sure but mostly works well
607  *  diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
608  *  mark - 3 bit marker, mark each valid pixel with it
609  */
frame_nn(pix * p,int x,int y,int * x0,int * x1,int * y0,int * y1,int cs,int mark,int diag)610 int frame_nn(pix *p, int  x,  int  y,
611              int *x0, int *x1, int *y0, int *y1,	// enlarge frame
612              int cs, int mark,int diag){
613 #if 1 /* flood-fill to detect black objects, simple and faster? */
614   int rc = 0, dx, col, maxstack=0; static int overflow=0;
615   int bmax=1024, blen=0, *buf;  /* buffer as replacement for recursion stack */
616 
617   /* check bounds */
618   if (outbounds(p, x, y))  return 0;
619   /* check if already marked (with mark since v0.4) */
620   if ((marked(p,x,y)&mark)==mark) return 0;
621 
622   col = ((getpixel(p, x, y) < cs) ? 0 : 1);
623   buf=(int *)malloc(bmax*sizeof(int)*2);
624   if (!buf) { fprintf(stderr,"malloc failed (frame_nn)\n");return 0;}
625   buf[0]=x;
626   buf[1]=y;
627   blen=1;
628 
629   g_debug(fprintf(stderr,"\nframe_nn   x=%4d y=%4d",x,y);)
630   for ( ; blen ; ) {
631     /* max stack depth is complexity of the object */
632     if (blen>maxstack) maxstack=blen;
633     blen--;             /* reduce the stack */
634     x=buf[blen*2+0];
635     y=buf[blen*2+1];
636     if (y < *y0) *y0 = y;
637     if (y > *y1) *y1 = y;
638     /* first go to leftmost pixel */
639     for ( ; x>0 && (col == ((getpixel(p, x-1, y) < cs) ? 0 : 1)) ; x--);
640     if ((marked(p,x,y)&mark)==mark) continue; /* already scanned */
641     for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, left */
642     if (    diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
643          &&  col != ((getpixel(p, x  , y+dx) < cs) ? 0 : 1)
644          &&  col == ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
645          && !((marked(p,x-1,y+dx)&mark)==mark)
646        ) {
647        if (blen+1>=bmax) { overflow|=1; continue; }
648        buf[blen*2+0]=x-1;
649        buf[blen*2+1]=y+dx;
650        blen++;
651     }
652     if (x < *x0) *x0 = x;
653     /* second go right, mark and get new starting points */
654     for ( ; x<p->x && (col == ((getpixel(p, x  , y) < cs) ? 0 : 1)) ; x++) {
655       p->p[x + y * p->x] |= (mark & 7);    rc++;  /* mark pixel */
656       /* enlarge frame */
657       if (x > *x1) *x1 = x;
658       for (dx=-1;dx<2;dx+=2) /* look at upper and lower line */
659       if (     col == ((getpixel(p, x  , y+dx) < cs) ? 0 : 1)
660         && (
661                col != ((getpixel(p, x-1, y   ) < cs) ? 0 : 1)
662             || col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) )
663         && !((marked(p,x,y+dx)&mark)==mark) && y+dx<p->y && y+dx>=0
664          ) {
665          if (blen+1>=bmax) { overflow|=1; continue; }
666          buf[blen*2+0]=x;
667          buf[blen*2+1]=y+dx;
668          blen++;
669       }
670     }
671     for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, right */
672     if (    diag  && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
673          &&  col == ((getpixel(p, x-1, y   ) < cs) ? 0 : 1)
674          &&  col != ((getpixel(p, x  , y   ) < cs) ? 0 : 1)
675          &&  col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
676          &&  col == ((getpixel(p, x  , y+dx) < cs) ? 0 : 1)
677          && !((marked(p,x,y+dx)&mark)==mark)
678        ) {
679        if (blen+1>=bmax) { overflow|=1; continue; }
680        buf[blen*2+0]=x;
681        buf[blen*2+1]=y+dx;
682        blen++;
683     }
684   }
685 
686   /* debug, ToDo: use info maxstack and pixels for image classification */
687   g_debug(fprintf(stderr," maxstack= %4d pixels= %6d",maxstack,rc);)
688   if (overflow==1){
689     overflow|=2;
690     fprintf(stderr,"# Warning: frame_nn stack oerflow\n");
691   }
692   free(buf);
693 #else /* old version, ToDo: improve it for tmp04/005*.pgm.gz */
694   int i, j, d, dx, ox, oy, od, nx, ny, rc = 0, rot = 0, x2 = x, y2 = y, ln;
695 
696   static const int d0[8][2] = { { 0, -1} /* up    */, {-1, -1},
697 				{-1,  0} /* left  */, {-1,  1},
698 				{ 0,  1} /* down  */, { 1,  1},
699 				{ 1,  0} /* right */, { 1, -1}};
700 
701   /* check bounds */
702   if (outbounds(p, x, y))
703     return 0;
704   /* check if already marked */
705   if ((marked(p,x,y)&mark)==mark)
706     return 0;
707 
708   i = ((getpixel(p, x, y) < cs) ? 0 : 1);
709   rc = 0;
710 
711   g_debug(fprintf(stderr," start frame:");)
712 
713   for (ln = 0; ln < 2 && rot >= 0; ln++) {  // repeat if right-loop
714     g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d - go to border\n",ln,diag,cs,x,y);)
715 
716     od=d=(8+4*ln-diag)&7; // start robot looks up, right is a wall
717     // go to right (left) border
718     if (ln==1) {
719       x=x2;	y=y2;
720     }
721     /* start on leftmost position */
722     for (dx = 1 - 2*ln; x + dx < p->x && x + dx >= 0 /* bounds */ &&
723       	      	       i == ((getpixel(p, x + dx, y) < cs) ? 0 : 1) /* color */;
724 	      	       x += dx);
725 
726     g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d\n",ln,diag,cs,x,y);)
727 
728     /* robot stores start-position */
729     ox = x;	oy = y;
730     for (rot = 0; abs(rot) <= 64; ) {	/* for sure max. 8 spirals */
731       /* leftmost position */
732       if (ln == 0 && x < x2) {
733 	x2 = x; 	y2 = y;
734       }
735 
736       g_debug(fprintf(stderr," xy %3d %3d d=%d i=%d p=%3d rc=%d\n",x,y,d,i,getpixel(p,x,y),rc);)
737 
738       if ( abs(d0[d][1]) ) {	/* mark left (right) pixels */
739 	for (j = 0, dx = d0[d][1]; x + j >= 0 && x + j < p->x
740 	              	&& i == ((getpixel(p, x + j, y) < cs) ? 0 : 1); j += dx) {
741 	  if (!((marked(p, x + j, y)&mark)==mark))
742 	    rc++;
743 	  p->p[x + j + y * p->x] |= (mark & 7);
744 	}
745       }
746       /* look to the front of robot */
747       nx = x + d0[d][0];
748       ny = y + d0[d][1];
749       /* if right is a wall */
750       if ( outbounds(p, nx, ny) || i != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
751 	/* rotate left */
752         d=(d+2-diag) & 7; rot-=2-diag;
753       }
754       else {	/* if no wall, go, turn back and rotate left */
755         x=nx; y=ny; d=(d+4+2-diag) & 7; rot+=2-diag+4;
756 	/* enlarge frame */
757         if (x < *x0)      *x0 = x;
758 	if (x > *x1)	  *x1 = x;
759 	if (y < *y0)	  *y0 = y;
760 	if (y > *y1)	  *y1 = y;
761       }
762       if(x==ox && y==oy && d==od) break;	// round trip finished
763     }
764   }
765   g_debug(fprintf(stderr," rot=%d\n",rot);)
766 #endif
767   return rc;
768 }
769 
770 /*   obsolete! replaced by vectors
771  * mark neighbouring pixel of same color, return number
772  * better with neighbours of same color (more general) ???
773  * parameters: (&~7)-pixmap, start-point, critical_value, mark
774  *  recursion is removed */
mark_nn(pix * p,int x,int y,int cs,int r)775 int mark_nn(pix * p, int x, int y, int cs, int r) {
776   /* out of bounds or already marked? */
777   if (outbounds(p, x, y) || (marked(p, x, y)&r)==r)
778     return 0;
779   {
780     int x0, x1, y0, y1;
781     x0 = x1 = x;
782     y0 = y1 = y;			// not used
783     return frame_nn(p, x, y, &x0, &x1, &y0, &y1, cs, r, OCR_JOB->tmp.n_run & 1);
784     // using same scheme
785   }
786 }
787 
788 /* ToDo: finish to replace old frame by this new one
789  *
790  *   @...........#@@@@@@@.  # = marked as already scanned black pixels
791  *   @........@@@@@@@@@@@#      only left and right border
792  *   .......#@@@@@@@@@@@@@        left side on even y
793  *   ......@@@@@@@@#.@@@@#        right side on odd y
794  *   .....#@@@@@......#@@@   no border is marked twice
795  *   ....@@@@@#......@@@#.   works also for thinn lines
796  *   ...#@@@@........#@@@. - outer loop is stored as first
797  *   ..@@@@#........@@@#.. - inner loop is stored as second
798  *   .#@@@@........#@@@@..    1st in an extra box (think on white chars)
799  *   @@@@#.......@@@@#....    2nd merge in an extra step
800  *   #@@@@@....#@@@@@.....
801  *   @@@@@@@@@@@@@@#......
802  *   .#@@@@@@@@@@@@.......
803  *
804  * run around the chape using laby-robot
805  *  - used for scanning boxes, look for horizontal b/w transitions
806  *    with unmarked black pixels and call this routine
807  *  - stop if crossing a marked box in same direction (left=up, right=down)
808  *  box  - char box, store frame_vectors and box
809  *  x,y  - starting point
810  *  mark - 3 bit marker, mark each valid pixel with it
811  *  diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
812  *  ds   - start direction, 6=right of right border, 2=left of left border
813  *  ret  - 0=ok, -1=already marked, -2=max_num_frames_exceeded
814  *               -7=no border in direction ds
815  */
816 #if 0
817 #undef g_debug
818 #define g_debug(x) x
819 #endif
820 /* grep keywords: scan_vectors frame_vector */
frame_vector(struct box * box1,int x,int y,int cs,int mark,int diag,int ds)821 int frame_vector(struct box *box1, int  x,  int  y,
822              int cs, int mark, int diag, int ds) {
823   int i1, i2, i2o,
824       new_x=1,    /* flag for storing the vector x,y */
825       steps=1,    /* steps between stored vectors, speedup for big frames */
826       d,          /* direction */
827       ox, oy,     /* starting point */
828       nx, ny, mx, my, /* used for simplification */
829       /* ToDo: add periphery to box (german: Umfang?) */
830       rc  = 1,    /* return code, circumference, sum vector lengths */
831       rot = 0,    /* memory for rotation, rot=8 means one full rotation */
832       vol = 0;    /* volume inside frame, negative for white inside black */
833   pix *p=box1->p;
834 
835   /* translate the 8 directions to (x,y) pairs,
836    * if only four directions are used, only every 2nd vector is accessed,
837    * +1 turn left, -1 turn right
838    */
839   static const int d0[8][2] =
840     { { 0, -1}, /* up    */  {-1, -1},   /* up-le */
841       {-1,  0}, /* left  */  {-1,  1},   /* do-le */
842       { 0,  1}, /* down  */  { 1,  1},   /* do-ri */
843       { 1,  0}, /* right */  { 1, -1} }; /* up-ri */
844 
845   /* check bounds */
846   if (outbounds(p, x, y))
847     return 0;
848 
849   /* pixel color we are looking for, 0=black, 1=white */
850   d = ds;
851   i1 = ((getpixel(p, x,            y           ) < cs) ? 0 : 1);
852   i2 = ((getpixel(p, x + d0[d][0], y + d0[d][1]) < cs) ? 0 : 1);
853 
854   g_debug(fprintf(stderr,"\nLEV2 frame_vector @ %3d %3d  d%d %2d %2d"
855     "  %d-%d pix=%3d mark=%d cs=%d",\
856     x,y,ds,d0[ds][0],d0[ds][1],i1,i2,getpixel(p,x,y),mark,cs);)
857 
858   if (i1==i2){
859     fprintf(stderr,"ERROR frame_vector: no border\n");
860     return -7; /* no border detected */
861   }
862 
863   /* initialize boxframe outside this function
864      box1->x0=box1->x1=x;
865      box1->y0=box1->y1=y;
866   */
867 
868   /* initialize boxvector outside this function
869      box1->num_frames=0
870      num_frame_vectors[0]=0 ???
871      and store start value
872    */
873   if (box1->num_frames >= MaxNumFrames) return -2;
874   /* index to next (x,y) */
875   i2o=i2=( (box1->num_frames==0)?0:
876             box1->num_frame_vectors[ box1->num_frames ] );
877 #if 0 // obsolete v0.43
878   box1->frame_vector[i2][0]=x;
879   box1->frame_vector[i2][1]=y;
880   i2++;
881   box1->num_frame_vectors[ box1->num_frames ]=i2;
882 #endif
883   box1->num_frames++;
884 
885   /* robot stores start-position */
886   ox = x;  oy = y; /* look forward to white pixel */
887 
888   for (;;) {	/* stop if same marked pixel touched */
889 
890     g_debug(fprintf(stderr,"\nLEV3: xy %3d %3d d= %d rot= %2d  %3d",x,y,d,rot,i2);)
891 
892     /* ToDo: store max. abs(rot) ??? for better recognition */
893     if (new_x) {
894       g_debug(fprintf(stderr,"\nLEV2: markB xy= %3d %3d ", x, y);)
895       p->p[x + y * p->x] |= (mark & 7); /* mark black pixel */
896     }
897 
898     /* store a new vector or enlarge the predecessor */
899     if (new_x && (rc%steps)==0) { /* dont store everything on big chars */
900       if (i2>=MaxFrameVectors) {
901         box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
902         reduce_vectors(box1,1);    /* simplify loop */
903         i2=box1->num_frame_vectors[ box1->num_frames-1 ];
904         /* enlarge steps on big chars getting speedup */
905         steps=(box1->y1-box1->y0+box1->x1-box1->x0)/32+1;
906       }
907       /* store frame-vector */
908       if (i2<MaxFrameVectors) {
909         box1->frame_vector[i2][0]=x;
910         box1->frame_vector[i2][1]=y;
911         /* test if older vector points to the same direction */
912         if (i2>1) {
913           /* get predecessor */
914           nx=box1->frame_vector[i2-1][0]-box1->frame_vector[i2-2][0];
915           ny=box1->frame_vector[i2-1][1]-box1->frame_vector[i2-2][1];
916           mx=x                          -box1->frame_vector[i2-1][0];
917           my=y                          -box1->frame_vector[i2-1][1];
918           /* same direction? */
919           if (nx*my-ny*mx==0 && nx*mx>=0 && ny*my>=0) {
920             /* simplify by removing predecessor */
921             i2--;
922             box1->frame_vector[i2][0]=x;
923             box1->frame_vector[i2][1]=y;
924           } /* do not simplify */
925         }
926         i2++;
927         box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
928       }
929       g_debug(fprintf(stderr," stored @ %3d steps= %d", i2-1, steps);)
930     }
931     new_x=0; /* work for new pixel (x,y) done */
932 
933     /* check if round trip is finished */
934     if (x==ox && y==oy && abs(rot)>=8) break;
935 
936     /* look to the front of robot (turtle or ant) */
937     nx = x + d0[d][0];
938     ny = y + d0[d][1];
939 
940     /* next step, if right is a wall turn the turtle left */
941     if ( outbounds(p, nx, ny) || i1 != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
942       if (y==ny && nx>=0 && nx<p->x) { /* if inbound */
943         g_debug(fprintf(stderr,"\nLEV2: markW xy= %3d %3d ", nx, ny);)
944         p->p[nx + ny * p->x] |= (mark & 7); /* mark white pixel */
945       }
946       /* rotate left 90 or 45 degrees */
947       d=(d+2-diag) & 7; rot+=2-diag;
948       /* calculate volume inside frame */
949       switch (d+diag) {
950         case 2+2: vol-=x-1; break;
951         case 6+2: vol+=x;   break;
952       }
953     }
954     else { /* if no wall, go forward and turn right (90 or 45 degrees) */
955       x=nx; y=ny;
956       /* turn back and rotate left */
957       d=(d+4+2-diag) & 7; rot+=2-diag-4;
958       rc++; /* counting steps, used for speedup */
959 
960       /* enlarge frame */
961       if (x < box1->x0) box1->x0 = x;
962       if (x > box1->x1) box1->x1 = x;
963       if (y < box1->y0) box1->y0 = y;
964       if (y > box1->y1) box1->y1 = y;
965 
966       new_x=1;
967     }
968   }
969 
970   /* to distinguish inner and outer frames, store volume as +v or -v */
971   box1->frame_vol[ box1->num_frames-1 ] = vol;
972   box1->frame_per[ box1->num_frames-1 ] = rc-1;
973 
974   /* dont count and store the first vector twice */
975   if (i2-i2o>1) {
976     i2--; rc--; box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
977   }
978   /* output break conditions */
979   g_debug(fprintf(stderr,"\nLEV2 o= %3d %3d  xy %3d %3d  r=%d v=%d",ox,oy,x,y,rot,vol);)
980   /* rc=1 for a single point, rc=2 for a two pixel sized point */
981   g_debug(fprintf(stderr," steps= %3d vectors= %3d",rc,i2);)
982   /* out_x(box1); ToDo: output only the first thousend */
983   return rc; /* return number of bordering pixels = periphery? */
984 }
985 
986 
987 
988 /* clear lowest 3 (marked) bits (they are used for marking) */
clr_bits(pix * p,int x0,int x1,int y0,int y1)989 void clr_bits(pix * p, int x0, int x1, int y0, int y1) {
990   int x, y;
991   for ( y=y0; y <= y1; y++)
992     for ( x=x0; x <= x1; x++)
993       p->p[x+y*p->x] &= ~7;
994 }
995 
996 /* look for white holes surrounded by black points
997  * at the moment look for white point with black in all four directions
998  *  - store position of hole in coordinates relativ to box!
999  *  ToDo: count only holes with vol>10% ???
1000  * ToDo: rewrite for frame vectors (faster, no malloc)
1001  *       holes are frames rotating left hand
1002  *  obsolete, do it with vectors
1003  */
num_hole(int x0,int x1,int y0,int y1,pix * p,int cs,holes_t * holes)1004 int num_hole(int x0, int x1, int y0, int y1, pix * p, int cs, holes_t *holes) {
1005   int num_holes = 0, x, y, hole_size;
1006   pix b;			// temporary mini-page
1007   int dx = x1 - x0 + 1, dy = y1 - y0 + 1;
1008   unsigned char *buf;	//  2nd copy of picture, for working
1009 
1010   if (holes) holes->num=0;
1011   if(dx<3 || dy<3) return 0;
1012   b.p = buf = (unsigned char *) malloc( dx * dy );
1013   if( !buf ){
1014     fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_hole", dx*dy );
1015     return 0;
1016   }
1017   if (copybox(p, x0, y0, dx, dy, &b, dx * dy))
1018     { free(b.p); return -1;}
1019 
1020   // printf(" num_hole(");
1021   /* --- mark white-points connected with border */
1022   for (x = 0; x < b.x; x++) {
1023     if (getpixel(&b, x, 0) >= cs)
1024       mark_nn(&b, x, 0, cs, AT);
1025     if (getpixel(&b, x, b.y - 1) >= cs)
1026       mark_nn(&b, x, b.y - 1, cs, AT);
1027   }
1028   for (y = 0; y < b.y; y++) {
1029     if (getpixel(&b, 0, y) >= cs)
1030       mark_nn(&b, 0, y, cs, AT);
1031     if (getpixel(&b, b.x - 1, y) >= cs)
1032       mark_nn(&b, b.x - 1, y, cs, AT);
1033   }
1034 
1035   g_debug(out_b(NULL,&b,0,0,b.x,b.y,cs);)
1036   // --- look for unmarked white points => hole
1037   for (x = 0; x < b.x; x++)
1038     for (y = 0; y < b.y; y++)
1039       if (!((marked(&b, x, y)&AT)==AT))	// unmarked
1040 	if (getpixel(&b, x, y) >= cs) {	// hole found
1041 #if 0
1042 	  hole_size=mark_nn(&b, x, y, cs, AT);  /* old version */
1043 	  if (hole_size > 1 || dx * dy <= 40)
1044 	    num_holes++;
1045 #else
1046           {    /* new version, for future store of hole characteristics */
1047             int x0, x1, y0, y1, i, j;
1048             x0 = x1 = x;
1049             y0 = y1 = y;			// not used
1050             hole_size=frame_nn(&b, x, y, &x0, &x1, &y0, &y1, cs, AT, OCR_JOB->tmp.n_run & 1);
1051             // store hole for future use, num is initialized with 0
1052  	    if (hole_size > 1 || dx * dy <= 40){
1053  	      num_holes++;
1054               if (holes) {
1055                 // sort in table
1056                 for (i=0;i<holes->num && i<MAX_HOLES;i++)
1057                   if (holes->hole[i].size < hole_size) break;
1058                 for (j=MAX_HOLES-2;j>=i;j--)
1059                   holes->hole[j+1]=holes->hole[j];
1060                 if (i<MAX_HOLES) {
1061                   // printf("  i=%d size=%d\n",i,hole_size);
1062                   holes->hole[i].size=hole_size;
1063                   holes->hole[i].x=x;
1064                   holes->hole[i].y=y;
1065                   holes->hole[i].x0=x0;
1066                   holes->hole[i].y0=y0;
1067                   holes->hole[i].x1=x1;
1068                   holes->hole[i].y1=y1;
1069                 }
1070                 holes->num++;
1071               }
1072             }
1073           }
1074 #endif
1075 	}
1076   free(b.p);
1077   // printf(")=%d",num_holes);
1078   return num_holes;
1079 }
1080 
1081 /* count for black nonconnected objects --- used for i,auml,ouml,etc. */
1082 /* ToDo: obsolete, replaced by vectors and box.num_boxes */
num_obj(int x0,int x1,int y0,int y1,pix * p,int cs)1083 int num_obj(int x0, int x1, int y0, int y1, pix * p, int cs) {
1084   int x, y, rc = 0;		// rc=num_obj
1085   unsigned char *buf; // 2nd copy of picture, for working
1086   pix b;
1087 
1088   if(x1<x0 || y1<y0) return 0;
1089   b.p = buf = (unsigned char *) malloc( (x1-x0+1) * (y1-y0+1) );
1090   if( !buf ){
1091     fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_obj",(x1-x0+1)*(y1-y0+1) );
1092     return 0;
1093   }
1094   if (copybox(p, x0, y0, x1 - x0 + 1, y1 - y0 + 1, &b, (x1-x0+1) * (y1-y0+1)))
1095     { free(b.p); return -1; }
1096   // --- mark black-points connected with neighbours
1097   for (x = 0; x < b.x; x++)
1098     for (y = 0; y < b.y; y++)
1099       if (getpixel(&b, x, y) < cs)
1100 	if (!((marked(&b, x, y)&AT)==AT)) {
1101 	  rc++;
1102 	  mark_nn(&b, x, y, cs, AT);
1103 	}
1104   free(b.p);
1105   return rc;
1106 }
1107 
1108 #if 0
1109 // ----------------------------------------------------------------------
1110 // first idea for making recognition based on probability
1111 //  - start with a list of all possible chars
1112 //  - call recognition_of_char(box *)
1113 //    - remove chars from list which could clearly excluded
1114 //    - reduce probability of chars which have wrong features
1115 //  - font types list could also build
1116 // at the moment it is only an idea, I should put it to the todo list
1117 //
1118 char *list="0123456789,.\0xe4\0xf6\0xfc"	// "a=228 o=246 u=252
1119            "abcdefghijklmnopqrstuvwxyz"
1120            "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1121 int  wert[100];
1122 int  listlen=0,numrest=0;
1123 // initialize a new character list (for future)
1124 void ini_list(){ int i;
1125     for(i=0;list[i]!=0 && i<100;i++) wert[i]=0;
1126     numrest=listlen=i; }
1127 // exclude??? (for future) oh it was long time ago, I wrote that :/
1128 void exclude(char *filt){ int i,j;
1129     for(j=0;filt[j]!=0 && j<100;j++)
1130     for(i=0;list[i]!=0 && i<100;i++)
1131     if( filt[j]==list[i] ) { if(!wert[i])numrest--; wert[i]++; } }
1132 // get the result after all the work (for future)
1133 char getresult(){ int i;
1134     if( numrest==1 )
1135     for(i=0;list[i]!=0 && i<100;i++) if(!wert[i]) return list[i];
1136     return '_';
1137  }
1138 #endif
1139 
1140 //  look at the environment of the pixel too (contrast etc.)
1141 //   detailed analysis only of diff pixels!
1142 //
1143 // 100% * "distance", 0 is ideal fit
1144 // = similarity of two chars for recognition of garbled (verstuemmelter) chars
1145 //   weight of pixels with only one same neighbour set to 0
1146 //   look at contours too! v0.2.4: B==H
1147 // changed for v0.41, Mar06
distance(pix * p1,struct box * box1,pix * p2,struct box * box2,int cs)1148 int distance( pix *p1, struct box *box1,
1149               pix *p2, struct box *box2, int cs){
1150    int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2;
1151    x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
1152    dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);
1153    dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);
1154    if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) return 100;
1155    // compare relations to baseline and upper line
1156    if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
1157    if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
1158    // compare pixels
1159    for( y=0;y<dy;y++ )
1160    for( x=0;x<dx;x++ ) {	// try global shift too ???
1161      v1     =((getpixel(p1,x1+x  ,y1+y  )<cs)?1:0); i1=8;	// better gray?
1162      v2     =((getpixel(p2,x2+x  ,y2+y  )<cs)?1:0); i2=8;	// better gray?
1163      if(v1==v2) { rgood+=8; continue; } // all things are right!
1164      // what about different pixel???
1165      // test overlap of 8 surounding pixels ??? bad if two nb. are bad
1166      v1=-1;
1167      for(i1=-1;i1<2;i1++)
1168      for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
1169        if( ((getpixel(p1,x1+x+i1*(1+dx/32),y1+y+i2*(1+dy/32))<cs)?1:0)
1170          !=((getpixel(p2,x2+x+i1*(1+dx/32),y2+y+i2*(1+dy/32))<cs)?1:0) ) v1++;
1171      }
1172      if (v1>0) rbad+=16*v1;
1173      else      rbad++;
1174    }
1175    if(rgood+rbad) rc= (100*rbad+(rgood+rbad-1))/(rgood+rbad); else rc=99;
1176    if(rc<10 && OCR_JOB->cfg.verbose & 7){
1177      fprintf(stderr,"\n#  distance rc=%d good=%d bad=%d",rc,rgood,rbad);
1178 //     out_x(box1);out_x(box2);
1179    }
1180    return rc;
1181 }
1182 
1183 
1184 
1185 // ============================= call OCR engine ================== ;)
1186 //  nrun=0 from outside, nrun=1 from inside (allows modifications, oobsolete)
whatletter(struct box * box1,int cs,int nrun)1187 wchar_t whatletter(struct box *box1, int cs, int nrun){
1188    wchar_t bc=UNKNOWN;			// best letter
1189    wchar_t um=SPACE;			// umlaut? '" => modifier
1190    pix *p=box1->p;   // whole image
1191    int	x,y,dots,xa,ya,x0,x1,y0,y1,dx,dy,i;
1192    pix b;            // box
1193    struct box bbuf=*box1;  // restore after modifikation!
1194 
1195    if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1196       bc=box1->tac[0];
1197    }
1198    // if (bc!=UNKNOWN) return bc;
1199    // if whatletter() called again, only unknown chars are processed
1200    // bad for splitting!
1201 
1202    // store box data, which can be modified for modified chars in 2nd run
1203    bbuf.x0=box1->x0; bbuf.y0=box1->y0;
1204    bbuf.x1=box1->x1; bbuf.y1=box1->y1;
1205 
1206    xa=box1->x;  ya=box1->y;
1207    x0=box1->x0; y0=box1->y0;
1208    x1=box1->x1; y1=box1->y1;
1209    // int vol=(y1-y0+1)*(x1-x0+1);	// volume
1210    // crossed l-m , divided chars
1211    while( get_bw(x0,x1,y0,y0,p,cs,1)!=1 && y0+1<y1) y0++;
1212    while( get_bw(x0,x1,y1,y1,p,cs,1)!=1 && y0+1<y1) y1--;
1213    dx=x1-x0+1;
1214    dy=y1-y0+1;	// size
1215 
1216    // better to proof the white frame too!!! ????
1217    // --- test for german umlaut and points above, not robust enough???
1218    // if three chars are connected i-dots (ari) sometimes were not detected
1219    //  - therefore after division a test could be useful
1220    // modify y0 only in second run!?
1221    // we need it here to have the right copybox
1222    if (um==SPACE && dy>5 && box1->num_boxes>1)
1223      testumlaut(box1,cs,2,&um); /* set box1->modifier + new y0 */
1224 
1225    dots=box1->dots;
1226    y0  =box1->y0;	// dots==2 => y0 below double dots
1227    dy  =y1-y0+1;
1228 
1229    // move upper and lower border (for divided letters)
1230    while( get_bw(x0,x1,y0,y0,p,cs,1)==0  &&  y0+1<y1) y0++;
1231    while( get_bw(x0,x1,y1,y1,p,cs,1)==0  &&  y0+1<y1) y1--;
1232    while( get_bw(x0,x0,y0,y1,p,cs,1)==0  &&  x0+1<x1) x0++;
1233    while( get_bw(x1,x1,y0,y1,p,cs,1)==0  &&  x0+1<x1) x1--;
1234    dx=x1-x0+1;
1235    dy=y1-y0+1;	// size
1236    box1->x0=x0; box1->y0=y0;	// set reduced frame
1237    box1->x1=x1; box1->y1=y1;
1238 
1239    // set good startpoint (probably bad from division)?
1240    if( xa<x0 || xa>x1 || ya<y0 || ya>y1
1241      || getpixel(p,xa,ya)>=cs /* || 2*ya<y0+y1 */ || dots>0 ){
1242      // subfunction? also called after division of two glued chars?
1243      for(y=y1;y>=y0;y--) // low to high (not i-dot)
1244      for(x=(x0+x1)/2,i=0;x>=x0 && x<=x1;i++,x+=((2*i&2)-1)*i) /* is that ok? */
1245      if (getpixel(p,x,y)<cs && (getpixel(p,x+1,y)<cs
1246                              || getpixel(p,x,y+1)<cs)){ xa=x;ya=y;y=-1;break; }
1247      /* should box1->x,y be set? */
1248    }
1249 
1250    // ----- create char-only-box -------------------------------------
1251    // ToDo: this will be obsolete if vectors are used only
1252    if(dx<1 || dy<1) return bc; /* should not happen */
1253    b.p = (unsigned char *) malloc( dx * dy );
1254    if (!b.p) fprintf(stderr,"Warning: malloc failed L%d\n",__LINE__);
1255    if( copybox(p,x0,y0,dx,dy,&b,dx*dy) )
1256      { free(b.p); return bc; }
1257    // clr_bits(&b,0,b.x-1,0,b.y-1);
1258    // ------ use diagonal too (only 2nd run?)
1259    /* following code failes on ! and ?  obsolete if vectors are used
1260       ToDo:
1261        - mark pixels neighoured to pixels outside and remove them from &b
1262          v0.40
1263          will be replaced by list of edge vectors
1264        - mark accents, dots and remove them from &b
1265     */
1266 #if 1 /* becomes obsolate by vector code */
1267    if (y0>0)  // mark upper overlap
1268    for ( x=x0; x<=x1; x++) {
1269      if (getpixel(p,x,y0-1)<cs
1270       && getpixel(p,x,y0  )<cs && (marked(&b,x-x0,0)&1)!=1)
1271      mark_nn(&b,x-x0,0,cs,1);
1272    }
1273    if (x0>0)  // mark left overlap
1274    for ( y=y0; y<=y1; y++) {
1275      if (getpixel(p,x0-1,y)<cs
1276       && getpixel(p,x0  ,y)<cs && (marked(&b,0,y-y0 )&1)!=1)
1277      mark_nn(&b,0,y-y0,cs,1);
1278    }
1279    if (x1<p->x-1)  // mark right overlap
1280    for ( y=y0; y<=y1; y++) {
1281      if (getpixel(p,x1+1,y)<cs
1282       && getpixel(p,x1  ,y)<cs && (marked(&b,x1-x0,y-y0)&1)!=1)
1283      mark_nn(&b,x1-x0,y-y0,cs,1);
1284    }
1285    mark_nn(&b,xa-x0,ya-y0,cs,2); // not glued chars
1286    for(x=0;x<b.x;x++)
1287    for(y=0;y<b.y;y++){
1288      if (  (marked(&b,x,y  )&3)==1 && getpixel(&b,x,y  )<cs )
1289      b.p[x+y*b.x] = 255&~7;  /* reset pixel */
1290    }
1291 #endif
1292 
1293    // if (bc == UNKNOWN)   // cause split to fail
1294    bc=ocr0(box1,&b,cs);
1295 
1296    /* ToDo: try to change pixels near cs?? or melt? */
1297    if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1298      bc=box1->tac[0];
1299    }
1300 
1301    if (um!=0 && um!=SPACE && bc<127) {  /* ToDo: is that obsolete now? */
1302      wchar_t newbc;
1303      newbc = compose(bc, um );
1304      if (newbc == bc) { /* nothing composed */
1305        if(OCR_JOB->cfg.verbose & 7)
1306          fprintf(stderr, "\nDBG whatletter: compose(%s) was useless (%d,%d)",
1307            decode(bc,ASCII), box1->x0, box1->y0);
1308        // if(OCR_JOB->cfg.verbose & 6) out_x(box1);
1309      }
1310      bc = newbc;
1311    }
1312    // restore modified boxes
1313    box1->x0=bbuf.x0; box1->y0=bbuf.y0;
1314    box1->x1=bbuf.x1; box1->y1=bbuf.y1;
1315 //   if (box1->c==UNKNOWN) out_b(box1,&b,0,0,dx,dy,cs); // test
1316 
1317    free(b.p);
1318    return bc;
1319 }
1320 
1321 /*
1322 ** creates a list of boxes/frames around objects detected
1323 ** on the pixmap p for further work
1324 ** returns number of boxes created.
1325 ** - by the way: get average X, Y (avX=sumX/numC,..)
1326 **  ToDo18?: do not put diagonal touched fat objects? easier to melt than to
1327 **        divide boxes? or for bold fonts (min-xpixels bigger 1?)
1328 */
scan_boxes(job_t * job,pix * p)1329 int scan_boxes( job_t *job, pix *p ){
1330   int x, y, nx, cs, rc, ds;
1331   struct box *box3;
1332   //  job_t *job=OCR_JOB; /* fixme */
1333 
1334   if (job->cfg.verbose)
1335     fprintf(stderr,"# scan_boxes");
1336 
1337   cs = job->cfg.cs;
1338   job->res.sumX = job->res.sumY = job->res.numC = 0;
1339 
1340   /* clear the lowest bits of each pixel, later used as "scanned"-marker */
1341   /* so boxes can overlap like bold "To" (proportional-font) */
1342   clr_bits( p, 0, p->x - 1, 0, p->y - 1);
1343 
1344   for (y=0; y < p->y; y++)
1345     for (x=0; x < p->x; x++)  // ds = direction to go 2=left 6=right
1346     for (ds=2; ds<7; ds+=4) { // NO - dust of size 1 is not removed !!!
1347       nx=x+((ds==2)?-1:+1);
1348       if (nx<0 || nx>=p->x) continue; /* out of image, ex: recframe */
1349       if ( getpixel(p, x,y)>=cs || getpixel(p,nx,y)< cs)  // b/w transition?
1350 	continue;
1351       if ((marked(p, x,y) & 1)&&(marked(p, nx, y) & 1))
1352 	continue;
1353       /* non-marked b/w-transition found, start boxing connected pixels */
1354       /* check (and mark) only horizontal b/w transitions */
1355       // --- insert new box in list
1356       box3 = (struct box *)malloc_box(NULL);
1357       box3->x0=box3->x1=box3->x=x;
1358       box3->y0=box3->y1=box3->y=y;
1359       box3->num_frames=0;
1360       box3->dots=0;
1361       box3->num_boxes=1;
1362       box3->num_subboxes=0;
1363       box3->modifier='\0';
1364       box3->num=job->res.numC;
1365       box3->line=0;	// not used here
1366       box3->m1=0; box3->m2=0; box3->m3=0; box3->m4=0;
1367       box3->p=p;
1368       box3->num_ac=0;   // for future use
1369 
1370 /*  frame, vectorize and mark only odd/even horizontal b/w transitions
1371  *  args: box, x,y, cs, mark, diag={0,1}, ds={2,6}
1372  *  ds   - start direction, 6=right of right border, 2=left of left border
1373  *  ret  - 0=ok, -1=already marked, -2=max_num_frames_exceeded
1374  *               -7=no border in direction ds
1375  *  ToDo: count errors and print out for debugging
1376  */
1377       rc=frame_vector(box3, x, y, cs, 1, 1, ds);
1378       g_debug(fprintf(stderr,"\n# ... scan xy= %3d %3d rc= %2d", x, y, rc);)
1379       if (rc<0) { free_box(box3); continue; }
1380       if (box3->num_frames && !box3->num_frame_vectors[0])
1381         fprintf(stderr,"\nERROR scan_boxes: no vector in frame (%d,%d)",x,y);
1382 
1383       job->res.numC++;
1384       job->res.sumX += box3->x1 - box3->x0 + 1;
1385       job->res.sumY += box3->y1 - box3->y0 + 1;
1386 
1387       box3->c=(((box3->y1-box3->y0+1)
1388                *(box3->x1-box3->x0+1)>=MaxBox)? PICTURE : UNKNOWN);
1389       list_app(&(job->res.boxlist), box3); 	// append to list
1390       // ToDo: debug
1391       // if (job->cfg.verbose && box3->y0==29) out_x(box3);
1392   }
1393   if(job->res.numC){
1394     if (job->cfg.verbose)
1395       fprintf(stderr," nC= %3d avD= %2d %2d\n",job->res.numC,
1396                (job->res.sumX+job->res.numC/2)/job->res.numC,
1397                (job->res.sumY+job->res.numC/2)/job->res.numC);
1398   }
1399   return job->res.numC;
1400 }
1401 
1402 /* compare ints for sorting.  Return -1, 0, or 1 according to
1403    whether *vr < *vs, vr == *vs, or *vr > *vs */
1404 int
intcompare(const void * vr,const void * vs)1405 intcompare (const void *vr, const void *vs)
1406 {
1407   int *r=(int *)vr;
1408   int *s=(int *)vs;
1409 
1410   if (*r < *s) return -1;
1411   if (*r > *s) return 1;
1412   return 0;
1413 }
1414 
1415 /*
1416  * measure_pitch - detect monospaced font and measure the pitch
1417  * measure overall pitch for difficult lines,
1418  *  after that measure pitch per line
1419  * dists arrays are limited to 1024 elements to reduce
1420  *  cpu usage for qsort on images with extreme high number of objects
1421  * insert space if dist>=pitch in list_insert_spaces()
1422  *  ToDo: ???
1423  *   - min/max distance-matrix  a-a,a-b,a-c,a-d ... etc;  td,rd > ie,el,es
1424  *   - OR measuring distance as min. pixel distance instead of box distance
1425  *        especially useful for italic font!
1426  *   - Kerning detection? minspace<=0 ???
1427  *   - iterate minMono+maxMonoWidth and count fitting and misfitting pairs
1428  * Lit:
1429  *  http://en.wikibooks.org/wiki/LaTeX/Formatting
1430  *         #The_Space_between_Words_and_Sentences
1431  *   \frenchspacing == no extra space after periods (word vs. sentences)
1432  *   \sloppypar     == some spaces between words may be to large
1433  *   inter word space
1434  *  http://en.wikipedia.org/wiki/Space_(punctuation)
1435  *   Variable-width general-purpose space == 1/5-em to 1/3-em
1436  *  http://en.wikipedia.org/wiki/Em_(typography)
1437  *   em = absolute maximum high,
1438  *       median cap height=0.70em,
1439  *       x-height=1ex=0.45..0.48..0.5em
1440  *  http://en.wikipedia.org/wiki/En_(typography) = n-width=0.5em
1441  *  http://pfaedit.sourceforge.net/glossary.html#overshoot
1442  *   i: left + right side bearing (character specifique, may be negative: VA)
1443  *  http://en.wikipedia.org/wiki/Typeface
1444  *  http://en.wikipedia.org/wiki/Letter-spacing
1445  *  http://en.wikipedia.org/wiki/Tracking_(typography) # Overlap VA
1446  *  http://en.wikipedia.org/wiki/Kerning  # Overlap VA AT Tx etc.
1447  *    similar blank 2D-area between pairs of characters
1448  *     Helvetica: ry=+30 AV=-80 units?
1449  * ToDo18: better mono detection
1450  *   1st round min. mono_width = max char_width (except melted chars)
1451  *   2nd round max. mono_width = min x0-pre.x0, x1-pre.x1 (+check against min_mono_em)
1452  *   3th round if something not fit, mono=0
1453  *
1454  */
measure_pitch(job_t * job)1455 void measure_pitch( job_t *job ){     /* word spacing */
1456   int numdists=0, spc=0,              /* number of stored distances */
1457       pitch_p=2, pdist, pdists[1024], /* proportional distances */
1458       pitch_m=10, /* monospaced em width */
1459       monospaced=1, l1, char_width_min=1023, char_width_max=0,
1460       mono_em_min=0,    // maximum monospace char width + 1   2010-09-25
1461       mono_em_max=2047, // minimum distance left side of two chars
1462       d1l, d1r; // left-left and right-right distance of 2 chars
1463   int d1, d2; // temporary vars, d1l + d1r sorted
1464   struct box *box2, *pre1=NULL, *pre2=NULL;
1465 
1466   if(job->cfg.verbose){ fprintf(stderr,"# check for word pitch"); }
1467   for (l1=0; l1<job->res.lines.num; l1++)
1468   { /* 0 means all lines */
1469     if(job->cfg.verbose){ fprintf(stderr,"\n#  line %2d\n# ...",l1); }
1470     numdists = 0;  /* clear distance lists */
1471     monospaced=1; mono_em_min=0;  mono_em_max=2047; // reset, 2010-09-28
1472     char_width_min=1023; char_width_max=0; // reset, 2010-09-28
1473     for_each_data(&(job->res.boxlist)) {
1474       box2 = (struct box *)list_get_current(&(job->res.boxlist));
1475       if (l1>0 && box2->line!=l1) continue; /* ignore other lines */
1476       /* ignore dots and pictures (min. font is 4x6) */
1477       if (box2->y1 - box2->y0 + 1 < 4 || box2->c==PICTURE) pre2=pre1=NULL;
1478       if (!pre1) { pre1=box2; continue; } /* we need a predecessor */
1479       if (pre1 && pre1->line != box2->line) { pre1=box2; continue; } /* 201809 */
1480       /* use gap for proportional fonts */
1481       pdist = box2->x0 - pre1->x1 - 1; /* do not add 1, subtract 1 ! */
1482       if (pdist<0) { // new line
1483         pre2=NULL; pre1=box2; continue; }
1484       if ((box2->x1 - box2->x0 + 1)
1485        >2*(box2->y1 - box2->y0 + 1)) { // skip long object
1486         continue; }
1487       if ((pre1->x1 - pre1->x0 + 1)
1488        >2*(pre1->y1 - pre1->y0 + 1)) { // skip long object
1489         pre1=box2; continue; }
1490       // JS-2010-09 sample spaces20100910.jpg 7 chars, fix bad auto space
1491       if (char_width_min > box2->x1 - box2->x0 + 1)
1492           char_width_min = box2->x1 - box2->x0 + 1;
1493       if (box2->x1 - box2->x0 < 4*(pre1->x1 - pre1->x0)) // ~ big lines
1494       if (char_width_max < box2->x1 - box2->x0 + 1)
1495           char_width_max = box2->x1 - box2->x0 + 1;
1496       // may cause problems if "_" is of width em (not em-1 like mwMW etc.)
1497       if (mono_em_min < char_width_max + 1)
1498           mono_em_min = char_width_max + 1; // minimum monospaced width
1499 
1500       // will fail on monospaced fonts where chars are not centered
1501       if (pre1) { // 2010-09-28
1502         d1l = box2->x0 - pre1->x0; // left to left distance
1503         d1r = box2->x1 - pre1->x1; // right to right distance
1504         if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1505         else           { d1=d1l; d2=d1r; } // thicker char on the right
1506         /* d1 < 2*width && d2 < 2*width, may fail for "IIIM" d2<2*max OK */
1507         if (d1>0 && d1 < 2*char_width_max && d2 < 2*mono_em_max) {
1508           if (mono_em_min<d1-1) mono_em_min = d1; }
1509         if (d1>0) {
1510           if (mono_em_max>d2+2) mono_em_max = d2; } // not best, shifted ()
1511         // 2010-10-06 examples/ocr-b add -1 +2, bad for "()"
1512 #if 1
1513         if ((48 & job->cfg.verbose) == 48)
1514         if (monospaced && l1)  // debugging until monospaced=0
1515           fprintf(stderr," L%02d DBG1 x %3d %+4d %3d %+4d  d %3d %3d"
1516             "  em %2d %2d  ex %2d\n# ...",
1517             l1, pre1->x0, pre1->x1-pre1->x0+1,
1518                 box2->x0, box2->x1-box2->x0+1, d1, d2,
1519             mono_em_min, mono_em_max, char_width_max);
1520 #endif
1521       }
1522 #if 1 // needed for correct spacing of last line of tmp08/0810CSchulze_crop
1523       if (pre2) {
1524         d1l = box2->x0 - pre2->x0; // left to left distance
1525         d1r = box2->x1 - pre2->x1; // right to right distance
1526         if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1527         else           { d1=d1l; d2=d1r; } // thicker char on the right
1528         if (d1>0 && d1 < 3*char_width_max && d2 < 3*mono_em_max) {
1529           if (2*mono_em_min<d1) mono_em_min = (d1+1)/2; }
1530         if (d1>0) {
1531           if (2*mono_em_max>d2) mono_em_max = (d2+1)/2; }
1532 #if 1
1533         if ((48 & job->cfg.verbose) == 48)
1534         if (monospaced && l1)  // debugging until monospaced=0
1535           fprintf(stderr," L%02d DBG2 x %3d %+4d %3d %+4d  d %3d %3d"
1536             "  em %2d %2d  ex %2d\n# ...",
1537             l1, pre2->x0, pre2->x1-pre2->x0+1,
1538                 box2->x0, box2->x1-box2->x0+1, d1, d2,
1539             mono_em_min, mono_em_max, char_width_max);
1540 #endif
1541       }
1542 #endif
1543 
1544 // the upper part does good work, we do not need this stuff ... ???
1545 #if 0
1546       // min distance between next neighbours of pre
1547       if (pre2  &&  1 < box2->x0 - pre2->x1)
1548       if (mono_em_max > box2->x0 - pre2->x1)
1549           mono_em_max = box2->x0 - pre2->x1;
1550       // ToDo: could be a problem for " ???
1551       if (pre2)
1552       if (pre1->x1 - pre1->x0 >= mono_em_min) // best max mono_dx
1553       if (pre1->x1 - pre1->x0 == box2->x1 - box2->x0) // best max mono_dx
1554       if (mono_em_max > box2->x0 - pre1->x0)
1555           mono_em_max = box2->x0 - pre1->x0;
1556       /* ToDo: better take 3 instead of 2 neighbours?, smallest font 4x6 */
1557       /* tmp08/gocr0801_bad5.jpg was not mono, need 2 to 3 chars */
1558       /* 2010-09-27 gives precise range! 16..22 to 16..17 */
1559       /* ToDo: no 2 char variant? */
1560       if (pre2  &&  1 < box2->x0 - pre2->x1)
1561       if (box2->x0-pre1->x1+1 < mono_em_min) // no spc between char + pre1
1562       if (pre1->x0-pre2->x1+1 < mono_em_min) // no spc between pre1 + pre2
1563       {
1564         if (3*mono_em_min <  box2->x1 - pre2->x0)
1565               mono_em_min = (box2->x1 - pre2->x0 + 2)/3;
1566       }
1567 #endif
1568 //#  tmp09/oebb_teletext_836_0001_sw.png
1569 //#  line 4 12 - 12  pre2 134 142           181 190
1570 //#                         0   8            47  56
1571 //#                         0    12  24  36   48
1572       // n=2: (n-1)*min < d1 <= (n  )*max  &&  (2*n+1)*max < (2*n+2)*min
1573       //      (n  )*min < d2 <= (n+1)*max  &&  (2*n+2)*max < (2*n+3)*min
1574       if (monospaced && pre1) { // check 2 chars for non mono space within
1575         d1l = box2->x0 - pre1->x0; // left to left distance  (do not + 1!)
1576         d1r = box2->x1 - pre1->x1; // right to right distance
1577         if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1578         else           { d1=d1l; d2=d1r; } // thicker char on the right
1579         if ( mono_em_max < 2*mono_em_min
1580           && mono_em_min < 2*mono_em_max)  // 2018-10 valid range?
1581         if ((box2->x0 - pre1->x1 <=   mono_em_max  // no space between
1582 //        && box2->x1 - pre1->x0 >  2*mono_em_max) // crossing 1 em border?
1583           && box2->x1 - pre1->x0 >  2*mono_em_min+mono_em_min/8) // 2018-09 rnd80
1584          || (box2->x0 - pre1->x1 >    mono_em_min  // space between
1585           && box2->x0 - pre1->x1 <= 2*mono_em_max-mono_em_max/16
1586 //        && box2->x1 - pre1->x0 >  3*mono_em_max)) { // crossing 2 em border?
1587           && box2->x1 - pre1->x0 >  3*mono_em_min+mono_em_min/8)) { // 2018-09 rnd80
1588           monospaced = 0; // can not be monospaced in that case 2010-09-25
1589          if (job->cfg.verbose)
1590            fprintf(stderr, " L%02d mono:=0  %d - %d  pre1 %d %d  %d %d y %d DBG%d\n# ...",
1591             l1, mono_em_min, mono_em_max,
1592             pre1->x0, pre1->x1, box2->x0, box2->x1,box2->y0,__LINE__);
1593         }
1594       }
1595       // n=3: (n-1)*min < d1 <= (n  )*max  &&  (2*n+1)*max < (2*n+2)*min
1596       //      (n  )*min < d2 <= (n+1)*max  &&  (2*n+2)*max < (2*n+3)*min
1597       if (monospaced && pre2 && (2*2+2)*mono_em_max < (2*2+3)*mono_em_min)
1598       { // check 2 chars for non mono space within
1599         d1l = box2->x0 - pre2->x0; // left to left distance
1600         d1r = box2->x1 - pre2->x1; // right to right distance
1601         if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1602         else           { d1=d1l; d2=d1r; } // thicker char on the right
1603         if ((box2->x0 - pre2->x1 >    mono_em_min+mono_em_min/16 // min. 1 char between
1604           && box2->x0 - pre2->x1 <= 2*mono_em_max-mono_em_max/8 // max. 2 chars
1605 //        && box2->x1 - pre2->x0 >  3*mono_em_max) // crossing 2 em border?
1606           && box2->x1 - pre2->x0 >  3*mono_em_min+mono_em_min/8) // 2018-09 rnd80
1607        || 0*(box2->x0 - pre2->x1 >  2*mono_em_min+mono_em_min/8  // min. 2 ex between
1608           && box2->x0 - pre2->x1 <= 3*mono_em_max-mono_em_max/4  // ?????? ToDo oebb
1609 //        && box2->x1 - pre2->x0 >  4*mono_em_max)) { // crossing 3 em border?
1610           && box2->x1 - pre2->x0 >  4*mono_em_min+mono_em_min/2)) { // 2018-09 rnd80
1611           monospaced = 0; // can not be monospaced in that case 2010-09-25
1612          if (job->cfg.verbose)
1613            fprintf(stderr, " L%02d mono:=0  %d - %d  pre2 %d %d  %d %d DBG%d\n# ...",
1614             l1, mono_em_min, mono_em_max,
1615             pre2->x0, pre2->x1, box2->x0, box2->x1, __LINE__);
1616         }
1617       }
1618       /* fonts are expected to be 6 to 60 pixels high, which is about
1619          4 to 50 pixels wide.  We allow some extra margin.
1620          space > 0 2010-09-27
1621          ToDo: compare left and right gap (or additional nearest 4 gaps)
1622          similar to mono space detection, check min distance
1623           between upper, middle and lower rightmost vector of prev char and
1624           leftmost vector of right char (hight is defined by the lower char)
1625           (if overlapping chars are detected! WAV,Te,...)
1626       */
1627       if (0 < pdist && pdist < 140) { /* better mdist < 3*Xaverage ? */
1628         // ignore extra wide spaces, tmp09/gocr_screen_capture* 2010-09-28
1629         if (2*pdist<5*char_width_max)
1630         /* two options for overflow: 1) ignore, 2) store randomly */
1631         if (numdists<1024) {   /* we do ignore here */
1632           pdists[numdists] = pdist;
1633           numdists++;
1634         }
1635       }
1636       pre2 = pre1; pre1 = box2;
1637     } end_for_each(&(job->res.boxlist));
1638 
1639     if (job->cfg.verbose)
1640       fprintf(stderr, " L%02d num_gaps= %2d x_width= %2d - %2d"
1641         " mono_em= %2d - %2d  mono= %d",
1642         l1, numdists, char_width_min, char_width_max,
1643         mono_em_min, mono_em_max, monospaced);
1644     if (numdists<8) {
1645       if (job->cfg.verbose && l1==0) /* only for all lines */
1646         fprintf(stderr," (WARNING num_gaps<8)");
1647     }
1648 #if 1 /* debugging */
1649     if ((job->cfg.verbose&(32+16))==48) {
1650       int i;
1651       fprintf(stderr,"\n# ...");
1652       for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1653       fprintf(stderr," <- pdist[%d]\n# ...",l1);
1654     }
1655 #endif
1656     if (numdists>0) {
1657       int i, diff, ni_min, max, best_p, ni;
1658       /* aware: takes long time for big data sets */
1659       /* dilute? (german: ausduennen?) */
1660       qsort (pdists, numdists, sizeof (int), intcompare);
1661       /* the new method, div0? */
1662       best_p=4*numdists/5;
1663       /* try to find better pitch for monospaced font (ok for prop) */
1664       // tolerant to 090729num*  tmp09/barcodes090916_interleaved*
1665       if (mono_em_min  > mono_em_max+(mono_em_min+4)/9+1 // rnd80 52 45
1666        || mono_em_max>=2*mono_em_min) {
1667         monospaced = 0;
1668         if (job->cfg.verbose)
1669           fprintf(stderr, "\n# ... L%02d mono:=0  %d - %d DBG%d",
1670           l1, mono_em_min, mono_em_max, __LINE__);
1671       } else
1672         pitch_m=((mono_em_max<3*mono_em_min)?
1673                  (mono_em_max+3*mono_em_min)/4:mono_em_min);
1674       /* try to find better pitch for proportional font */
1675       // the largest diff could be the best, if diff is always 1,
1676       //  take the diff with the lowest weight
1677       // JS-2010-09 add numdists<8 sample spaces20100908.jpg
1678       // todo: search most offen biggest gapdiff (ignore big table gaps)
1679       //   mean gapdiff? gap[n-1-i]-gap[0+i] until gapdiff=0, skip table gaps
1680       // 2010-09-28 check until end of table, because old bad wide gaps are
1681       //     no more added to the table
1682       for (ni=ni_min=1024,max=0,i=((numdists<8)?0:numdists/2+1);
1683                                 i<numdists;i++) {
1684         if (pdists[i]<=char_width_min/3) continue; // JS-2010-09
1685         if (pdists[i]> char_width_max*2) {
1686          /* set 2nd best which is numdists as default */; break; } // JS-2010-27 table gaps
1687         if (numdists<16)  // single word?
1688         if (pdists[i]<=char_width_max/3) continue; // JS-2010-09
1689         diff=pdists[i]-pdists[i-1];
1690         if (diff>max) {
1691           max=diff; best_p=i-1;
1692           if ((job->cfg.verbose&(32+16))==48)
1693             fprintf(stderr," L%02d best_p= %3d + maxdiff=%3d\n# ...",
1694               l1, pdists[best_p], max);
1695           if (max>3 &&                   3*pdists[i]>=4*pdists[i-1]) { break; }
1696           if (max>1 && 3*i>numdists*2 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1697         }
1698         if (diff) {
1699           if (ni<ni_min) {
1700             // do not try to divide one word per line
1701             ni_min=ni; if (max<=1 && numdists>16) best_p=i-1;
1702             if ((job->cfg.verbose&(32+16))==48)
1703               fprintf(stderr," L%02d best_p=%3d ni_min=%3d\n# ...",
1704                 l1, pdists[best_p], ni_min);
1705           }
1706           ni=1;
1707         } else ni++;
1708       }
1709       if (numdists<16 && max<=1 && ni_min>1) best_p=numdists-1; // one word
1710 #if 1 /* debugging */
1711       if ((job->cfg.verbose&(32+16))==48) {
1712         // fprintf(stderr,"\n# ...");
1713         for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1714         fprintf(stderr," <- pdist[%d] sorted\n# ...",l1);
1715         fprintf(stderr," L%02d maxdiff=%d min_samediffs=%d", l1, max, ni_min);
1716       }
1717 #endif
1718       /* we measure spaces in two different ways (mono, prop) */
1719       /* prop: gap between boxes,   mono: distance of middle */
1720       if (best_p<numdists-1) pitch_p = ((pdists[best_p]+pdists[best_p+1])/2+1);
1721       else                   pitch_p = (pdists[best_p]+1  );
1722       if (numdists)
1723         if (   pdists[numdists-1]*2 <= pdists[0]*3
1724             || pdists[numdists-1]   <= pdists[0]+3) {
1725         /* line is just a single word */
1726           pitch_p = pdists[numdists-1]+10;
1727         }
1728       if (l1>0 && job->cfg.spc==0) {
1729         job->res.lines.pitch[l1]=(monospaced?pitch_m:pitch_p);
1730         job->res.lines.mono[l1]=monospaced;
1731       }
1732       if (job->cfg.verbose) {
1733         fprintf(stderr,"\n# ...");
1734         fprintf(stderr," L%02d mono: num=%3d min=%3d max=%3d pitch=%3d\n# ...",
1735           l1, numdists, mono_em_min,mono_em_max,pitch_m);
1736         fprintf(stderr," L%02d prop: num=%3d min=%3d max=%3d pitch=%3d @ %2d%%\n# ...",
1737           l1, numdists, pdists[0],pdists[numdists-1],pitch_p,best_p*100/numdists);
1738         fprintf(stderr," L%02d result: mono=%d  distance >= %d considered as space\n# ...",
1739           l1, monospaced, job->res.lines.pitch[l1]);
1740       }
1741     } /* if (not) enough spaces */
1742     if (l1==0) {  /* set default spaces to each line */
1743       int l2;
1744       spc = job->cfg.spc;
1745       if (spc==0) /* set only if not set by option */
1746         spc = ((monospaced)?pitch_m:pitch_p);
1747       for  (l2=0; l2<job->res.lines.num; l2++ )
1748         job->res.lines.pitch[l2]=spc;
1749     }
1750   }  /* each line */
1751   if (job->cfg.spc==0)
1752     job->cfg.spc = spc;
1753   if (job->cfg.verbose)
1754     fprintf(stderr," overall space width is %d %s\n",
1755         spc, ((monospaced)?"monospaced":"proportional"));
1756 
1757 
1758 }
1759 
1760 /* ---- count subboxes (white holes within black area) --------
1761  *  new: count boxes lying inside another box (usually holes, ex: "aeobdg")
1762  *  needed for glue_boxes, dont joining textboxes, tables and other complex
1763  *    objects
1764  * ToDo: count only frames of invers spin? do we need sorted list here? -> no
1765  */
count_subboxes(pix * pp)1766 int count_subboxes( pix *pp ){
1767   int ii=0, num_mini=0, num_same=0, cnt=0;
1768   struct box *box2,*box4;
1769   job_t *job=OCR_JOB; /* fixme */
1770   progress_counter_t *pc = NULL;
1771   if (job->cfg.verbose) { fprintf(stderr,"# count subboxes\n# ..."); }
1772 
1773   pc = open_progress(job->res.boxlist.n,"count_subboxes");
1774   for_each_data(&(job->res.boxlist)) {
1775     box2 = (struct box *)list_get_current(&(job->res.boxlist));
1776     box2->num_subboxes=0;
1777     progress(cnt++,pc);
1778     if (   (box2->x1 - box2->x0)<2
1779         || (box2->y1 - box2->y0)<2) continue; /* speedup for dotted bg */
1780     // holes inside box2 char, aoebdqg, 0.41
1781     for_each_data(&(job->res.boxlist)) {
1782       box4=(struct box *)list_get_current(&(job->res.boxlist));
1783       if (box4->y0 > box2->y1) break; // faster, but boxes need to be sorted
1784       // ToDo: better use binary tree (above/below x) to find near boxes?
1785       if (box4==box2) continue;
1786       if( box4->x0==box2->x0 && box4->x1==box2->x1
1787        && box4->y0==box2->y0 && box4->y1==box2->y1)
1788          num_same++; /* erroneous!? */
1789       if ( box4->x0 >= box2->x0  &&  box4->x1 <= box2->x1
1790         && box4->y0 >= box2->y0  &&  box4->y1 <= box2->y1
1791         && box4->num_subboxes==0 ) /* box4 inside box2? */
1792       {
1793         box2->num_subboxes++; ii++;
1794         if ((box4->x1 - box4->x0 + 1)
1795            *(box4->y1 - box4->y0 + 1)<17) num_mini++;
1796       }
1797     } end_for_each(&(job->res.boxlist));
1798 #if 0
1799     if (cnt < 1000 && job->cfg.verbose)
1800       fprintf(stderr," %4d box %4d %4d %+3d %+3d  subboxes %4d\n# ...",
1801         cnt, box2->x0, box2->y0, box2->x1-box2->x0,
1802                                  box2->y1-box2->y0, box2->num_subboxes);
1803 #endif
1804   }   end_for_each(&(job->res.boxlist));
1805   close_progress(pc);
1806   if (job->cfg.verbose)
1807     fprintf(stderr," %3d subboxes counted (mini=%d, same=%d) nC= %d\n",
1808       ii, num_mini, num_same/2 /* counted twice */, cnt);
1809   return 0;
1810 }
1811 
1812 /* ---- join holes to chars( before step1 ) v0.42  -----------------------
1813    join boxes lying inside another box (usually holes, ex: "aeobdg46890")
1814    Dont add dust to a char!  (ij-dots later)
1815    lines are not detected yet
1816 */
glue_holes_inside_chars(pix * pp)1817 int glue_holes_inside_chars( pix *pp ){
1818   int ii, x0, y0, x1, y1, cnt=0,
1819       glued_same=0, glued_holes=0;
1820   struct box *box2, *box4;
1821   job_t *job=OCR_JOB; /* fixme */
1822   progress_counter_t *pc = NULL;
1823   // int cs=job->cfg.cs;
1824   {
1825     count_subboxes( pp ); /* move to pgm2asc() later */
1826 
1827     pc = open_progress(job->res.boxlist.n,"glue_holes_inside_chars");
1828     if (job->cfg.verbose)
1829         fprintf(stderr,"# glue_holes to chars nC= %d\n# ...",job->res.numC);
1830     ii=0;
1831     for_each_data(&(job->res.boxlist)) {
1832       // get the smaller box which may be extended by bigger boxes around it
1833       box2 = (struct box *)list_get_current(&(job->res.boxlist));
1834       x0 = box2->x0;  x1 = box2->x1;
1835       y0 = box2->y0;  y1 = box2->y1;
1836 
1837       progress(cnt++,pc);
1838 
1839       // would it better than moving vectors to build a sub-box-tree?
1840 
1841       // do not remove chars inside pictures (car plates on photos)
1842       if( box2->c == PICTURE || box2->num_subboxes > 7) continue;
1843 
1844       // holes inside char, aoebdqg, 0.41
1845       // dont merge boxes which have subboxes by itself!
1846       // search boxes inside box2
1847       // if (x1-x0+1>2 || y1-y0+1>2) /* skip tiny boxes, bad for 4x6 */
1848       for_each_data(&(job->res.boxlist)) {
1849 	box4=(struct box *)list_get_current(&(job->res.boxlist));
1850         if(box4!=box2 && box4->c != PICTURE )
1851 	{
1852 	  // ToDo: dont glue, if size differs by big factors (>16?)
1853 	  //  box4 is of same size or smaller
1854 	  //if ((job->cfg.verbose & 48)==48
1855 	  //   && abs(box4->x0-x0)<4 && abs(box4->y0-y0)<8)
1856           //  { fprintf(stderr,"\n# DBG_glue");out_x(box2);out_x(box4); }
1857           if (abs(box4->frame_vol[0])
1858             >=abs(box2->frame_vol[0])/512) // 2010-10 bad invalid_ogv.jpg
1859           if (   (    box4->x0==x0 && box4->x1==x1
1860                    && box4->y0==y0 && box4->y1==y1 ) /* do not happen !? */
1861               || (    box4->x0>=x0 && box4->x1<=x1
1862                    && box4->y0>=y0 && box4->y1<=y1
1863                    // 2010-09 subboxes==0 to subboxes<4 for 0 with dot in it
1864                    && box4->num_subboxes<2 ) )  /* no or very small subboxes? */
1865           {  // fkt melt(box2,box4)
1866             // same box, if very small but hollow char (4x5 o)
1867             if( box4->x0==x0 && box4->x1==x1
1868              && box4->y0==y0 && box4->y1==y1) glued_same++; else glued_holes++;
1869             // fprintf(stderr,"\n# DEBUG merge:");
1870             // out_x(box2);  // small
1871             // out_x(box4);  // big
1872             if ((job->cfg.verbose & 7)==7) // LEV3
1873               fprintf(stderr," join hole %4d %4d %+4d %+4d %+6d"
1874                                      " + %4d %4d %+4d %+4d %+6d %d\n# ...",
1875                 x0, y0, x1-x0+1, y1-y0+1, box2->frame_vol[0],
1876                 box4->x0, box4->y0,
1877                 box4->x1-box4->x0+1, box4->y1-box4->y0+1,
1878                 box4->frame_vol[0], glued_same);
1879             if ((box4->x1-box4->x0+1)< 8*(x1-x0+1)
1880              || (box4->y1-box4->y0+1)<12*(y1-y0+1)) // skip dust
1881             merge_boxes( box2, box4 ); // add box4 to bigger box2
1882   	    //if ((job->cfg.verbose & 48)==48)
1883             //  { fprintf(stderr,"\n# DBG_glue_result");out_x(box2); }
1884             x0 = box2->x0; x1 = box2->x1;
1885             y0 = box2->y0; y1 = box2->y1;
1886             job->res.numC--;  // dont count fragments as chars
1887             ii++;	// count removed
1888 	    list_del(&(job->res.boxlist), box4); // remove box4
1889 	    free_box(box4);
1890 	    // now search another hole inside box2
1891           }
1892         }
1893       } end_for_each(&(job->res.boxlist));
1894 
1895     } end_for_each(&(job->res.boxlist));
1896 
1897     if (job->cfg.verbose)
1898       fprintf(stderr," joined: %3d holes, %3d same, nC= %d\n",
1899         glued_holes, glued_same, job->res.numC);
1900     close_progress(pc);
1901   }
1902   return 0;
1903 }
1904 
1905 
1906 /* ---- join broken chars ( before step1 ??? )  -----------------------
1907     use this carefully, do not destroy previous detection ~fi, broken K=k' g
1908     join if boxes are near or diagonally connected
1909     other strategy: mark boxes for deleting and delete in extra loop at end
1910     faster: check only next two following boxes because list is sorted!
1911     ToDo: store m4 of upper line to m4_of_prev_line, and check that "-points are below
1912     done: join boxes lying inside another box (usually holes, ex: "aeobdg")
1913     Dont add dust to a char!
1914     lines should be detected already (Test it for m1-m4 unknown)
1915     ToDo: divide in glue_idots, glue_thin_chars etc. and optimize it
1916 */
glue_broken_chars(job_t * job,pix * pp)1917 int glue_broken_chars( job_t *job, pix *pp ){
1918   int ii, y, cs, x0, y0, x1, y1, cnt=0,
1919       num_frags=0, glued_frags=0, glued_hor=0,
1920       do_join=0; /* 1..n means we have a reason to join two objects to one */
1921 //  for better debugging:        upper_dots(umlauts) lower_dots ...
1922 // char *(join_reason)[5]={"no","\"A\"Uij\%","!?;\%","=:;","'',,"}; 2018-09
1923   char *(join_reason)[5]={"no", "\"A\"Uij%%", "!?;%%", "=:;", "'',,"};
1924 //             do_join:    0      1            2        3       4
1925   struct box *box2, *box4;
1926   // job_t *job=OCR_JOB; /* fixme */
1927   progress_counter_t *pc = NULL;
1928   cs=job->cfg.cs;
1929   {
1930     count_subboxes( pp ); /* move to pgm2asc() later */
1931 
1932     pc = open_progress(job->res.boxlist.n,"glue_broken_chars");
1933     if (job->cfg.verbose)
1934         fprintf(stderr,"# glue broken chars nC= %d avX= %d\n# ...",
1935           job->res.numC, job->res.avX);
1936     ii=0;
1937     for_each_data(&(job->res.boxlist)) {
1938       // get the box which may be extended by boxes around it
1939       box2 = (struct box *)list_get_current(&(job->res.boxlist));
1940       x0 = box2->x0;  x1 = box2->x1;
1941       y0 = box2->y0;  y1 = box2->y1;
1942       progress(cnt++,pc);
1943       do_join=0;
1944       // vertical broken (g965T umlauts etc.)
1945       // not: f,
1946       // would it better than moving vectors to build a sub-box-tree?
1947       // do not remove chars inside pictures (car plates on photos)
1948       if (box2->c == PICTURE || box2->num_subboxes > 7) continue;
1949       /* continue loop if box is below or above line = dust */
1950       if (box2->m4>0 && y0>box2->m4) continue; /* dust outside ? */
1951       if (box2->m1>0 && y0<box2->m1-(box2->m3-box2->m2)) continue;
1952       /* ToDo:
1953        *  - check that y0 is greater as m3 of the char/line above
1954        */
1955       // --- variant 1 = ij-dots umlaut-dots :;= ---
1956       // check small boxes (box2) whether they belong
1957       //       to near same size or bigger boxes (box4)
1958       if( 2*(y1-y0) < box2->m4 - box2->m1     // care for dots etc.
1959        && (   2*y1<=(box2->m3+box2->m2)       // upper fragments
1960            || 2*y0>=(box2->m3+box2->m2)) ) {  // lower fragments
1961         struct box *box5=NULL;    // nearest box
1962         box4=NULL;
1963         num_frags++;   /* count for debugging */
1964         // get the [2nd] next x-nearest box in the same line
1965         for_each_data(&(job->res.boxlist)) {
1966   	  box4=(struct box *)list_get_current(&(job->res.boxlist));
1967           if (box4 == box2  ||  box4->c == PICTURE) continue;
1968           /* 0.42 speed up for background pixel pattern, box4 to small */
1969           if ( box4->x1 - box4->x0 + 1 < x1-x0+1
1970             && box4->y1 - box4->y0 + 1 < y1-y0+1 ) continue;
1971           // have in mind that line number may be wrong for dust
1972           if (box4->line>=0 && box2->line>=0 && box4->line==box2->line)
1973           {
1974              if (!box5) box5=box4;
1975              if ( abs(box4->x0 + box4->x1 - 2*box2->x0)
1976                  <abs(box5->x0 + box5->x1 - 2*box2->x0))
1977                { /* box6=box5; next-nearest box */ box5=box4; }
1978   	  }
1979         } end_for_each(&(job->res.boxlist));
1980 	box4=box5; // next nearest box within the same line
1981       	if (box4) {
1982           // do not glue "%^" in 0811qemu2.png 2010-09-28
1983       	  if (box4->x1 - box4->x0 + 1 > job->res.avX / 2
1984       	   && box2->x1 - box2->x0 + 1 > job->res.avX / 2
1985       	   && (  box2->x0 > box4->x1
1986       	      || box4->x0 > box2->x1)) continue;
1987 #if 0    /* set this to 1 for debugging of melting bugs */
1988           if (job->cfg.verbose & 7) {
1989             fprintf(stderr,"\n# next two boxes are candidates for joining");
1990             out_x(box2);
1991             out_x(box4); }
1992 #endif
1993           if ( /* umlaut "a "o "u, ij; box2 is the small dot, box4 the body */
1994                        4*y1 <= 3*box2->m2 + box2->m3 // y1=box2->y1, ocr-a %
1995               && 4*box4->y1 >= 3*box2->m2 + box2->m3 // dont join 2 dots
1996               &&   2* y1 < box4->y1 + box4->y0  // box2 above box4
1997               &&   box4->x1 + job->res.avX/2 >= x0
1998               &&   box4->x0 - job->res.avX/2 <= x1
1999               && (y1 < box4->y0 || x0 < box4->x1) // dont melt "d'"
2000               &&   3* (      y1 - box4->y0)
2001                 <= 2* (box4->y1 - box4->y0)  // too far away? dust!
2002               // ToDo mono-serif-i dot is 8x smaller char but "Strichdicke"?
2003               &&   9* (      x1 -       x0 + 1)  // rnd80.i 4x8 vs. 35x34
2004                 >=    (box4->x1 - box4->x0 + 1)  // dot must have minimum size
2005               &&  10* (      y1 -       y0 + 1)
2006                 >=    (box4->y1 - box4->y0 + 1)  // dot must have minimum size
2007             ) do_join=1;
2008           if ( (!do_join) /* !?; box2 is the dot, box4 the body */
2009               && 2*box4->x1>=x0+x1 	/* test if box4 is around box2 */
2010               && 2*box4->x0<=2*x1 /* +x0+1 Jan00 */
2011               && ( x1-x0 <= box4->x1-box4->x0+2 )
2012               &&   2*y0>=box2->m2+box2->m3
2013               &&   4*y1>=box2->m2+3*box2->m3
2014               &&   4*(y1-y0)<box2->m4-box2->m1
2015               &&   (8*box4->y1 < box4->m2+7*box4->m3
2016                    || box4->m4-box4->m1<16) /* Jan00 */
2017             ) do_join=2;
2018           if ( (!do_join) /* =;: box2 is the upper box, box4 the lower box */
2019               &&  2*box4->x1>=x0+x1 	/* test if box4 is around box2 */
2020               && 2*box4->x0<=2*x1 /* +x0+1 */
2021               && ( x1-x0  <=   box4->x1-box4->x0+4 )
2022               && (  4*x0  <= 3*box4->x1+box4->x0 )
2023               && (( box2->m2 && box4->m2
2024                 &&   y1< box2->m3
2025                 && 2*box4->y1 >    box4->m3+box4->m2  // can be bigger than m3
2026                 && 4*box4->y0 >= 3*box4->m2+box4->m3
2027                 && 2*box2->y0 <    box2->m3+box2->m2
2028                  )
2029                || ( (!box2->m2) || (!box4->m2) )
2030               )
2031             ) do_join=3;
2032           /* '' ,, tmp08/0811qemu2 2010-10-01 + rnd80.png=mono */
2033           if (   abs(box2->y1 - box4->y1) <= (y1-y0)/8+1 // same y1
2034               && abs(box2->y0 - box4->y0) <= (y1-y0)/8+1 // same y0
2035               && abs((box4->x1 - box4->x0) - (x1-x0)) <= (x1-x0)/8+1 // same dx
2036               && x1-x0 <= job->res.avX/2               // small width
2037               && ( abs(box4->x0 - x1 - 1) <= job->res.avX/2  // small gap
2038                 || abs(x0 - box4->x1 - 1) <= job->res.avX/2) // ocr-b
2039               && ( 4*y1 <= 3*box2->m2 +   box2->m3     // ''
2040                 || 4*y0 >= 2*box2->m2 + 2*box2->m3 )   // ,,
2041             ) do_join=4;
2042           if (do_join>0) {  // fkt melt(box2,box4)
2043             if (job->cfg.verbose & 7) // space "( " for better " x"-searching
2044               fprintf(stderr," join objects  %4d %4d %+4d %+4d"
2045                                          " + %4d %4d %+4d %+4d %s\n# ...",
2046                 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2047                 box4->x1-box4->x0+1, box4->y1-box4->y0+1,join_reason[do_join]);
2048             // fprintf(stderr,"\n# DEBUG merge:");  // d=7x34 @ (109,51) ???
2049             // if (job->cfg.verbose & 4) out_x(box2);
2050             // if (job->cfg.verbose & 4) out_x(box4);
2051             merge_boxes( box2, box4 ); // add box4 to box2
2052             x0 = box2->x0; x1 = box2->x1;
2053             y0 = box2->y0; y1 = box2->y1;
2054 #if 0
2055             if (job->cfg.verbose & 7) //
2056               fprintf(stderr," join objects  %3d %3d %+4d %+4d\n# ...",
2057                 x0, y0, x1-x0+1, y1-y0+1);
2058 #endif
2059             // if (job->cfg.verbose & 4) out_x(box2);
2060             // 2010-09-24 hmm, correct overall hight here, later set bad???
2061             // job->res.numC--;  // dont count fragments as chars
2062             ii++; glued_frags++; // remove
2063             // output_list(job);
2064 	    list_del(&(job->res.boxlist), box4); /* ret&1: error-message ??? */
2065             // output_list(job);
2066 	    free_box(box4);
2067           }
2068 	}
2069       }
2070 //  continue;
2071 
2072       // horizontally broken w' K'
2073       if(     2*y1  <   (box2->m3+box2->m2) )
2074       if( 2*(y1-y0) <   (box2->m3+box2->m2) )	// fragment
2075       for_each_data(&(job->res.boxlist)) {
2076 	box4=(struct box *)list_get_current(&(job->res.boxlist));
2077         if (box4!=box2 && box4->c != PICTURE)
2078 	{
2079           if( box4->line>=0 && box4->line==box2->line
2080           && box4->x1>=x0-1 && box4->x1<x0  // do not glue 6-
2081           && box4->x0+3*box4->x1<4*x0)
2082           if( get_bw(x0  ,x0  ,y1,y1  ,pp,cs,1) == 1)
2083           if( get_bw(x0-2,x0-1,y1,y1+2,pp,cs,1) == 1)
2084           {  // fkt melt(box2,box4)
2085             if (job->cfg.verbose & 7)
2086               fprintf(stderr," join objects  %4d %4d %+4d %+4d"
2087                                          " + %4d %4d %+4d %+4d w'K'\n# ...",
2088                 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2089                 box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2090             put(pp,x0,y1+1,~(128+64),0);
2091             merge_boxes( box2, box4 );
2092             x0 = box2->x0; x1 = box2->x1;
2093             y0 = box2->y0; y1 = box2->y1;
2094             job->res.numC--; ii++;	// remove
2095             glued_hor++;
2096 	    list_del(&(job->res.boxlist), box4);
2097 	    free_box(box4);
2098           }
2099         }
2100       } end_for_each(&(job->res.boxlist));
2101 
2102       // horizontally broken n h	(h=l_)		v0.2.5 Jun00
2103       if( abs(box2->m2-y0)<=(y1-y0)/8 )
2104       if( abs(box2->m3-y1)<=(y1-y0)/8 )
2105       if( num_cross(x0,         x1,(y0+  y1)/2,(y0+  y1)/2,pp,cs) == 1)
2106       if( num_cross(x0,         x1,(y0+3*y1)/4,(y0+3*y1)/4,pp,cs) == 1)
2107       if(    get_bw((3*x0+x1)/4,(3*x0+x1)/4,(3*y0+y1)/4,y1,pp,cs,1) == 0)
2108       if(    get_bw(x0,(3*x0+x1)/4,(3*y0+y1)/4,(y0+3*y1)/4,pp,cs,1) == 0)
2109       if(    get_bw(x0,         x0,         y0,(3*y0+y1)/4,pp,cs,1) == 1)
2110       for_each_data(&(job->res.boxlist)) {
2111 	box4=(struct box *)list_get_current(&(job->res.boxlist));
2112       	if (box4!=box2 && box4->c != PICTURE)
2113 	{
2114           if( box4->line>=0 && box4->line==box2->line
2115           && box4->x1>x0-3 && box4->x1-2<x0
2116            && abs(box4->y1-box2->m3)<2)
2117       	  {  // fkt melt(box2,box4)
2118             if (job->cfg.verbose & 7)
2119               fprintf(stderr," join objects %4d %4d %+4d %+4d"
2120                                         " + %4d %4d %+4d %+4d nh\n# ...",
2121                 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2122                 box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2123       	    y=loop(pp,x0,y0,y1-y0,cs,0,DO);if(2*y>y1-y0) continue;
2124             put(pp,x0-1,y0+y  ,~(128+64),0);
2125             put(pp,x0-1,y0+y+1,~(128+64),0);
2126             merge_boxes( box2, box4 );  // add box4 to box2
2127             x0 = box2->x0; x1 = box2->x1;
2128             y0 = box2->y0; y1 = box2->y1;
2129             job->res.numC--; ii++;	// remove
2130             glued_hor++;
2131 	    list_del(&(job->res.boxlist), box4);
2132 	    free_box(box4);
2133           }
2134       	}
2135       } end_for_each(&(job->res.boxlist));
2136     } end_for_each(&(job->res.boxlist));
2137     if (job->cfg.verbose)
2138       fprintf(stderr," joined: %3d fragments (found %3d), %3d rest, nC= %d\n",
2139         glued_frags, num_frags, glued_hor, job->res.numC);
2140     close_progress(pc);
2141   }
2142   return 0;
2143 }
2144 
2145 /*
2146 ** this is a simple way to improve results on noisy images:
2147 ** - find similar chars (build cluster of same chars)
2148 ** - analyze clusters (could be used for generating unknown font-base)
2149 ** - the quality of the result depends mainly on the distance function
2150 */
2151   // ---- analyse boxes, compare chars, compress picture ------------
2152   // ToDo: - error-correction only on large chars!
find_same_chars(pix * pp)2153 int find_same_chars( pix *pp){
2154   int i,k,d,cs,dist,n1,dx; struct box *box2,*box3,/* *box4, */ *box5;
2155   pix p=(*pp);
2156   job_t *job=OCR_JOB; /* fixme */
2157   cs=job->cfg.cs;
2158   {
2159     if(job->cfg.verbose)fprintf(stderr,"# packing");
2160     i = list_total(&(job->res.boxlist));
2161     for_each_data(&(job->res.boxlist)) {
2162       box2 = (struct box *)list_get_current(&(job->res.boxlist));
2163       dist=1000;	// 100% maximum
2164       dx = box2->x1 - box2->x0 + 1;
2165 
2166       if(job->cfg.verbose)fprintf(stderr,"\r# packing %5d",i);
2167       if( dx>3 )
2168       for(box3=(struct box *)list_next(&(job->res.boxlist),box2);box3;
2169 	  box3=(struct box *)list_next(&(job->res.boxlist),box3)) {
2170         if(box2->num!=box3->num){
2171           int d=distance(&p,box2,&p,box3,cs);
2172           if ( d<dist ) { dist=d; /* box4=box3; */ }	// best fit
2173           if ( d<5 ){   // good limit = 5% ???
2174             i--;n1=box3->num;		// set all num==box2.num to box2.num
2175 	    for_each_data(&(job->res.boxlist)) {
2176 	      box5=(struct box *)(struct box *)list_get_current(&(job->res.boxlist));
2177 	      if(box5!=box2)
2178               if( box5->num==n1 ) box5->num=box2->num;
2179 	    } end_for_each(&(job->res.boxlist));
2180           // out_x2(box2,box5);
2181           // fprintf(stderr," dist=%d\n",d);
2182           }
2183       	}
2184       }
2185       // nearest dist to box2 has box4
2186       //    out_b2(box2,box4);
2187       //    fprintf(stderr," dist=%d\n",dist);
2188     } end_for_each(&(job->res.boxlist));
2189     k=0;
2190     if(job->cfg.verbose)fprintf(stderr," %d different chars",i);
2191     for_each_data(&(job->res.boxlist)) {
2192       struct box *box3,*box4;
2193       int j,dist;
2194       box2=(struct box *)list_get_current(&(job->res.boxlist));
2195       for(box3=(struct box *)list_get_header(&(job->res.boxlist));
2196           box3!=box2 && box3!=NULL;
2197 	  box3=(struct box *)list_next(&(job->res.boxlist), box3))
2198         if(box3->num==box2->num)break;
2199       if(box3!=box2 && box3!=NULL)continue;
2200       i++;
2201       // count number of same chars
2202       dist=0;box4=box2;
2203 
2204       for(box3=box2,j=0;box3;
2205           box3=(struct box *)list_next(&(job->res.boxlist), box3)) {
2206 	if(box3->num==box2->num){
2207           j++;
2208           d=distance(&p,box2,&p,box3,cs);
2209           if ( d>dist ) { dist=d; box4=box3; }	// worst fit
2210 	}
2211       }
2212       if(job->cfg.verbose&8){
2213         out_x2(box2,box4);
2214         fprintf(stderr," no %d char %4d %5d times maxdist=%d\n",i,box2->num,j,dist);
2215       }
2216       // calculate mean-char (error-correction)
2217       // ToDo: calculate maxdist in group
2218       k+=j;
2219   //    if(j>1)
2220   //    out_b(box1,NULL,0,0,0,0,cs);
2221       if(job->cfg.verbose&8)
2222       fprintf(stderr," no %d char %4d %5d times sum=%d\n",i,box2->num,j,k);
2223     } end_for_each(&(job->res.boxlist));
2224     if(job->cfg.verbose)fprintf(stderr," ok\n");
2225   }
2226   return 0;
2227 }
2228 
2229 /*
2230 ** call the first engine for all boxes and set box->c=result;
2231 **
2232 */
char_recognition(pix * pp,int mo)2233 int char_recognition( pix *pp, int mo){
2234   int i,ii,ni,cs,x0,y0,x1,y1;
2235   struct box *box2;
2236   progress_counter_t *pc;
2237   wchar_t cc;
2238   job_t *job=OCR_JOB; /* fixme */
2239   cs=job->cfg.cs;
2240   // ---- analyse boxes, find chars ---------------------------------
2241   if (job->cfg.verbose)
2242     fprintf(stderr,"# char recognition");
2243   i=ii=ni=0;
2244   for_each_data(&(job->res.boxlist)) { /* count boxes */
2245     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2246     /* wew: isn't this just job->res.numC? */
2247     /* js: The program is very complex. I am not sure anymore
2248            wether numC is the number of boxes or the number of valid
2249            characters.
2250            Because its not time consuming I count the boxes here. */
2251     if (box2->c==UNKNOWN)  i++;
2252     if (box2->c==PICTURE) ii++;
2253     ni++;
2254   } end_for_each(&(job->res.boxlist));
2255   if(job->cfg.verbose)
2256     fprintf(stderr," unknown= %d picts= %d boxes= %d\n# ",i,ii,ni);
2257   if (!ni) return 0;
2258   i=ii=0;
2259   pc = open_progress(ni,"char_recognition");
2260   for_each_data(&(job->res.boxlist)) {
2261     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2262     x0=box2->x0;x1=box2->x1;
2263     y0=box2->y0;y1=box2->y1;	// box
2264     cc=box2->c;
2265     if (cc==PICTURE) continue;
2266 
2267     if ((mo&256)==0) { /* this case should be default (main engine) */
2268       if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2269         cc=whatletter(box2,cs   ,0);
2270     }
2271 
2272     if(mo&2)
2273       if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2274 	cc=ocr_db(box2, job);
2275 
2276 
2277     // box2->c=cc; bad idea (May03 removed)
2278     // set(box2,cc,95); ToDo: is that better?
2279 
2280     if(cc==UNKNOWN)
2281 	i++;
2282     ii++;
2283 
2284     if(job->cfg.verbose&8) {
2285       fprintf(stderr,"\n# code= %04lx %c",(long)cc,(char)((cc<255)?cc:'_'));
2286       out_b(box2,pp,x0,y0,x1-x0+1,y1-y0+1,cs);
2287     }
2288     progress(ii,pc); /* ii = 0..ni */
2289 
2290   } end_for_each(&(job->res.boxlist));
2291   close_progress(pc);
2292   if(job->cfg.verbose)fprintf(stderr," %d of %d chars unidentified\n",i,ii);
2293   return 0;
2294 }
2295 
2296 
2297 /*
2298 ** compare unknown with known chars,
2299 ** very similar to the find_similar_char_function but here only to
2300 ** improve the result
2301 */
compare_unknown_with_known_chars(pix * pp,int mo)2302 int compare_unknown_with_known_chars(pix * pp, int mo) {
2303   job_t *job=OCR_JOB; /* fixme */
2304   int i, cs = job->cfg.cs, dist, d, ad, wac, ni, ii;
2305   struct box *box2, *box3, *box4;
2306   progress_counter_t *pc=NULL;
2307   wchar_t bc;
2308   i = ii = 0; // ---- -------------------------------
2309   if (job->cfg.verbose)
2310     fprintf(stderr, "# try to compare unknown with known chars !(mode&8)");
2311   if (!(mo & 8))
2312   {
2313     ii=ni=0;
2314     for_each_data(&(job->res.boxlist)) { ni++; } end_for_each(&(job->res.boxlist));
2315     pc = open_progress(ni,"compare_chars");
2316     for_each_data(&(job->res.boxlist)) {
2317       box2 = (struct box *)list_get_current(&(job->res.boxlist)); ii++;
2318       if (box2->c == UNKNOWN || (box2->num_ac>0 && box2->wac[0]<97))
2319 	if (box2->y1 - box2->y0 > 4 && box2->x1 - box2->x0 > 1) { // no dots!
2320 	  box4 = (struct box *)list_get_header(&(job->res.boxlist));;
2321 	  dist = 1000;		/* 100% maximum */
2322 	  bc = UNKNOWN;		/* best fit char */
2323 	  for_each_data(&(job->res.boxlist)) {
2324 	    box3 = (struct box *)list_get_current(&(job->res.boxlist));
2325             wac=((box3->num_ac>0)?box3->wac[0]:100);
2326 	    if (box3 == box2 || box3->c == UNKNOWN
2327                              || wac<job->cfg.certainty) continue;
2328 	    if (box2->y1 - box2->y0 < 5 || box2->x1 - box2->x0 < 3) continue;
2329 	    d = distance(pp, box2, pp, box3, cs);
2330 	    if (d < dist) {
2331 		dist = d;  bc = box3->c;  box4 = box3;
2332 	    }
2333 	  } end_for_each(&(job->res.boxlist));
2334 	  if (dist < 10) {
2335             /* sureness can be maximal of box3 */
2336 	    if (box4->num_ac>0) ad = box4->wac[0];
2337 	    else                ad = 97;
2338 	    ad-=dist; if(ad<1) ad=1;
2339 	    /* ToDo: ad should depend on ad of bestfit */
2340 	    setac(box2,(wchar_t)bc,ad);
2341 	    i++;
2342 	  }			// limit as option???
2343 	  //  => better max distance('e','e') ???
2344 	  if (dist < 50 && (job->cfg.verbose & 7)) {	// only for debugging
2345 	    fprintf(stderr,"\n#  L%02d xy= %4d %4d best fit was %04x=%c"
2346 	         " dist=%3d%% i=%d", box2->line, box2->x0, box2->y0,
2347 	         (int)bc, (char)((bc<128)?bc:'_'), dist, i);
2348 	    if (box4->num_ac>0) fprintf(stderr," w= %3d%%",box4->wac[0]);
2349 	    if ((job->cfg.verbose & 4) && dist < 10)
2350 	      out_x2(box2, box4);
2351 	  }
2352 	  progress(ii,pc);
2353 	}
2354     } end_for_each(&(job->res.boxlist));
2355     close_progress(pc);
2356   }
2357   if (job->cfg.verbose)
2358     fprintf(stderr, " - found %d (nC=%d)\n", i, ii);
2359   return 0;
2360 }
2361 
2362 /*
2363 // ---- divide overlapping chars which !strchr("_,.:;",c);
2364 // block-splitting (two ore three glued chars)
2365 // division if dots>0 does not work properly! ???
2366 //
2367 // ToDo: what about glued "be"? simply try vert. cut on fat vert. line?
2368 // what about recursive division?
2369 // ToDo: mark divided boxes to give the engine a chance to
2370 //       handle wrong divisions
2371 //   sample: tmp13/sslmozFP.png bold 8x9-to-9x9-overlapfont
2372 //    Todo: check min-x-neigbours_of_all_black_pixels if>1 erosion right ?
2373 //      also if two vectors between are same but reverse = cut, 'nt' 'To'
2374 //  Todo: tmp08/gocr0801_bad5.jpg double-touching-"ke"= 2 holes!
2375 //        middle hole must be splitted to left and right char, ToDo18
2376 */
try_to_divide_boxes(pix * pp,int mo)2377 int  try_to_divide_boxes( pix *pp, int mo){
2378   struct box *box2, boxa, boxb;
2379   job_t *job=OCR_JOB; /* fixme */
2380   int cs=job->cfg.cs, ad=100,
2381       a2[8], ar, // certainty of each part, ar = product of all certainties
2382       cbest;  // best certainty, skip search of certainty<cbest-1 for speed
2383   wchar_t ci[8],  // split max. 8 chars
2384           s1[]={ UNKNOWN, '_', '.', ',', '\'', '!', ';', '?', ':', '-',
2385       '=', '(', ')', '/', '\\', '\0' };	// not accepted chars, \0-terminated!
2386   int x0, x1, y0, y1,
2387       xi[8+1]; // cutting positions
2388   int i, ii, i1, i2, n1, dx; // dy, dx;
2389   // pix p=(*pp); // remove!
2390   if (job->cfg.verbose)
2391     fprintf(stderr,"# try to divide unknown chars !(mode&16)");
2392   if(!(mo&16))  // put this to the caller
2393   for_each_data(&(job->res.boxlist)) {
2394     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2395     // don't try to split simple structures (ex: 400x30 square)
2396     if ((!box2->num_frames)
2397        || box2->num_frame_vectors[ box2->num_frames-1 ]<9) continue;
2398     if((box2->c==UNKNOWN || (box2->num_ac && box2->wac[0]<job->cfg.certainty))
2399       && box2->x1-box2->x0>5 && box2->y1-box2->y0>4){
2400       x0=box2->x0; x1=box2->x1; dx= x1-x0+1;
2401       y0=box2->y0; y1=box2->y1;
2402       ad=100;
2403       cbest=0;
2404 
2405       /* ocr1809_12minus "-5" */
2406       ii=loop(pp,x0+1,y0,y1-y0,cs,0,DO);
2407       i =loop(pp,x0+1,y1,y1-y0,cs,0,UP);
2408       if (ii+i >= 7*(y1-y0-1)/8
2409         && y0+ii>box2->m2 && y0+ii<box2->m3) { // check for "-5"
2410         for (i1=0;i1<(x1-x0)/2;i1++) { // check v-symmetry
2411           i2=loop(pp,x0+i1,y0,y1-y0,cs,0,DO);
2412           if (abs(i2 - ii) > (y1-y0)/16) break; // not or end of -
2413           i2=loop(pp,x0+i1,y1,y1-y0,cs,0,UP);
2414           if (abs(i2 - i ) > (y1-y0)/16) break; // not or end of -
2415         }
2416         if ((job->cfg.verbose&2) /* && i1>(x1-x0)/3*/){
2417           fprintf(stderr,
2418       "\n# try_to_divide_box(xy,dxy): %4d %4d %3d %3d as -5 xcut= %d-1",
2419              x0, y0, x1-x0+1, y1-y0+1, i1); }
2420         if (i1>(x1-x0-1)/4) {
2421           i=0; boxa=*box2;   // copy contents, ToDo: reset ac-list (in cut_box?)
2422           boxa.x=x0; boxa.y=y0;        // obsolete? mark pixel, overlap?
2423           boxa.x0=xi[i]=x0;boxa.x1=xi[i+1]=x0+i1-1;  // new horizontal box range
2424           cut_box(&boxa); boxa.num_ac=0;  // ToDo: add box2 as src argument?
2425           ci[i]=whatletter(&boxa,cs,0); /* get char */
2426           a2[i]=testac(&boxa,ci[i]); /* get certainty */
2427           if ((ci[i]=='-' || ci[i]=='_') && a2[i]>=97) // 2018-09 "-5"
2428           { setac(&boxa,ci[i],a2[i]=99);
2429               if ((job->cfg.verbose&2)) {
2430                DBG(fprintf(stderr,"\nDBG %s set split certainty 99",\
2431                decode(ci[0],ASCII))); }}
2432           i++; boxb=*box2;  // try rest if it has to be split again
2433           boxb.x=xi[i]+1; boxb.y=y0;
2434           boxb.x0=xi[i]+1;boxb.x1=xi[i+1]=box2->x1;
2435           cut_box(&boxb); boxb.num_ac=0;
2436           ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2437           if (a2[0]>=97 && a2[1]>=99) // 2018-09 "-5"
2438           { char buf[8]=""; setac(&boxb,ci[i],a2[i]=99);
2439               if ((job->cfg.verbose&2)) {
2440               DBG(fprintf(stderr,"\nDBG %s set split certainty 99",\
2441                decode(ci[1],ASCII)));}
2442             buf[0]=ci[0];buf[1]=ci[1];buf[2]=0;
2443             ar=a2[1]; // not final, just testing
2444             if (buf[0]) setas(box2,buf,ar); }
2445         }
2446       } /* check "-5" split */
2447 
2448       /* get minimum vertical lines, but fails on ocr1809_12minus "-5" */
2449       n1 = num_cross(x0,x1,(  y1+y0)/2,(  y1+y0)/2,pp,cs);
2450       ii = num_cross(x0,x1,(3*y1+y0)/4,(3*y1+y0)/4,pp,cs); if (ii<n1) n1=ii;
2451       if (box2->m2 && box2->m3 > box2->m2+2)
2452       for (i=box2->m2+1;i<=box2->m3-1;i++) {
2453         // 2017-07 patch from LLeroy2005
2454         if ((i<=y0) || (i>=y1)) continue; // box smaller than baseline
2455         if (loop(pp,x0+1,i,x1-x0,cs,1,RI) > (x1-x0-2)) continue; // ll
2456         ii = num_cross(x0,x1,i,i,pp,cs); if (ii<n1) n1=ii;
2457       } if (n1<2) continue;  // seems to make no sense to divide
2458       if (n1<4) ad=99*ad/100; // not to strong because m2+m3 could be wrong
2459       if (n1<3) ad=99*ad/100;
2460 
2461       if( 2*y1 < box2->m3+box2->m4    /* baseline char ? */
2462        && num_cross(x0,x1,y1-1,y1-1,pp,cs)==1  // -1 for slopes
2463        && num_cross((x0+2*x1)/3,(x0+3*x1)/4,y0,y1,pp,cs)<3  // not exclude tz
2464        && num_cross((3*x0+x1)/4,(2*x0+x1)/3,y0,y1,pp,cs)<3  // not exclude zl
2465        && loop(pp,x0,y1-(y1-y0)/32,x1-x0,cs,0,RI)
2466          +loop(pp,x1,y1-(y1-y0)/32,x1-x0,cs,0,LE) > (x1-x0+1)/2
2467         ) continue; /* do not try on bvdo"o etc. */
2468 
2469       // one vertical line can not be two glued chars, lc?
2470       if ( num_cross(x0,x1,(y1+y0)/2,(y1+y0)/2,pp,cs)<=1 ) continue;
2471       {	// doublet = 2 letters
2472         // char buf[4]="\0\0\0";      // 4th byte is string end == \0
2473         // buf[0]=c1;                 // c1 is wchar_t! (0xbf00 to 0) failes
2474         // buf[1]=c2;
2475         char buf[64]="";      // end == \0
2476         if (job->cfg.verbose&2){  int l1=box2->line;
2477           fprintf(stderr,
2478              "\n# try_to_divide_box(xy,dxy): %4d %4d %3d %3d L%02d mono=%d",
2479              x0, y0, x1-x0+1, y1-y0+1, l1, job->res.lines.mono[l1] /* 1=mono */);
2480           if (job->cfg.verbose&4) out_x(box2); // list box as data+ASC-image
2481         }
2482         // for mono-spaced/teletext fonts only wide chars molten!?
2483         // Todo: or search 2 invers vectors at xi[1], p.e. diag. touch
2484         { int i4, i5, i6=-1, i7=-1, i8=-1, i9=-1;
2485           int num_allvec= box2->num_frame_vectors[box2->num_frames-1]-1;
2486           for (i4=0;i4<num_allvec-1;i4++)
2487              if ( box2->frame_vector[i4  ][0]!=x0
2488                && box2->frame_vector[i4  ][0]!=x1
2489                && box2->frame_vector[i4+1][0]!=x0
2490                && box2->frame_vector[i4+1][0]!=x1
2491                && box2->frame_vector[i4  ][1]!=y0
2492                && box2->frame_vector[i4  ][1]!=y1
2493                && box2->frame_vector[i4+1][1]!=y0
2494                && box2->frame_vector[i4+1][1]!=y1
2495                && abs(box2->frame_vector[i4+1][0]  // dx==1
2496                      -box2->frame_vector[i4  ][0])==1
2497                && abs(box2->frame_vector[i4+1][1]  // dy==1
2498                      -box2->frame_vector[i4  ][1])==1 )
2499             for (i5=i4+2;i5<num_allvec;i5++)
2500               /* same point in opposit direction == diag dots? */
2501               /* may fail if one vector is longer 1+1 ... ToDo18 */
2502               if (box2->frame_vector[i4  ][0]
2503                == box2->frame_vector[i5  ][0]
2504                && box2->frame_vector[i4  ][1]
2505                == box2->frame_vector[i5  ][1]
2506                && box2->frame_vector[i4+1][0]
2507                == box2->frame_vector[i5-1][0]
2508                && box2->frame_vector[i4+1][1]
2509                == box2->frame_vector[i5-1][1]) {
2510               if(job->cfg.verbose&2)
2511         fprintf(stderr,"DBG vsplit i45= %d %d i67 %d %d i89 %d %d"
2512                 " xy=%2d %2d ToDo\n", i4, i5, i6, i7, i8, i9,
2513                   box2->frame_vector[i4][0]-x0,
2514                   box2->frame_vector[i4][1]-y0);
2515               if (i6==-1) { i6=i4; i7=i5; } else /* unique */
2516               if (i8==-1) { i8=i4; i9=i5; } else /* unique */
2517               { i4=num_allvec; break; } // max 2 cut-points or abort
2518               /* ToDo: else break? ore store 2 max. */
2519            } // found touching vectors
2520            // ToDo18: handle split at vectors i4 i5 and/or i6 i7
2521            //  p.e. tmp13/sslmozFP.png bold 8x9 proportional font "nt" "To"
2522            //  p.e. tmp09/barcodes090916_code39.png 10x12 prop.font "ow"
2523            // ...
2524         } // inverse vectors
2525         // it would be better if testing is only if most right and left char
2526         //    has no horizontal gap (below m2) ex: be
2527         i=0; // num splittet chars
2528         xi[0]=x0; xi[1]=x0+(dx/8)+1; xi[2]=x1; // split_to xi0..1 and xi1..2
2529         for ( ; ; xi[i+1]++) { // x[i] .. x[i+1], slower? but better v0.42
2530           int bow=0;   // default = no bow = no cutting = fail divide
2531           // ToDo: skip if not a local dy-min for speedup
2532           // int num_b2vec= box2->num_frame_vectors[box2->num_frames-1]-1;
2533           int num_allvec= box2->num_frame_vectors[box2->num_frames-1]-1;
2534           // int num_b2vec= box2->num_frame_vectors[0]-1; // biggest frame
2535           int i1, i2, i3; /* vector indizes around cutting gaps */
2536           /* break if x is to near to the right border */
2537           if (xi[i+1]>x1-dx/8-1) { if (i==0) break;
2538               i--; xi[i+2]=x1; continue; }
2539           int l1=box2->line, mono=job->res.lines.mono[l1]; // 2018-10 add
2540           if (mono &&  // 2018-10 use monofont advantage
2541            ( abs(abs(xi[i+1]-x0) -   job->res.lines.pitch[l1])  /* 2 chars */
2542                                  >   job->res.lines.pitch[l1]/8
2543           && abs(abs(xi[i+1]-x0) - 2*job->res.lines.pitch[l1])  /* 3 chars */
2544                                  >   job->res.lines.pitch[l1]/8 ) ) continue;
2545           if (mono && job->cfg.verbose&2) // rnd80-Droid-Sans-Mono-Regular ww
2546             fprintf(stderr,"\n#DBG monosplit x01,xi,pitch= %4d %4d %4d %4d",
2547               x0, x1-x0+1, xi[i+1]-x0,job->res.lines.pitch[l1]);
2548           //  ToDo: search invers vectors (diagonal touching chars)
2549           //        between left-down and right-down vectors
2550           //        and right-(middle)top and left-top vectors
2551           // "To" "rn" "fi" "nt" "ity" bold 8x9 tmp13/sslmozFP.png
2552           // 2017-03 new nearest_frame-version, check lower ends
2553           if (box2->num_frames<1) fprintf(stderr,"ERROR.split frames=0\n");
2554           // search vectors near (xi1,y1) = bottom of char and (xi1,y0) = top
2555           // failed on tmp08/gocr0801_bad5.jpg "ke"
2556 //          i1=nearest_frame_vector(box2, 0,num_allvec, (xi[0]+3*xi[1])/4, y1);
2557 //          i3=nearest_frame_vector(box2, 0,num_allvec, (3*xi[1]+xi[2])/4, y1);
2558           i1=nearest_frame_vector(box2, 0,num_allvec, (xi[0]+xi[1])/2, y1);
2559           i3=nearest_frame_vector(box2, 0,num_allvec, (xi[1]+xi[2])/2, y1);
2560           i2=nearest_frame_vector(box2,i1,i3, xi[1], y0);
2561 
2562           // 2017-08 vectors may lay on border, x1=+3..-3 replaced by +dx/8+1
2563           // 2017-08 num_b2vec replaced by num_allvec to split small 'Fi'
2564           DBG( if(job->cfg.verbose&2) // 2018-09
2565             fprintf(stderr,"\nDBG split at xi,i123 %2d %2d %2d"\
2566                " #%02d #%02d #%02d dy %d",\
2567                xi[0]-x0,xi[1]-x0,xi[2]-x0,i1,i2,i3,y1-y0+1); )
2568           if (i1==i2 || i2==i3) continue;  /* must be different 2017-03 */
2569           if (-2*box2->frame_vector[i2][1]
2570                 +box2->frame_vector[i1][1]
2571                 +box2->frame_vector[i3][1]>(y1-y0)/2) bow=1; // big dy
2572           // ToDo17: do not cut holes!? check other nearest_frame_vectors?
2573           //   tmp09/barcodes090916_code39.png "ow"
2574           //   tmp13/sslmozFP.png "Fi" "To" "ity"
2575           if(job->cfg.verbose&2)
2576             fprintf(stderr,"\n# test split at  x%d= %2d %2d %2d"
2577                     " bow %d i123=%2d %2d %2d",
2578                       i, xi[i]-x0, xi[i+1]-x0, xi[i+2]-x0,
2579                       bow, i1,i2,i3);
2580           /* skip if no local minimum at xi[i+1] or if its not thin enough */
2581           // 2010-10-11 failes for ke on tmp08/gocr0801_bad5.jpg ToDo!!!
2582 //          if (bow==0 || 4*(ymax-ymin)>2*(y1-y0)) continue;
2583           if (bow==0) continue;
2584           // cuttet parts should have about the same height (max-min)
2585           // we dont want to cut an 'n' in three parts!
2586           // ToDo: thickness on xi[i+1]?
2587           // try to split successive right box if left box is recognised,
2588           // else shift the splitting point further to the right border
2589           // removing ->dots if dot only above one char !!! ??? not implemented
2590           if(job->cfg.verbose&2)
2591             fprintf(stderr,"\n# try to split, newbox[%d].x= %2d ... %2d "
2592                            "dy= %d ", i, xi[i]-x0, xi[i+1]-x0, y1-y0+1);
2593           boxa=*box2;	// copy contents, ToDo: reset ac-list (in cut_box?)
2594           boxa.x=xi[i]; boxa.y=y0;        // obsolete? mark pixel, overlap?
2595           boxa.x0=xi[i];boxa.x1=xi[i+1];  // new horizontal box range
2596           // ToDo: vector-version cut at 2vec near xi, allow dx/8 overlapp!
2597           //   see tmp13/ssl* "To"
2598           cut_box(&boxa); boxa.num_ac=0;  // ToDo: add box2 as src argument?
2599           // out_x(&boxa);
2600           // get wchar + certainty
2601           ci[i]=whatletter(&boxa,cs,0); /* get char */
2602           a2[i]=testac(&boxa,ci[i]); /* get certainty */
2603           if ((ci[i]=='c' || ci[i]=='C') && a2[i]==100) // 2018-09 "ow" read as 100% "cw"
2604             { setac(&boxa,ci[i],a2[i]=99);
2605               DBG(fprintf(stderr,"\nDBG set split certainty 99");)}
2606           if(job->cfg.verbose&2)
2607             fprintf(stderr,"\n#  certainty %d  limit= %d  cbest= %d ",
2608                            a2[i], job->cfg.certainty, cbest);
2609 	  if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2610 	   || wcschr(s1,ci[i]) ) { continue; }  // dont split here
2611 
2612           for (ar=ad,ii=0;ii<=i;ii++) {
2613             ar=a2[ii]*ar/100; }  // multiply all probabilities
2614 	  if (ar<98*job->cfg.certainty/100 || ar<cbest) {
2615             continue; } // dont go deeper, no longer string
2616 
2617 	  i++; if (i==8) break; // maximum splits
2618 	  if (i==4) break;  // at the moment its to slow to go further
2619 	  if (i+1<8) xi[i+1]=x1;  // right border of next box
2620 	  if (i+2<8) xi[i+2]=x1;
2621 
2622           if(job->cfg.verbose&2)
2623             fprintf(stderr,"\n try end split [%d].x=%d [%d].x=%d ",
2624                            i, xi[i]-x0, i+1, xi[i+1]-x0);
2625           boxb=*box2;  // try rest if it has to be split again
2626           boxb.x=xi[i]+1; boxb.y=y0;
2627           boxb.x0=xi[i]+1;boxb.x1=xi[i+1];
2628           cut_box(&boxb); boxb.num_ac=0;
2629           ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2630 	  if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2631 	   || wcschr(s1,ci[i]) ) { xi[i+1]=xi[i]+2; continue; } // split rest
2632 	  // now we have everything splittet
2633 
2634           if(job->cfg.verbose&2) {
2635             fprintf(stderr,"\n split at/to: ");
2636             for (ii=0;ii<=i;ii++)
2637             fprintf(stderr,"  %2d %s (%3d)", xi[ii+1]-x0,
2638               decode(ci[ii],ASCII), a2[ii]);
2639             fprintf(stderr,"\n");
2640           }
2641 	  // boxa..c changed!!! dots should be modified!!!
2642           // Question: cut it into boxes v0.40 or set a string v0.41?
2643           // new way of building a string v0.41 (can call setas multiple)
2644           // usefull if compare unknown with known strings (except barcode?)
2645           // ToDo: also create alternate variants? ex: I <-> l
2646           for (buf[0]=0,ar=ad,ii=0;ii<=i;ii++) {
2647             ar=a2[ii]*ar/100;  // multiply all probabilities
2648             if (i>0 && ci[ii]=='n' && ci[ii-1]=='r') ar--; // m == rn
2649             strncat(buf,decode(ci[ii],job->cfg.out_format),20);
2650           }
2651 
2652           if (ar>cbest) cbest=ar; // best (highest) certainty found
2653           // reduce, but not if we cross certainty border
2654           if (99*ar/100 > job->cfg.certainty) ar=99*ar/100;
2655           if (job->cfg.verbose&2)
2656             fprintf(stderr,"\n split result= %s (%3d) ",buf, ar);
2657           setas(box2,buf,ar); // char *, does it disturb further splitting?
2658           buf[0]=0;
2659           i--; xi[i+2]=x1;
2660         } /* xi[i+1]++ */
2661       } /* divide box */
2662     } /* unknown box dx>5 */
2663   } end_for_each(&(job->res.boxlist));
2664   if (job->cfg.verbose) fprintf(stderr,", numC %d\n",job->res.numC);
2665   return 0;
2666 }
2667 
2668 /*
2669 // ---- divide vertical glued boxes (ex: g above T);
2670 */
divide_vert_glued_boxes(pix * pp,int mo)2671 int  divide_vert_glued_boxes( pix *pp, int mo){
2672   struct box *box2,*box3,*box4;
2673   job_t *job=OCR_JOB; /* fixme */
2674   int y0,y1,y,dy,flag_found,dx;
2675   if(job->cfg.verbose)fprintf(stderr,"# divide vertical glued boxes");
2676   for_each_data(&(job->res.boxlist)) {
2677     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2678     if (box2->c != UNKNOWN) continue; /* dont try on pictures */
2679     y0=box2->y0; y1=box2->y1; dy=y1-y0+1;
2680     dx=4*(job->res.avX+box2->x1-box2->x0+1);     // we want to be sure to look at 4ex distance
2681     if ( dy>2*job->res.avY && dy<6*job->res.avY && box2->m1
2682       && y0<=box2->m2+2 && y0>=box2->m1-2
2683       && y1>=box2->m4+job->res.avY-2)
2684     { // test if lower end fits one of the other lines?
2685       box4=box2; flag_found=0;
2686       for_each_data(&(job->res.boxlist)) {
2687         box4 = (struct box *)list_get_current(&(job->res.boxlist));
2688         if (box4->c != UNKNOWN) continue; /* dont try on pictures */
2689         if (box4->x1<box2->x0-dx || box4->x0>box2->x1+dx) continue; // ignore far boxes
2690         if (box4->line==box2->line  ) flag_found|=1;    // near char on same line
2691         if (box4->line==box2->line+1) flag_found|=2;    // near char on next line
2692         if (flag_found==3) break;                 // we have two vertical glued chars
2693       } end_for_each(&(job->res.boxlist));
2694       if (flag_found!=3) continue;         // do not divide big chars or special symbols
2695       y=box2->m4;  // lower end of the next line
2696       if(job->cfg.verbose&2){
2697         fprintf(stderr,"\n# divide box below y=%4d",y-y0);
2698         if(job->cfg.verbose&6)out_x(box2);
2699       }
2700       // --- insert box3 before box2
2701       box3= (struct box *) malloc_box(box2);
2702       box3->y1=y;
2703       box2->y0=y+1; box2->line++; // m1..m4 should be corrected!
2704       if (box4->line == box2->line){
2705         box2->m1=box4->m1;        box2->m2=box4->m2;
2706         box2->m3=box4->m3;        box2->m4=box4->m4;
2707       }
2708       box3->num=job->res.numC;
2709       if (list_ins(&(job->res.boxlist), box2, box3)) {
2710           fprintf(stderr,"ERROR list_ins\n"); };
2711       job->res.numC++;
2712     }
2713   } end_for_each(&(job->res.boxlist));
2714   if(job->cfg.verbose)fprintf(stderr,", numC %d\n",job->res.numC);
2715   return 0;
2716 }
2717 
2718 
2719 /*
2720    on some systems isupper(>255) cause a segmentation fault SIGSEGV
2721    therefore this function
2722    ToDo: should be replaced (?) by wctype if available on every system
2723  */
wisupper(wchar_t cc)2724 int wisupper(wchar_t cc){ return ((cc<128)?isupper(cc):0); }
wislower(wchar_t cc)2725 int wislower(wchar_t cc){ return ((cc<128)?islower(cc):0); }
wisalpha(wchar_t cc)2726 int wisalpha(wchar_t cc){ return ((cc<128)?isalpha(cc):0); }
wisdigit(wchar_t cc)2727 int wisdigit(wchar_t cc){ return ((cc<128)?isdigit(cc):0); }
wisspace(wchar_t cc)2728 int wisspace(wchar_t cc){ return ((cc<128)?isspace(cc):0); }
2729 
2730 /* set box2->c to cc if cc is in the ac-list of box2, return 1 on success  */
setc(struct box * box2,wchar_t cc)2731 int setc(struct box *box2, wchar_t cc){
2732   int ret=0, w2; // w1
2733   // w1=((box2->num_ac) ? box2->wac[0] : 0);  // weight of replaced char
2734   w2=testac(box2,cc);
2735   if (OCR_JOB->cfg.verbose) {
2736     // print first 2 alternative chars
2737       fprintf(stderr, "\n#  setc old nac=%d %s %s %3d %3d  to %s %3d at %4d %4d",
2738        box2->num_ac, decode(box2->c,ASCII),
2739        (box2->num_ac<2)?" ":decode(box2->tac[1],ASCII), box2->wac[0],
2740        (box2->num_ac<2)?0:box2->wac[1],
2741        decode(cc,ASCII), (100+w2+1)/2, box2->x0, box2->y0);
2742   }
2743   if (w2) { if (box2->c!=cc) { ret=1; setac(box2,cc,(100+w2+1)/2); } }
2744   // if(OCR_JOB->cfg.verbose & 4) out_x(box2);
2745   // ToDo: modify per setac (shift ac)
2746   return ret;
2747 }
2748 
2749 
2750 /* ---- proof difficult chars Il1 by context view ----
2751   context: separator, number, vowel, nonvowel, upper case ????
2752   could be also used to find unknown chars if the environment (nonumbers)
2753     can be found in other places!
2754   ToDo:
2755    - box->tac[] as set of possible chars, ac set by engine, example:
2756        ac="l/" (not "Il|/\" because serifs detected and slant>0)
2757        correction only to one of the ac-set (alternative chars)!
2758    - should be language-settable; Unicode compatible
2759    - box2->ad and wac should be changed? (not proper yet)
2760  *  ------------- */
context_correction(job_t * job)2761 int context_correction( job_t *job ) {
2762  // const static char
2763   char *l_vowel="aeiouy";
2764     // *l_Vowel="AEIOU",chars if the environment (nonumbers)
2765   char *l_nonvo = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ";
2766   int  hexdigits = 0, hexdivpos = 0; // "O0lI123456789ABCDEFabcdef:"
2767   struct box *box3, *box2, *prev, *next, *pre2, *pre3, *pre4;
2768   int dx, dy, O0_num=0, O0_slashed_zeros=0,
2769       O0_maxw=0, O0_minw=999999, O0_maxh=0, O0_minh=999999;
2770   //  pix *pp = &(job->src.p);
2771   int nc=0, ns=0; // num corrections
2772   wchar_t last_double_quotation=0; // correction of different quotations "
2773   pre4=pre3=pre2=prev=next=NULL;
2774 
2775   if (job->cfg.verbose)
2776     fprintf(stderr, "# context correction Il1 O0\n");
2777 
2778   // 1st loop to make max/min/num-statistics O0-correction 2018-09 rnd.tt
2779   for_each_data(&(job->res.boxlist)) {
2780     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2781     dx= box2->x1 - box2->x0 + 1;
2782     dy= box2->y1 - box2->y0 + 1;
2783     if (box2->c && strchr("O0",box2->c)  /* strchr "O0" 0x00 = true */
2784         && dy >= box2->m3 - box2->m2){ /* do not at ° */
2785       // IFV fprintf(stderr,"\n# O0 stat nac=%d %s %3d at %3d %3d",
2786        // box2->num_ac,decode(box2->c,ASCII), box2->wac[0], box2->x0, box2->y0);
2787       // ToDo18 maxw < 2*mean_dx
2788       O0_num++;
2789       if (box2->num_frames==3 && box2->c=='0') O0_slashed_zeros++;
2790       if (O0_maxw < dx) O0_maxw= dx;  /* max width O */
2791       if (O0_minw > dx) O0_minw= dx;  /* min width 0 */
2792       if (O0_maxh < dy) O0_maxh= dy;  /* max high  0 */
2793       if (O0_minh > dy) O0_minh= dy;  /* min high  O */
2794     }
2795   } end_for_each(&(job->res.boxlist));
2796   if (job->cfg.verbose)
2797     fprintf(stderr, "# O0 num= %d slashed0=%d mimaxW %d %d",
2798       O0_num, O0_slashed_zeros, O0_minw, O0_maxw);
2799 
2800   // 2nd loop to make corrections
2801   for_each_data(&(job->res.boxlist)) {
2802     pre4=pre3; pre3 = pre2; pre2 = prev; // 2010-10-01 tmp08/080916_JL*_150
2803     box2 = (struct box *)list_get_current(&(job->res.boxlist));
2804     prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2805     next = (struct box *)list_get_cur_next(&(job->res.boxlist));
2806     dx= box2->x1 - box2->x0 + 1;
2807     dy= box2->y1 - box2->y0 + 1;
2808     if (box2->c==0) continue; // 2018-09 strchr false positive
2809     // ToDo: count last_upper, lower, digits, hexdigits
2810     // 2010-10-10 hex-mode tmp08/gocr0801_bad5
2811     if (box2->c && strchr("O0lI123456789ABCDEFabcdef",box2->c)) hexdigits++;
2812     else if (box2->c && strchr(": ",box2->c) && prev && prev->c!=box2->c
2813          && (hexdigits-hexdivpos==2 || hexdigits-hexdivpos==4))
2814            hexdivpos=hexdigits;
2815       else { hexdigits=0; hexdivpos=0; }
2816     if (box2->c==' ' && prev && prev->c==' ') hexdigits=0;
2817     if (box2->c==':' && pre3 && pre3->c!=':') hexdigits=0; // :89:AB:CD:
2818     if (box2->c && strchr("O0",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'0');
2819     if (box2->c && strchr("l1",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'1');
2820     // 2010-10-01 sample tmp08/0811CSchulze_crop
2821     if (box2->c==DOUBLE_LOW_9_QUOTATION_MARK) {
2822       last_double_quotation = box2->tac[0];
2823       fprintf(stderr,"\n#  ... found DOUBLE_LOW_9_QUOTATION_MARK");
2824     }
2825     if (box2->c==QUOTATION_MARK // 0x22 = ""
2826       && last_double_quotation == DOUBLE_LOW_9_QUOTATION_MARK) {
2827       last_double_quotation = 0;
2828       box2->c = box2->tac[0] = DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK;
2829       IFV fprintf(stderr,"\n#  change nac=%d %s   %3d to %s %3d at %3d %3d",
2830         box2->num_ac, "\"", box2->wac[0],
2831         decode(box2->c,ASCII), box2->wac[0], box2->x0, box2->y0);
2832     } // box2->c==QUOTATION_MARK // 0x22 = ""
2833 
2834     if (           box2->c > 0xFF ) continue; // temporary UNICODE fix 1
2835     if ((prev) && (prev->c > 0xFF)) continue; // temporary UNICODE fix 2
2836     if ((next) && (next->c > 0xFF)) continue; // temporary UNICODE fix 3
2837     if (box2->num_ac<2) continue; // no alternatives
2838     if (box2->wac[0]==100 && box2->wac[1]<100) continue;
2839     if (box2->num_ac && box2->tas[0]) continue; // buggy space_remove 0.42
2840 
2841     /* check for Il1| which are general difficult to distinguish */
2842     /* bbg: not very good. Should add some tests to check if is preceded by '.',
2843      spelling, etc */
2844     /* ToDo: only correct if not 100% sure (wac[i]<100)
2845         and new char is in wat[] */
2846     if (box2->c && strchr("Il1|", box2->c) && next && prev) {
2847 //       if( strchr(" \n",prev->c)      // SPC
2848 //        && strchr(" \n",next->c) ) box2->c='I'; else // bad idea! I have ...
2849       if (wisalpha(next->c) && next->c!='i' &&
2850           ( prev->c == '\n' || // unref-pointer pre2 fix 2017-04-25 by Norbert M.
2851 	   ( prev->c == ' ' && (!pre2 || (pre2 && pre2->c == '.' )) ) ) )
2852         {  nc+=setc(box2,(wchar_t)'I'); }
2853       else if (
2854            (   box2->c!='1'             /* lnt => Int, but 1st */
2855             && strchr(l_nonvo,next->c)
2856             && strchr("\" \n",prev->c))
2857           ||  (prev && ((!pre2) || wisupper(pre2->c) || strchr(" \n",pre2->c))
2858             && wisupper(prev->c)
2859             && box2->num_frame_vectors[0]==4
2860             && box2->frame_vector[0][0]==box2->x0
2861             && box2->frame_vector[1][0]==box2->x0
2862             && box2->frame_vector[2][0]==box2->x1
2863             && box2->frame_vector[3][0]==box2->x1
2864             ))  // " DI*"
2865         /* do not change he'll to he'Il! */
2866         { nc+=setc(box2,(wchar_t)'I'); }  // set box2->c to 'I' if 'I' is in the ac-list
2867       else if (strchr(l_vowel,next->c)) /* unusual? Ii Ie Ia Iy Iu */
2868           /*  && strchr("KkBbFfgGpP",prev->c)) */ /* kle Kla Kli */
2869           {  nc+=setc(box2,(wchar_t)'l'); }
2870       else if (wisupper(next->c)  // ToDo: check 6 neighbours for upper+spaces
2871             && !strchr("O0I123456789",next->c)
2872             && !strchr("O0I123456789",prev->c)) /* avoid lO => IO (10) */
2873 	{  nc+=setc(box2,(wchar_t)'I'); }
2874       else if (prev && wislower(prev->c))
2875 	{  nc+=setc(box2,(wchar_t)'l'); }
2876       else if (wisdigit(prev->c)
2877             || wisdigit(next->c)
2878             || (next && strchr(":-",next->c) && pre2 && pre2->c==next->c
2879              && prev && strchr("0123456789ABCDabcd",prev->c)) // hex 2010-10
2880             || (next->c=='O' && !wisalpha(prev->c)))  /* lO => 10 */
2881 	{  nc+=setc(box2,(wchar_t)'1'); }
2882     }
2883     // JS-2010-09 (ToDo: only if I is an alternate char!?)
2884     if (strchr("Il|", box2->c) && next && !prev) { // first char?
2885       if (wisalpha(next->c) && next->c!='i' && !strchr(l_vowel,next->c))
2886 	 {  nc+=setc(box2,(wchar_t)'I'); }
2887       else if (wisupper(next->c)
2888             && !strchr("O0I123456789",next->c)) /* avoid lO => IO (10) */
2889 	{  nc+=setc(box2,(wchar_t)'I'); }
2890     }
2891 
2892     // ToDo: count width of all "0O" to decide between wide and narrow O's
2893     // ToDo: set dbg-stack to context correction + setc output corrections
2894     // FreeMono-Regular 0 is slightly higher and less width than O, rnd-chars
2895     //    0 is 30x51 (m3-m0=50 H=Xx46)
2896     //    O is 40x48
2897     /* check for O0 */
2898     else if (strchr("O0", box2->c)) {
2899       int i0, have_hexhi=0, have_hexlo=0, have_digits=0, have_alpha=0,
2900               have_upper=0;
2901       wchar_t c0; /* test char loop over pre2 ... next2 */
2902       for (i0=0; i0<5; i0++) {/* ToDo: take into account 100% chars only? */
2903         c0='\0';
2904         if (i0==4 && pre4) c0=pre4->c;
2905         if (i0==0 && pre3) c0=pre3->c;
2906         if (i0==1 && pre2) c0=pre2->c;
2907         if (i0==2 && prev) c0=prev->c;
2908         if (i0==3 && next) c0=next->c; /* ToDo17 nex2 for 0.7 */
2909         if (c0=='\0') continue;
2910         if (strchr("abcdef",c0)) have_hexlo++; /* 2017-07 */
2911         if (strchr("ABCDEF",c0)) have_hexhi++; /* 2017-07 */
2912         if (strchr("123456789",c0)) have_digits++;
2913         if (strchr("ghijklmnopqrstuvwxyz",c0)) have_alpha++;
2914         if (strchr("GHIJKLMNPQRSTUVWXYZ",c0)) have_upper++;
2915       }
2916       if ((have_hexlo && have_hexhi) || have_alpha || have_upper) {
2917         have_upper+=have_hexhi; have_alpha+=have_hexlo;
2918         have_hexlo=0; have_hexhi=0; } // have_hex*=0 if isalpha
2919       // wchar_t c_ask= 'O'; // detect changes?
2920       if (O0_slashed_zeros==0 && O0_num>1  // 2018-09 rnd80.tt
2921          && O0_maxw > O0_minw + (box2->x1 - box2->x0 + 1)/32 + 1
2922          && (box2->x1 - box2->x0 + 1) > (O0_maxw+O0_minw)/2
2923          && dy >= box2->m3 - box2->m2) {
2924         nc+=setc(box2,(wchar_t)'O'); // big width
2925         IFV fprintf(stderr," DBG%04d %d,%d: O0 to O", __LINE__,
2926           box2->x0,box2->y0);
2927       } else
2928       if (O0_slashed_zeros==0 && O0_num>1  // 2018-09 rnd80.tt
2929          && O0_maxw > O0_minw + (box2->x1 - box2->x0 + 1)/32 + 1
2930          && (box2->x1 - box2->x0 + 1) < (O0_maxw+O0_minw+1)/2
2931          && dy >= box2->m3 - box2->m2) {
2932         nc+=setc(box2,(wchar_t)'0'); // small width
2933         IFV fprintf(stderr," DBG%04d %d,%d: O0 to 0", __LINE__,
2934           box2->x0,box2->y0);
2935       } else
2936       if (((!next) || !strchr(" .,", next->c))
2937       && ((((!prev) ||  wisspace(prev->c)) // first letter?
2938            && have_alpha /* words vs units? Orig vs. 0days */
2939            && (!have_digits))
2940        ||    (have_upper && (!have_digits)) )) // UPWORD?
2941       { nc+=setc(box2,(wchar_t)'O');
2942         IFV fprintf(stderr," DBG%04d %d,%d: O0 to O", __LINE__,
2943           box2->x0,box2->y0);}
2944       // ! "  Otto"
2945       // wchar_t c_ask= '0'; // + replace else if !!!
2946       else if ((have_digits || have_hexlo /* || have_hexhi */ /* C0DE39 */
2947              || (prev && strchr(" -+", prev->c) &&
2948                  next && strchr(" .,", next->c)))
2949             && (!have_upper) /*&& (!have_hexhi)*/) /* 2017-07 */
2950 	{ nc+=setc(box2,(wchar_t)'0');
2951           IFV fprintf(stderr," DBG%04d %d,%d: O0 to 0", __LINE__,
2952             box2->x0,box2->y0);}
2953     } // O0
2954 
2955     /* check for 5S */
2956     else if (strchr("5S", box2->c) && next && prev) {
2957       if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */
2958 	{ nc+=setc(box2,(wchar_t)'S'); }
2959       else if (wisalpha(prev->c) && wisalpha(next->c)
2960                                  && wisupper(next->c)) /* word in upper case */
2961 	{ nc+=setc(box2,(wchar_t)'S'); }
2962       else if (wisdigit(prev->c) || wisdigit(next->c))
2963 	{ nc+=setc(box2,(wchar_t)'5'); }
2964     }
2965 
2966     /* was a space not found? xXx => x Xx ??? */
2967     if (wisupper(box2->c) && next && prev) {
2968       if (wislower(prev->c) && wislower(next->c)
2969 	  && 2 * (box2->x0 - prev->x1) > 3 * (next->x0 - box2->x1)) {
2970 	struct box *box3 = malloc_box((struct box *) NULL);
2971 	box3->x0 = prev->x1 + 2;
2972 	box3->x1 = box2->x0 - 2;
2973 	box3->y0 = box2->y0;
2974 	box3->y1 = box2->y1;
2975 	box3->x = box2->x0 - 1;
2976 	box3->y = box2->y0;
2977 	box3->dots = 0;
2978 	box3->num_boxes = 0;
2979 	box3->num_subboxes = 0;
2980 	box3->c = ' ';
2981 	box3->modifier = 0;
2982 	setac(box3,' ',99); /* ToDo: weight depends from distance */
2983 	box3->num = -1;
2984 	box3->line = prev->line;
2985 	box3->m1 = box3->m2 = box3->m3 = box3->m4 = 0;
2986 	box3->p = &(job->src.p);
2987 	list_ins(&(job->res.boxlist), box2, box3);
2988       }
2989     }
2990 
2991     /* a space before punctuation? but not " ./file" */
2992     if ( prev && next)
2993     if (prev->c == ' ' && strchr(" \n"    , next->c)
2994                        && strchr(".,;:!?)", box2->c))
2995       if (prev->x1 - prev->x0 < 2 * job->res.avX) {	// carefully on tables
2996 	box3 = prev;
2997 	if ( !list_del(&(job->res.boxlist), box3) ) free_box(box3);
2998         prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2999         ns++;
3000       }
3001 
3002     /* \'\' to \" */
3003     if ( prev )
3004     if ( (prev->c == '`' || prev->c == '\'')
3005       && (box2->c == '`' || box2->c == '\'') )
3006       if (prev->x1 - box2->x0 < job->res.avX) { // carefully on tables
3007         box2->c='\"';
3008 	box3 = prev;
3009 	list_del(&(job->res.boxlist), box3);
3010 	free_box(box3);
3011       }
3012   } end_for_each(&(job->res.boxlist));
3013   if (job->cfg.verbose)
3014     fprintf(stderr, " num_corrected= %d removed_spaces= %d\n", nc, ns);
3015   return 0;
3016 }
3017 
3018 
3019 /* ---- insert spaces ----
3020  *  depends strongly from the outcome of measure_pitch()
3021  * ------------------------ */
list_insert_spaces(pix * pp,job_t * job)3022 int list_insert_spaces( pix *pp, job_t *job ) {
3023   int i=0, j1, j2, i1, maxline=-1, dy=0, num_nl=0, num_spc=0, min_x0=1023;
3024   char cc;
3025   struct box *box2, *box3=NULL, *box4=NULL;
3026 
3027   // measure mean line height
3028   for(i1=1;i1<job->res.lines.num;i1++) {
3029     dy+=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
3030     if (min_x0>job->res.lines.x0[i1])
3031         min_x0=job->res.lines.x0[i1];  // 2010-09-30
3032   } if (job->res.lines.num>1) dy/=(job->res.lines.num-1);
3033   i=0; j2=0;
3034   for(i1=1;i1<job->res.lines.num;i1++) {
3035     j1=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
3036     if (j1>dy*120/100 || j1<dy*80/100) continue; // only most frequently
3037     j2+=j1; i++;
3038   } if (i>0 && j2/i>7) dy=j2/i;
3039   if( job->cfg.verbose&1 )
3040     fprintf(stderr,"# insert space between words (dy=%d) ...",dy);
3041   if (!dy) dy=(job->res.avY)*110/100+1;
3042 
3043   if (min_x0 < 4) min_x0 = 0; // tmp09/oebb_teletext* monospaced first gap
3044   // ToDo: rewrite, replace cc by num_spc + num_nl
3045   i=0;
3046   for_each_data(&(job->res.boxlist)) {
3047     int thispitch=0, thismono=0, pdist=0; // spacing paras per line
3048     box2 =(struct box *)list_get_current(&(job->res.boxlist));
3049     cc=0; num_nl=0; num_spc=0;
3050     box3 = (struct box *)list_prev(&(job->res.boxlist), box2);
3051     if (box2->line > maxline) {  // new line, lines and chars must be sorted!
3052       int ydist=0, ypitch=0;
3053       if (maxline>=0) {
3054         // num_nl = 1; // ToDo: allow multiple newlines
3055         if (box2->line>1)
3056          ydist = job->res.lines.m1[ box2->line   ]
3057                 -job->res.lines.m1[ box2->line-1 ]; // 2010-09-26
3058         ypitch = job->res.lines.m4[ box2->line ]
3059                 -job->res.lines.m1[ box2->line ];
3060         if (ypitch>4) num_nl = ydist / (2*ypitch); // ToDo: improve it!
3061         if (!num_nl) num_nl=1;
3062       }
3063       maxline=box2->line;
3064     }
3065     if (box2->line==maxline) {  // lines and chars must be sorted!
3066       thispitch = job->res.lines.pitch[box2->line];
3067       thismono  = job->res.lines.mono[ box2->line];
3068       if (box3) pdist  = box2->x0 - box3->x1 - 1; // 2010-09-26
3069       if (pdist < 0) pdist = 0; // overlap like proportional: "VA"
3070       if (num_nl || !box3)
3071         pdist  = box2->x0 - min_x0; // first char of new line
3072       // if (pdist >= thispitch) cc=' '; // 2010-09-24 ???
3073       if (thismono) num_spc = pdist / thispitch;
3074       else          num_spc = pdist*2 / (3*job->res.avX); // ToDo: use 1em!
3075       if (pdist>=thispitch && !num_spc) num_spc = 1; // proportional font
3076       // ToDo: multi spaces for proportional font
3077     }
3078 
3079 #if 0
3080     if ((job->cfg.verbose&48)==48)
3081       fprintf(stderr,"\n# DBG%02d %d mono=%d  %d pitch= %2d"
3082         " pdist= %2d nl %d spc %d", maxline, box2->line, thismono,
3083         job->res.lines.mono[ box2->line], thispitch, pdist, num_nl, num_spc);
3084 #endif
3085 
3086     // call this multiple times
3087     for (i1=0;i1<num_nl+num_spc;i1++) {
3088       int mdist=0;
3089       box4=(struct box *)list_prev(&(job->res.boxlist), box2);
3090       if (box4) mdist  = box2->x0 - box4->x1 + 1; // 2010-09
3091       else      mdist  = 0;
3092       if (mdist<0) mdist=0;
3093       box3=(struct box *)malloc_box(NULL);
3094       box3->x0=box2->x0-2+((num_spc)?-mdist+ i1   *mdist/num_spc:0);
3095       box3->x1=box2->x0-2+((num_spc)?-mdist+(i1+1)*mdist/num_spc:0);
3096       box3->y0=box2->y0;
3097       box3->y1=box2->y1;
3098       if (i1>=num_nl &&  box4)
3099         box3->x0 = box4->x1+2+((num_spc)?i1*mdist/num_spc:0);
3100       if (i1< num_nl || !box4)
3101         box3->x0 = job->res.lines.x0[box2->line];
3102       if (i1< num_nl && box4){
3103         box3->y0=box4->y1;	// better use lines.y1[box2->pre] ???
3104         box3->y1=box2->y0;
3105       }
3106       box3->x = box3->x0; // 2010-09
3107       box3->y = box2->y0;
3108       box3->dots = 0;
3109       box3->c = cc = ((i1<num_nl)?'\n':' ');
3110       box3->num_boxes = 0;
3111       box3->num_subboxes = 0;
3112       box3->modifier = '\0';
3113       box3->num=-1;        box3->line=box2->line;
3114       box3->m1=box2->m1;   box3->m2=box2->m2;
3115       box3->m3=box2->m3;   box3->m4=box2->m4;
3116       box3->p=pp;
3117       setac(box3,cc,100);   /* ToDo: weight depends from distance */
3118       list_ins(&(job->res.boxlist),box2,box3); // insert box3 before box2
3119       if( job->cfg.verbose&1 ) {
3120         fprintf(stderr,"\n# insert space &%d; at %4d %4d box= %p"
3121           " mono %d dx %2d pdx,mdx %2d %2d",
3122           (int)box3->c, box3->x0, box3->y0, (void*)box3,
3123           thismono, thispitch, pdist, mdist);
3124         /* out_x(box3); */
3125       }
3126       i++;
3127     }
3128   } end_for_each(&(job->res.boxlist));
3129   if( job->cfg.verbose&1 ) fprintf(stderr,"\n# ... found %d spaces\n",i);
3130   return 0;
3131 }
3132 
3133 
3134 /*
3135    add infos where the box is positioned to the box
3136    this is useful for better recognition
3137 */
add_line_info(job_t * job)3138 int  add_line_info( job_t *job /* , List *boxlist2 */){
3139   struct tlines *lines = &job->res.lines;
3140   struct box *box2;
3141   int i,xx,mindy1,mindy2,m1,m2,m3,m4,num_line_members=0,num_rest=0;
3142   if (job->cfg.verbose&1) fprintf(stderr,"# add_line_info to boxes ...");
3143   for_each_data(&(job->res.boxlist)) {
3144     box2 =(struct box *)list_get_current(&(job->res.boxlist));
3145     for (i=1;i<job->res.lines.num;i++) /* line 0 is a place holder */
3146     { // add rotated image correction dy(x)
3147       if (lines->dx) xx=lines->dy*((box2->x1+box2->x0)/2)/lines->dx;
3148                 else xx=0;
3149       m1= lines->m1[i]+xx;
3150       m2= lines->m2[i]+xx;
3151       m3= lines->m3[i]+xx;
3152       m4= lines->m4[i]+xx;
3153       if (m4-m1==0) continue; /* no text line (line==0) */
3154       /* --- 2018-10 min y-distance y0 to m1 or y1 to m4 --- */
3155       mindy1     = abs(box2->y0 - m1);  // min dy (dots and _)
3156       if (mindy1 > abs(box2->y1 - m4))
3157           mindy1 = abs(box2->y1 - m4);
3158       mindy2=999999;
3159       if (box2->m2){ mindy2= abs(box2->y0 - box2->m1);
3160         if (mindy2 > abs(box2->y1 - box2->m4))
3161             mindy2 = abs(box2->y1 - box2->m4);
3162       }
3163       // fprintf(stderr," test line %d m1=%d %d %d %d\n",i,m1,m2,m3,m4);
3164 #if 1  // added + modified again 2018-10 ToDo18 need m5 distance to next line
3165       if(( (box2->y1 -box2->y0 +1) <= (m3-m2+1)/2
3166        && box2->y0  +job->res.avY/2 +2 >= m1
3167        && box2->y1  -job->res.avY/2 -2 <= m4 ) // dots (ToDo: better 2nd run)
3168       || ( box2->y1 +job->res.avY/4 +2 >= m2   // body
3169         && box2->y0 -job->res.avY/4 -2 <= m3)) /* not to far away */
3170 #endif
3171       /* give also a comma or dot behind the line a chance */
3172       if ( box2->x0 >= lines->x0[i]
3173         && box2->x1 <= lines->x1[i]+job->res.avX )
3174       if ( box2->y0 <= m4 + 2*job->res.avY    // 2010-10-01+09 0811qemu2
3175         && box2->y1 >= m1 -   job->res.avY/2 - 1 // give "a "o ... a chance
3176         && box2->y1 <= m4 + 2*job->res.avY )  // 2010-10-09 ocr-b-'_'
3177       if ( box2->m2==0  // already put to a line? check y-distance
3178         // || abs(box2->y0 - box2->m2) > abs(box2->y0 - m2)
3179         || mindy1 < mindy2 )
3180       { /* found nearest line */
3181         if ((job->cfg.verbose&16) && (box2->y1 -box2->y0 +1) <= (m3-m2+1)/2)
3182          fprintf(stderr,"\n#  line.info.set L%02d xy= %4d %4d m14 %4d %4d avY %4d",
3183                  i, box2->x0, box2->y0, m1, m4, job->res.avY);
3184         box2->m1= m1;
3185         box2->m2= m2;
3186         box2->m3= m3;
3187         box2->m4= m4;
3188         box2->line= i;
3189       }
3190     } // i=1..lines (for every char)
3191     if (((box2->y1 -box2->y0 +1) >= (box2->m3 -box2->m2+1)/2) // body not dots
3192      && (box2->y1+2 < box2->m1
3193       || box2->y0   < box2->m1 - (box2->m3-box2->m1)/2
3194       || box2->y0-2 > box2->m4 + (box2->m3-box2->m2)/2 // bad m4 + ,._ ocr-b
3195       || box2->y1   > box2->m3 + (box2->m3-box2->m1)
3196      )) /* to far away */
3197     {  /* reset */
3198         if (job->cfg.verbose&16)
3199          fprintf(stderr,"\n#  line.info.reset L%02d xy= %4d %4d m14 %4d %4d avY %4d",
3200                  box2->line, box2->x0, box2->y0, box2->m1, box2->m4, job->res.avY);
3201         box2->m1= 0;
3202         box2->m2= 0;
3203         box2->m3= 0;
3204         box2->m4= 0;
3205         box2->line= 0;
3206         num_rest++;
3207     } else num_line_members++;
3208   } end_for_each(&(job->res.boxlist));
3209   if (job->cfg.verbose&1)
3210     fprintf(stderr," done, num_line_chars=%d rest=%d\n",
3211             num_line_members, num_rest);
3212   return 0;
3213 }
3214 
3215 
3216 /*
3217  *  bring the boxes in right order
3218  *  add_line_info must be executed first!
3219  */
sort_box_func(const void * a,const void * b)3220 int sort_box_func (const void *a, const void *b) {
3221   struct box *boxa, *boxb;
3222 
3223   boxa = (struct box *)a;
3224   boxb = (struct box *)b;
3225 
3226   if ( ( boxb->line < boxa->line ) ||
3227        ( boxb->line == boxa->line && boxb->x0 < boxa->x0 ) )
3228     return 1;
3229   return -1;
3230 }
3231 
3232 // -------------------------------------------------------------
3233 // ------             use this for entry from other programs
3234 // include pnm.h pgm2asc.h
3235 // -------------------------------------------------------------
3236 // entry point for gocr.c or if it is used as lib
3237 // better name is call_ocr ???
3238 // jb: OLD COMMENT: not removed due to set_options_* ()
3239 // args after pix *pp should be removed and new functions
3240 //   set_option_mode(int mode), set_option_spacewidth() .... etc.
3241 //   should be used instead, before calling pgm2asc(pix *pp)
3242 //   ! change if you can ! - used by X11 frontend
pgm2asc(job_t * job)3243 int pgm2asc(job_t *job)
3244 {
3245   pix *pp;
3246   progress_counter_t *pc;
3247   static int multi_image_count=0;  /* number of image within multi-image */
3248   int orig_cs=0;
3249 
3250   if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */
3251 
3252   multi_image_count++;
3253 
3254   assert(job);
3255   /* FIXME jb: remove pp */
3256   pp = &(job->src.p);
3257 
3258   pc = open_progress(100,"pgm2asc_main");
3259   progress(0,pc); /* start progress output 0% 0% */
3260 #if 0 /* dont vast memory */
3261   /* FIXME jb: malloc */
3262   if ( job->cfg.verbose & 32 ) {
3263     // generate 2nd imagebuffer for debugging output
3264     job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);
3265     // buffer
3266     assert(job->tmp.ppo.p);
3267     copybox(&job->src.p,
3268             0, 0, job->src.p.x, job->src.p.y,
3269             &job->tmp.ppo,
3270             job->src.p.x * job->src.p.y);
3271   }
3272 #else
3273   job->tmp.ppo=job->src.p; /* temporarely, removed later */
3274 #endif
3275   // check for bad read/format-convert image writeppm/png 2018-10
3276   //if(job->cfg.verbose&32) debug_img("out00",job,8/* 8=clr_bit1..3 */);
3277 
3278 
3279   /* ----- count colors ------ create histogram -------
3280      - this should be used to create a upper and lower limit for cs
3281      - cs is the optimum gray value between cs_min and cs_max
3282      - also inverse scans could be detected here later */
3283   if (orig_cs==0)
3284     job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3285   else  // dont set cs, output stats + do inversion if needed 2010-10-07
3286     otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3287 //  if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);
3288   /* renormalize the image and set the normalized threshold value */
3289   job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
3290   if( job->cfg.verbose )
3291     fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);
3292 //  if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);
3293 
3294   progress(5,pc); /* progress is only estimated */
3295 
3296 
3297   /* this is first step for reorganize the PG
3298      ---- look for letters, put rectangular frames around letters
3299      letter = connected points near color F
3300      should be used by dust removing (faster) and line detection!
3301      ---- 0..cs = black letters, last change = Mai99 */
3302 
3303   progress(8,pc); /* progress is only estimated */
3304 
3305 //  if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);
3306   scan_boxes( job, pp );
3307   if ( !job->res.numC ){
3308     fprintf( stderr,"# no boxes found - stopped\n" );
3309     if(job->cfg.verbose&32) debug_img("out01",job,8);
3310     /***** should free stuff, etc) */
3311     return(1);
3312   }
3313   // tmp10/bug100818a.pgm creates artefacts on image
3314 //  if (job->cfg.verbose&32) debug_img("out00",job,4+8);
3315 
3316   progress(10,pc); /* progress is only estimated */
3317   // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
3318   // output_list(job);  // for debugging
3319   // ToDo: matrix printer preprocessing
3320 
3321   remove_dust( job ); /* from the &(job->res.boxlist)! */
3322 // if(job->cfg.verbose&32) debug_img("out02",job,4+8);
3323 // output_list(job);  // for debugging
3324 #if 0 // ToDo 2010-10-15 destroys QR-barcodes
3325   smooth_borders( job ); /* only for big chars */
3326 #endif
3327   progress(12,pc); /* progress is only estimated */
3328 // if(job->cfg.verbose&32) debug_img("out03",job,4+8);
3329 // output_list(job);  // for debugging
3330 
3331   detect_barcode( job );  /* mark barcode */
3332 // if(job->cfg.verbose&32) debug_img("out04",job,4+8);
3333 // output_list(job);  // for debugging
3334 
3335   detect_pictures( job ); /* mark pictures */
3336 //  if(job->cfg.verbose&32) debug_img("out05",job,4+8);
3337 // output_list(job);  // for debugging
3338 
3339   remove_pictures( job ); /* do this as early as possible, before layout */
3340 //  if(job->cfg.verbose&32) debug_img("out06",job,4+8);
3341 // output_list(job);  // for debugging
3342 
3343   glue_holes_inside_chars( pp ); /* including count subboxes (holes)  */
3344 
3345   detect_rotation_angle( job );
3346 
3347 #if 1 		/* Rotate the whole picture! move boxes */
3348   if( job->res.lines.dy!=0 ){  // move down lowest first, move up highest first
3349     // in work! ??? (at end set dy=0) think on ppo!
3350   }
3351 #endif
3352   detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */
3353 // if(job->cfg.verbose&32) debug_img("out07",job,4+8);
3354   progress(20,pc); /* progress is only estimated */
3355 
3356   add_line_info( job /* , &(job->res.boxlist) */);
3357   if (job->cfg.verbose&32) debug_img("out10",job,4+8);
3358 
3359   divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
3360 //  if(job->cfg.verbose&32) debug_img("out11",job,0);
3361 
3362   remove_melted_serifs( job, pp ); /* make some corrections on pixmap */
3363   /* list_ins seems to sort in the boxes on the wrong place ??? */
3364 //  if(job->cfg.verbose&32) debug_img("out12",job,4+8);
3365 
3366   glue_broken_chars( job, pp ); /* 2nd glue */
3367 //  if(job->cfg.verbose&32) debug_img("out14",job,4+8);
3368 // 2010-09-24 overall box size is correct here, but later broken
3369 
3370   remove_rest_of_dust( job );
3371 //  if(job->cfg.verbose&32) debug_img("out15",job,4+8);
3372 
3373   /* better sort after dust is removed (slow for lot of pixels) */
3374   list_sort(&(job->res.boxlist), sort_box_func);
3375 
3376   measure_pitch( job );
3377 
3378   if(job->cfg.mode&64) find_same_chars( pp );
3379   progress(30,pc); /* progress is only estimated */
3380 //  if(job->cfg.verbose&32) debug_img("out16",job,4+8);
3381 
3382   char_recognition( pp, job->cfg.mode);
3383   progress(60,pc); /* progress is only estimated */
3384 //  if(job->cfg.verbose&32) debug_img("out17",job,4+8);
3385 
3386   if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
3387     /* may be, characters/pictures have changed line number */
3388     list_sort(&(job->res.boxlist), sort_box_func);
3389     // 2nd recognition call if lines are adjusted
3390     char_recognition( pp, job->cfg.mode);
3391   }
3392 
3393 #define BlownUpDrawing 1     /* german: Explosionszeichnung, temporarly */
3394 #if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
3395 { /* just for debugging */
3396   int i,ii,ni; struct box *box2;
3397   i=ii=ni=0;
3398   for_each_data(&(job->res.boxlist)) { /* count boxes */
3399     box2 = (struct box *)list_get_current(&(job->res.boxlist));
3400     if (box2->c==UNKNOWN)  i++;
3401     if (box2->c==PICTURE) ii++;
3402     ni++;
3403   } end_for_each(&(job->res.boxlist));
3404   if (job->cfg.verbose)
3405     fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);
3406 }
3407 #endif
3408   // ----------- write out20.pgm ----------- mark lines + boxes
3409   if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);
3410 
3411   compare_unknown_with_known_chars( pp, job->cfg.mode);
3412   progress(70,pc); /* progress is only estimated */
3413 
3414   try_to_divide_boxes( pp, job->cfg.mode);
3415   progress(80,pc); /* progress is only estimated */
3416 
3417   /* --- list output ---- for debugging --- */
3418   if (job->cfg.verbose&6) output_list(job);
3419 
3420   /* ---- insert spaces ---- */
3421   list_insert_spaces( pp , job );
3422 
3423   // ---- proof difficult chars Il1 by context view ----
3424   if (job->cfg.verbose)
3425     fprintf(stderr,"# context correction if !(mode&32)\n");
3426   if (!(job->cfg.mode&32)) context_correction( job );
3427 
3428   store_boxtree_lines( job, job->cfg.mode );
3429   progress(90,pc); /* progress is only estimated */
3430 
3431 /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
3432  * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
3433  *  awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o
3434  * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
3435  *  9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
3436  * 1*1 1*7 not recognized (Oct04)
3437  *  33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
3438  */
3439 #if     BlownUpDrawing == 1  /* german: Explosionszeichnung */
3440 { /* just for debugging */
3441   int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
3442     i=ii=ni=0;
3443   for_each_data(&(job->res.boxlist)) { /* count boxes */
3444     box2 = (struct box *)list_get_current(&(job->res.boxlist));
3445     if (box2->c==UNKNOWN)  i++;
3446     if (box2->c==PICTURE) ii++;
3447     if (box2->c>' ' && box2->c<='z') ni++;
3448   } end_for_each(&(job->res.boxlist));
3449   if(job->cfg.verbose)
3450     fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
3451   for (i=0;i<20;i++) {
3452     ni=0;
3453     for_each_data(&(job->res.boxlist)) { /* count boxes */
3454       box2 = (struct box *)list_get_current(&(job->res.boxlist));
3455       if (box2->c==testc[i]) ni++;
3456     } end_for_each(&(job->res.boxlist));
3457     if(job->cfg.verbose && ni>0)
3458       fprintf(stderr," (%c)=%d",testc[i],ni);
3459   }
3460   if(job->cfg.verbose)
3461     fprintf(stderr,"\n");
3462 }
3463 #endif
3464 
3465   // ---- frame-size-histogram
3466   // ---- (my own defined) distance between letters
3467   // ---- write internal picture of textsite
3468   // ----------- write out30.pgm -----------
3469   if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
3470 
3471   progress(100,pc); /* progress is only estimated */
3472 
3473   close_progress(pc);
3474 
3475   return 0; 	/* what should I return? error-state? num-of-chars? */
3476 }
3477