1 /*
2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2018 Joerg Schulenburg
4
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
19 see README for EMAIL-address
20
21 sometimes I have written comments in german language, sorry for that
22
23 - look for ??? for preliminary code
24 - space: avX=22 11-13 (empirical estimated)
25 avX=16 5-7
26 avX= 7 5-6
27
28 ToDo: - add filter (r/s mismatch) g300c1
29 - better get_line2 function (problems on high resolution)
30 - write parallelizable code!
31 - learnmode (optimize filter)
32 - use ispell for final control or if unsure
33 - better line scanning (if not even)
34 - step 5: same chars differ? => expert mode
35 - chars dx>dy and above 50% hor-crossing > 4 is char-group ?
36 - detect color of chars and background
37 - better word space calculation (look at the examples)
38 (distance: left-left, middle-middle, left-right, thickness of e *0.75)
39
40 GLOBAL DATA (mostly structures)
41 - pix : image - one byte per pixel bits0-2=working
42 - lines : rows of the text (points to pix)
43 - box : list of bounding box for character
44 - obj : objects (lines, splines, etc. building a character)
45 */
46
47
48 #include <stdlib.h>
49 #include <stdio.h>
50 #include <assert.h>
51 #include <string.h>
52 #include <ctype.h>
53 #include "config.h"
54 #ifdef HAVE_WCHAR_H
55 #include <wchar.h>
56 #endif
57
58 #include "amiga.h"
59 #include "list.h"
60 #include "pgm2asc.h"
61 // #include "pcx.h" /* needed for writebmp (removed later) */
62 /* ocr1 is the test-engine - remember: this is development version */
63 #include "ocr1.h"
64 /* first engine */
65 #include "ocr0.h"
66 #include "otsu.h"
67 #include "barcode.h"
68 #include "progress.h"
69 #include "unicode_defs.h" /* UNKNOWN + PICTURES + ... */
70
71 #include "gocr.h"
72
73 #include "ocr0_dbg.h" /* added 2017-07 */
74
75 /* wew: will be exceeded by capitals at 1200dpi */
76 #define MaxBox (100*200) // largest possible letter (buffersize)
77 #define MAX(a,b) ((a) >= (b) ? (a) : (b))
78
79 /* if the system does not know about wchar.h, define functions here */
80 #ifndef HAVE_WCHAR_H
81 /* typedef unsigned wchar_t; */
82 /* Find the first occurrence of WC in WCS. */
wcschr(wchar_t * wcs,wchar_t wc)83 wchar_t *wcschr (wchar_t *wcs, wchar_t wc) {
84 int i; for(i=0;wcs[i];i++) if (wcs[i]==wc) return wcs+i; return NULL;
85 }
wcscpy(wchar_t * dest,const wchar_t * src)86 wchar_t *wcscpy (wchar_t *dest, const wchar_t *src) {
87 int i; for(i=0;src[i];i++) dest[i]=src[i]; dest[i]=0; return dest;
88 }
wcslen(const wchar_t * s)89 size_t wcslen (const wchar_t *s){
90 size_t i; for(i=0;s[i];i++); return i;
91 }
92 #endif
93 #ifndef HAVE_WCSDUP
wcsdup(const wchar_t * WS)94 wchar_t * wcsdup (const wchar_t *WS) { /* its a gnu extension */
95 wchar_t *copy;
96 copy = (wchar_t *) malloc((wcslen(WS)+1)*sizeof(wchar_t));
97 if (!copy)return NULL;
98 wcscpy(copy, WS);
99 return copy;
100 }
101 #endif
102
103 // ------------------------ feature extraction -----------------
104 // -------------------------------------------------------------
105 // detect maximas in of line overlaps (return in %) and line coordinates
106 // this is for future use
107 #define HOR 1 // horizontal
108 #define VER 2 // vertical
109 #define RIS 3 // rising=steigend
110 #define FAL 4 // falling=fallend
111
112 /* exchange two variables */
swap(int * a,int * b)113 static void swap(int *a, int *b) {
114 int c = *a;
115 *a = *b;
116 *b = c;
117 }
118
119 // calculate the overlapping of the line (0-1) with black points
120 // by recursive bisection
121 // line: y=dy/dx*x+b, implicit form: d=F(x,y)=dy*x-dx*y+b*dx=0
122 // incremental y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y))
123 // ret & 1 => inverse pixel!
124 // d=2*F(x,y) integer numbers
get_line(int x0,int y0,int x1,int y1,pix * p,int cs,int ret)125 int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
126 int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,
127 *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
128 dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
129 dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
130 // rotate coordinate system if dy>dx
131 /*bbg: can be faster if instead of pointers we use the variables and swaps? */
132 /*js: Do not know, I am happy that the current code is working and is small */
133 if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1; }
134 else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1; }
135 if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
136 d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
137 x=x0; y=y0; r0=r1=0; /* dd=tolerance (store max drift) */
138 while( (*px)<=(*px1) ){
139 if( ((getpixel(p,x,y)<cs)?1:0)^(ret&1) ) r0++; else r1++;
140 (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
141 }
142 return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
143 }
144
145 // this function should detect whether a direct connection between points
146 // exists or not, not finally implemented
147 // ret & 1 => inverse pixel!
148 // d=2*F(x,y) integer numbers, ideal line: ,I pixel: I@
149 // ..@ @@@ .@. ...,@2@. +1..+3 floodfill around line ???
150 // ..@ .@@ .@. ...,.@@@ +2..+4 <= that's not implemented yet
151 // ..@ ..@ .@. ...,.@@@ +2..+4
152 // @.@ @.. .@. ...,@@@. +1..+3
153 // @.@ @@. .@. ...I@@@. 0..+3
154 // @@@ @@@ .@. ..@1@@.. 0..+2
155 // 90% 0% 100% 90% r1-r2
156 // I am not satisfied with it
get_line2(int x0,int y0,int x1,int y1,pix * p,int cs,int ret)157 int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
158 int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,q,ddy,rx,ry,
159 *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
160 dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
161 dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
162 // rotate coordinate system if dy>dx
163 if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1;rx=1;ry=0; }
164 else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1;rx=0;ry=1; }
165 if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
166 d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
167 x=x0; y=y0; r0=r1=0; ddy=3; // tolerance = bit 1 + bit 0 = left+right
168 // int t=(*pdx)/16,tl,tr; // tolerance, left-,right delimiter
169 while( (*px)<=(*px1) ){ // not finaly implemented
170 q=((getpixel(p,x,y)<cs)?1:0)^(ret&1);
171 if ( !q ){ // tolerance one pixel perpenticular to the line
172 // what about 2 or more pixels tolerance???
173 ddy&=(~1)|(((getpixel(p,x+ry,y+rx)<cs)?1:0)^(ret&1));
174 ddy&=(~2)|(((getpixel(p,x-ry,y-rx)<cs)?1:0)^(ret&1))*2;
175 } else ddy=3;
176 if( ddy ) r0++; else r1++;
177 (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
178 }
179 return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
180 }
181
182 /* Look for dots in the rectangular region x0 <= x <= x1 and y0 <= y
183 <= y1 in pixmap p. The two low order bits in mask indicate the color
184 of dots to look for: If mask==1 then look for black dots (where a
185 pixel value less than cs is considered black). If mask==2 then look
186 for white dots. If mask==3 then look for both black and white dots.
187 If the dots are found, the corresponding bits are set in the returned
188 value. Heavily used by the engine ocr0*.cc */
get_bw(int x0,int x1,int y0,int y1,pix * p,int cs,int mask)189 char get_bw(int x0, int x1, int y0, int y1, pix * p, int cs, int mask) {
190 char rc = 0; // later with error < 2% (1 dot)
191 int x, y;
192
193 if (x0 < 0) x0 = 0;
194 if (x1 >= p->x) x1 = p->x - 1;
195 if (y0 < 0) y0 = 0;
196 if (y1 >= p->y) y1 = p->y - 1;
197
198 for ( y = y0; y <= y1; y++)
199 for ( x = x0; x <= x1; x++) {
200 rc |= ((getpixel(p, x, y) < cs) ? 1 : 2); // break if rc==3
201 if ((rc & mask) == mask)
202 return mask; // break loop
203 }
204 return (rc & mask);
205 }
206
207 /* more general Mar2000 (x0,x1,y0,y1 instead of x0,y0,x1,y1! (history))
208 * look for black crossings throw a line from x0,y0 to x1,y1 and count them
209 * follow line and count crossings ([white]-black-transitions)
210 * ex: horizontal num_cross of 'm' would return 3
211 *
212 * fail for: .a... a-to-b counts no transitions, but there is
213 * ...#.
214 * ..#..
215 * .#..b
216 * ToDo18: make it tolerant against noise on big chars, +cross-width
217 * ......#.#########.#...... should count as 1 cross
218 */
num_cross(int x0,int x1,int y0,int y1,pix * p,int cs)219 int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
220 int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white
221 int dx = x1 - x0, dy = y1 - y0;
222 int w2_cross=0, w1_cross=0, w1_white=0; // last + 2nd-last cross-width
223
224 d = MAX(abs(dx), abs(dy));
225 for (i = 0, x = x0, y = y0; i <= d; i++) {
226 if (d) {
227 x = x0 + i * dx / d;
228 y = y0 + i * dy / d;
229 }
230 k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black
231 if (col == 0 && k == 1) rc++; // found a white-black transition
232 if (col == 1 && k == 1) w1_cross++; // 1810 add line-width
233 if (col == 1 && k == 0) {
234 if ((w2_cross<=1 && w1_white<=1 && w1_cross>7)
235 || (w1_cross<=1 && w1_white<=1 && w2_cross>7)) if (rc>1) rc--; // 1810 remove noise
236 if (w1_cross > w2_cross) { w2_cross=w1_cross; }
237 w1_cross=0;
238 }
239 if (col == 0 && k == 0) w1_white++; // 1810 add line-width
240 if (col == 0 && k == 1) w1_white=0;
241 col = k; // last color
242 }
243 return rc;
244 }
245
num_cross_fine(int x0,int x1,int y0,int y1,pix * p,int cs)246 int num_cross_fine(int x0, int x1, int y0, int y1, pix *p, int cs) {
247 int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white
248 int dx = x1 - x0, dy = y1 - y0;
249
250 d = MAX(abs(dx), abs(dy));
251 for (i = 0, x = x0, y = y0; i <= d; i++) {
252 if (d) {
253 x = x0 + i * dx / d;
254 y = y0 + i * dy / d;
255 }
256 k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black
257 if (col == 0 && k == 1) rc++; // found a white-black transition
258 col = k; // last color
259 }
260 return rc;
261 }
262
263 /* check if test matches pattern
264 * possible pattern: "a-zA-Z0-9+--\\" (x-y dont work for c>127)
265 * return: 0 means dont fit, 1 means found
266 * ToDo: wchar_t cc + matching UTF-8 pattern for nonASCII
267 */
my_strchr(char * pattern,wchar_t cc)268 int my_strchr( char *pattern, wchar_t cc ) {
269 char *s1;
270 if (pattern==(char *)NULL) return 0;
271
272 /* if (!(cc&0x80)) s1=strchr(pattern,(char)cc); else */
273 switch (cc) {
274 case '-': /* used as a special character */
275 s1=strstr(pattern,"--"); /* search string -- in pattern */
276 if (s1) return 1; break;
277 default:
278 s1=strstr(pattern,decode(cc, UTF8)); /* search string cc in pattern */
279 if (s1) return 1; /* cc simply matches */
280 /* single char not found, now check the ranges */
281 s1=pattern;
282 while (s1) {
283 s1=strchr(s1+1,'-'); /* look for next '-' */
284 if ((!s1) || (!s1[0]) || (!s1[1])) return 0; /* nothing found or end */
285 if (*(s1-1)=='-' || *(s1+1)=='-') continue; /* skip -- pattern */
286 if (*(s1-1)<=cc && *(s1+1)>=cc) return 1; /* within range */
287 }
288 }
289 return 0;
290 }
291
292 /* set alternate chars and its weight, called from the engine
293 if a char is recognized to (weight) percent
294 can be used for filtering (only numbers etc)
295 often usefull if Il1 are looking very similar
296 should this function stay in box.c ???
297 weight is between 0 and 100 in percent, 100 means absolutely sure
298 - not final, not time critical (js)
299 - replace it by a string-function setaobj(*b,"string",weight)
300 and let call setac the setas function
301 */
302
setas(struct box * b,char * as,int weight)303 int setas(struct box *b, char *as, int weight){
304 job_t *job=OCR_JOB;
305 int i,j;
306 if (b->num_ac > NumAlt || b->num_ac<0) {
307 fprintf(stderr,"\nDBG: There is something wrong with setas()!");
308 b->num_ac=0;
309 }
310 if (as==NULL) {
311 fprintf(stderr,"\nDBG: setas(NULL) makes no sense!"); return 0; }
312 if (as[0]==0) {
313 fprintf(stderr,"\nDBG: setas(\"\") makes no sense!"
314 " x= %d %d", b->x0, b->y0);
315 // out_x(b);
316 return 0;
317 }
318
319 /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
320 if (job->cfg.cfilter) {
321 /* do not accept chars which are not in the cfilter string */
322 if ( as[0]>0 && as[1]==0 )
323 if ( !my_strchr(job->cfg.cfilter,as[0]) ) return 0;
324 }
325 #if 0 /* obsolete, done in setac */
326 /* not sure that this is the right place, but where else? */
327 if ( as[0]>0 && as[1]==0 )
328 if (b->modifier != SPACE && b->modifier != 0) {
329 wchar_t newac;
330 newac = compose(as[0], b->modifier);
331 as = (char *)decode(newac, UTF8); /* was (const char *) */
332 if (newac == as[0]) { /* nothing composed */
333 fprintf(stderr, "\nDBG setas compose was useless %d %d",b->x0,b->y0);
334 // out_x(b);
335 }
336 }
337 #endif
338
339 /* only the first run gets the full weight */
340 weight=(100-job->tmp.n_run)*weight/100;
341
342 /* remove same entries from table */
343 for (i=0;i<b->num_ac;i++)
344 if (b->tas[i])
345 if (strcmp(as,b->tas[i])==0) break;
346 if (b->num_ac>0 && i<b->num_ac){
347 if (weight<=b->wac[i]) return 0; /* if found + less weight ignore it */
348 /* to insert the new weigth on the right place, we remove it first */
349 if (b->tas[i]) free(b->tas[i]);
350 for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
351 b->tac[j]=b->tac[j+1]; /* copy the char */
352 b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
353 b->wac[j]=b->wac[j+1]; /* copy the weight */
354 }
355 b->num_ac--; /* shrink table */
356 }
357 /* sorting and add it to the table */
358 for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
359 if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
360 for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
361 b->tac[j]=b->tac[j-1]; /* copy the char */
362 b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
363 b->wac[j]=b->wac[j-1]; /* copy the weight */
364 }
365 if (i<b->num_ac) { /* insert new entry */
366 b->tac[i]=0; /* insert the char=0 ... */
367 b->tas[i]=(char *)malloc(strlen(as)+1); /* ... string */
368 if (b->tas[i]) memcpy(b->tas[i],as,strlen(as)+1);
369 b->wac[i]=weight; /* ... and its weight */
370 }
371 if (i==0) b->c=b->tac[0]; /* char or 0 for string */
372 return 0;
373 }
374
375 /* ToDo: this function will be replaced by a call of setas() later */
setac(struct box * b,wchar_t ac,int weight)376 int setac(struct box *b, wchar_t ac, int weight){
377 int i,j;
378 job_t *job=OCR_JOB;
379 if ((!b) || b->num_ac > NumAlt || b->num_ac<0) {
380 fprintf(stderr,"\nDBG: This is a bad call to setac()!");
381 if(b && (job->cfg.verbose & 6)) out_x(b);
382 b->num_ac=0;
383 }
384 if (ac==0 || ac==UNKNOWN) {
385 fprintf(stderr,"\nDBG: setac(0) makes no sense!");
386 return 0;
387 }
388 /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
389 if (job->cfg.cfilter) {
390 /* do not accept chars which are not in the cfilter string */
391 /* if ( ac>255 || !strchr(job->cfg.cfilter,(char)ac) ) return 0; */
392 if ( !my_strchr(job->cfg.cfilter,ac) ) return 0;
393 }
394 /* not sure that this is the right place, but where else? */
395 if (b->modifier != SPACE && b->modifier != 0) {
396 wchar_t newac;
397 newac = compose(ac, b->modifier);
398 if (newac == ac) { /* nothing composed */
399 if(job->cfg.verbose & 7)
400 fprintf(stderr, "\nDBG %s setac (%d,%d): compose was useless, wac=%d",
401 decode(ac,ASCII), b->x0, b->y0, weight);
402 /* if(job->cfg.verbose & 6) out_x(b); */
403 }
404 ac = newac;
405 }
406
407 /* only the first run gets the full weight */
408 weight=(100-job->tmp.n_run)*weight/100;
409
410 /* remove same entries from table */
411 for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) break;
412 if (b->num_ac>0 && i<b->num_ac){
413 if (weight<=b->wac[i]) return 0;
414 if (b->tas[i]) free(b->tas[i]);
415 for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
416 b->tac[j]=b->tac[j+1]; /* copy the char */
417 b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
418 b->wac[j]=b->wac[j+1]; /* copy the weight */
419 }
420 b->num_ac--; /* shrink table */
421 }
422 /* sorting it to the table */
423 for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
424 if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
425 for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
426 b->tac[j]=b->tac[j-1]; /* copy the char */
427 b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
428 b->wac[j]=b->wac[j-1]; /* copy the weight */
429 }
430 if (i<b->num_ac) { /* insert new entry */
431 b->tac[i]=ac; /* insert the char ... */
432 b->tas[i]=NULL; /* ... no string (?) 2018-09 fix ji */
433 b->wac[i]=weight; /* ... and its weight */
434 }
435 if (i==0) b->c=ac; /* store best result to b->c (will be obsolete) */
436
437 return 0;
438 }
439
440 /* test if ac in wac-table
441 usefull for contextcorrection and box-splitting
442 return 0 if not found
443 return wac if found (wac>0)
444 */
testac(struct box * b,wchar_t ac)445 int testac(struct box *b, wchar_t ac){
446 int i;
447 if (b->num_ac > NumAlt || b->num_ac<0) {
448 fprintf(stderr,"\n#DEBUG: There is something wrong with testac()!");
449 b->num_ac=0;
450 }
451 /* search entries in table */
452 for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) return b->wac[i];
453 return 0;
454 }
455
456
457 /* look for edges: follow a line from x0,y0 to x1,y1, record the
458 * location of each transition, and return their number.
459 * ex: horizontal num_cross of 'm' would return 6
460 * remark: this function is not used, obsolete? ToDo: remove?
461 */
follow_path(int x0,int x1,int y0,int y1,pix * p,int cs,path_t * path)462 int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path) {
463 int rc = 0, prev, x, y, i, d, color; // rc=crossings col=0=white
464 int dx = x1 - x0, dy = y1 - y0;
465
466 d = MAX(abs(dx), abs(dy));
467 prev = getpixel(p, x0, y0) < cs; // 0=white 1=black
468 path->start = prev;
469 for (i = 1, x = x0, y = y0; i <= d; i++) {
470 if (d) {
471 x = x0 + i * dx / d;
472 y = y0 + i * dy / d;
473 }
474 color = getpixel(p, x, y) < cs; // 0=white 1=black
475 if (color != prev){
476 if (rc>=path->max){
477 int n=path->max*2+10;
478 path->x = (int *) xrealloc(path->x, n*sizeof(int));
479 path->y = (int *) xrealloc(path->y, n*sizeof(int));
480 path->max = n;
481 }
482 path->x[rc]=x;
483 path->y[rc]=y;
484 rc++;
485 }
486 prev = color;
487 }
488 path->num=rc;
489 return rc;
490 }
491
492 /* ToDo: only used in follow_path, which is obsolete, remove? */
xrealloc(void * ptr,size_t size)493 void *xrealloc(void *ptr, size_t size){
494 void *p;
495 p = realloc(ptr, size);
496 if (size>0 && (!p)){
497 fprintf(stderr, "insufficient memory");
498 exit(1);
499 }
500 return p;
501 }
502
503 /*
504 * -------------------------------------------------------------
505 * mark edge-points
506 * - first move forward until b/w-edge
507 * - more than 2 pixel?
508 * - loop around
509 * - if forward pixel : go up, rotate right
510 * - if forward no pixel : rotate left
511 * - stop if found first 2 pixel in same order
512 * go_along_the_right_wall strategy is very similar and used otherwhere
513 * --------------------------------------------------------------
514 * turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border
515 * out: last-position
516 *
517 * could be used to extract more features:
518 * by counting stepps, dead-end streets ,xmax,ymax,ro-,ru-,lo-,lu-edges
519 *
520 * use this little animal to find features, I first was happy about it
521 * but now I prefer the loop() function
522 */
523
turmite(pix * p,int * x,int * y,int x0,int x1,int y0,int y1,int cs,int rw,int rb)524 void turmite(pix *p, int *x, int *y,
525 int x0, int x1, int y0, int y1, int cs, int rw, int rb) {
526 int r;
527 if (outbounds(p, x0, y0)) // out of pixmap
528 return;
529 while (*x >= x0 && *y >= y0 && *x <= x1 && *y <= y1) {
530 r = ((getpixel(p, *x, *y) < cs) ? rb : rw); // select rule
531 switch (r) {
532 case UP: (*y)--; break;
533 case DO: (*y)++; break;
534 case RI: (*x)++; break;
535 case LE: (*x)--; break;
536 case ST: break;
537 default: assert(0);
538 }
539 if( r==ST ) break; /* leave the while-loop */
540 }
541 }
542
543 /* search a way from p0 to p1 without crossing pixels of type t
544 * only two directions, useful to test if there is a gap 's'
545 * labyrinth algorithm - do you know a faster way? */
joined(pix * p,int x0,int y0,int x1,int y1,int cs)546 int joined(pix *p, int x0, int y0, int x1, int y1, int cs){
547 int t,r,x,y,dx,dy,xa,ya,xb,yb;
548 x=x0;y=y0;dx=1;dy=0;
549 if(x1>x0){xa=x0;xb=x1;} else {xb=x0;xa=x1;}
550 if(y1>y0){ya=y0;yb=y1;} else {yb=y0;ya=y1;}
551 t=((getpixel(p,x,y)<cs)?1:0);
552 for(;;){
553 if( t==((getpixel(p,x+dy,y-dx)<cs)?1:0) // right free?
554 && x+dy>=xa && x+dy<=xb && y-dx>=ya && y-dx<=yb) // wall
555 { r=dy;dy=-dx;dx=r;x+=dx;y+=dy; } // rotate right and step forward
556 else { r=dx;dx=-dy;dy=r; } // rotate left
557 // fprintf(stderr," path xy %d-%d %d-%d %d %d %d %d\n",xa,xb,ya,yb,x,y,dx,dy);
558 if( x==x1 && y==y1 ) return 1;
559 if( x==x0 && y==y0 && dx==1) return 0;
560 }
561 // return 0; // endless loop ?
562 }
563
564 /* move from x,y to direction r until pixel of color col is found
565 * or maximum of l steps
566 * return the number of steps done */
loop(pix * p,int x,int y,int l,int cs,int col,DIRECTION r)567 int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){
568 int i=0;
569 if(x>=0 && y>=0 && x<p->x && y<p->y){
570 switch (r) {
571 case UP:
572 for( ;i<l && y>=0;i++,y--)
573 if( (getpixel(p,x,y)<cs)^col )
574 break;
575 break;
576 case DO:
577 for( ;i<l && y<p->y;i++,y++)
578 if( (getpixel(p,x,y)<cs)^col )
579 break;
580 break;
581 case LE:
582 for( ;i<l && x>=0;i++,x--)
583 if( (getpixel(p,x,y)<cs)^col )
584 break;
585 break;
586 case RI:
587 for( ;i<l && x<p->x;i++,x++)
588 if( (getpixel(p,x,y)<cs)^col )
589 break;
590 break;
591 default:;
592 }
593 }
594 return i;
595 }
596
597 /* Given a point, frames a rectangle containing all points of the same
598 * color surrounding it, and mark these points.
599 * ToDo: obsolate and replaced by frame_vector
600 *
601 * looking for better algo: go horizontally and look for upper/lower non_marked_pixel/nopixel
602 * use lowest three bits for mark
603 * - recursive version removed! AmigaOS has no Stack-OVL-Event
604 * run around the chape using laby-robot
605 * bad changes can lead to endless loop!
606 * - this is not absolutely sure but mostly works well
607 * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
608 * mark - 3 bit marker, mark each valid pixel with it
609 */
frame_nn(pix * p,int x,int y,int * x0,int * x1,int * y0,int * y1,int cs,int mark,int diag)610 int frame_nn(pix *p, int x, int y,
611 int *x0, int *x1, int *y0, int *y1, // enlarge frame
612 int cs, int mark,int diag){
613 #if 1 /* flood-fill to detect black objects, simple and faster? */
614 int rc = 0, dx, col, maxstack=0; static int overflow=0;
615 int bmax=1024, blen=0, *buf; /* buffer as replacement for recursion stack */
616
617 /* check bounds */
618 if (outbounds(p, x, y)) return 0;
619 /* check if already marked (with mark since v0.4) */
620 if ((marked(p,x,y)&mark)==mark) return 0;
621
622 col = ((getpixel(p, x, y) < cs) ? 0 : 1);
623 buf=(int *)malloc(bmax*sizeof(int)*2);
624 if (!buf) { fprintf(stderr,"malloc failed (frame_nn)\n");return 0;}
625 buf[0]=x;
626 buf[1]=y;
627 blen=1;
628
629 g_debug(fprintf(stderr,"\nframe_nn x=%4d y=%4d",x,y);)
630 for ( ; blen ; ) {
631 /* max stack depth is complexity of the object */
632 if (blen>maxstack) maxstack=blen;
633 blen--; /* reduce the stack */
634 x=buf[blen*2+0];
635 y=buf[blen*2+1];
636 if (y < *y0) *y0 = y;
637 if (y > *y1) *y1 = y;
638 /* first go to leftmost pixel */
639 for ( ; x>0 && (col == ((getpixel(p, x-1, y) < cs) ? 0 : 1)) ; x--);
640 if ((marked(p,x,y)&mark)==mark) continue; /* already scanned */
641 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, left */
642 if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
643 && col != ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
644 && col == ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
645 && !((marked(p,x-1,y+dx)&mark)==mark)
646 ) {
647 if (blen+1>=bmax) { overflow|=1; continue; }
648 buf[blen*2+0]=x-1;
649 buf[blen*2+1]=y+dx;
650 blen++;
651 }
652 if (x < *x0) *x0 = x;
653 /* second go right, mark and get new starting points */
654 for ( ; x<p->x && (col == ((getpixel(p, x , y) < cs) ? 0 : 1)) ; x++) {
655 p->p[x + y * p->x] |= (mark & 7); rc++; /* mark pixel */
656 /* enlarge frame */
657 if (x > *x1) *x1 = x;
658 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line */
659 if ( col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
660 && (
661 col != ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
662 || col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) )
663 && !((marked(p,x,y+dx)&mark)==mark) && y+dx<p->y && y+dx>=0
664 ) {
665 if (blen+1>=bmax) { overflow|=1; continue; }
666 buf[blen*2+0]=x;
667 buf[blen*2+1]=y+dx;
668 blen++;
669 }
670 }
671 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, right */
672 if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
673 && col == ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
674 && col != ((getpixel(p, x , y ) < cs) ? 0 : 1)
675 && col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
676 && col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
677 && !((marked(p,x,y+dx)&mark)==mark)
678 ) {
679 if (blen+1>=bmax) { overflow|=1; continue; }
680 buf[blen*2+0]=x;
681 buf[blen*2+1]=y+dx;
682 blen++;
683 }
684 }
685
686 /* debug, ToDo: use info maxstack and pixels for image classification */
687 g_debug(fprintf(stderr," maxstack= %4d pixels= %6d",maxstack,rc);)
688 if (overflow==1){
689 overflow|=2;
690 fprintf(stderr,"# Warning: frame_nn stack oerflow\n");
691 }
692 free(buf);
693 #else /* old version, ToDo: improve it for tmp04/005*.pgm.gz */
694 int i, j, d, dx, ox, oy, od, nx, ny, rc = 0, rot = 0, x2 = x, y2 = y, ln;
695
696 static const int d0[8][2] = { { 0, -1} /* up */, {-1, -1},
697 {-1, 0} /* left */, {-1, 1},
698 { 0, 1} /* down */, { 1, 1},
699 { 1, 0} /* right */, { 1, -1}};
700
701 /* check bounds */
702 if (outbounds(p, x, y))
703 return 0;
704 /* check if already marked */
705 if ((marked(p,x,y)&mark)==mark)
706 return 0;
707
708 i = ((getpixel(p, x, y) < cs) ? 0 : 1);
709 rc = 0;
710
711 g_debug(fprintf(stderr," start frame:");)
712
713 for (ln = 0; ln < 2 && rot >= 0; ln++) { // repeat if right-loop
714 g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d - go to border\n",ln,diag,cs,x,y);)
715
716 od=d=(8+4*ln-diag)&7; // start robot looks up, right is a wall
717 // go to right (left) border
718 if (ln==1) {
719 x=x2; y=y2;
720 }
721 /* start on leftmost position */
722 for (dx = 1 - 2*ln; x + dx < p->x && x + dx >= 0 /* bounds */ &&
723 i == ((getpixel(p, x + dx, y) < cs) ? 0 : 1) /* color */;
724 x += dx);
725
726 g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d\n",ln,diag,cs,x,y);)
727
728 /* robot stores start-position */
729 ox = x; oy = y;
730 for (rot = 0; abs(rot) <= 64; ) { /* for sure max. 8 spirals */
731 /* leftmost position */
732 if (ln == 0 && x < x2) {
733 x2 = x; y2 = y;
734 }
735
736 g_debug(fprintf(stderr," xy %3d %3d d=%d i=%d p=%3d rc=%d\n",x,y,d,i,getpixel(p,x,y),rc);)
737
738 if ( abs(d0[d][1]) ) { /* mark left (right) pixels */
739 for (j = 0, dx = d0[d][1]; x + j >= 0 && x + j < p->x
740 && i == ((getpixel(p, x + j, y) < cs) ? 0 : 1); j += dx) {
741 if (!((marked(p, x + j, y)&mark)==mark))
742 rc++;
743 p->p[x + j + y * p->x] |= (mark & 7);
744 }
745 }
746 /* look to the front of robot */
747 nx = x + d0[d][0];
748 ny = y + d0[d][1];
749 /* if right is a wall */
750 if ( outbounds(p, nx, ny) || i != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
751 /* rotate left */
752 d=(d+2-diag) & 7; rot-=2-diag;
753 }
754 else { /* if no wall, go, turn back and rotate left */
755 x=nx; y=ny; d=(d+4+2-diag) & 7; rot+=2-diag+4;
756 /* enlarge frame */
757 if (x < *x0) *x0 = x;
758 if (x > *x1) *x1 = x;
759 if (y < *y0) *y0 = y;
760 if (y > *y1) *y1 = y;
761 }
762 if(x==ox && y==oy && d==od) break; // round trip finished
763 }
764 }
765 g_debug(fprintf(stderr," rot=%d\n",rot);)
766 #endif
767 return rc;
768 }
769
770 /* obsolete! replaced by vectors
771 * mark neighbouring pixel of same color, return number
772 * better with neighbours of same color (more general) ???
773 * parameters: (&~7)-pixmap, start-point, critical_value, mark
774 * recursion is removed */
mark_nn(pix * p,int x,int y,int cs,int r)775 int mark_nn(pix * p, int x, int y, int cs, int r) {
776 /* out of bounds or already marked? */
777 if (outbounds(p, x, y) || (marked(p, x, y)&r)==r)
778 return 0;
779 {
780 int x0, x1, y0, y1;
781 x0 = x1 = x;
782 y0 = y1 = y; // not used
783 return frame_nn(p, x, y, &x0, &x1, &y0, &y1, cs, r, OCR_JOB->tmp.n_run & 1);
784 // using same scheme
785 }
786 }
787
788 /* ToDo: finish to replace old frame by this new one
789 *
790 * @...........#@@@@@@@. # = marked as already scanned black pixels
791 * @........@@@@@@@@@@@# only left and right border
792 * .......#@@@@@@@@@@@@@ left side on even y
793 * ......@@@@@@@@#.@@@@# right side on odd y
794 * .....#@@@@@......#@@@ no border is marked twice
795 * ....@@@@@#......@@@#. works also for thinn lines
796 * ...#@@@@........#@@@. - outer loop is stored as first
797 * ..@@@@#........@@@#.. - inner loop is stored as second
798 * .#@@@@........#@@@@.. 1st in an extra box (think on white chars)
799 * @@@@#.......@@@@#.... 2nd merge in an extra step
800 * #@@@@@....#@@@@@.....
801 * @@@@@@@@@@@@@@#......
802 * .#@@@@@@@@@@@@.......
803 *
804 * run around the chape using laby-robot
805 * - used for scanning boxes, look for horizontal b/w transitions
806 * with unmarked black pixels and call this routine
807 * - stop if crossing a marked box in same direction (left=up, right=down)
808 * box - char box, store frame_vectors and box
809 * x,y - starting point
810 * mark - 3 bit marker, mark each valid pixel with it
811 * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
812 * ds - start direction, 6=right of right border, 2=left of left border
813 * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
814 * -7=no border in direction ds
815 */
816 #if 0
817 #undef g_debug
818 #define g_debug(x) x
819 #endif
820 /* grep keywords: scan_vectors frame_vector */
frame_vector(struct box * box1,int x,int y,int cs,int mark,int diag,int ds)821 int frame_vector(struct box *box1, int x, int y,
822 int cs, int mark, int diag, int ds) {
823 int i1, i2, i2o,
824 new_x=1, /* flag for storing the vector x,y */
825 steps=1, /* steps between stored vectors, speedup for big frames */
826 d, /* direction */
827 ox, oy, /* starting point */
828 nx, ny, mx, my, /* used for simplification */
829 /* ToDo: add periphery to box (german: Umfang?) */
830 rc = 1, /* return code, circumference, sum vector lengths */
831 rot = 0, /* memory for rotation, rot=8 means one full rotation */
832 vol = 0; /* volume inside frame, negative for white inside black */
833 pix *p=box1->p;
834
835 /* translate the 8 directions to (x,y) pairs,
836 * if only four directions are used, only every 2nd vector is accessed,
837 * +1 turn left, -1 turn right
838 */
839 static const int d0[8][2] =
840 { { 0, -1}, /* up */ {-1, -1}, /* up-le */
841 {-1, 0}, /* left */ {-1, 1}, /* do-le */
842 { 0, 1}, /* down */ { 1, 1}, /* do-ri */
843 { 1, 0}, /* right */ { 1, -1} }; /* up-ri */
844
845 /* check bounds */
846 if (outbounds(p, x, y))
847 return 0;
848
849 /* pixel color we are looking for, 0=black, 1=white */
850 d = ds;
851 i1 = ((getpixel(p, x, y ) < cs) ? 0 : 1);
852 i2 = ((getpixel(p, x + d0[d][0], y + d0[d][1]) < cs) ? 0 : 1);
853
854 g_debug(fprintf(stderr,"\nLEV2 frame_vector @ %3d %3d d%d %2d %2d"
855 " %d-%d pix=%3d mark=%d cs=%d",\
856 x,y,ds,d0[ds][0],d0[ds][1],i1,i2,getpixel(p,x,y),mark,cs);)
857
858 if (i1==i2){
859 fprintf(stderr,"ERROR frame_vector: no border\n");
860 return -7; /* no border detected */
861 }
862
863 /* initialize boxframe outside this function
864 box1->x0=box1->x1=x;
865 box1->y0=box1->y1=y;
866 */
867
868 /* initialize boxvector outside this function
869 box1->num_frames=0
870 num_frame_vectors[0]=0 ???
871 and store start value
872 */
873 if (box1->num_frames >= MaxNumFrames) return -2;
874 /* index to next (x,y) */
875 i2o=i2=( (box1->num_frames==0)?0:
876 box1->num_frame_vectors[ box1->num_frames ] );
877 #if 0 // obsolete v0.43
878 box1->frame_vector[i2][0]=x;
879 box1->frame_vector[i2][1]=y;
880 i2++;
881 box1->num_frame_vectors[ box1->num_frames ]=i2;
882 #endif
883 box1->num_frames++;
884
885 /* robot stores start-position */
886 ox = x; oy = y; /* look forward to white pixel */
887
888 for (;;) { /* stop if same marked pixel touched */
889
890 g_debug(fprintf(stderr,"\nLEV3: xy %3d %3d d= %d rot= %2d %3d",x,y,d,rot,i2);)
891
892 /* ToDo: store max. abs(rot) ??? for better recognition */
893 if (new_x) {
894 g_debug(fprintf(stderr,"\nLEV2: markB xy= %3d %3d ", x, y);)
895 p->p[x + y * p->x] |= (mark & 7); /* mark black pixel */
896 }
897
898 /* store a new vector or enlarge the predecessor */
899 if (new_x && (rc%steps)==0) { /* dont store everything on big chars */
900 if (i2>=MaxFrameVectors) {
901 box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
902 reduce_vectors(box1,1); /* simplify loop */
903 i2=box1->num_frame_vectors[ box1->num_frames-1 ];
904 /* enlarge steps on big chars getting speedup */
905 steps=(box1->y1-box1->y0+box1->x1-box1->x0)/32+1;
906 }
907 /* store frame-vector */
908 if (i2<MaxFrameVectors) {
909 box1->frame_vector[i2][0]=x;
910 box1->frame_vector[i2][1]=y;
911 /* test if older vector points to the same direction */
912 if (i2>1) {
913 /* get predecessor */
914 nx=box1->frame_vector[i2-1][0]-box1->frame_vector[i2-2][0];
915 ny=box1->frame_vector[i2-1][1]-box1->frame_vector[i2-2][1];
916 mx=x -box1->frame_vector[i2-1][0];
917 my=y -box1->frame_vector[i2-1][1];
918 /* same direction? */
919 if (nx*my-ny*mx==0 && nx*mx>=0 && ny*my>=0) {
920 /* simplify by removing predecessor */
921 i2--;
922 box1->frame_vector[i2][0]=x;
923 box1->frame_vector[i2][1]=y;
924 } /* do not simplify */
925 }
926 i2++;
927 box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
928 }
929 g_debug(fprintf(stderr," stored @ %3d steps= %d", i2-1, steps);)
930 }
931 new_x=0; /* work for new pixel (x,y) done */
932
933 /* check if round trip is finished */
934 if (x==ox && y==oy && abs(rot)>=8) break;
935
936 /* look to the front of robot (turtle or ant) */
937 nx = x + d0[d][0];
938 ny = y + d0[d][1];
939
940 /* next step, if right is a wall turn the turtle left */
941 if ( outbounds(p, nx, ny) || i1 != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
942 if (y==ny && nx>=0 && nx<p->x) { /* if inbound */
943 g_debug(fprintf(stderr,"\nLEV2: markW xy= %3d %3d ", nx, ny);)
944 p->p[nx + ny * p->x] |= (mark & 7); /* mark white pixel */
945 }
946 /* rotate left 90 or 45 degrees */
947 d=(d+2-diag) & 7; rot+=2-diag;
948 /* calculate volume inside frame */
949 switch (d+diag) {
950 case 2+2: vol-=x-1; break;
951 case 6+2: vol+=x; break;
952 }
953 }
954 else { /* if no wall, go forward and turn right (90 or 45 degrees) */
955 x=nx; y=ny;
956 /* turn back and rotate left */
957 d=(d+4+2-diag) & 7; rot+=2-diag-4;
958 rc++; /* counting steps, used for speedup */
959
960 /* enlarge frame */
961 if (x < box1->x0) box1->x0 = x;
962 if (x > box1->x1) box1->x1 = x;
963 if (y < box1->y0) box1->y0 = y;
964 if (y > box1->y1) box1->y1 = y;
965
966 new_x=1;
967 }
968 }
969
970 /* to distinguish inner and outer frames, store volume as +v or -v */
971 box1->frame_vol[ box1->num_frames-1 ] = vol;
972 box1->frame_per[ box1->num_frames-1 ] = rc-1;
973
974 /* dont count and store the first vector twice */
975 if (i2-i2o>1) {
976 i2--; rc--; box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
977 }
978 /* output break conditions */
979 g_debug(fprintf(stderr,"\nLEV2 o= %3d %3d xy %3d %3d r=%d v=%d",ox,oy,x,y,rot,vol);)
980 /* rc=1 for a single point, rc=2 for a two pixel sized point */
981 g_debug(fprintf(stderr," steps= %3d vectors= %3d",rc,i2);)
982 /* out_x(box1); ToDo: output only the first thousend */
983 return rc; /* return number of bordering pixels = periphery? */
984 }
985
986
987
988 /* clear lowest 3 (marked) bits (they are used for marking) */
clr_bits(pix * p,int x0,int x1,int y0,int y1)989 void clr_bits(pix * p, int x0, int x1, int y0, int y1) {
990 int x, y;
991 for ( y=y0; y <= y1; y++)
992 for ( x=x0; x <= x1; x++)
993 p->p[x+y*p->x] &= ~7;
994 }
995
996 /* look for white holes surrounded by black points
997 * at the moment look for white point with black in all four directions
998 * - store position of hole in coordinates relativ to box!
999 * ToDo: count only holes with vol>10% ???
1000 * ToDo: rewrite for frame vectors (faster, no malloc)
1001 * holes are frames rotating left hand
1002 * obsolete, do it with vectors
1003 */
num_hole(int x0,int x1,int y0,int y1,pix * p,int cs,holes_t * holes)1004 int num_hole(int x0, int x1, int y0, int y1, pix * p, int cs, holes_t *holes) {
1005 int num_holes = 0, x, y, hole_size;
1006 pix b; // temporary mini-page
1007 int dx = x1 - x0 + 1, dy = y1 - y0 + 1;
1008 unsigned char *buf; // 2nd copy of picture, for working
1009
1010 if (holes) holes->num=0;
1011 if(dx<3 || dy<3) return 0;
1012 b.p = buf = (unsigned char *) malloc( dx * dy );
1013 if( !buf ){
1014 fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_hole", dx*dy );
1015 return 0;
1016 }
1017 if (copybox(p, x0, y0, dx, dy, &b, dx * dy))
1018 { free(b.p); return -1;}
1019
1020 // printf(" num_hole(");
1021 /* --- mark white-points connected with border */
1022 for (x = 0; x < b.x; x++) {
1023 if (getpixel(&b, x, 0) >= cs)
1024 mark_nn(&b, x, 0, cs, AT);
1025 if (getpixel(&b, x, b.y - 1) >= cs)
1026 mark_nn(&b, x, b.y - 1, cs, AT);
1027 }
1028 for (y = 0; y < b.y; y++) {
1029 if (getpixel(&b, 0, y) >= cs)
1030 mark_nn(&b, 0, y, cs, AT);
1031 if (getpixel(&b, b.x - 1, y) >= cs)
1032 mark_nn(&b, b.x - 1, y, cs, AT);
1033 }
1034
1035 g_debug(out_b(NULL,&b,0,0,b.x,b.y,cs);)
1036 // --- look for unmarked white points => hole
1037 for (x = 0; x < b.x; x++)
1038 for (y = 0; y < b.y; y++)
1039 if (!((marked(&b, x, y)&AT)==AT)) // unmarked
1040 if (getpixel(&b, x, y) >= cs) { // hole found
1041 #if 0
1042 hole_size=mark_nn(&b, x, y, cs, AT); /* old version */
1043 if (hole_size > 1 || dx * dy <= 40)
1044 num_holes++;
1045 #else
1046 { /* new version, for future store of hole characteristics */
1047 int x0, x1, y0, y1, i, j;
1048 x0 = x1 = x;
1049 y0 = y1 = y; // not used
1050 hole_size=frame_nn(&b, x, y, &x0, &x1, &y0, &y1, cs, AT, OCR_JOB->tmp.n_run & 1);
1051 // store hole for future use, num is initialized with 0
1052 if (hole_size > 1 || dx * dy <= 40){
1053 num_holes++;
1054 if (holes) {
1055 // sort in table
1056 for (i=0;i<holes->num && i<MAX_HOLES;i++)
1057 if (holes->hole[i].size < hole_size) break;
1058 for (j=MAX_HOLES-2;j>=i;j--)
1059 holes->hole[j+1]=holes->hole[j];
1060 if (i<MAX_HOLES) {
1061 // printf(" i=%d size=%d\n",i,hole_size);
1062 holes->hole[i].size=hole_size;
1063 holes->hole[i].x=x;
1064 holes->hole[i].y=y;
1065 holes->hole[i].x0=x0;
1066 holes->hole[i].y0=y0;
1067 holes->hole[i].x1=x1;
1068 holes->hole[i].y1=y1;
1069 }
1070 holes->num++;
1071 }
1072 }
1073 }
1074 #endif
1075 }
1076 free(b.p);
1077 // printf(")=%d",num_holes);
1078 return num_holes;
1079 }
1080
1081 /* count for black nonconnected objects --- used for i,auml,ouml,etc. */
1082 /* ToDo: obsolete, replaced by vectors and box.num_boxes */
num_obj(int x0,int x1,int y0,int y1,pix * p,int cs)1083 int num_obj(int x0, int x1, int y0, int y1, pix * p, int cs) {
1084 int x, y, rc = 0; // rc=num_obj
1085 unsigned char *buf; // 2nd copy of picture, for working
1086 pix b;
1087
1088 if(x1<x0 || y1<y0) return 0;
1089 b.p = buf = (unsigned char *) malloc( (x1-x0+1) * (y1-y0+1) );
1090 if( !buf ){
1091 fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_obj",(x1-x0+1)*(y1-y0+1) );
1092 return 0;
1093 }
1094 if (copybox(p, x0, y0, x1 - x0 + 1, y1 - y0 + 1, &b, (x1-x0+1) * (y1-y0+1)))
1095 { free(b.p); return -1; }
1096 // --- mark black-points connected with neighbours
1097 for (x = 0; x < b.x; x++)
1098 for (y = 0; y < b.y; y++)
1099 if (getpixel(&b, x, y) < cs)
1100 if (!((marked(&b, x, y)&AT)==AT)) {
1101 rc++;
1102 mark_nn(&b, x, y, cs, AT);
1103 }
1104 free(b.p);
1105 return rc;
1106 }
1107
1108 #if 0
1109 // ----------------------------------------------------------------------
1110 // first idea for making recognition based on probability
1111 // - start with a list of all possible chars
1112 // - call recognition_of_char(box *)
1113 // - remove chars from list which could clearly excluded
1114 // - reduce probability of chars which have wrong features
1115 // - font types list could also build
1116 // at the moment it is only an idea, I should put it to the todo list
1117 //
1118 char *list="0123456789,.\0xe4\0xf6\0xfc" // "a=228 o=246 u=252
1119 "abcdefghijklmnopqrstuvwxyz"
1120 "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1121 int wert[100];
1122 int listlen=0,numrest=0;
1123 // initialize a new character list (for future)
1124 void ini_list(){ int i;
1125 for(i=0;list[i]!=0 && i<100;i++) wert[i]=0;
1126 numrest=listlen=i; }
1127 // exclude??? (for future) oh it was long time ago, I wrote that :/
1128 void exclude(char *filt){ int i,j;
1129 for(j=0;filt[j]!=0 && j<100;j++)
1130 for(i=0;list[i]!=0 && i<100;i++)
1131 if( filt[j]==list[i] ) { if(!wert[i])numrest--; wert[i]++; } }
1132 // get the result after all the work (for future)
1133 char getresult(){ int i;
1134 if( numrest==1 )
1135 for(i=0;list[i]!=0 && i<100;i++) if(!wert[i]) return list[i];
1136 return '_';
1137 }
1138 #endif
1139
1140 // look at the environment of the pixel too (contrast etc.)
1141 // detailed analysis only of diff pixels!
1142 //
1143 // 100% * "distance", 0 is ideal fit
1144 // = similarity of two chars for recognition of garbled (verstuemmelter) chars
1145 // weight of pixels with only one same neighbour set to 0
1146 // look at contours too! v0.2.4: B==H
1147 // changed for v0.41, Mar06
distance(pix * p1,struct box * box1,pix * p2,struct box * box2,int cs)1148 int distance( pix *p1, struct box *box1,
1149 pix *p2, struct box *box2, int cs){
1150 int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2;
1151 x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
1152 dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);
1153 dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);
1154 if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) return 100;
1155 // compare relations to baseline and upper line
1156 if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
1157 if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
1158 // compare pixels
1159 for( y=0;y<dy;y++ )
1160 for( x=0;x<dx;x++ ) { // try global shift too ???
1161 v1 =((getpixel(p1,x1+x ,y1+y )<cs)?1:0); i1=8; // better gray?
1162 v2 =((getpixel(p2,x2+x ,y2+y )<cs)?1:0); i2=8; // better gray?
1163 if(v1==v2) { rgood+=8; continue; } // all things are right!
1164 // what about different pixel???
1165 // test overlap of 8 surounding pixels ??? bad if two nb. are bad
1166 v1=-1;
1167 for(i1=-1;i1<2;i1++)
1168 for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
1169 if( ((getpixel(p1,x1+x+i1*(1+dx/32),y1+y+i2*(1+dy/32))<cs)?1:0)
1170 !=((getpixel(p2,x2+x+i1*(1+dx/32),y2+y+i2*(1+dy/32))<cs)?1:0) ) v1++;
1171 }
1172 if (v1>0) rbad+=16*v1;
1173 else rbad++;
1174 }
1175 if(rgood+rbad) rc= (100*rbad+(rgood+rbad-1))/(rgood+rbad); else rc=99;
1176 if(rc<10 && OCR_JOB->cfg.verbose & 7){
1177 fprintf(stderr,"\n# distance rc=%d good=%d bad=%d",rc,rgood,rbad);
1178 // out_x(box1);out_x(box2);
1179 }
1180 return rc;
1181 }
1182
1183
1184
1185 // ============================= call OCR engine ================== ;)
1186 // nrun=0 from outside, nrun=1 from inside (allows modifications, oobsolete)
whatletter(struct box * box1,int cs,int nrun)1187 wchar_t whatletter(struct box *box1, int cs, int nrun){
1188 wchar_t bc=UNKNOWN; // best letter
1189 wchar_t um=SPACE; // umlaut? '" => modifier
1190 pix *p=box1->p; // whole image
1191 int x,y,dots,xa,ya,x0,x1,y0,y1,dx,dy,i;
1192 pix b; // box
1193 struct box bbuf=*box1; // restore after modifikation!
1194
1195 if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1196 bc=box1->tac[0];
1197 }
1198 // if (bc!=UNKNOWN) return bc;
1199 // if whatletter() called again, only unknown chars are processed
1200 // bad for splitting!
1201
1202 // store box data, which can be modified for modified chars in 2nd run
1203 bbuf.x0=box1->x0; bbuf.y0=box1->y0;
1204 bbuf.x1=box1->x1; bbuf.y1=box1->y1;
1205
1206 xa=box1->x; ya=box1->y;
1207 x0=box1->x0; y0=box1->y0;
1208 x1=box1->x1; y1=box1->y1;
1209 // int vol=(y1-y0+1)*(x1-x0+1); // volume
1210 // crossed l-m , divided chars
1211 while( get_bw(x0,x1,y0,y0,p,cs,1)!=1 && y0+1<y1) y0++;
1212 while( get_bw(x0,x1,y1,y1,p,cs,1)!=1 && y0+1<y1) y1--;
1213 dx=x1-x0+1;
1214 dy=y1-y0+1; // size
1215
1216 // better to proof the white frame too!!! ????
1217 // --- test for german umlaut and points above, not robust enough???
1218 // if three chars are connected i-dots (ari) sometimes were not detected
1219 // - therefore after division a test could be useful
1220 // modify y0 only in second run!?
1221 // we need it here to have the right copybox
1222 if (um==SPACE && dy>5 && box1->num_boxes>1)
1223 testumlaut(box1,cs,2,&um); /* set box1->modifier + new y0 */
1224
1225 dots=box1->dots;
1226 y0 =box1->y0; // dots==2 => y0 below double dots
1227 dy =y1-y0+1;
1228
1229 // move upper and lower border (for divided letters)
1230 while( get_bw(x0,x1,y0,y0,p,cs,1)==0 && y0+1<y1) y0++;
1231 while( get_bw(x0,x1,y1,y1,p,cs,1)==0 && y0+1<y1) y1--;
1232 while( get_bw(x0,x0,y0,y1,p,cs,1)==0 && x0+1<x1) x0++;
1233 while( get_bw(x1,x1,y0,y1,p,cs,1)==0 && x0+1<x1) x1--;
1234 dx=x1-x0+1;
1235 dy=y1-y0+1; // size
1236 box1->x0=x0; box1->y0=y0; // set reduced frame
1237 box1->x1=x1; box1->y1=y1;
1238
1239 // set good startpoint (probably bad from division)?
1240 if( xa<x0 || xa>x1 || ya<y0 || ya>y1
1241 || getpixel(p,xa,ya)>=cs /* || 2*ya<y0+y1 */ || dots>0 ){
1242 // subfunction? also called after division of two glued chars?
1243 for(y=y1;y>=y0;y--) // low to high (not i-dot)
1244 for(x=(x0+x1)/2,i=0;x>=x0 && x<=x1;i++,x+=((2*i&2)-1)*i) /* is that ok? */
1245 if (getpixel(p,x,y)<cs && (getpixel(p,x+1,y)<cs
1246 || getpixel(p,x,y+1)<cs)){ xa=x;ya=y;y=-1;break; }
1247 /* should box1->x,y be set? */
1248 }
1249
1250 // ----- create char-only-box -------------------------------------
1251 // ToDo: this will be obsolete if vectors are used only
1252 if(dx<1 || dy<1) return bc; /* should not happen */
1253 b.p = (unsigned char *) malloc( dx * dy );
1254 if (!b.p) fprintf(stderr,"Warning: malloc failed L%d\n",__LINE__);
1255 if( copybox(p,x0,y0,dx,dy,&b,dx*dy) )
1256 { free(b.p); return bc; }
1257 // clr_bits(&b,0,b.x-1,0,b.y-1);
1258 // ------ use diagonal too (only 2nd run?)
1259 /* following code failes on ! and ? obsolete if vectors are used
1260 ToDo:
1261 - mark pixels neighoured to pixels outside and remove them from &b
1262 v0.40
1263 will be replaced by list of edge vectors
1264 - mark accents, dots and remove them from &b
1265 */
1266 #if 1 /* becomes obsolate by vector code */
1267 if (y0>0) // mark upper overlap
1268 for ( x=x0; x<=x1; x++) {
1269 if (getpixel(p,x,y0-1)<cs
1270 && getpixel(p,x,y0 )<cs && (marked(&b,x-x0,0)&1)!=1)
1271 mark_nn(&b,x-x0,0,cs,1);
1272 }
1273 if (x0>0) // mark left overlap
1274 for ( y=y0; y<=y1; y++) {
1275 if (getpixel(p,x0-1,y)<cs
1276 && getpixel(p,x0 ,y)<cs && (marked(&b,0,y-y0 )&1)!=1)
1277 mark_nn(&b,0,y-y0,cs,1);
1278 }
1279 if (x1<p->x-1) // mark right overlap
1280 for ( y=y0; y<=y1; y++) {
1281 if (getpixel(p,x1+1,y)<cs
1282 && getpixel(p,x1 ,y)<cs && (marked(&b,x1-x0,y-y0)&1)!=1)
1283 mark_nn(&b,x1-x0,y-y0,cs,1);
1284 }
1285 mark_nn(&b,xa-x0,ya-y0,cs,2); // not glued chars
1286 for(x=0;x<b.x;x++)
1287 for(y=0;y<b.y;y++){
1288 if ( (marked(&b,x,y )&3)==1 && getpixel(&b,x,y )<cs )
1289 b.p[x+y*b.x] = 255&~7; /* reset pixel */
1290 }
1291 #endif
1292
1293 // if (bc == UNKNOWN) // cause split to fail
1294 bc=ocr0(box1,&b,cs);
1295
1296 /* ToDo: try to change pixels near cs?? or melt? */
1297 if (box1->num_ac>0 && box1->wac[0]>=OCR_JOB->cfg.certainty && bc==UNKNOWN) {
1298 bc=box1->tac[0];
1299 }
1300
1301 if (um!=0 && um!=SPACE && bc<127) { /* ToDo: is that obsolete now? */
1302 wchar_t newbc;
1303 newbc = compose(bc, um );
1304 if (newbc == bc) { /* nothing composed */
1305 if(OCR_JOB->cfg.verbose & 7)
1306 fprintf(stderr, "\nDBG whatletter: compose(%s) was useless (%d,%d)",
1307 decode(bc,ASCII), box1->x0, box1->y0);
1308 // if(OCR_JOB->cfg.verbose & 6) out_x(box1);
1309 }
1310 bc = newbc;
1311 }
1312 // restore modified boxes
1313 box1->x0=bbuf.x0; box1->y0=bbuf.y0;
1314 box1->x1=bbuf.x1; box1->y1=bbuf.y1;
1315 // if (box1->c==UNKNOWN) out_b(box1,&b,0,0,dx,dy,cs); // test
1316
1317 free(b.p);
1318 return bc;
1319 }
1320
1321 /*
1322 ** creates a list of boxes/frames around objects detected
1323 ** on the pixmap p for further work
1324 ** returns number of boxes created.
1325 ** - by the way: get average X, Y (avX=sumX/numC,..)
1326 ** ToDo18?: do not put diagonal touched fat objects? easier to melt than to
1327 ** divide boxes? or for bold fonts (min-xpixels bigger 1?)
1328 */
scan_boxes(job_t * job,pix * p)1329 int scan_boxes( job_t *job, pix *p ){
1330 int x, y, nx, cs, rc, ds;
1331 struct box *box3;
1332 // job_t *job=OCR_JOB; /* fixme */
1333
1334 if (job->cfg.verbose)
1335 fprintf(stderr,"# scan_boxes");
1336
1337 cs = job->cfg.cs;
1338 job->res.sumX = job->res.sumY = job->res.numC = 0;
1339
1340 /* clear the lowest bits of each pixel, later used as "scanned"-marker */
1341 /* so boxes can overlap like bold "To" (proportional-font) */
1342 clr_bits( p, 0, p->x - 1, 0, p->y - 1);
1343
1344 for (y=0; y < p->y; y++)
1345 for (x=0; x < p->x; x++) // ds = direction to go 2=left 6=right
1346 for (ds=2; ds<7; ds+=4) { // NO - dust of size 1 is not removed !!!
1347 nx=x+((ds==2)?-1:+1);
1348 if (nx<0 || nx>=p->x) continue; /* out of image, ex: recframe */
1349 if ( getpixel(p, x,y)>=cs || getpixel(p,nx,y)< cs) // b/w transition?
1350 continue;
1351 if ((marked(p, x,y) & 1)&&(marked(p, nx, y) & 1))
1352 continue;
1353 /* non-marked b/w-transition found, start boxing connected pixels */
1354 /* check (and mark) only horizontal b/w transitions */
1355 // --- insert new box in list
1356 box3 = (struct box *)malloc_box(NULL);
1357 box3->x0=box3->x1=box3->x=x;
1358 box3->y0=box3->y1=box3->y=y;
1359 box3->num_frames=0;
1360 box3->dots=0;
1361 box3->num_boxes=1;
1362 box3->num_subboxes=0;
1363 box3->modifier='\0';
1364 box3->num=job->res.numC;
1365 box3->line=0; // not used here
1366 box3->m1=0; box3->m2=0; box3->m3=0; box3->m4=0;
1367 box3->p=p;
1368 box3->num_ac=0; // for future use
1369
1370 /* frame, vectorize and mark only odd/even horizontal b/w transitions
1371 * args: box, x,y, cs, mark, diag={0,1}, ds={2,6}
1372 * ds - start direction, 6=right of right border, 2=left of left border
1373 * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
1374 * -7=no border in direction ds
1375 * ToDo: count errors and print out for debugging
1376 */
1377 rc=frame_vector(box3, x, y, cs, 1, 1, ds);
1378 g_debug(fprintf(stderr,"\n# ... scan xy= %3d %3d rc= %2d", x, y, rc);)
1379 if (rc<0) { free_box(box3); continue; }
1380 if (box3->num_frames && !box3->num_frame_vectors[0])
1381 fprintf(stderr,"\nERROR scan_boxes: no vector in frame (%d,%d)",x,y);
1382
1383 job->res.numC++;
1384 job->res.sumX += box3->x1 - box3->x0 + 1;
1385 job->res.sumY += box3->y1 - box3->y0 + 1;
1386
1387 box3->c=(((box3->y1-box3->y0+1)
1388 *(box3->x1-box3->x0+1)>=MaxBox)? PICTURE : UNKNOWN);
1389 list_app(&(job->res.boxlist), box3); // append to list
1390 // ToDo: debug
1391 // if (job->cfg.verbose && box3->y0==29) out_x(box3);
1392 }
1393 if(job->res.numC){
1394 if (job->cfg.verbose)
1395 fprintf(stderr," nC= %3d avD= %2d %2d\n",job->res.numC,
1396 (job->res.sumX+job->res.numC/2)/job->res.numC,
1397 (job->res.sumY+job->res.numC/2)/job->res.numC);
1398 }
1399 return job->res.numC;
1400 }
1401
1402 /* compare ints for sorting. Return -1, 0, or 1 according to
1403 whether *vr < *vs, vr == *vs, or *vr > *vs */
1404 int
intcompare(const void * vr,const void * vs)1405 intcompare (const void *vr, const void *vs)
1406 {
1407 int *r=(int *)vr;
1408 int *s=(int *)vs;
1409
1410 if (*r < *s) return -1;
1411 if (*r > *s) return 1;
1412 return 0;
1413 }
1414
1415 /*
1416 * measure_pitch - detect monospaced font and measure the pitch
1417 * measure overall pitch for difficult lines,
1418 * after that measure pitch per line
1419 * dists arrays are limited to 1024 elements to reduce
1420 * cpu usage for qsort on images with extreme high number of objects
1421 * insert space if dist>=pitch in list_insert_spaces()
1422 * ToDo: ???
1423 * - min/max distance-matrix a-a,a-b,a-c,a-d ... etc; td,rd > ie,el,es
1424 * - OR measuring distance as min. pixel distance instead of box distance
1425 * especially useful for italic font!
1426 * - Kerning detection? minspace<=0 ???
1427 * - iterate minMono+maxMonoWidth and count fitting and misfitting pairs
1428 * Lit:
1429 * http://en.wikibooks.org/wiki/LaTeX/Formatting
1430 * #The_Space_between_Words_and_Sentences
1431 * \frenchspacing == no extra space after periods (word vs. sentences)
1432 * \sloppypar == some spaces between words may be to large
1433 * inter word space
1434 * http://en.wikipedia.org/wiki/Space_(punctuation)
1435 * Variable-width general-purpose space == 1/5-em to 1/3-em
1436 * http://en.wikipedia.org/wiki/Em_(typography)
1437 * em = absolute maximum high,
1438 * median cap height=0.70em,
1439 * x-height=1ex=0.45..0.48..0.5em
1440 * http://en.wikipedia.org/wiki/En_(typography) = n-width=0.5em
1441 * http://pfaedit.sourceforge.net/glossary.html#overshoot
1442 * i: left + right side bearing (character specifique, may be negative: VA)
1443 * http://en.wikipedia.org/wiki/Typeface
1444 * http://en.wikipedia.org/wiki/Letter-spacing
1445 * http://en.wikipedia.org/wiki/Tracking_(typography) # Overlap VA
1446 * http://en.wikipedia.org/wiki/Kerning # Overlap VA AT Tx etc.
1447 * similar blank 2D-area between pairs of characters
1448 * Helvetica: ry=+30 AV=-80 units?
1449 * ToDo18: better mono detection
1450 * 1st round min. mono_width = max char_width (except melted chars)
1451 * 2nd round max. mono_width = min x0-pre.x0, x1-pre.x1 (+check against min_mono_em)
1452 * 3th round if something not fit, mono=0
1453 *
1454 */
measure_pitch(job_t * job)1455 void measure_pitch( job_t *job ){ /* word spacing */
1456 int numdists=0, spc=0, /* number of stored distances */
1457 pitch_p=2, pdist, pdists[1024], /* proportional distances */
1458 pitch_m=10, /* monospaced em width */
1459 monospaced=1, l1, char_width_min=1023, char_width_max=0,
1460 mono_em_min=0, // maximum monospace char width + 1 2010-09-25
1461 mono_em_max=2047, // minimum distance left side of two chars
1462 d1l, d1r; // left-left and right-right distance of 2 chars
1463 int d1, d2; // temporary vars, d1l + d1r sorted
1464 struct box *box2, *pre1=NULL, *pre2=NULL;
1465
1466 if(job->cfg.verbose){ fprintf(stderr,"# check for word pitch"); }
1467 for (l1=0; l1<job->res.lines.num; l1++)
1468 { /* 0 means all lines */
1469 if(job->cfg.verbose){ fprintf(stderr,"\n# line %2d\n# ...",l1); }
1470 numdists = 0; /* clear distance lists */
1471 monospaced=1; mono_em_min=0; mono_em_max=2047; // reset, 2010-09-28
1472 char_width_min=1023; char_width_max=0; // reset, 2010-09-28
1473 for_each_data(&(job->res.boxlist)) {
1474 box2 = (struct box *)list_get_current(&(job->res.boxlist));
1475 if (l1>0 && box2->line!=l1) continue; /* ignore other lines */
1476 /* ignore dots and pictures (min. font is 4x6) */
1477 if (box2->y1 - box2->y0 + 1 < 4 || box2->c==PICTURE) pre2=pre1=NULL;
1478 if (!pre1) { pre1=box2; continue; } /* we need a predecessor */
1479 if (pre1 && pre1->line != box2->line) { pre1=box2; continue; } /* 201809 */
1480 /* use gap for proportional fonts */
1481 pdist = box2->x0 - pre1->x1 - 1; /* do not add 1, subtract 1 ! */
1482 if (pdist<0) { // new line
1483 pre2=NULL; pre1=box2; continue; }
1484 if ((box2->x1 - box2->x0 + 1)
1485 >2*(box2->y1 - box2->y0 + 1)) { // skip long object
1486 continue; }
1487 if ((pre1->x1 - pre1->x0 + 1)
1488 >2*(pre1->y1 - pre1->y0 + 1)) { // skip long object
1489 pre1=box2; continue; }
1490 // JS-2010-09 sample spaces20100910.jpg 7 chars, fix bad auto space
1491 if (char_width_min > box2->x1 - box2->x0 + 1)
1492 char_width_min = box2->x1 - box2->x0 + 1;
1493 if (box2->x1 - box2->x0 < 4*(pre1->x1 - pre1->x0)) // ~ big lines
1494 if (char_width_max < box2->x1 - box2->x0 + 1)
1495 char_width_max = box2->x1 - box2->x0 + 1;
1496 // may cause problems if "_" is of width em (not em-1 like mwMW etc.)
1497 if (mono_em_min < char_width_max + 1)
1498 mono_em_min = char_width_max + 1; // minimum monospaced width
1499
1500 // will fail on monospaced fonts where chars are not centered
1501 if (pre1) { // 2010-09-28
1502 d1l = box2->x0 - pre1->x0; // left to left distance
1503 d1r = box2->x1 - pre1->x1; // right to right distance
1504 if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1505 else { d1=d1l; d2=d1r; } // thicker char on the right
1506 /* d1 < 2*width && d2 < 2*width, may fail for "IIIM" d2<2*max OK */
1507 if (d1>0 && d1 < 2*char_width_max && d2 < 2*mono_em_max) {
1508 if (mono_em_min<d1-1) mono_em_min = d1; }
1509 if (d1>0) {
1510 if (mono_em_max>d2+2) mono_em_max = d2; } // not best, shifted ()
1511 // 2010-10-06 examples/ocr-b add -1 +2, bad for "()"
1512 #if 1
1513 if ((48 & job->cfg.verbose) == 48)
1514 if (monospaced && l1) // debugging until monospaced=0
1515 fprintf(stderr," L%02d DBG1 x %3d %+4d %3d %+4d d %3d %3d"
1516 " em %2d %2d ex %2d\n# ...",
1517 l1, pre1->x0, pre1->x1-pre1->x0+1,
1518 box2->x0, box2->x1-box2->x0+1, d1, d2,
1519 mono_em_min, mono_em_max, char_width_max);
1520 #endif
1521 }
1522 #if 1 // needed for correct spacing of last line of tmp08/0810CSchulze_crop
1523 if (pre2) {
1524 d1l = box2->x0 - pre2->x0; // left to left distance
1525 d1r = box2->x1 - pre2->x1; // right to right distance
1526 if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1527 else { d1=d1l; d2=d1r; } // thicker char on the right
1528 if (d1>0 && d1 < 3*char_width_max && d2 < 3*mono_em_max) {
1529 if (2*mono_em_min<d1) mono_em_min = (d1+1)/2; }
1530 if (d1>0) {
1531 if (2*mono_em_max>d2) mono_em_max = (d2+1)/2; }
1532 #if 1
1533 if ((48 & job->cfg.verbose) == 48)
1534 if (monospaced && l1) // debugging until monospaced=0
1535 fprintf(stderr," L%02d DBG2 x %3d %+4d %3d %+4d d %3d %3d"
1536 " em %2d %2d ex %2d\n# ...",
1537 l1, pre2->x0, pre2->x1-pre2->x0+1,
1538 box2->x0, box2->x1-box2->x0+1, d1, d2,
1539 mono_em_min, mono_em_max, char_width_max);
1540 #endif
1541 }
1542 #endif
1543
1544 // the upper part does good work, we do not need this stuff ... ???
1545 #if 0
1546 // min distance between next neighbours of pre
1547 if (pre2 && 1 < box2->x0 - pre2->x1)
1548 if (mono_em_max > box2->x0 - pre2->x1)
1549 mono_em_max = box2->x0 - pre2->x1;
1550 // ToDo: could be a problem for " ???
1551 if (pre2)
1552 if (pre1->x1 - pre1->x0 >= mono_em_min) // best max mono_dx
1553 if (pre1->x1 - pre1->x0 == box2->x1 - box2->x0) // best max mono_dx
1554 if (mono_em_max > box2->x0 - pre1->x0)
1555 mono_em_max = box2->x0 - pre1->x0;
1556 /* ToDo: better take 3 instead of 2 neighbours?, smallest font 4x6 */
1557 /* tmp08/gocr0801_bad5.jpg was not mono, need 2 to 3 chars */
1558 /* 2010-09-27 gives precise range! 16..22 to 16..17 */
1559 /* ToDo: no 2 char variant? */
1560 if (pre2 && 1 < box2->x0 - pre2->x1)
1561 if (box2->x0-pre1->x1+1 < mono_em_min) // no spc between char + pre1
1562 if (pre1->x0-pre2->x1+1 < mono_em_min) // no spc between pre1 + pre2
1563 {
1564 if (3*mono_em_min < box2->x1 - pre2->x0)
1565 mono_em_min = (box2->x1 - pre2->x0 + 2)/3;
1566 }
1567 #endif
1568 //# tmp09/oebb_teletext_836_0001_sw.png
1569 //# line 4 12 - 12 pre2 134 142 181 190
1570 //# 0 8 47 56
1571 //# 0 12 24 36 48
1572 // n=2: (n-1)*min < d1 <= (n )*max && (2*n+1)*max < (2*n+2)*min
1573 // (n )*min < d2 <= (n+1)*max && (2*n+2)*max < (2*n+3)*min
1574 if (monospaced && pre1) { // check 2 chars for non mono space within
1575 d1l = box2->x0 - pre1->x0; // left to left distance (do not + 1!)
1576 d1r = box2->x1 - pre1->x1; // right to right distance
1577 if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1578 else { d1=d1l; d2=d1r; } // thicker char on the right
1579 if ( mono_em_max < 2*mono_em_min
1580 && mono_em_min < 2*mono_em_max) // 2018-10 valid range?
1581 if ((box2->x0 - pre1->x1 <= mono_em_max // no space between
1582 // && box2->x1 - pre1->x0 > 2*mono_em_max) // crossing 1 em border?
1583 && box2->x1 - pre1->x0 > 2*mono_em_min+mono_em_min/8) // 2018-09 rnd80
1584 || (box2->x0 - pre1->x1 > mono_em_min // space between
1585 && box2->x0 - pre1->x1 <= 2*mono_em_max-mono_em_max/16
1586 // && box2->x1 - pre1->x0 > 3*mono_em_max)) { // crossing 2 em border?
1587 && box2->x1 - pre1->x0 > 3*mono_em_min+mono_em_min/8)) { // 2018-09 rnd80
1588 monospaced = 0; // can not be monospaced in that case 2010-09-25
1589 if (job->cfg.verbose)
1590 fprintf(stderr, " L%02d mono:=0 %d - %d pre1 %d %d %d %d y %d DBG%d\n# ...",
1591 l1, mono_em_min, mono_em_max,
1592 pre1->x0, pre1->x1, box2->x0, box2->x1,box2->y0,__LINE__);
1593 }
1594 }
1595 // n=3: (n-1)*min < d1 <= (n )*max && (2*n+1)*max < (2*n+2)*min
1596 // (n )*min < d2 <= (n+1)*max && (2*n+2)*max < (2*n+3)*min
1597 if (monospaced && pre2 && (2*2+2)*mono_em_max < (2*2+3)*mono_em_min)
1598 { // check 2 chars for non mono space within
1599 d1l = box2->x0 - pre2->x0; // left to left distance
1600 d1r = box2->x1 - pre2->x1; // right to right distance
1601 if (d1l > d1r) { d1=d1r; d2=d1l; } // thinner char on the right
1602 else { d1=d1l; d2=d1r; } // thicker char on the right
1603 if ((box2->x0 - pre2->x1 > mono_em_min+mono_em_min/16 // min. 1 char between
1604 && box2->x0 - pre2->x1 <= 2*mono_em_max-mono_em_max/8 // max. 2 chars
1605 // && box2->x1 - pre2->x0 > 3*mono_em_max) // crossing 2 em border?
1606 && box2->x1 - pre2->x0 > 3*mono_em_min+mono_em_min/8) // 2018-09 rnd80
1607 || 0*(box2->x0 - pre2->x1 > 2*mono_em_min+mono_em_min/8 // min. 2 ex between
1608 && box2->x0 - pre2->x1 <= 3*mono_em_max-mono_em_max/4 // ?????? ToDo oebb
1609 // && box2->x1 - pre2->x0 > 4*mono_em_max)) { // crossing 3 em border?
1610 && box2->x1 - pre2->x0 > 4*mono_em_min+mono_em_min/2)) { // 2018-09 rnd80
1611 monospaced = 0; // can not be monospaced in that case 2010-09-25
1612 if (job->cfg.verbose)
1613 fprintf(stderr, " L%02d mono:=0 %d - %d pre2 %d %d %d %d DBG%d\n# ...",
1614 l1, mono_em_min, mono_em_max,
1615 pre2->x0, pre2->x1, box2->x0, box2->x1, __LINE__);
1616 }
1617 }
1618 /* fonts are expected to be 6 to 60 pixels high, which is about
1619 4 to 50 pixels wide. We allow some extra margin.
1620 space > 0 2010-09-27
1621 ToDo: compare left and right gap (or additional nearest 4 gaps)
1622 similar to mono space detection, check min distance
1623 between upper, middle and lower rightmost vector of prev char and
1624 leftmost vector of right char (hight is defined by the lower char)
1625 (if overlapping chars are detected! WAV,Te,...)
1626 */
1627 if (0 < pdist && pdist < 140) { /* better mdist < 3*Xaverage ? */
1628 // ignore extra wide spaces, tmp09/gocr_screen_capture* 2010-09-28
1629 if (2*pdist<5*char_width_max)
1630 /* two options for overflow: 1) ignore, 2) store randomly */
1631 if (numdists<1024) { /* we do ignore here */
1632 pdists[numdists] = pdist;
1633 numdists++;
1634 }
1635 }
1636 pre2 = pre1; pre1 = box2;
1637 } end_for_each(&(job->res.boxlist));
1638
1639 if (job->cfg.verbose)
1640 fprintf(stderr, " L%02d num_gaps= %2d x_width= %2d - %2d"
1641 " mono_em= %2d - %2d mono= %d",
1642 l1, numdists, char_width_min, char_width_max,
1643 mono_em_min, mono_em_max, monospaced);
1644 if (numdists<8) {
1645 if (job->cfg.verbose && l1==0) /* only for all lines */
1646 fprintf(stderr," (WARNING num_gaps<8)");
1647 }
1648 #if 1 /* debugging */
1649 if ((job->cfg.verbose&(32+16))==48) {
1650 int i;
1651 fprintf(stderr,"\n# ...");
1652 for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1653 fprintf(stderr," <- pdist[%d]\n# ...",l1);
1654 }
1655 #endif
1656 if (numdists>0) {
1657 int i, diff, ni_min, max, best_p, ni;
1658 /* aware: takes long time for big data sets */
1659 /* dilute? (german: ausduennen?) */
1660 qsort (pdists, numdists, sizeof (int), intcompare);
1661 /* the new method, div0? */
1662 best_p=4*numdists/5;
1663 /* try to find better pitch for monospaced font (ok for prop) */
1664 // tolerant to 090729num* tmp09/barcodes090916_interleaved*
1665 if (mono_em_min > mono_em_max+(mono_em_min+4)/9+1 // rnd80 52 45
1666 || mono_em_max>=2*mono_em_min) {
1667 monospaced = 0;
1668 if (job->cfg.verbose)
1669 fprintf(stderr, "\n# ... L%02d mono:=0 %d - %d DBG%d",
1670 l1, mono_em_min, mono_em_max, __LINE__);
1671 } else
1672 pitch_m=((mono_em_max<3*mono_em_min)?
1673 (mono_em_max+3*mono_em_min)/4:mono_em_min);
1674 /* try to find better pitch for proportional font */
1675 // the largest diff could be the best, if diff is always 1,
1676 // take the diff with the lowest weight
1677 // JS-2010-09 add numdists<8 sample spaces20100908.jpg
1678 // todo: search most offen biggest gapdiff (ignore big table gaps)
1679 // mean gapdiff? gap[n-1-i]-gap[0+i] until gapdiff=0, skip table gaps
1680 // 2010-09-28 check until end of table, because old bad wide gaps are
1681 // no more added to the table
1682 for (ni=ni_min=1024,max=0,i=((numdists<8)?0:numdists/2+1);
1683 i<numdists;i++) {
1684 if (pdists[i]<=char_width_min/3) continue; // JS-2010-09
1685 if (pdists[i]> char_width_max*2) {
1686 /* set 2nd best which is numdists as default */; break; } // JS-2010-27 table gaps
1687 if (numdists<16) // single word?
1688 if (pdists[i]<=char_width_max/3) continue; // JS-2010-09
1689 diff=pdists[i]-pdists[i-1];
1690 if (diff>max) {
1691 max=diff; best_p=i-1;
1692 if ((job->cfg.verbose&(32+16))==48)
1693 fprintf(stderr," L%02d best_p= %3d + maxdiff=%3d\n# ...",
1694 l1, pdists[best_p], max);
1695 if (max>3 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1696 if (max>1 && 3*i>numdists*2 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1697 }
1698 if (diff) {
1699 if (ni<ni_min) {
1700 // do not try to divide one word per line
1701 ni_min=ni; if (max<=1 && numdists>16) best_p=i-1;
1702 if ((job->cfg.verbose&(32+16))==48)
1703 fprintf(stderr," L%02d best_p=%3d ni_min=%3d\n# ...",
1704 l1, pdists[best_p], ni_min);
1705 }
1706 ni=1;
1707 } else ni++;
1708 }
1709 if (numdists<16 && max<=1 && ni_min>1) best_p=numdists-1; // one word
1710 #if 1 /* debugging */
1711 if ((job->cfg.verbose&(32+16))==48) {
1712 // fprintf(stderr,"\n# ...");
1713 for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1714 fprintf(stderr," <- pdist[%d] sorted\n# ...",l1);
1715 fprintf(stderr," L%02d maxdiff=%d min_samediffs=%d", l1, max, ni_min);
1716 }
1717 #endif
1718 /* we measure spaces in two different ways (mono, prop) */
1719 /* prop: gap between boxes, mono: distance of middle */
1720 if (best_p<numdists-1) pitch_p = ((pdists[best_p]+pdists[best_p+1])/2+1);
1721 else pitch_p = (pdists[best_p]+1 );
1722 if (numdists)
1723 if ( pdists[numdists-1]*2 <= pdists[0]*3
1724 || pdists[numdists-1] <= pdists[0]+3) {
1725 /* line is just a single word */
1726 pitch_p = pdists[numdists-1]+10;
1727 }
1728 if (l1>0 && job->cfg.spc==0) {
1729 job->res.lines.pitch[l1]=(monospaced?pitch_m:pitch_p);
1730 job->res.lines.mono[l1]=monospaced;
1731 }
1732 if (job->cfg.verbose) {
1733 fprintf(stderr,"\n# ...");
1734 fprintf(stderr," L%02d mono: num=%3d min=%3d max=%3d pitch=%3d\n# ...",
1735 l1, numdists, mono_em_min,mono_em_max,pitch_m);
1736 fprintf(stderr," L%02d prop: num=%3d min=%3d max=%3d pitch=%3d @ %2d%%\n# ...",
1737 l1, numdists, pdists[0],pdists[numdists-1],pitch_p,best_p*100/numdists);
1738 fprintf(stderr," L%02d result: mono=%d distance >= %d considered as space\n# ...",
1739 l1, monospaced, job->res.lines.pitch[l1]);
1740 }
1741 } /* if (not) enough spaces */
1742 if (l1==0) { /* set default spaces to each line */
1743 int l2;
1744 spc = job->cfg.spc;
1745 if (spc==0) /* set only if not set by option */
1746 spc = ((monospaced)?pitch_m:pitch_p);
1747 for (l2=0; l2<job->res.lines.num; l2++ )
1748 job->res.lines.pitch[l2]=spc;
1749 }
1750 } /* each line */
1751 if (job->cfg.spc==0)
1752 job->cfg.spc = spc;
1753 if (job->cfg.verbose)
1754 fprintf(stderr," overall space width is %d %s\n",
1755 spc, ((monospaced)?"monospaced":"proportional"));
1756
1757
1758 }
1759
1760 /* ---- count subboxes (white holes within black area) --------
1761 * new: count boxes lying inside another box (usually holes, ex: "aeobdg")
1762 * needed for glue_boxes, dont joining textboxes, tables and other complex
1763 * objects
1764 * ToDo: count only frames of invers spin? do we need sorted list here? -> no
1765 */
count_subboxes(pix * pp)1766 int count_subboxes( pix *pp ){
1767 int ii=0, num_mini=0, num_same=0, cnt=0;
1768 struct box *box2,*box4;
1769 job_t *job=OCR_JOB; /* fixme */
1770 progress_counter_t *pc = NULL;
1771 if (job->cfg.verbose) { fprintf(stderr,"# count subboxes\n# ..."); }
1772
1773 pc = open_progress(job->res.boxlist.n,"count_subboxes");
1774 for_each_data(&(job->res.boxlist)) {
1775 box2 = (struct box *)list_get_current(&(job->res.boxlist));
1776 box2->num_subboxes=0;
1777 progress(cnt++,pc);
1778 if ( (box2->x1 - box2->x0)<2
1779 || (box2->y1 - box2->y0)<2) continue; /* speedup for dotted bg */
1780 // holes inside box2 char, aoebdqg, 0.41
1781 for_each_data(&(job->res.boxlist)) {
1782 box4=(struct box *)list_get_current(&(job->res.boxlist));
1783 if (box4->y0 > box2->y1) break; // faster, but boxes need to be sorted
1784 // ToDo: better use binary tree (above/below x) to find near boxes?
1785 if (box4==box2) continue;
1786 if( box4->x0==box2->x0 && box4->x1==box2->x1
1787 && box4->y0==box2->y0 && box4->y1==box2->y1)
1788 num_same++; /* erroneous!? */
1789 if ( box4->x0 >= box2->x0 && box4->x1 <= box2->x1
1790 && box4->y0 >= box2->y0 && box4->y1 <= box2->y1
1791 && box4->num_subboxes==0 ) /* box4 inside box2? */
1792 {
1793 box2->num_subboxes++; ii++;
1794 if ((box4->x1 - box4->x0 + 1)
1795 *(box4->y1 - box4->y0 + 1)<17) num_mini++;
1796 }
1797 } end_for_each(&(job->res.boxlist));
1798 #if 0
1799 if (cnt < 1000 && job->cfg.verbose)
1800 fprintf(stderr," %4d box %4d %4d %+3d %+3d subboxes %4d\n# ...",
1801 cnt, box2->x0, box2->y0, box2->x1-box2->x0,
1802 box2->y1-box2->y0, box2->num_subboxes);
1803 #endif
1804 } end_for_each(&(job->res.boxlist));
1805 close_progress(pc);
1806 if (job->cfg.verbose)
1807 fprintf(stderr," %3d subboxes counted (mini=%d, same=%d) nC= %d\n",
1808 ii, num_mini, num_same/2 /* counted twice */, cnt);
1809 return 0;
1810 }
1811
1812 /* ---- join holes to chars( before step1 ) v0.42 -----------------------
1813 join boxes lying inside another box (usually holes, ex: "aeobdg46890")
1814 Dont add dust to a char! (ij-dots later)
1815 lines are not detected yet
1816 */
glue_holes_inside_chars(pix * pp)1817 int glue_holes_inside_chars( pix *pp ){
1818 int ii, x0, y0, x1, y1, cnt=0,
1819 glued_same=0, glued_holes=0;
1820 struct box *box2, *box4;
1821 job_t *job=OCR_JOB; /* fixme */
1822 progress_counter_t *pc = NULL;
1823 // int cs=job->cfg.cs;
1824 {
1825 count_subboxes( pp ); /* move to pgm2asc() later */
1826
1827 pc = open_progress(job->res.boxlist.n,"glue_holes_inside_chars");
1828 if (job->cfg.verbose)
1829 fprintf(stderr,"# glue_holes to chars nC= %d\n# ...",job->res.numC);
1830 ii=0;
1831 for_each_data(&(job->res.boxlist)) {
1832 // get the smaller box which may be extended by bigger boxes around it
1833 box2 = (struct box *)list_get_current(&(job->res.boxlist));
1834 x0 = box2->x0; x1 = box2->x1;
1835 y0 = box2->y0; y1 = box2->y1;
1836
1837 progress(cnt++,pc);
1838
1839 // would it better than moving vectors to build a sub-box-tree?
1840
1841 // do not remove chars inside pictures (car plates on photos)
1842 if( box2->c == PICTURE || box2->num_subboxes > 7) continue;
1843
1844 // holes inside char, aoebdqg, 0.41
1845 // dont merge boxes which have subboxes by itself!
1846 // search boxes inside box2
1847 // if (x1-x0+1>2 || y1-y0+1>2) /* skip tiny boxes, bad for 4x6 */
1848 for_each_data(&(job->res.boxlist)) {
1849 box4=(struct box *)list_get_current(&(job->res.boxlist));
1850 if(box4!=box2 && box4->c != PICTURE )
1851 {
1852 // ToDo: dont glue, if size differs by big factors (>16?)
1853 // box4 is of same size or smaller
1854 //if ((job->cfg.verbose & 48)==48
1855 // && abs(box4->x0-x0)<4 && abs(box4->y0-y0)<8)
1856 // { fprintf(stderr,"\n# DBG_glue");out_x(box2);out_x(box4); }
1857 if (abs(box4->frame_vol[0])
1858 >=abs(box2->frame_vol[0])/512) // 2010-10 bad invalid_ogv.jpg
1859 if ( ( box4->x0==x0 && box4->x1==x1
1860 && box4->y0==y0 && box4->y1==y1 ) /* do not happen !? */
1861 || ( box4->x0>=x0 && box4->x1<=x1
1862 && box4->y0>=y0 && box4->y1<=y1
1863 // 2010-09 subboxes==0 to subboxes<4 for 0 with dot in it
1864 && box4->num_subboxes<2 ) ) /* no or very small subboxes? */
1865 { // fkt melt(box2,box4)
1866 // same box, if very small but hollow char (4x5 o)
1867 if( box4->x0==x0 && box4->x1==x1
1868 && box4->y0==y0 && box4->y1==y1) glued_same++; else glued_holes++;
1869 // fprintf(stderr,"\n# DEBUG merge:");
1870 // out_x(box2); // small
1871 // out_x(box4); // big
1872 if ((job->cfg.verbose & 7)==7) // LEV3
1873 fprintf(stderr," join hole %4d %4d %+4d %+4d %+6d"
1874 " + %4d %4d %+4d %+4d %+6d %d\n# ...",
1875 x0, y0, x1-x0+1, y1-y0+1, box2->frame_vol[0],
1876 box4->x0, box4->y0,
1877 box4->x1-box4->x0+1, box4->y1-box4->y0+1,
1878 box4->frame_vol[0], glued_same);
1879 if ((box4->x1-box4->x0+1)< 8*(x1-x0+1)
1880 || (box4->y1-box4->y0+1)<12*(y1-y0+1)) // skip dust
1881 merge_boxes( box2, box4 ); // add box4 to bigger box2
1882 //if ((job->cfg.verbose & 48)==48)
1883 // { fprintf(stderr,"\n# DBG_glue_result");out_x(box2); }
1884 x0 = box2->x0; x1 = box2->x1;
1885 y0 = box2->y0; y1 = box2->y1;
1886 job->res.numC--; // dont count fragments as chars
1887 ii++; // count removed
1888 list_del(&(job->res.boxlist), box4); // remove box4
1889 free_box(box4);
1890 // now search another hole inside box2
1891 }
1892 }
1893 } end_for_each(&(job->res.boxlist));
1894
1895 } end_for_each(&(job->res.boxlist));
1896
1897 if (job->cfg.verbose)
1898 fprintf(stderr," joined: %3d holes, %3d same, nC= %d\n",
1899 glued_holes, glued_same, job->res.numC);
1900 close_progress(pc);
1901 }
1902 return 0;
1903 }
1904
1905
1906 /* ---- join broken chars ( before step1 ??? ) -----------------------
1907 use this carefully, do not destroy previous detection ~fi, broken K=k' g
1908 join if boxes are near or diagonally connected
1909 other strategy: mark boxes for deleting and delete in extra loop at end
1910 faster: check only next two following boxes because list is sorted!
1911 ToDo: store m4 of upper line to m4_of_prev_line, and check that "-points are below
1912 done: join boxes lying inside another box (usually holes, ex: "aeobdg")
1913 Dont add dust to a char!
1914 lines should be detected already (Test it for m1-m4 unknown)
1915 ToDo: divide in glue_idots, glue_thin_chars etc. and optimize it
1916 */
glue_broken_chars(job_t * job,pix * pp)1917 int glue_broken_chars( job_t *job, pix *pp ){
1918 int ii, y, cs, x0, y0, x1, y1, cnt=0,
1919 num_frags=0, glued_frags=0, glued_hor=0,
1920 do_join=0; /* 1..n means we have a reason to join two objects to one */
1921 // for better debugging: upper_dots(umlauts) lower_dots ...
1922 // char *(join_reason)[5]={"no","\"A\"Uij\%","!?;\%","=:;","'',,"}; 2018-09
1923 char *(join_reason)[5]={"no", "\"A\"Uij%%", "!?;%%", "=:;", "'',,"};
1924 // do_join: 0 1 2 3 4
1925 struct box *box2, *box4;
1926 // job_t *job=OCR_JOB; /* fixme */
1927 progress_counter_t *pc = NULL;
1928 cs=job->cfg.cs;
1929 {
1930 count_subboxes( pp ); /* move to pgm2asc() later */
1931
1932 pc = open_progress(job->res.boxlist.n,"glue_broken_chars");
1933 if (job->cfg.verbose)
1934 fprintf(stderr,"# glue broken chars nC= %d avX= %d\n# ...",
1935 job->res.numC, job->res.avX);
1936 ii=0;
1937 for_each_data(&(job->res.boxlist)) {
1938 // get the box which may be extended by boxes around it
1939 box2 = (struct box *)list_get_current(&(job->res.boxlist));
1940 x0 = box2->x0; x1 = box2->x1;
1941 y0 = box2->y0; y1 = box2->y1;
1942 progress(cnt++,pc);
1943 do_join=0;
1944 // vertical broken (g965T umlauts etc.)
1945 // not: f,
1946 // would it better than moving vectors to build a sub-box-tree?
1947 // do not remove chars inside pictures (car plates on photos)
1948 if (box2->c == PICTURE || box2->num_subboxes > 7) continue;
1949 /* continue loop if box is below or above line = dust */
1950 if (box2->m4>0 && y0>box2->m4) continue; /* dust outside ? */
1951 if (box2->m1>0 && y0<box2->m1-(box2->m3-box2->m2)) continue;
1952 /* ToDo:
1953 * - check that y0 is greater as m3 of the char/line above
1954 */
1955 // --- variant 1 = ij-dots umlaut-dots :;= ---
1956 // check small boxes (box2) whether they belong
1957 // to near same size or bigger boxes (box4)
1958 if( 2*(y1-y0) < box2->m4 - box2->m1 // care for dots etc.
1959 && ( 2*y1<=(box2->m3+box2->m2) // upper fragments
1960 || 2*y0>=(box2->m3+box2->m2)) ) { // lower fragments
1961 struct box *box5=NULL; // nearest box
1962 box4=NULL;
1963 num_frags++; /* count for debugging */
1964 // get the [2nd] next x-nearest box in the same line
1965 for_each_data(&(job->res.boxlist)) {
1966 box4=(struct box *)list_get_current(&(job->res.boxlist));
1967 if (box4 == box2 || box4->c == PICTURE) continue;
1968 /* 0.42 speed up for background pixel pattern, box4 to small */
1969 if ( box4->x1 - box4->x0 + 1 < x1-x0+1
1970 && box4->y1 - box4->y0 + 1 < y1-y0+1 ) continue;
1971 // have in mind that line number may be wrong for dust
1972 if (box4->line>=0 && box2->line>=0 && box4->line==box2->line)
1973 {
1974 if (!box5) box5=box4;
1975 if ( abs(box4->x0 + box4->x1 - 2*box2->x0)
1976 <abs(box5->x0 + box5->x1 - 2*box2->x0))
1977 { /* box6=box5; next-nearest box */ box5=box4; }
1978 }
1979 } end_for_each(&(job->res.boxlist));
1980 box4=box5; // next nearest box within the same line
1981 if (box4) {
1982 // do not glue "%^" in 0811qemu2.png 2010-09-28
1983 if (box4->x1 - box4->x0 + 1 > job->res.avX / 2
1984 && box2->x1 - box2->x0 + 1 > job->res.avX / 2
1985 && ( box2->x0 > box4->x1
1986 || box4->x0 > box2->x1)) continue;
1987 #if 0 /* set this to 1 for debugging of melting bugs */
1988 if (job->cfg.verbose & 7) {
1989 fprintf(stderr,"\n# next two boxes are candidates for joining");
1990 out_x(box2);
1991 out_x(box4); }
1992 #endif
1993 if ( /* umlaut "a "o "u, ij; box2 is the small dot, box4 the body */
1994 4*y1 <= 3*box2->m2 + box2->m3 // y1=box2->y1, ocr-a %
1995 && 4*box4->y1 >= 3*box2->m2 + box2->m3 // dont join 2 dots
1996 && 2* y1 < box4->y1 + box4->y0 // box2 above box4
1997 && box4->x1 + job->res.avX/2 >= x0
1998 && box4->x0 - job->res.avX/2 <= x1
1999 && (y1 < box4->y0 || x0 < box4->x1) // dont melt "d'"
2000 && 3* ( y1 - box4->y0)
2001 <= 2* (box4->y1 - box4->y0) // too far away? dust!
2002 // ToDo mono-serif-i dot is 8x smaller char but "Strichdicke"?
2003 && 9* ( x1 - x0 + 1) // rnd80.i 4x8 vs. 35x34
2004 >= (box4->x1 - box4->x0 + 1) // dot must have minimum size
2005 && 10* ( y1 - y0 + 1)
2006 >= (box4->y1 - box4->y0 + 1) // dot must have minimum size
2007 ) do_join=1;
2008 if ( (!do_join) /* !?; box2 is the dot, box4 the body */
2009 && 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
2010 && 2*box4->x0<=2*x1 /* +x0+1 Jan00 */
2011 && ( x1-x0 <= box4->x1-box4->x0+2 )
2012 && 2*y0>=box2->m2+box2->m3
2013 && 4*y1>=box2->m2+3*box2->m3
2014 && 4*(y1-y0)<box2->m4-box2->m1
2015 && (8*box4->y1 < box4->m2+7*box4->m3
2016 || box4->m4-box4->m1<16) /* Jan00 */
2017 ) do_join=2;
2018 if ( (!do_join) /* =;: box2 is the upper box, box4 the lower box */
2019 && 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
2020 && 2*box4->x0<=2*x1 /* +x0+1 */
2021 && ( x1-x0 <= box4->x1-box4->x0+4 )
2022 && ( 4*x0 <= 3*box4->x1+box4->x0 )
2023 && (( box2->m2 && box4->m2
2024 && y1< box2->m3
2025 && 2*box4->y1 > box4->m3+box4->m2 // can be bigger than m3
2026 && 4*box4->y0 >= 3*box4->m2+box4->m3
2027 && 2*box2->y0 < box2->m3+box2->m2
2028 )
2029 || ( (!box2->m2) || (!box4->m2) )
2030 )
2031 ) do_join=3;
2032 /* '' ,, tmp08/0811qemu2 2010-10-01 + rnd80.png=mono */
2033 if ( abs(box2->y1 - box4->y1) <= (y1-y0)/8+1 // same y1
2034 && abs(box2->y0 - box4->y0) <= (y1-y0)/8+1 // same y0
2035 && abs((box4->x1 - box4->x0) - (x1-x0)) <= (x1-x0)/8+1 // same dx
2036 && x1-x0 <= job->res.avX/2 // small width
2037 && ( abs(box4->x0 - x1 - 1) <= job->res.avX/2 // small gap
2038 || abs(x0 - box4->x1 - 1) <= job->res.avX/2) // ocr-b
2039 && ( 4*y1 <= 3*box2->m2 + box2->m3 // ''
2040 || 4*y0 >= 2*box2->m2 + 2*box2->m3 ) // ,,
2041 ) do_join=4;
2042 if (do_join>0) { // fkt melt(box2,box4)
2043 if (job->cfg.verbose & 7) // space "( " for better " x"-searching
2044 fprintf(stderr," join objects %4d %4d %+4d %+4d"
2045 " + %4d %4d %+4d %+4d %s\n# ...",
2046 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2047 box4->x1-box4->x0+1, box4->y1-box4->y0+1,join_reason[do_join]);
2048 // fprintf(stderr,"\n# DEBUG merge:"); // d=7x34 @ (109,51) ???
2049 // if (job->cfg.verbose & 4) out_x(box2);
2050 // if (job->cfg.verbose & 4) out_x(box4);
2051 merge_boxes( box2, box4 ); // add box4 to box2
2052 x0 = box2->x0; x1 = box2->x1;
2053 y0 = box2->y0; y1 = box2->y1;
2054 #if 0
2055 if (job->cfg.verbose & 7) //
2056 fprintf(stderr," join objects %3d %3d %+4d %+4d\n# ...",
2057 x0, y0, x1-x0+1, y1-y0+1);
2058 #endif
2059 // if (job->cfg.verbose & 4) out_x(box2);
2060 // 2010-09-24 hmm, correct overall hight here, later set bad???
2061 // job->res.numC--; // dont count fragments as chars
2062 ii++; glued_frags++; // remove
2063 // output_list(job);
2064 list_del(&(job->res.boxlist), box4); /* ret&1: error-message ??? */
2065 // output_list(job);
2066 free_box(box4);
2067 }
2068 }
2069 }
2070 // continue;
2071
2072 // horizontally broken w' K'
2073 if( 2*y1 < (box2->m3+box2->m2) )
2074 if( 2*(y1-y0) < (box2->m3+box2->m2) ) // fragment
2075 for_each_data(&(job->res.boxlist)) {
2076 box4=(struct box *)list_get_current(&(job->res.boxlist));
2077 if (box4!=box2 && box4->c != PICTURE)
2078 {
2079 if( box4->line>=0 && box4->line==box2->line
2080 && box4->x1>=x0-1 && box4->x1<x0 // do not glue 6-
2081 && box4->x0+3*box4->x1<4*x0)
2082 if( get_bw(x0 ,x0 ,y1,y1 ,pp,cs,1) == 1)
2083 if( get_bw(x0-2,x0-1,y1,y1+2,pp,cs,1) == 1)
2084 { // fkt melt(box2,box4)
2085 if (job->cfg.verbose & 7)
2086 fprintf(stderr," join objects %4d %4d %+4d %+4d"
2087 " + %4d %4d %+4d %+4d w'K'\n# ...",
2088 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2089 box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2090 put(pp,x0,y1+1,~(128+64),0);
2091 merge_boxes( box2, box4 );
2092 x0 = box2->x0; x1 = box2->x1;
2093 y0 = box2->y0; y1 = box2->y1;
2094 job->res.numC--; ii++; // remove
2095 glued_hor++;
2096 list_del(&(job->res.boxlist), box4);
2097 free_box(box4);
2098 }
2099 }
2100 } end_for_each(&(job->res.boxlist));
2101
2102 // horizontally broken n h (h=l_) v0.2.5 Jun00
2103 if( abs(box2->m2-y0)<=(y1-y0)/8 )
2104 if( abs(box2->m3-y1)<=(y1-y0)/8 )
2105 if( num_cross(x0, x1,(y0+ y1)/2,(y0+ y1)/2,pp,cs) == 1)
2106 if( num_cross(x0, x1,(y0+3*y1)/4,(y0+3*y1)/4,pp,cs) == 1)
2107 if( get_bw((3*x0+x1)/4,(3*x0+x1)/4,(3*y0+y1)/4,y1,pp,cs,1) == 0)
2108 if( get_bw(x0,(3*x0+x1)/4,(3*y0+y1)/4,(y0+3*y1)/4,pp,cs,1) == 0)
2109 if( get_bw(x0, x0, y0,(3*y0+y1)/4,pp,cs,1) == 1)
2110 for_each_data(&(job->res.boxlist)) {
2111 box4=(struct box *)list_get_current(&(job->res.boxlist));
2112 if (box4!=box2 && box4->c != PICTURE)
2113 {
2114 if( box4->line>=0 && box4->line==box2->line
2115 && box4->x1>x0-3 && box4->x1-2<x0
2116 && abs(box4->y1-box2->m3)<2)
2117 { // fkt melt(box2,box4)
2118 if (job->cfg.verbose & 7)
2119 fprintf(stderr," join objects %4d %4d %+4d %+4d"
2120 " + %4d %4d %+4d %+4d nh\n# ...",
2121 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
2122 box4->x1-box4->x0+1, box4->y1-box4->y0+1);
2123 y=loop(pp,x0,y0,y1-y0,cs,0,DO);if(2*y>y1-y0) continue;
2124 put(pp,x0-1,y0+y ,~(128+64),0);
2125 put(pp,x0-1,y0+y+1,~(128+64),0);
2126 merge_boxes( box2, box4 ); // add box4 to box2
2127 x0 = box2->x0; x1 = box2->x1;
2128 y0 = box2->y0; y1 = box2->y1;
2129 job->res.numC--; ii++; // remove
2130 glued_hor++;
2131 list_del(&(job->res.boxlist), box4);
2132 free_box(box4);
2133 }
2134 }
2135 } end_for_each(&(job->res.boxlist));
2136 } end_for_each(&(job->res.boxlist));
2137 if (job->cfg.verbose)
2138 fprintf(stderr," joined: %3d fragments (found %3d), %3d rest, nC= %d\n",
2139 glued_frags, num_frags, glued_hor, job->res.numC);
2140 close_progress(pc);
2141 }
2142 return 0;
2143 }
2144
2145 /*
2146 ** this is a simple way to improve results on noisy images:
2147 ** - find similar chars (build cluster of same chars)
2148 ** - analyze clusters (could be used for generating unknown font-base)
2149 ** - the quality of the result depends mainly on the distance function
2150 */
2151 // ---- analyse boxes, compare chars, compress picture ------------
2152 // ToDo: - error-correction only on large chars!
find_same_chars(pix * pp)2153 int find_same_chars( pix *pp){
2154 int i,k,d,cs,dist,n1,dx; struct box *box2,*box3,/* *box4, */ *box5;
2155 pix p=(*pp);
2156 job_t *job=OCR_JOB; /* fixme */
2157 cs=job->cfg.cs;
2158 {
2159 if(job->cfg.verbose)fprintf(stderr,"# packing");
2160 i = list_total(&(job->res.boxlist));
2161 for_each_data(&(job->res.boxlist)) {
2162 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2163 dist=1000; // 100% maximum
2164 dx = box2->x1 - box2->x0 + 1;
2165
2166 if(job->cfg.verbose)fprintf(stderr,"\r# packing %5d",i);
2167 if( dx>3 )
2168 for(box3=(struct box *)list_next(&(job->res.boxlist),box2);box3;
2169 box3=(struct box *)list_next(&(job->res.boxlist),box3)) {
2170 if(box2->num!=box3->num){
2171 int d=distance(&p,box2,&p,box3,cs);
2172 if ( d<dist ) { dist=d; /* box4=box3; */ } // best fit
2173 if ( d<5 ){ // good limit = 5% ???
2174 i--;n1=box3->num; // set all num==box2.num to box2.num
2175 for_each_data(&(job->res.boxlist)) {
2176 box5=(struct box *)(struct box *)list_get_current(&(job->res.boxlist));
2177 if(box5!=box2)
2178 if( box5->num==n1 ) box5->num=box2->num;
2179 } end_for_each(&(job->res.boxlist));
2180 // out_x2(box2,box5);
2181 // fprintf(stderr," dist=%d\n",d);
2182 }
2183 }
2184 }
2185 // nearest dist to box2 has box4
2186 // out_b2(box2,box4);
2187 // fprintf(stderr," dist=%d\n",dist);
2188 } end_for_each(&(job->res.boxlist));
2189 k=0;
2190 if(job->cfg.verbose)fprintf(stderr," %d different chars",i);
2191 for_each_data(&(job->res.boxlist)) {
2192 struct box *box3,*box4;
2193 int j,dist;
2194 box2=(struct box *)list_get_current(&(job->res.boxlist));
2195 for(box3=(struct box *)list_get_header(&(job->res.boxlist));
2196 box3!=box2 && box3!=NULL;
2197 box3=(struct box *)list_next(&(job->res.boxlist), box3))
2198 if(box3->num==box2->num)break;
2199 if(box3!=box2 && box3!=NULL)continue;
2200 i++;
2201 // count number of same chars
2202 dist=0;box4=box2;
2203
2204 for(box3=box2,j=0;box3;
2205 box3=(struct box *)list_next(&(job->res.boxlist), box3)) {
2206 if(box3->num==box2->num){
2207 j++;
2208 d=distance(&p,box2,&p,box3,cs);
2209 if ( d>dist ) { dist=d; box4=box3; } // worst fit
2210 }
2211 }
2212 if(job->cfg.verbose&8){
2213 out_x2(box2,box4);
2214 fprintf(stderr," no %d char %4d %5d times maxdist=%d\n",i,box2->num,j,dist);
2215 }
2216 // calculate mean-char (error-correction)
2217 // ToDo: calculate maxdist in group
2218 k+=j;
2219 // if(j>1)
2220 // out_b(box1,NULL,0,0,0,0,cs);
2221 if(job->cfg.verbose&8)
2222 fprintf(stderr," no %d char %4d %5d times sum=%d\n",i,box2->num,j,k);
2223 } end_for_each(&(job->res.boxlist));
2224 if(job->cfg.verbose)fprintf(stderr," ok\n");
2225 }
2226 return 0;
2227 }
2228
2229 /*
2230 ** call the first engine for all boxes and set box->c=result;
2231 **
2232 */
char_recognition(pix * pp,int mo)2233 int char_recognition( pix *pp, int mo){
2234 int i,ii,ni,cs,x0,y0,x1,y1;
2235 struct box *box2;
2236 progress_counter_t *pc;
2237 wchar_t cc;
2238 job_t *job=OCR_JOB; /* fixme */
2239 cs=job->cfg.cs;
2240 // ---- analyse boxes, find chars ---------------------------------
2241 if (job->cfg.verbose)
2242 fprintf(stderr,"# char recognition");
2243 i=ii=ni=0;
2244 for_each_data(&(job->res.boxlist)) { /* count boxes */
2245 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2246 /* wew: isn't this just job->res.numC? */
2247 /* js: The program is very complex. I am not sure anymore
2248 wether numC is the number of boxes or the number of valid
2249 characters.
2250 Because its not time consuming I count the boxes here. */
2251 if (box2->c==UNKNOWN) i++;
2252 if (box2->c==PICTURE) ii++;
2253 ni++;
2254 } end_for_each(&(job->res.boxlist));
2255 if(job->cfg.verbose)
2256 fprintf(stderr," unknown= %d picts= %d boxes= %d\n# ",i,ii,ni);
2257 if (!ni) return 0;
2258 i=ii=0;
2259 pc = open_progress(ni,"char_recognition");
2260 for_each_data(&(job->res.boxlist)) {
2261 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2262 x0=box2->x0;x1=box2->x1;
2263 y0=box2->y0;y1=box2->y1; // box
2264 cc=box2->c;
2265 if (cc==PICTURE) continue;
2266
2267 if ((mo&256)==0) { /* this case should be default (main engine) */
2268 if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2269 cc=whatletter(box2,cs ,0);
2270 }
2271
2272 if(mo&2)
2273 if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<job->cfg.certainty)
2274 cc=ocr_db(box2, job);
2275
2276
2277 // box2->c=cc; bad idea (May03 removed)
2278 // set(box2,cc,95); ToDo: is that better?
2279
2280 if(cc==UNKNOWN)
2281 i++;
2282 ii++;
2283
2284 if(job->cfg.verbose&8) {
2285 fprintf(stderr,"\n# code= %04lx %c",(long)cc,(char)((cc<255)?cc:'_'));
2286 out_b(box2,pp,x0,y0,x1-x0+1,y1-y0+1,cs);
2287 }
2288 progress(ii,pc); /* ii = 0..ni */
2289
2290 } end_for_each(&(job->res.boxlist));
2291 close_progress(pc);
2292 if(job->cfg.verbose)fprintf(stderr," %d of %d chars unidentified\n",i,ii);
2293 return 0;
2294 }
2295
2296
2297 /*
2298 ** compare unknown with known chars,
2299 ** very similar to the find_similar_char_function but here only to
2300 ** improve the result
2301 */
compare_unknown_with_known_chars(pix * pp,int mo)2302 int compare_unknown_with_known_chars(pix * pp, int mo) {
2303 job_t *job=OCR_JOB; /* fixme */
2304 int i, cs = job->cfg.cs, dist, d, ad, wac, ni, ii;
2305 struct box *box2, *box3, *box4;
2306 progress_counter_t *pc=NULL;
2307 wchar_t bc;
2308 i = ii = 0; // ---- -------------------------------
2309 if (job->cfg.verbose)
2310 fprintf(stderr, "# try to compare unknown with known chars !(mode&8)");
2311 if (!(mo & 8))
2312 {
2313 ii=ni=0;
2314 for_each_data(&(job->res.boxlist)) { ni++; } end_for_each(&(job->res.boxlist));
2315 pc = open_progress(ni,"compare_chars");
2316 for_each_data(&(job->res.boxlist)) {
2317 box2 = (struct box *)list_get_current(&(job->res.boxlist)); ii++;
2318 if (box2->c == UNKNOWN || (box2->num_ac>0 && box2->wac[0]<97))
2319 if (box2->y1 - box2->y0 > 4 && box2->x1 - box2->x0 > 1) { // no dots!
2320 box4 = (struct box *)list_get_header(&(job->res.boxlist));;
2321 dist = 1000; /* 100% maximum */
2322 bc = UNKNOWN; /* best fit char */
2323 for_each_data(&(job->res.boxlist)) {
2324 box3 = (struct box *)list_get_current(&(job->res.boxlist));
2325 wac=((box3->num_ac>0)?box3->wac[0]:100);
2326 if (box3 == box2 || box3->c == UNKNOWN
2327 || wac<job->cfg.certainty) continue;
2328 if (box2->y1 - box2->y0 < 5 || box2->x1 - box2->x0 < 3) continue;
2329 d = distance(pp, box2, pp, box3, cs);
2330 if (d < dist) {
2331 dist = d; bc = box3->c; box4 = box3;
2332 }
2333 } end_for_each(&(job->res.boxlist));
2334 if (dist < 10) {
2335 /* sureness can be maximal of box3 */
2336 if (box4->num_ac>0) ad = box4->wac[0];
2337 else ad = 97;
2338 ad-=dist; if(ad<1) ad=1;
2339 /* ToDo: ad should depend on ad of bestfit */
2340 setac(box2,(wchar_t)bc,ad);
2341 i++;
2342 } // limit as option???
2343 // => better max distance('e','e') ???
2344 if (dist < 50 && (job->cfg.verbose & 7)) { // only for debugging
2345 fprintf(stderr,"\n# L%02d xy= %4d %4d best fit was %04x=%c"
2346 " dist=%3d%% i=%d", box2->line, box2->x0, box2->y0,
2347 (int)bc, (char)((bc<128)?bc:'_'), dist, i);
2348 if (box4->num_ac>0) fprintf(stderr," w= %3d%%",box4->wac[0]);
2349 if ((job->cfg.verbose & 4) && dist < 10)
2350 out_x2(box2, box4);
2351 }
2352 progress(ii,pc);
2353 }
2354 } end_for_each(&(job->res.boxlist));
2355 close_progress(pc);
2356 }
2357 if (job->cfg.verbose)
2358 fprintf(stderr, " - found %d (nC=%d)\n", i, ii);
2359 return 0;
2360 }
2361
2362 /*
2363 // ---- divide overlapping chars which !strchr("_,.:;",c);
2364 // block-splitting (two ore three glued chars)
2365 // division if dots>0 does not work properly! ???
2366 //
2367 // ToDo: what about glued "be"? simply try vert. cut on fat vert. line?
2368 // what about recursive division?
2369 // ToDo: mark divided boxes to give the engine a chance to
2370 // handle wrong divisions
2371 // sample: tmp13/sslmozFP.png bold 8x9-to-9x9-overlapfont
2372 // Todo: check min-x-neigbours_of_all_black_pixels if>1 erosion right ?
2373 // also if two vectors between are same but reverse = cut, 'nt' 'To'
2374 // Todo: tmp08/gocr0801_bad5.jpg double-touching-"ke"= 2 holes!
2375 // middle hole must be splitted to left and right char, ToDo18
2376 */
try_to_divide_boxes(pix * pp,int mo)2377 int try_to_divide_boxes( pix *pp, int mo){
2378 struct box *box2, boxa, boxb;
2379 job_t *job=OCR_JOB; /* fixme */
2380 int cs=job->cfg.cs, ad=100,
2381 a2[8], ar, // certainty of each part, ar = product of all certainties
2382 cbest; // best certainty, skip search of certainty<cbest-1 for speed
2383 wchar_t ci[8], // split max. 8 chars
2384 s1[]={ UNKNOWN, '_', '.', ',', '\'', '!', ';', '?', ':', '-',
2385 '=', '(', ')', '/', '\\', '\0' }; // not accepted chars, \0-terminated!
2386 int x0, x1, y0, y1,
2387 xi[8+1]; // cutting positions
2388 int i, ii, i1, i2, n1, dx; // dy, dx;
2389 // pix p=(*pp); // remove!
2390 if (job->cfg.verbose)
2391 fprintf(stderr,"# try to divide unknown chars !(mode&16)");
2392 if(!(mo&16)) // put this to the caller
2393 for_each_data(&(job->res.boxlist)) {
2394 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2395 // don't try to split simple structures (ex: 400x30 square)
2396 if ((!box2->num_frames)
2397 || box2->num_frame_vectors[ box2->num_frames-1 ]<9) continue;
2398 if((box2->c==UNKNOWN || (box2->num_ac && box2->wac[0]<job->cfg.certainty))
2399 && box2->x1-box2->x0>5 && box2->y1-box2->y0>4){
2400 x0=box2->x0; x1=box2->x1; dx= x1-x0+1;
2401 y0=box2->y0; y1=box2->y1;
2402 ad=100;
2403 cbest=0;
2404
2405 /* ocr1809_12minus "-5" */
2406 ii=loop(pp,x0+1,y0,y1-y0,cs,0,DO);
2407 i =loop(pp,x0+1,y1,y1-y0,cs,0,UP);
2408 if (ii+i >= 7*(y1-y0-1)/8
2409 && y0+ii>box2->m2 && y0+ii<box2->m3) { // check for "-5"
2410 for (i1=0;i1<(x1-x0)/2;i1++) { // check v-symmetry
2411 i2=loop(pp,x0+i1,y0,y1-y0,cs,0,DO);
2412 if (abs(i2 - ii) > (y1-y0)/16) break; // not or end of -
2413 i2=loop(pp,x0+i1,y1,y1-y0,cs,0,UP);
2414 if (abs(i2 - i ) > (y1-y0)/16) break; // not or end of -
2415 }
2416 if ((job->cfg.verbose&2) /* && i1>(x1-x0)/3*/){
2417 fprintf(stderr,
2418 "\n# try_to_divide_box(xy,dxy): %4d %4d %3d %3d as -5 xcut= %d-1",
2419 x0, y0, x1-x0+1, y1-y0+1, i1); }
2420 if (i1>(x1-x0-1)/4) {
2421 i=0; boxa=*box2; // copy contents, ToDo: reset ac-list (in cut_box?)
2422 boxa.x=x0; boxa.y=y0; // obsolete? mark pixel, overlap?
2423 boxa.x0=xi[i]=x0;boxa.x1=xi[i+1]=x0+i1-1; // new horizontal box range
2424 cut_box(&boxa); boxa.num_ac=0; // ToDo: add box2 as src argument?
2425 ci[i]=whatletter(&boxa,cs,0); /* get char */
2426 a2[i]=testac(&boxa,ci[i]); /* get certainty */
2427 if ((ci[i]=='-' || ci[i]=='_') && a2[i]>=97) // 2018-09 "-5"
2428 { setac(&boxa,ci[i],a2[i]=99);
2429 if ((job->cfg.verbose&2)) {
2430 DBG(fprintf(stderr,"\nDBG %s set split certainty 99",\
2431 decode(ci[0],ASCII))); }}
2432 i++; boxb=*box2; // try rest if it has to be split again
2433 boxb.x=xi[i]+1; boxb.y=y0;
2434 boxb.x0=xi[i]+1;boxb.x1=xi[i+1]=box2->x1;
2435 cut_box(&boxb); boxb.num_ac=0;
2436 ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2437 if (a2[0]>=97 && a2[1]>=99) // 2018-09 "-5"
2438 { char buf[8]=""; setac(&boxb,ci[i],a2[i]=99);
2439 if ((job->cfg.verbose&2)) {
2440 DBG(fprintf(stderr,"\nDBG %s set split certainty 99",\
2441 decode(ci[1],ASCII)));}
2442 buf[0]=ci[0];buf[1]=ci[1];buf[2]=0;
2443 ar=a2[1]; // not final, just testing
2444 if (buf[0]) setas(box2,buf,ar); }
2445 }
2446 } /* check "-5" split */
2447
2448 /* get minimum vertical lines, but fails on ocr1809_12minus "-5" */
2449 n1 = num_cross(x0,x1,( y1+y0)/2,( y1+y0)/2,pp,cs);
2450 ii = num_cross(x0,x1,(3*y1+y0)/4,(3*y1+y0)/4,pp,cs); if (ii<n1) n1=ii;
2451 if (box2->m2 && box2->m3 > box2->m2+2)
2452 for (i=box2->m2+1;i<=box2->m3-1;i++) {
2453 // 2017-07 patch from LLeroy2005
2454 if ((i<=y0) || (i>=y1)) continue; // box smaller than baseline
2455 if (loop(pp,x0+1,i,x1-x0,cs,1,RI) > (x1-x0-2)) continue; // ll
2456 ii = num_cross(x0,x1,i,i,pp,cs); if (ii<n1) n1=ii;
2457 } if (n1<2) continue; // seems to make no sense to divide
2458 if (n1<4) ad=99*ad/100; // not to strong because m2+m3 could be wrong
2459 if (n1<3) ad=99*ad/100;
2460
2461 if( 2*y1 < box2->m3+box2->m4 /* baseline char ? */
2462 && num_cross(x0,x1,y1-1,y1-1,pp,cs)==1 // -1 for slopes
2463 && num_cross((x0+2*x1)/3,(x0+3*x1)/4,y0,y1,pp,cs)<3 // not exclude tz
2464 && num_cross((3*x0+x1)/4,(2*x0+x1)/3,y0,y1,pp,cs)<3 // not exclude zl
2465 && loop(pp,x0,y1-(y1-y0)/32,x1-x0,cs,0,RI)
2466 +loop(pp,x1,y1-(y1-y0)/32,x1-x0,cs,0,LE) > (x1-x0+1)/2
2467 ) continue; /* do not try on bvdo"o etc. */
2468
2469 // one vertical line can not be two glued chars, lc?
2470 if ( num_cross(x0,x1,(y1+y0)/2,(y1+y0)/2,pp,cs)<=1 ) continue;
2471 { // doublet = 2 letters
2472 // char buf[4]="\0\0\0"; // 4th byte is string end == \0
2473 // buf[0]=c1; // c1 is wchar_t! (0xbf00 to 0) failes
2474 // buf[1]=c2;
2475 char buf[64]=""; // end == \0
2476 if (job->cfg.verbose&2){ int l1=box2->line;
2477 fprintf(stderr,
2478 "\n# try_to_divide_box(xy,dxy): %4d %4d %3d %3d L%02d mono=%d",
2479 x0, y0, x1-x0+1, y1-y0+1, l1, job->res.lines.mono[l1] /* 1=mono */);
2480 if (job->cfg.verbose&4) out_x(box2); // list box as data+ASC-image
2481 }
2482 // for mono-spaced/teletext fonts only wide chars molten!?
2483 // Todo: or search 2 invers vectors at xi[1], p.e. diag. touch
2484 { int i4, i5, i6=-1, i7=-1, i8=-1, i9=-1;
2485 int num_allvec= box2->num_frame_vectors[box2->num_frames-1]-1;
2486 for (i4=0;i4<num_allvec-1;i4++)
2487 if ( box2->frame_vector[i4 ][0]!=x0
2488 && box2->frame_vector[i4 ][0]!=x1
2489 && box2->frame_vector[i4+1][0]!=x0
2490 && box2->frame_vector[i4+1][0]!=x1
2491 && box2->frame_vector[i4 ][1]!=y0
2492 && box2->frame_vector[i4 ][1]!=y1
2493 && box2->frame_vector[i4+1][1]!=y0
2494 && box2->frame_vector[i4+1][1]!=y1
2495 && abs(box2->frame_vector[i4+1][0] // dx==1
2496 -box2->frame_vector[i4 ][0])==1
2497 && abs(box2->frame_vector[i4+1][1] // dy==1
2498 -box2->frame_vector[i4 ][1])==1 )
2499 for (i5=i4+2;i5<num_allvec;i5++)
2500 /* same point in opposit direction == diag dots? */
2501 /* may fail if one vector is longer 1+1 ... ToDo18 */
2502 if (box2->frame_vector[i4 ][0]
2503 == box2->frame_vector[i5 ][0]
2504 && box2->frame_vector[i4 ][1]
2505 == box2->frame_vector[i5 ][1]
2506 && box2->frame_vector[i4+1][0]
2507 == box2->frame_vector[i5-1][0]
2508 && box2->frame_vector[i4+1][1]
2509 == box2->frame_vector[i5-1][1]) {
2510 if(job->cfg.verbose&2)
2511 fprintf(stderr,"DBG vsplit i45= %d %d i67 %d %d i89 %d %d"
2512 " xy=%2d %2d ToDo\n", i4, i5, i6, i7, i8, i9,
2513 box2->frame_vector[i4][0]-x0,
2514 box2->frame_vector[i4][1]-y0);
2515 if (i6==-1) { i6=i4; i7=i5; } else /* unique */
2516 if (i8==-1) { i8=i4; i9=i5; } else /* unique */
2517 { i4=num_allvec; break; } // max 2 cut-points or abort
2518 /* ToDo: else break? ore store 2 max. */
2519 } // found touching vectors
2520 // ToDo18: handle split at vectors i4 i5 and/or i6 i7
2521 // p.e. tmp13/sslmozFP.png bold 8x9 proportional font "nt" "To"
2522 // p.e. tmp09/barcodes090916_code39.png 10x12 prop.font "ow"
2523 // ...
2524 } // inverse vectors
2525 // it would be better if testing is only if most right and left char
2526 // has no horizontal gap (below m2) ex: be
2527 i=0; // num splittet chars
2528 xi[0]=x0; xi[1]=x0+(dx/8)+1; xi[2]=x1; // split_to xi0..1 and xi1..2
2529 for ( ; ; xi[i+1]++) { // x[i] .. x[i+1], slower? but better v0.42
2530 int bow=0; // default = no bow = no cutting = fail divide
2531 // ToDo: skip if not a local dy-min for speedup
2532 // int num_b2vec= box2->num_frame_vectors[box2->num_frames-1]-1;
2533 int num_allvec= box2->num_frame_vectors[box2->num_frames-1]-1;
2534 // int num_b2vec= box2->num_frame_vectors[0]-1; // biggest frame
2535 int i1, i2, i3; /* vector indizes around cutting gaps */
2536 /* break if x is to near to the right border */
2537 if (xi[i+1]>x1-dx/8-1) { if (i==0) break;
2538 i--; xi[i+2]=x1; continue; }
2539 int l1=box2->line, mono=job->res.lines.mono[l1]; // 2018-10 add
2540 if (mono && // 2018-10 use monofont advantage
2541 ( abs(abs(xi[i+1]-x0) - job->res.lines.pitch[l1]) /* 2 chars */
2542 > job->res.lines.pitch[l1]/8
2543 && abs(abs(xi[i+1]-x0) - 2*job->res.lines.pitch[l1]) /* 3 chars */
2544 > job->res.lines.pitch[l1]/8 ) ) continue;
2545 if (mono && job->cfg.verbose&2) // rnd80-Droid-Sans-Mono-Regular ww
2546 fprintf(stderr,"\n#DBG monosplit x01,xi,pitch= %4d %4d %4d %4d",
2547 x0, x1-x0+1, xi[i+1]-x0,job->res.lines.pitch[l1]);
2548 // ToDo: search invers vectors (diagonal touching chars)
2549 // between left-down and right-down vectors
2550 // and right-(middle)top and left-top vectors
2551 // "To" "rn" "fi" "nt" "ity" bold 8x9 tmp13/sslmozFP.png
2552 // 2017-03 new nearest_frame-version, check lower ends
2553 if (box2->num_frames<1) fprintf(stderr,"ERROR.split frames=0\n");
2554 // search vectors near (xi1,y1) = bottom of char and (xi1,y0) = top
2555 // failed on tmp08/gocr0801_bad5.jpg "ke"
2556 // i1=nearest_frame_vector(box2, 0,num_allvec, (xi[0]+3*xi[1])/4, y1);
2557 // i3=nearest_frame_vector(box2, 0,num_allvec, (3*xi[1]+xi[2])/4, y1);
2558 i1=nearest_frame_vector(box2, 0,num_allvec, (xi[0]+xi[1])/2, y1);
2559 i3=nearest_frame_vector(box2, 0,num_allvec, (xi[1]+xi[2])/2, y1);
2560 i2=nearest_frame_vector(box2,i1,i3, xi[1], y0);
2561
2562 // 2017-08 vectors may lay on border, x1=+3..-3 replaced by +dx/8+1
2563 // 2017-08 num_b2vec replaced by num_allvec to split small 'Fi'
2564 DBG( if(job->cfg.verbose&2) // 2018-09
2565 fprintf(stderr,"\nDBG split at xi,i123 %2d %2d %2d"\
2566 " #%02d #%02d #%02d dy %d",\
2567 xi[0]-x0,xi[1]-x0,xi[2]-x0,i1,i2,i3,y1-y0+1); )
2568 if (i1==i2 || i2==i3) continue; /* must be different 2017-03 */
2569 if (-2*box2->frame_vector[i2][1]
2570 +box2->frame_vector[i1][1]
2571 +box2->frame_vector[i3][1]>(y1-y0)/2) bow=1; // big dy
2572 // ToDo17: do not cut holes!? check other nearest_frame_vectors?
2573 // tmp09/barcodes090916_code39.png "ow"
2574 // tmp13/sslmozFP.png "Fi" "To" "ity"
2575 if(job->cfg.verbose&2)
2576 fprintf(stderr,"\n# test split at x%d= %2d %2d %2d"
2577 " bow %d i123=%2d %2d %2d",
2578 i, xi[i]-x0, xi[i+1]-x0, xi[i+2]-x0,
2579 bow, i1,i2,i3);
2580 /* skip if no local minimum at xi[i+1] or if its not thin enough */
2581 // 2010-10-11 failes for ke on tmp08/gocr0801_bad5.jpg ToDo!!!
2582 // if (bow==0 || 4*(ymax-ymin)>2*(y1-y0)) continue;
2583 if (bow==0) continue;
2584 // cuttet parts should have about the same height (max-min)
2585 // we dont want to cut an 'n' in three parts!
2586 // ToDo: thickness on xi[i+1]?
2587 // try to split successive right box if left box is recognised,
2588 // else shift the splitting point further to the right border
2589 // removing ->dots if dot only above one char !!! ??? not implemented
2590 if(job->cfg.verbose&2)
2591 fprintf(stderr,"\n# try to split, newbox[%d].x= %2d ... %2d "
2592 "dy= %d ", i, xi[i]-x0, xi[i+1]-x0, y1-y0+1);
2593 boxa=*box2; // copy contents, ToDo: reset ac-list (in cut_box?)
2594 boxa.x=xi[i]; boxa.y=y0; // obsolete? mark pixel, overlap?
2595 boxa.x0=xi[i];boxa.x1=xi[i+1]; // new horizontal box range
2596 // ToDo: vector-version cut at 2vec near xi, allow dx/8 overlapp!
2597 // see tmp13/ssl* "To"
2598 cut_box(&boxa); boxa.num_ac=0; // ToDo: add box2 as src argument?
2599 // out_x(&boxa);
2600 // get wchar + certainty
2601 ci[i]=whatletter(&boxa,cs,0); /* get char */
2602 a2[i]=testac(&boxa,ci[i]); /* get certainty */
2603 if ((ci[i]=='c' || ci[i]=='C') && a2[i]==100) // 2018-09 "ow" read as 100% "cw"
2604 { setac(&boxa,ci[i],a2[i]=99);
2605 DBG(fprintf(stderr,"\nDBG set split certainty 99");)}
2606 if(job->cfg.verbose&2)
2607 fprintf(stderr,"\n# certainty %d limit= %d cbest= %d ",
2608 a2[i], job->cfg.certainty, cbest);
2609 if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2610 || wcschr(s1,ci[i]) ) { continue; } // dont split here
2611
2612 for (ar=ad,ii=0;ii<=i;ii++) {
2613 ar=a2[ii]*ar/100; } // multiply all probabilities
2614 if (ar<98*job->cfg.certainty/100 || ar<cbest) {
2615 continue; } // dont go deeper, no longer string
2616
2617 i++; if (i==8) break; // maximum splits
2618 if (i==4) break; // at the moment its to slow to go further
2619 if (i+1<8) xi[i+1]=x1; // right border of next box
2620 if (i+2<8) xi[i+2]=x1;
2621
2622 if(job->cfg.verbose&2)
2623 fprintf(stderr,"\n try end split [%d].x=%d [%d].x=%d ",
2624 i, xi[i]-x0, i+1, xi[i+1]-x0);
2625 boxb=*box2; // try rest if it has to be split again
2626 boxb.x=xi[i]+1; boxb.y=y0;
2627 boxb.x0=xi[i]+1;boxb.x1=xi[i+1];
2628 cut_box(&boxb); boxb.num_ac=0;
2629 ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2630 if (a2[i]<job->cfg.certainty || a2[i]<cbest-1
2631 || wcschr(s1,ci[i]) ) { xi[i+1]=xi[i]+2; continue; } // split rest
2632 // now we have everything splittet
2633
2634 if(job->cfg.verbose&2) {
2635 fprintf(stderr,"\n split at/to: ");
2636 for (ii=0;ii<=i;ii++)
2637 fprintf(stderr," %2d %s (%3d)", xi[ii+1]-x0,
2638 decode(ci[ii],ASCII), a2[ii]);
2639 fprintf(stderr,"\n");
2640 }
2641 // boxa..c changed!!! dots should be modified!!!
2642 // Question: cut it into boxes v0.40 or set a string v0.41?
2643 // new way of building a string v0.41 (can call setas multiple)
2644 // usefull if compare unknown with known strings (except barcode?)
2645 // ToDo: also create alternate variants? ex: I <-> l
2646 for (buf[0]=0,ar=ad,ii=0;ii<=i;ii++) {
2647 ar=a2[ii]*ar/100; // multiply all probabilities
2648 if (i>0 && ci[ii]=='n' && ci[ii-1]=='r') ar--; // m == rn
2649 strncat(buf,decode(ci[ii],job->cfg.out_format),20);
2650 }
2651
2652 if (ar>cbest) cbest=ar; // best (highest) certainty found
2653 // reduce, but not if we cross certainty border
2654 if (99*ar/100 > job->cfg.certainty) ar=99*ar/100;
2655 if (job->cfg.verbose&2)
2656 fprintf(stderr,"\n split result= %s (%3d) ",buf, ar);
2657 setas(box2,buf,ar); // char *, does it disturb further splitting?
2658 buf[0]=0;
2659 i--; xi[i+2]=x1;
2660 } /* xi[i+1]++ */
2661 } /* divide box */
2662 } /* unknown box dx>5 */
2663 } end_for_each(&(job->res.boxlist));
2664 if (job->cfg.verbose) fprintf(stderr,", numC %d\n",job->res.numC);
2665 return 0;
2666 }
2667
2668 /*
2669 // ---- divide vertical glued boxes (ex: g above T);
2670 */
divide_vert_glued_boxes(pix * pp,int mo)2671 int divide_vert_glued_boxes( pix *pp, int mo){
2672 struct box *box2,*box3,*box4;
2673 job_t *job=OCR_JOB; /* fixme */
2674 int y0,y1,y,dy,flag_found,dx;
2675 if(job->cfg.verbose)fprintf(stderr,"# divide vertical glued boxes");
2676 for_each_data(&(job->res.boxlist)) {
2677 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2678 if (box2->c != UNKNOWN) continue; /* dont try on pictures */
2679 y0=box2->y0; y1=box2->y1; dy=y1-y0+1;
2680 dx=4*(job->res.avX+box2->x1-box2->x0+1); // we want to be sure to look at 4ex distance
2681 if ( dy>2*job->res.avY && dy<6*job->res.avY && box2->m1
2682 && y0<=box2->m2+2 && y0>=box2->m1-2
2683 && y1>=box2->m4+job->res.avY-2)
2684 { // test if lower end fits one of the other lines?
2685 box4=box2; flag_found=0;
2686 for_each_data(&(job->res.boxlist)) {
2687 box4 = (struct box *)list_get_current(&(job->res.boxlist));
2688 if (box4->c != UNKNOWN) continue; /* dont try on pictures */
2689 if (box4->x1<box2->x0-dx || box4->x0>box2->x1+dx) continue; // ignore far boxes
2690 if (box4->line==box2->line ) flag_found|=1; // near char on same line
2691 if (box4->line==box2->line+1) flag_found|=2; // near char on next line
2692 if (flag_found==3) break; // we have two vertical glued chars
2693 } end_for_each(&(job->res.boxlist));
2694 if (flag_found!=3) continue; // do not divide big chars or special symbols
2695 y=box2->m4; // lower end of the next line
2696 if(job->cfg.verbose&2){
2697 fprintf(stderr,"\n# divide box below y=%4d",y-y0);
2698 if(job->cfg.verbose&6)out_x(box2);
2699 }
2700 // --- insert box3 before box2
2701 box3= (struct box *) malloc_box(box2);
2702 box3->y1=y;
2703 box2->y0=y+1; box2->line++; // m1..m4 should be corrected!
2704 if (box4->line == box2->line){
2705 box2->m1=box4->m1; box2->m2=box4->m2;
2706 box2->m3=box4->m3; box2->m4=box4->m4;
2707 }
2708 box3->num=job->res.numC;
2709 if (list_ins(&(job->res.boxlist), box2, box3)) {
2710 fprintf(stderr,"ERROR list_ins\n"); };
2711 job->res.numC++;
2712 }
2713 } end_for_each(&(job->res.boxlist));
2714 if(job->cfg.verbose)fprintf(stderr,", numC %d\n",job->res.numC);
2715 return 0;
2716 }
2717
2718
2719 /*
2720 on some systems isupper(>255) cause a segmentation fault SIGSEGV
2721 therefore this function
2722 ToDo: should be replaced (?) by wctype if available on every system
2723 */
wisupper(wchar_t cc)2724 int wisupper(wchar_t cc){ return ((cc<128)?isupper(cc):0); }
wislower(wchar_t cc)2725 int wislower(wchar_t cc){ return ((cc<128)?islower(cc):0); }
wisalpha(wchar_t cc)2726 int wisalpha(wchar_t cc){ return ((cc<128)?isalpha(cc):0); }
wisdigit(wchar_t cc)2727 int wisdigit(wchar_t cc){ return ((cc<128)?isdigit(cc):0); }
wisspace(wchar_t cc)2728 int wisspace(wchar_t cc){ return ((cc<128)?isspace(cc):0); }
2729
2730 /* set box2->c to cc if cc is in the ac-list of box2, return 1 on success */
setc(struct box * box2,wchar_t cc)2731 int setc(struct box *box2, wchar_t cc){
2732 int ret=0, w2; // w1
2733 // w1=((box2->num_ac) ? box2->wac[0] : 0); // weight of replaced char
2734 w2=testac(box2,cc);
2735 if (OCR_JOB->cfg.verbose) {
2736 // print first 2 alternative chars
2737 fprintf(stderr, "\n# setc old nac=%d %s %s %3d %3d to %s %3d at %4d %4d",
2738 box2->num_ac, decode(box2->c,ASCII),
2739 (box2->num_ac<2)?" ":decode(box2->tac[1],ASCII), box2->wac[0],
2740 (box2->num_ac<2)?0:box2->wac[1],
2741 decode(cc,ASCII), (100+w2+1)/2, box2->x0, box2->y0);
2742 }
2743 if (w2) { if (box2->c!=cc) { ret=1; setac(box2,cc,(100+w2+1)/2); } }
2744 // if(OCR_JOB->cfg.verbose & 4) out_x(box2);
2745 // ToDo: modify per setac (shift ac)
2746 return ret;
2747 }
2748
2749
2750 /* ---- proof difficult chars Il1 by context view ----
2751 context: separator, number, vowel, nonvowel, upper case ????
2752 could be also used to find unknown chars if the environment (nonumbers)
2753 can be found in other places!
2754 ToDo:
2755 - box->tac[] as set of possible chars, ac set by engine, example:
2756 ac="l/" (not "Il|/\" because serifs detected and slant>0)
2757 correction only to one of the ac-set (alternative chars)!
2758 - should be language-settable; Unicode compatible
2759 - box2->ad and wac should be changed? (not proper yet)
2760 * ------------- */
context_correction(job_t * job)2761 int context_correction( job_t *job ) {
2762 // const static char
2763 char *l_vowel="aeiouy";
2764 // *l_Vowel="AEIOU",chars if the environment (nonumbers)
2765 char *l_nonvo = "bcdfghjklmnpqrstvwxzBCDFGHJKLMNPQRSTVWXZ";
2766 int hexdigits = 0, hexdivpos = 0; // "O0lI123456789ABCDEFabcdef:"
2767 struct box *box3, *box2, *prev, *next, *pre2, *pre3, *pre4;
2768 int dx, dy, O0_num=0, O0_slashed_zeros=0,
2769 O0_maxw=0, O0_minw=999999, O0_maxh=0, O0_minh=999999;
2770 // pix *pp = &(job->src.p);
2771 int nc=0, ns=0; // num corrections
2772 wchar_t last_double_quotation=0; // correction of different quotations "
2773 pre4=pre3=pre2=prev=next=NULL;
2774
2775 if (job->cfg.verbose)
2776 fprintf(stderr, "# context correction Il1 O0\n");
2777
2778 // 1st loop to make max/min/num-statistics O0-correction 2018-09 rnd.tt
2779 for_each_data(&(job->res.boxlist)) {
2780 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2781 dx= box2->x1 - box2->x0 + 1;
2782 dy= box2->y1 - box2->y0 + 1;
2783 if (box2->c && strchr("O0",box2->c) /* strchr "O0" 0x00 = true */
2784 && dy >= box2->m3 - box2->m2){ /* do not at ° */
2785 // IFV fprintf(stderr,"\n# O0 stat nac=%d %s %3d at %3d %3d",
2786 // box2->num_ac,decode(box2->c,ASCII), box2->wac[0], box2->x0, box2->y0);
2787 // ToDo18 maxw < 2*mean_dx
2788 O0_num++;
2789 if (box2->num_frames==3 && box2->c=='0') O0_slashed_zeros++;
2790 if (O0_maxw < dx) O0_maxw= dx; /* max width O */
2791 if (O0_minw > dx) O0_minw= dx; /* min width 0 */
2792 if (O0_maxh < dy) O0_maxh= dy; /* max high 0 */
2793 if (O0_minh > dy) O0_minh= dy; /* min high O */
2794 }
2795 } end_for_each(&(job->res.boxlist));
2796 if (job->cfg.verbose)
2797 fprintf(stderr, "# O0 num= %d slashed0=%d mimaxW %d %d",
2798 O0_num, O0_slashed_zeros, O0_minw, O0_maxw);
2799
2800 // 2nd loop to make corrections
2801 for_each_data(&(job->res.boxlist)) {
2802 pre4=pre3; pre3 = pre2; pre2 = prev; // 2010-10-01 tmp08/080916_JL*_150
2803 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2804 prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2805 next = (struct box *)list_get_cur_next(&(job->res.boxlist));
2806 dx= box2->x1 - box2->x0 + 1;
2807 dy= box2->y1 - box2->y0 + 1;
2808 if (box2->c==0) continue; // 2018-09 strchr false positive
2809 // ToDo: count last_upper, lower, digits, hexdigits
2810 // 2010-10-10 hex-mode tmp08/gocr0801_bad5
2811 if (box2->c && strchr("O0lI123456789ABCDEFabcdef",box2->c)) hexdigits++;
2812 else if (box2->c && strchr(": ",box2->c) && prev && prev->c!=box2->c
2813 && (hexdigits-hexdivpos==2 || hexdigits-hexdivpos==4))
2814 hexdivpos=hexdigits;
2815 else { hexdigits=0; hexdivpos=0; }
2816 if (box2->c==' ' && prev && prev->c==' ') hexdigits=0;
2817 if (box2->c==':' && pre3 && pre3->c!=':') hexdigits=0; // :89:AB:CD:
2818 if (box2->c && strchr("O0",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'0');
2819 if (box2->c && strchr("l1",box2->c) && hexdigits>5) nc+=setc(box2,(wchar_t)'1');
2820 // 2010-10-01 sample tmp08/0811CSchulze_crop
2821 if (box2->c==DOUBLE_LOW_9_QUOTATION_MARK) {
2822 last_double_quotation = box2->tac[0];
2823 fprintf(stderr,"\n# ... found DOUBLE_LOW_9_QUOTATION_MARK");
2824 }
2825 if (box2->c==QUOTATION_MARK // 0x22 = ""
2826 && last_double_quotation == DOUBLE_LOW_9_QUOTATION_MARK) {
2827 last_double_quotation = 0;
2828 box2->c = box2->tac[0] = DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK;
2829 IFV fprintf(stderr,"\n# change nac=%d %s %3d to %s %3d at %3d %3d",
2830 box2->num_ac, "\"", box2->wac[0],
2831 decode(box2->c,ASCII), box2->wac[0], box2->x0, box2->y0);
2832 } // box2->c==QUOTATION_MARK // 0x22 = ""
2833
2834 if ( box2->c > 0xFF ) continue; // temporary UNICODE fix 1
2835 if ((prev) && (prev->c > 0xFF)) continue; // temporary UNICODE fix 2
2836 if ((next) && (next->c > 0xFF)) continue; // temporary UNICODE fix 3
2837 if (box2->num_ac<2) continue; // no alternatives
2838 if (box2->wac[0]==100 && box2->wac[1]<100) continue;
2839 if (box2->num_ac && box2->tas[0]) continue; // buggy space_remove 0.42
2840
2841 /* check for Il1| which are general difficult to distinguish */
2842 /* bbg: not very good. Should add some tests to check if is preceded by '.',
2843 spelling, etc */
2844 /* ToDo: only correct if not 100% sure (wac[i]<100)
2845 and new char is in wat[] */
2846 if (box2->c && strchr("Il1|", box2->c) && next && prev) {
2847 // if( strchr(" \n",prev->c) // SPC
2848 // && strchr(" \n",next->c) ) box2->c='I'; else // bad idea! I have ...
2849 if (wisalpha(next->c) && next->c!='i' &&
2850 ( prev->c == '\n' || // unref-pointer pre2 fix 2017-04-25 by Norbert M.
2851 ( prev->c == ' ' && (!pre2 || (pre2 && pre2->c == '.' )) ) ) )
2852 { nc+=setc(box2,(wchar_t)'I'); }
2853 else if (
2854 ( box2->c!='1' /* lnt => Int, but 1st */
2855 && strchr(l_nonvo,next->c)
2856 && strchr("\" \n",prev->c))
2857 || (prev && ((!pre2) || wisupper(pre2->c) || strchr(" \n",pre2->c))
2858 && wisupper(prev->c)
2859 && box2->num_frame_vectors[0]==4
2860 && box2->frame_vector[0][0]==box2->x0
2861 && box2->frame_vector[1][0]==box2->x0
2862 && box2->frame_vector[2][0]==box2->x1
2863 && box2->frame_vector[3][0]==box2->x1
2864 )) // " DI*"
2865 /* do not change he'll to he'Il! */
2866 { nc+=setc(box2,(wchar_t)'I'); } // set box2->c to 'I' if 'I' is in the ac-list
2867 else if (strchr(l_vowel,next->c)) /* unusual? Ii Ie Ia Iy Iu */
2868 /* && strchr("KkBbFfgGpP",prev->c)) */ /* kle Kla Kli */
2869 { nc+=setc(box2,(wchar_t)'l'); }
2870 else if (wisupper(next->c) // ToDo: check 6 neighbours for upper+spaces
2871 && !strchr("O0I123456789",next->c)
2872 && !strchr("O0I123456789",prev->c)) /* avoid lO => IO (10) */
2873 { nc+=setc(box2,(wchar_t)'I'); }
2874 else if (prev && wislower(prev->c))
2875 { nc+=setc(box2,(wchar_t)'l'); }
2876 else if (wisdigit(prev->c)
2877 || wisdigit(next->c)
2878 || (next && strchr(":-",next->c) && pre2 && pre2->c==next->c
2879 && prev && strchr("0123456789ABCDabcd",prev->c)) // hex 2010-10
2880 || (next->c=='O' && !wisalpha(prev->c))) /* lO => 10 */
2881 { nc+=setc(box2,(wchar_t)'1'); }
2882 }
2883 // JS-2010-09 (ToDo: only if I is an alternate char!?)
2884 if (strchr("Il|", box2->c) && next && !prev) { // first char?
2885 if (wisalpha(next->c) && next->c!='i' && !strchr(l_vowel,next->c))
2886 { nc+=setc(box2,(wchar_t)'I'); }
2887 else if (wisupper(next->c)
2888 && !strchr("O0I123456789",next->c)) /* avoid lO => IO (10) */
2889 { nc+=setc(box2,(wchar_t)'I'); }
2890 }
2891
2892 // ToDo: count width of all "0O" to decide between wide and narrow O's
2893 // ToDo: set dbg-stack to context correction + setc output corrections
2894 // FreeMono-Regular 0 is slightly higher and less width than O, rnd-chars
2895 // 0 is 30x51 (m3-m0=50 H=Xx46)
2896 // O is 40x48
2897 /* check for O0 */
2898 else if (strchr("O0", box2->c)) {
2899 int i0, have_hexhi=0, have_hexlo=0, have_digits=0, have_alpha=0,
2900 have_upper=0;
2901 wchar_t c0; /* test char loop over pre2 ... next2 */
2902 for (i0=0; i0<5; i0++) {/* ToDo: take into account 100% chars only? */
2903 c0='\0';
2904 if (i0==4 && pre4) c0=pre4->c;
2905 if (i0==0 && pre3) c0=pre3->c;
2906 if (i0==1 && pre2) c0=pre2->c;
2907 if (i0==2 && prev) c0=prev->c;
2908 if (i0==3 && next) c0=next->c; /* ToDo17 nex2 for 0.7 */
2909 if (c0=='\0') continue;
2910 if (strchr("abcdef",c0)) have_hexlo++; /* 2017-07 */
2911 if (strchr("ABCDEF",c0)) have_hexhi++; /* 2017-07 */
2912 if (strchr("123456789",c0)) have_digits++;
2913 if (strchr("ghijklmnopqrstuvwxyz",c0)) have_alpha++;
2914 if (strchr("GHIJKLMNPQRSTUVWXYZ",c0)) have_upper++;
2915 }
2916 if ((have_hexlo && have_hexhi) || have_alpha || have_upper) {
2917 have_upper+=have_hexhi; have_alpha+=have_hexlo;
2918 have_hexlo=0; have_hexhi=0; } // have_hex*=0 if isalpha
2919 // wchar_t c_ask= 'O'; // detect changes?
2920 if (O0_slashed_zeros==0 && O0_num>1 // 2018-09 rnd80.tt
2921 && O0_maxw > O0_minw + (box2->x1 - box2->x0 + 1)/32 + 1
2922 && (box2->x1 - box2->x0 + 1) > (O0_maxw+O0_minw)/2
2923 && dy >= box2->m3 - box2->m2) {
2924 nc+=setc(box2,(wchar_t)'O'); // big width
2925 IFV fprintf(stderr," DBG%04d %d,%d: O0 to O", __LINE__,
2926 box2->x0,box2->y0);
2927 } else
2928 if (O0_slashed_zeros==0 && O0_num>1 // 2018-09 rnd80.tt
2929 && O0_maxw > O0_minw + (box2->x1 - box2->x0 + 1)/32 + 1
2930 && (box2->x1 - box2->x0 + 1) < (O0_maxw+O0_minw+1)/2
2931 && dy >= box2->m3 - box2->m2) {
2932 nc+=setc(box2,(wchar_t)'0'); // small width
2933 IFV fprintf(stderr," DBG%04d %d,%d: O0 to 0", __LINE__,
2934 box2->x0,box2->y0);
2935 } else
2936 if (((!next) || !strchr(" .,", next->c))
2937 && ((((!prev) || wisspace(prev->c)) // first letter?
2938 && have_alpha /* words vs units? Orig vs. 0days */
2939 && (!have_digits))
2940 || (have_upper && (!have_digits)) )) // UPWORD?
2941 { nc+=setc(box2,(wchar_t)'O');
2942 IFV fprintf(stderr," DBG%04d %d,%d: O0 to O", __LINE__,
2943 box2->x0,box2->y0);}
2944 // ! " Otto"
2945 // wchar_t c_ask= '0'; // + replace else if !!!
2946 else if ((have_digits || have_hexlo /* || have_hexhi */ /* C0DE39 */
2947 || (prev && strchr(" -+", prev->c) &&
2948 next && strchr(" .,", next->c)))
2949 && (!have_upper) /*&& (!have_hexhi)*/) /* 2017-07 */
2950 { nc+=setc(box2,(wchar_t)'0');
2951 IFV fprintf(stderr," DBG%04d %d,%d: O0 to 0", __LINE__,
2952 box2->x0,box2->y0);}
2953 } // O0
2954
2955 /* check for 5S */
2956 else if (strchr("5S", box2->c) && next && prev) {
2957 if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */
2958 { nc+=setc(box2,(wchar_t)'S'); }
2959 else if (wisalpha(prev->c) && wisalpha(next->c)
2960 && wisupper(next->c)) /* word in upper case */
2961 { nc+=setc(box2,(wchar_t)'S'); }
2962 else if (wisdigit(prev->c) || wisdigit(next->c))
2963 { nc+=setc(box2,(wchar_t)'5'); }
2964 }
2965
2966 /* was a space not found? xXx => x Xx ??? */
2967 if (wisupper(box2->c) && next && prev) {
2968 if (wislower(prev->c) && wislower(next->c)
2969 && 2 * (box2->x0 - prev->x1) > 3 * (next->x0 - box2->x1)) {
2970 struct box *box3 = malloc_box((struct box *) NULL);
2971 box3->x0 = prev->x1 + 2;
2972 box3->x1 = box2->x0 - 2;
2973 box3->y0 = box2->y0;
2974 box3->y1 = box2->y1;
2975 box3->x = box2->x0 - 1;
2976 box3->y = box2->y0;
2977 box3->dots = 0;
2978 box3->num_boxes = 0;
2979 box3->num_subboxes = 0;
2980 box3->c = ' ';
2981 box3->modifier = 0;
2982 setac(box3,' ',99); /* ToDo: weight depends from distance */
2983 box3->num = -1;
2984 box3->line = prev->line;
2985 box3->m1 = box3->m2 = box3->m3 = box3->m4 = 0;
2986 box3->p = &(job->src.p);
2987 list_ins(&(job->res.boxlist), box2, box3);
2988 }
2989 }
2990
2991 /* a space before punctuation? but not " ./file" */
2992 if ( prev && next)
2993 if (prev->c == ' ' && strchr(" \n" , next->c)
2994 && strchr(".,;:!?)", box2->c))
2995 if (prev->x1 - prev->x0 < 2 * job->res.avX) { // carefully on tables
2996 box3 = prev;
2997 if ( !list_del(&(job->res.boxlist), box3) ) free_box(box3);
2998 prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2999 ns++;
3000 }
3001
3002 /* \'\' to \" */
3003 if ( prev )
3004 if ( (prev->c == '`' || prev->c == '\'')
3005 && (box2->c == '`' || box2->c == '\'') )
3006 if (prev->x1 - box2->x0 < job->res.avX) { // carefully on tables
3007 box2->c='\"';
3008 box3 = prev;
3009 list_del(&(job->res.boxlist), box3);
3010 free_box(box3);
3011 }
3012 } end_for_each(&(job->res.boxlist));
3013 if (job->cfg.verbose)
3014 fprintf(stderr, " num_corrected= %d removed_spaces= %d\n", nc, ns);
3015 return 0;
3016 }
3017
3018
3019 /* ---- insert spaces ----
3020 * depends strongly from the outcome of measure_pitch()
3021 * ------------------------ */
list_insert_spaces(pix * pp,job_t * job)3022 int list_insert_spaces( pix *pp, job_t *job ) {
3023 int i=0, j1, j2, i1, maxline=-1, dy=0, num_nl=0, num_spc=0, min_x0=1023;
3024 char cc;
3025 struct box *box2, *box3=NULL, *box4=NULL;
3026
3027 // measure mean line height
3028 for(i1=1;i1<job->res.lines.num;i1++) {
3029 dy+=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
3030 if (min_x0>job->res.lines.x0[i1])
3031 min_x0=job->res.lines.x0[i1]; // 2010-09-30
3032 } if (job->res.lines.num>1) dy/=(job->res.lines.num-1);
3033 i=0; j2=0;
3034 for(i1=1;i1<job->res.lines.num;i1++) {
3035 j1=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
3036 if (j1>dy*120/100 || j1<dy*80/100) continue; // only most frequently
3037 j2+=j1; i++;
3038 } if (i>0 && j2/i>7) dy=j2/i;
3039 if( job->cfg.verbose&1 )
3040 fprintf(stderr,"# insert space between words (dy=%d) ...",dy);
3041 if (!dy) dy=(job->res.avY)*110/100+1;
3042
3043 if (min_x0 < 4) min_x0 = 0; // tmp09/oebb_teletext* monospaced first gap
3044 // ToDo: rewrite, replace cc by num_spc + num_nl
3045 i=0;
3046 for_each_data(&(job->res.boxlist)) {
3047 int thispitch=0, thismono=0, pdist=0; // spacing paras per line
3048 box2 =(struct box *)list_get_current(&(job->res.boxlist));
3049 cc=0; num_nl=0; num_spc=0;
3050 box3 = (struct box *)list_prev(&(job->res.boxlist), box2);
3051 if (box2->line > maxline) { // new line, lines and chars must be sorted!
3052 int ydist=0, ypitch=0;
3053 if (maxline>=0) {
3054 // num_nl = 1; // ToDo: allow multiple newlines
3055 if (box2->line>1)
3056 ydist = job->res.lines.m1[ box2->line ]
3057 -job->res.lines.m1[ box2->line-1 ]; // 2010-09-26
3058 ypitch = job->res.lines.m4[ box2->line ]
3059 -job->res.lines.m1[ box2->line ];
3060 if (ypitch>4) num_nl = ydist / (2*ypitch); // ToDo: improve it!
3061 if (!num_nl) num_nl=1;
3062 }
3063 maxline=box2->line;
3064 }
3065 if (box2->line==maxline) { // lines and chars must be sorted!
3066 thispitch = job->res.lines.pitch[box2->line];
3067 thismono = job->res.lines.mono[ box2->line];
3068 if (box3) pdist = box2->x0 - box3->x1 - 1; // 2010-09-26
3069 if (pdist < 0) pdist = 0; // overlap like proportional: "VA"
3070 if (num_nl || !box3)
3071 pdist = box2->x0 - min_x0; // first char of new line
3072 // if (pdist >= thispitch) cc=' '; // 2010-09-24 ???
3073 if (thismono) num_spc = pdist / thispitch;
3074 else num_spc = pdist*2 / (3*job->res.avX); // ToDo: use 1em!
3075 if (pdist>=thispitch && !num_spc) num_spc = 1; // proportional font
3076 // ToDo: multi spaces for proportional font
3077 }
3078
3079 #if 0
3080 if ((job->cfg.verbose&48)==48)
3081 fprintf(stderr,"\n# DBG%02d %d mono=%d %d pitch= %2d"
3082 " pdist= %2d nl %d spc %d", maxline, box2->line, thismono,
3083 job->res.lines.mono[ box2->line], thispitch, pdist, num_nl, num_spc);
3084 #endif
3085
3086 // call this multiple times
3087 for (i1=0;i1<num_nl+num_spc;i1++) {
3088 int mdist=0;
3089 box4=(struct box *)list_prev(&(job->res.boxlist), box2);
3090 if (box4) mdist = box2->x0 - box4->x1 + 1; // 2010-09
3091 else mdist = 0;
3092 if (mdist<0) mdist=0;
3093 box3=(struct box *)malloc_box(NULL);
3094 box3->x0=box2->x0-2+((num_spc)?-mdist+ i1 *mdist/num_spc:0);
3095 box3->x1=box2->x0-2+((num_spc)?-mdist+(i1+1)*mdist/num_spc:0);
3096 box3->y0=box2->y0;
3097 box3->y1=box2->y1;
3098 if (i1>=num_nl && box4)
3099 box3->x0 = box4->x1+2+((num_spc)?i1*mdist/num_spc:0);
3100 if (i1< num_nl || !box4)
3101 box3->x0 = job->res.lines.x0[box2->line];
3102 if (i1< num_nl && box4){
3103 box3->y0=box4->y1; // better use lines.y1[box2->pre] ???
3104 box3->y1=box2->y0;
3105 }
3106 box3->x = box3->x0; // 2010-09
3107 box3->y = box2->y0;
3108 box3->dots = 0;
3109 box3->c = cc = ((i1<num_nl)?'\n':' ');
3110 box3->num_boxes = 0;
3111 box3->num_subboxes = 0;
3112 box3->modifier = '\0';
3113 box3->num=-1; box3->line=box2->line;
3114 box3->m1=box2->m1; box3->m2=box2->m2;
3115 box3->m3=box2->m3; box3->m4=box2->m4;
3116 box3->p=pp;
3117 setac(box3,cc,100); /* ToDo: weight depends from distance */
3118 list_ins(&(job->res.boxlist),box2,box3); // insert box3 before box2
3119 if( job->cfg.verbose&1 ) {
3120 fprintf(stderr,"\n# insert space &%d; at %4d %4d box= %p"
3121 " mono %d dx %2d pdx,mdx %2d %2d",
3122 (int)box3->c, box3->x0, box3->y0, (void*)box3,
3123 thismono, thispitch, pdist, mdist);
3124 /* out_x(box3); */
3125 }
3126 i++;
3127 }
3128 } end_for_each(&(job->res.boxlist));
3129 if( job->cfg.verbose&1 ) fprintf(stderr,"\n# ... found %d spaces\n",i);
3130 return 0;
3131 }
3132
3133
3134 /*
3135 add infos where the box is positioned to the box
3136 this is useful for better recognition
3137 */
add_line_info(job_t * job)3138 int add_line_info( job_t *job /* , List *boxlist2 */){
3139 struct tlines *lines = &job->res.lines;
3140 struct box *box2;
3141 int i,xx,mindy1,mindy2,m1,m2,m3,m4,num_line_members=0,num_rest=0;
3142 if (job->cfg.verbose&1) fprintf(stderr,"# add_line_info to boxes ...");
3143 for_each_data(&(job->res.boxlist)) {
3144 box2 =(struct box *)list_get_current(&(job->res.boxlist));
3145 for (i=1;i<job->res.lines.num;i++) /* line 0 is a place holder */
3146 { // add rotated image correction dy(x)
3147 if (lines->dx) xx=lines->dy*((box2->x1+box2->x0)/2)/lines->dx;
3148 else xx=0;
3149 m1= lines->m1[i]+xx;
3150 m2= lines->m2[i]+xx;
3151 m3= lines->m3[i]+xx;
3152 m4= lines->m4[i]+xx;
3153 if (m4-m1==0) continue; /* no text line (line==0) */
3154 /* --- 2018-10 min y-distance y0 to m1 or y1 to m4 --- */
3155 mindy1 = abs(box2->y0 - m1); // min dy (dots and _)
3156 if (mindy1 > abs(box2->y1 - m4))
3157 mindy1 = abs(box2->y1 - m4);
3158 mindy2=999999;
3159 if (box2->m2){ mindy2= abs(box2->y0 - box2->m1);
3160 if (mindy2 > abs(box2->y1 - box2->m4))
3161 mindy2 = abs(box2->y1 - box2->m4);
3162 }
3163 // fprintf(stderr," test line %d m1=%d %d %d %d\n",i,m1,m2,m3,m4);
3164 #if 1 // added + modified again 2018-10 ToDo18 need m5 distance to next line
3165 if(( (box2->y1 -box2->y0 +1) <= (m3-m2+1)/2
3166 && box2->y0 +job->res.avY/2 +2 >= m1
3167 && box2->y1 -job->res.avY/2 -2 <= m4 ) // dots (ToDo: better 2nd run)
3168 || ( box2->y1 +job->res.avY/4 +2 >= m2 // body
3169 && box2->y0 -job->res.avY/4 -2 <= m3)) /* not to far away */
3170 #endif
3171 /* give also a comma or dot behind the line a chance */
3172 if ( box2->x0 >= lines->x0[i]
3173 && box2->x1 <= lines->x1[i]+job->res.avX )
3174 if ( box2->y0 <= m4 + 2*job->res.avY // 2010-10-01+09 0811qemu2
3175 && box2->y1 >= m1 - job->res.avY/2 - 1 // give "a "o ... a chance
3176 && box2->y1 <= m4 + 2*job->res.avY ) // 2010-10-09 ocr-b-'_'
3177 if ( box2->m2==0 // already put to a line? check y-distance
3178 // || abs(box2->y0 - box2->m2) > abs(box2->y0 - m2)
3179 || mindy1 < mindy2 )
3180 { /* found nearest line */
3181 if ((job->cfg.verbose&16) && (box2->y1 -box2->y0 +1) <= (m3-m2+1)/2)
3182 fprintf(stderr,"\n# line.info.set L%02d xy= %4d %4d m14 %4d %4d avY %4d",
3183 i, box2->x0, box2->y0, m1, m4, job->res.avY);
3184 box2->m1= m1;
3185 box2->m2= m2;
3186 box2->m3= m3;
3187 box2->m4= m4;
3188 box2->line= i;
3189 }
3190 } // i=1..lines (for every char)
3191 if (((box2->y1 -box2->y0 +1) >= (box2->m3 -box2->m2+1)/2) // body not dots
3192 && (box2->y1+2 < box2->m1
3193 || box2->y0 < box2->m1 - (box2->m3-box2->m1)/2
3194 || box2->y0-2 > box2->m4 + (box2->m3-box2->m2)/2 // bad m4 + ,._ ocr-b
3195 || box2->y1 > box2->m3 + (box2->m3-box2->m1)
3196 )) /* to far away */
3197 { /* reset */
3198 if (job->cfg.verbose&16)
3199 fprintf(stderr,"\n# line.info.reset L%02d xy= %4d %4d m14 %4d %4d avY %4d",
3200 box2->line, box2->x0, box2->y0, box2->m1, box2->m4, job->res.avY);
3201 box2->m1= 0;
3202 box2->m2= 0;
3203 box2->m3= 0;
3204 box2->m4= 0;
3205 box2->line= 0;
3206 num_rest++;
3207 } else num_line_members++;
3208 } end_for_each(&(job->res.boxlist));
3209 if (job->cfg.verbose&1)
3210 fprintf(stderr," done, num_line_chars=%d rest=%d\n",
3211 num_line_members, num_rest);
3212 return 0;
3213 }
3214
3215
3216 /*
3217 * bring the boxes in right order
3218 * add_line_info must be executed first!
3219 */
sort_box_func(const void * a,const void * b)3220 int sort_box_func (const void *a, const void *b) {
3221 struct box *boxa, *boxb;
3222
3223 boxa = (struct box *)a;
3224 boxb = (struct box *)b;
3225
3226 if ( ( boxb->line < boxa->line ) ||
3227 ( boxb->line == boxa->line && boxb->x0 < boxa->x0 ) )
3228 return 1;
3229 return -1;
3230 }
3231
3232 // -------------------------------------------------------------
3233 // ------ use this for entry from other programs
3234 // include pnm.h pgm2asc.h
3235 // -------------------------------------------------------------
3236 // entry point for gocr.c or if it is used as lib
3237 // better name is call_ocr ???
3238 // jb: OLD COMMENT: not removed due to set_options_* ()
3239 // args after pix *pp should be removed and new functions
3240 // set_option_mode(int mode), set_option_spacewidth() .... etc.
3241 // should be used instead, before calling pgm2asc(pix *pp)
3242 // ! change if you can ! - used by X11 frontend
pgm2asc(job_t * job)3243 int pgm2asc(job_t *job)
3244 {
3245 pix *pp;
3246 progress_counter_t *pc;
3247 static int multi_image_count=0; /* number of image within multi-image */
3248 int orig_cs=0;
3249
3250 if (!multi_image_count) orig_cs = job->cfg.cs; /* save for multi-images */
3251
3252 multi_image_count++;
3253
3254 assert(job);
3255 /* FIXME jb: remove pp */
3256 pp = &(job->src.p);
3257
3258 pc = open_progress(100,"pgm2asc_main");
3259 progress(0,pc); /* start progress output 0% 0% */
3260 #if 0 /* dont vast memory */
3261 /* FIXME jb: malloc */
3262 if ( job->cfg.verbose & 32 ) {
3263 // generate 2nd imagebuffer for debugging output
3264 job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);
3265 // buffer
3266 assert(job->tmp.ppo.p);
3267 copybox(&job->src.p,
3268 0, 0, job->src.p.x, job->src.p.y,
3269 &job->tmp.ppo,
3270 job->src.p.x * job->src.p.y);
3271 }
3272 #else
3273 job->tmp.ppo=job->src.p; /* temporarely, removed later */
3274 #endif
3275 // check for bad read/format-convert image writeppm/png 2018-10
3276 //if(job->cfg.verbose&32) debug_img("out00",job,8/* 8=clr_bit1..3 */);
3277
3278
3279 /* ----- count colors ------ create histogram -------
3280 - this should be used to create a upper and lower limit for cs
3281 - cs is the optimum gray value between cs_min and cs_max
3282 - also inverse scans could be detected here later */
3283 if (orig_cs==0)
3284 job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3285 else // dont set cs, output stats + do inversion if needed 2010-10-07
3286 otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y,job->cfg.verbose & 1);
3287 // if (job->cfg.verbose&32) debug_img("out001.ppm",job,0);
3288 /* renormalize the image and set the normalized threshold value */
3289 job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
3290 if( job->cfg.verbose )
3291 fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);
3292 // if (job->cfg.verbose&32) debug_img("out002.ppm",job,0);
3293
3294 progress(5,pc); /* progress is only estimated */
3295
3296
3297 /* this is first step for reorganize the PG
3298 ---- look for letters, put rectangular frames around letters
3299 letter = connected points near color F
3300 should be used by dust removing (faster) and line detection!
3301 ---- 0..cs = black letters, last change = Mai99 */
3302
3303 progress(8,pc); /* progress is only estimated */
3304
3305 // if (job->cfg.verbose&32) debug_img("out008.ppm",job,8);
3306 scan_boxes( job, pp );
3307 if ( !job->res.numC ){
3308 fprintf( stderr,"# no boxes found - stopped\n" );
3309 if(job->cfg.verbose&32) debug_img("out01",job,8);
3310 /***** should free stuff, etc) */
3311 return(1);
3312 }
3313 // tmp10/bug100818a.pgm creates artefacts on image
3314 // if (job->cfg.verbose&32) debug_img("out00",job,4+8);
3315
3316 progress(10,pc); /* progress is only estimated */
3317 // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
3318 // output_list(job); // for debugging
3319 // ToDo: matrix printer preprocessing
3320
3321 remove_dust( job ); /* from the &(job->res.boxlist)! */
3322 // if(job->cfg.verbose&32) debug_img("out02",job,4+8);
3323 // output_list(job); // for debugging
3324 #if 0 // ToDo 2010-10-15 destroys QR-barcodes
3325 smooth_borders( job ); /* only for big chars */
3326 #endif
3327 progress(12,pc); /* progress is only estimated */
3328 // if(job->cfg.verbose&32) debug_img("out03",job,4+8);
3329 // output_list(job); // for debugging
3330
3331 detect_barcode( job ); /* mark barcode */
3332 // if(job->cfg.verbose&32) debug_img("out04",job,4+8);
3333 // output_list(job); // for debugging
3334
3335 detect_pictures( job ); /* mark pictures */
3336 // if(job->cfg.verbose&32) debug_img("out05",job,4+8);
3337 // output_list(job); // for debugging
3338
3339 remove_pictures( job ); /* do this as early as possible, before layout */
3340 // if(job->cfg.verbose&32) debug_img("out06",job,4+8);
3341 // output_list(job); // for debugging
3342
3343 glue_holes_inside_chars( pp ); /* including count subboxes (holes) */
3344
3345 detect_rotation_angle( job );
3346
3347 #if 1 /* Rotate the whole picture! move boxes */
3348 if( job->res.lines.dy!=0 ){ // move down lowest first, move up highest first
3349 // in work! ??? (at end set dy=0) think on ppo!
3350 }
3351 #endif
3352 detect_text_lines( pp, job->cfg.mode ); /* detect and mark job->tmp.ppo */
3353 // if(job->cfg.verbose&32) debug_img("out07",job,4+8);
3354 progress(20,pc); /* progress is only estimated */
3355
3356 add_line_info( job /* , &(job->res.boxlist) */);
3357 if (job->cfg.verbose&32) debug_img("out10",job,4+8);
3358
3359 divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
3360 // if(job->cfg.verbose&32) debug_img("out11",job,0);
3361
3362 remove_melted_serifs( job, pp ); /* make some corrections on pixmap */
3363 /* list_ins seems to sort in the boxes on the wrong place ??? */
3364 // if(job->cfg.verbose&32) debug_img("out12",job,4+8);
3365
3366 glue_broken_chars( job, pp ); /* 2nd glue */
3367 // if(job->cfg.verbose&32) debug_img("out14",job,4+8);
3368 // 2010-09-24 overall box size is correct here, but later broken
3369
3370 remove_rest_of_dust( job );
3371 // if(job->cfg.verbose&32) debug_img("out15",job,4+8);
3372
3373 /* better sort after dust is removed (slow for lot of pixels) */
3374 list_sort(&(job->res.boxlist), sort_box_func);
3375
3376 measure_pitch( job );
3377
3378 if(job->cfg.mode&64) find_same_chars( pp );
3379 progress(30,pc); /* progress is only estimated */
3380 // if(job->cfg.verbose&32) debug_img("out16",job,4+8);
3381
3382 char_recognition( pp, job->cfg.mode);
3383 progress(60,pc); /* progress is only estimated */
3384 // if(job->cfg.verbose&32) debug_img("out17",job,4+8);
3385
3386 if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
3387 /* may be, characters/pictures have changed line number */
3388 list_sort(&(job->res.boxlist), sort_box_func);
3389 // 2nd recognition call if lines are adjusted
3390 char_recognition( pp, job->cfg.mode);
3391 }
3392
3393 #define BlownUpDrawing 1 /* german: Explosionszeichnung, temporarly */
3394 #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
3395 { /* just for debugging */
3396 int i,ii,ni; struct box *box2;
3397 i=ii=ni=0;
3398 for_each_data(&(job->res.boxlist)) { /* count boxes */
3399 box2 = (struct box *)list_get_current(&(job->res.boxlist));
3400 if (box2->c==UNKNOWN) i++;
3401 if (box2->c==PICTURE) ii++;
3402 ni++;
3403 } end_for_each(&(job->res.boxlist));
3404 if (job->cfg.verbose)
3405 fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);
3406 }
3407 #endif
3408 // ----------- write out20.pgm ----------- mark lines + boxes
3409 if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);
3410
3411 compare_unknown_with_known_chars( pp, job->cfg.mode);
3412 progress(70,pc); /* progress is only estimated */
3413
3414 try_to_divide_boxes( pp, job->cfg.mode);
3415 progress(80,pc); /* progress is only estimated */
3416
3417 /* --- list output ---- for debugging --- */
3418 if (job->cfg.verbose&6) output_list(job);
3419
3420 /* ---- insert spaces ---- */
3421 list_insert_spaces( pp , job );
3422
3423 // ---- proof difficult chars Il1 by context view ----
3424 if (job->cfg.verbose)
3425 fprintf(stderr,"# context correction if !(mode&32)\n");
3426 if (!(job->cfg.mode&32)) context_correction( job );
3427
3428 store_boxtree_lines( job, job->cfg.mode );
3429 progress(90,pc); /* progress is only estimated */
3430
3431 /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
3432 * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
3433 * awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o
3434 * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
3435 * 9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
3436 * 1*1 1*7 not recognized (Oct04)
3437 * 33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
3438 */
3439 #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
3440 { /* just for debugging */
3441 int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
3442 i=ii=ni=0;
3443 for_each_data(&(job->res.boxlist)) { /* count boxes */
3444 box2 = (struct box *)list_get_current(&(job->res.boxlist));
3445 if (box2->c==UNKNOWN) i++;
3446 if (box2->c==PICTURE) ii++;
3447 if (box2->c>' ' && box2->c<='z') ni++;
3448 } end_for_each(&(job->res.boxlist));
3449 if(job->cfg.verbose)
3450 fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
3451 for (i=0;i<20;i++) {
3452 ni=0;
3453 for_each_data(&(job->res.boxlist)) { /* count boxes */
3454 box2 = (struct box *)list_get_current(&(job->res.boxlist));
3455 if (box2->c==testc[i]) ni++;
3456 } end_for_each(&(job->res.boxlist));
3457 if(job->cfg.verbose && ni>0)
3458 fprintf(stderr," (%c)=%d",testc[i],ni);
3459 }
3460 if(job->cfg.verbose)
3461 fprintf(stderr,"\n");
3462 }
3463 #endif
3464
3465 // ---- frame-size-histogram
3466 // ---- (my own defined) distance between letters
3467 // ---- write internal picture of textsite
3468 // ----------- write out30.pgm -----------
3469 if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
3470
3471 progress(100,pc); /* progress is only estimated */
3472
3473 close_progress(pc);
3474
3475 return 0; /* what should I return? error-state? num-of-chars? */
3476 }
3477