1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file recogdid.c
29 * <pre>
30 *
31 * Top-level identification
32 * BOXA *recogDecode()
33 *
34 * Generate decoding arrays
35 * static l_int32 recogPrepareForDecoding()
36 * static l_int32 recogMakeDecodingArray()
37 *
38 * Dynamic programming for best path
39 * static l_int32 recogRunViterbi()
40 * static l_int32 recogRescoreDidResult()
41 * static PIX *recogShowPath()
42 *
43 * Create/destroy temporary DID data
44 * l_int32 recogCreateDid()
45 * l_int32 recogDestroyDid()
46 *
47 * Various helpers
48 * l_int32 recogDidExists()
49 * L_RDID *recogGetDid()
50 * static l_int32 recogGetWindowedArea()
51 * l_int32 recogSetChannelParams()
52 * static l_int32 recogTransferRchToDid()
53 *
54 * See recogbasic.c for examples of training a recognizer, which is
55 * required before it can be used for document image decoding.
56 *
57 * Gary Kopec pioneered this hidden markov approach to "Document Image
58 * Decoding" (DID) in the early 1990s. It is based on estimation
59 * using a generative model of the image generation process, and
60 * provides the most likely decoding of an image if the model is correct.
61 * Given the model, it finds the maximum a posteriori (MAP) "message"
62 * given the observed image. The model describes how to generate
63 * an image from a message, and the MAP message is derived from the
64 * observed image using Bayes' theorem. This approach can also be used
65 * to build the model, using the iterative expectation/maximization
66 * method from labeled but errorful data.
67 *
68 * In a little more detail: The model comprises three things: the ideal
69 * printed character templates, the independent bit-flip noise model, and
70 * the character setwidths. When a character is printed, the setwidth
71 * is the distance in pixels that you move forward before being able
72 * to print the next character. It is typically slightly less than the
73 * width of the character template: if too small, an extra character can be
74 * hallucinated; if too large, it will not be able to match the next
75 * character template on the line. The model assumes that the probabilities
76 * of bit flip depend only on the assignment of the pixel to background
77 * or template foreground. The multilevel templates have different
78 * bit flip probabilities for each level. Because a character image
79 * is composed of many pixels, each of which can be independently flipped,
80 * the actual probability of seeing any rendering is exceedingly small,
81 * being composed of the product of the probabilities for each pixel.
82 * The log likelihood is used both to avoid numeric underflow and,
83 * more importantly, because it results in a summation of independent
84 * pixel probabilities. That summation can be shown, in Kopec's
85 * original paper, to consist of a sum of two terms: (a) the number of
86 * fg pixels in the bit-and of the observed image with the ideal
87 * template and (b) the number of fg pixels in the template. Each
88 * has a coefficient that depends only on the bit-flip probabilities
89 * for the fg and bg. A beautiful result, and computationally simple!
90 * One nice feature of this approach is that the result of the decoding
91 * is not very sensitive to the values used for the bit flip probabilities.
92 *
93 * The procedure for finding the best decoding (MAP) for a given image goes
94 * under several names: Viterbi, dynamic programming, hidden markov model.
95 * It is called a "hidden markov model" because the templates are assumed
96 * to be printed serially and we don't know what they are -- the identity
97 * of the templates must be inferred from the observed image.
98 * The possible decodings form a dense trellis over the pixel positions,
99 * where at each pixel position you have the possibility of having any
100 * of the characters printed there (with some reference point) or having
101 * a single pixel wide space inserted there. Thus, before the trellis
102 * can be traversed, we must do the work of finding the log probability,
103 * at each pixel location, that each of the templates was printed there.
104 * Armed with those arrays of data, the dynamic programming procedure
105 * moves from left to right, one pixel at a time, recursively finding
106 * the path with the highest log probability that gets to that pixel
107 * position (and noting which template was printed to arrive there).
108 * After reaching the right side of the image, we can simply backtrack
109 * along the path, jumping over each template that lies on the highest
110 * scoring path. This best path thus only goes through a few of the
111 * pixel positions.
112 *
113 * There are two refinements to the original Kopec paper. In the first,
114 * one uses multiple, non-overlapping fg templates, each with its own
115 * bit flip probability. This makes sense, because the probability
116 * that a fg boundary pixel flips to bg is greater than that of a fg
117 * pixel not on the boundary. And the flip probability of a fg boundary
118 * pixel is smaller than that of a bg boundary pixel, which in turn
119 * is greater than that of a bg pixel not on a boundary (the latter
120 * is taken to be the true background). Then the simplest realistic
121 * multiple template model has three templates that are not background.
122 *
123 * In the second refinement, a heuristic (strict upper bound) is used
124 * iteratively in the Viterbi process to compute the log probabilities.
125 * Using the heuristic, you find the best path, and then score all nodes
126 * on that path with the actual probability, which is guaranteed to
127 * be a smaller number. You run this iteratively, rescoring just the best
128 * found path each time. After each rescoring, the path may change because
129 * the local scores have been reduced. However, the process converges
130 * rapidly, and when it doesn't change, it must be the best path because
131 * it is properly scored (even if neighboring paths are heuristically
132 * scored). The heuristic score is found column-wise by assuming
133 * that all the fg pixels in the template are on fg pixels in the image --
134 * we just take the minimum of the number of pixels in the template
135 * and image column. This can easily give a 10-fold reduction in
136 * computation because the heuristic score can be computed much faster
137 * than the exact score.
138 *
139 * For reference, the classic paper on the approach by Kopec is:
140 * * "Document Image Decoding Using Markov Source Models", IEEE Trans.
141 * PAMI, Vol 16, No. 6, June 1994, pp 602-617.
142 * A refinement of the method for multilevel templates by Kopec is:
143 * * "Multilevel Character Templates for Document Image Decoding",
144 * Proc. SPIE 3027, Document Recognition IV, p. 168ff, 1997.
145 * Further refinements for more efficient decoding are given in these
146 * two papers, which are both stored on leptonica.org:
147 * * "Document Image Decoding using Iterated Complete Path Search", Minka,
148 * Bloomberg and Popat, Proc. SPIE Vol 4307, p. 250-258, Document
149 * Recognition and Retrieval VIII, San Jose, CA 2001.
150 * * "Document Image Decoding using Iterated Complete Path Search with
151 * Subsampled Heuristic Scoring", Bloomberg, Minka and Popat, ICDAR 2001,
152 * p. 344-349, Sept. 2001, Seattle.
153 * </pre>
154 */
155
156 #include <string.h>
157 #include <math.h>
158 #include "allheaders.h"
159
160 static l_int32 recogPrepareForDecoding(L_RECOG *recog, PIX *pixs,
161 l_int32 debug);
162 static l_int32 recogMakeDecodingArray(L_RECOG *recog, l_int32 index,
163 l_int32 debug);
164 static l_int32 recogRunViterbi(L_RECOG *recog, PIX **ppixdb);
165 static l_int32 recogRescoreDidResult(L_RECOG *recog, PIX **ppixdb);
166 static PIX *recogShowPath(L_RECOG *recog, l_int32 select);
167 static l_int32 recogGetWindowedArea(L_RECOG *recog, l_int32 index,
168 l_int32 x, l_int32 *pdely, l_int32 *pwsum);
169 static l_int32 recogTransferRchToDid(L_RECOG *recog, l_int32 x, l_int32 y);
170
171 /* Parameters for modeling the decoding */
172 static const l_float32 SetwidthFraction = 0.95;
173 static const l_int32 MaxYShift = 1;
174
175 /* Channel parameters. alpha[0] is the probability that a bg pixel
176 * is OFF. alpha[1] is the probability that level 1 fg is ON.
177 * The actual values are not too critical, but they must be larger
178 * than 0.5 and smaller than 1.0. For more accuracy in template
179 * matching, use a 4-level template, where levels 2 and 3 are
180 * boundary pixels in the fg and bg, respectively. */
181 static const l_float32 DefaultAlpha2[] = {0.95f, 0.9f};
182 static const l_float32 DefaultAlpha4[] = {0.95f, 0.9f, 0.75f, 0.25f};
183
184
185 /*------------------------------------------------------------------------*
186 * Top-level identification *
187 *------------------------------------------------------------------------*/
188 /*!
189 * \brief recogDecode()
190 *
191 * \param[in] recog with LUT's pre-computed
192 * \param[in] pixs typically of multiple touching characters, 1 bpp
193 * \param[in] nlevels of templates; 2 for now
194 * \param[out] ppixdb [optional] debug result; can be null
195 * \return boxa segmentation of pixs into characters, or NULL on error
196 *
197 * <pre>
198 * Notes:
199 * (1) The input pixs has been filtered so that it is likely to be
200 * composed of more than one touching character. Specifically,
201 * its height can only slightly exceed that of the tallest
202 * unscaled template, the width is somewhat larger than the
203 * width of the widest unscaled template, and the w/h aspect ratio
204 * is bounded by max_wh_ratio.
205 * (2) This uses the DID mechanism with labeled templates to
206 * segment the input %pixs. The resulting segmentation is
207 * returned. (It is given by did->boxa).
208 * (3) In debug mode, the Viterbi path is rescored based on all
209 * the templates. In non-debug mode, the same procedure is
210 * carried out by recogIdentifyPix() on the result of the
211 * segmentation.
212 * </pre>
213 */
214 BOXA *
recogDecode(L_RECOG * recog,PIX * pixs,l_int32 nlevels,PIX ** ppixdb)215 recogDecode(L_RECOG *recog,
216 PIX *pixs,
217 l_int32 nlevels,
218 PIX **ppixdb)
219 {
220 l_int32 debug;
221 PIX *pix1;
222 PIXA *pixa;
223
224 PROCNAME("recogDecode");
225
226 if (ppixdb) *ppixdb = NULL;
227 if (!recog)
228 return (BOXA *)ERROR_PTR("recog not defined", procName, NULL);
229 if (!pixs || pixGetDepth(pixs) != 1)
230 return (BOXA *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
231 if (!recog->train_done)
232 return (BOXA *)ERROR_PTR("training not finished", procName, NULL);
233 if (nlevels != 2)
234 return (BOXA *)ERROR_PTR("nlevels != 2 (for now)", procName, NULL);
235
236 debug = (ppixdb) ? 1 : 0;
237 if (recogPrepareForDecoding(recog, pixs, debug))
238 return (BOXA *)ERROR_PTR("error making arrays", procName, NULL);
239 recogSetChannelParams(recog, nlevels);
240
241 /* Normal path; just run Viterbi */
242 if (!debug) {
243 if (recogRunViterbi(recog, NULL) == 0)
244 return boxaCopy(recog->did->boxa, L_COPY);
245 else
246 return (BOXA *)ERROR_PTR("error in Viterbi", procName, NULL);
247 }
248
249 /* Debug path */
250 if (recogRunViterbi(recog, &pix1))
251 return (BOXA *)ERROR_PTR("error in viterbi", procName, NULL);
252 pixa = pixaCreate(2);
253 pixaAddPix(pixa, pix1, L_INSERT);
254 if (recogRescoreDidResult(recog, &pix1)) {
255 pixaDestroy(&pixa);
256 return (BOXA *)ERROR_PTR("error in rescoring", procName, NULL);
257 }
258 pixaAddPix(pixa, pix1, L_INSERT);
259 *ppixdb = pixaDisplayTiledInRows(pixa, 32, 2 * pixGetWidth(pix1) + 100,
260 1.0, 0, 30, 2);
261 pixaDestroy(&pixa);
262 return boxaCopy(recog->did->boxa, L_COPY);
263 }
264
265
266 /*------------------------------------------------------------------------*
267 * Generate decoding arrays *
268 *------------------------------------------------------------------------*/
269 /*!
270 * \brief recogPrepareForDecoding()
271 *
272 * \param[in] recog with LUT's pre-computed
273 * \param[in] pixs typically of multiple touching characters, 1 bpp
274 * \param[in] debug 1 for debug output; 0 otherwise
275 * \return 0 if OK, 1 on error
276 *
277 * <pre>
278 * Notes:
279 * (1) Binarizes and crops input %pixs.
280 * (2) Removes previous L_RDID struct and makes a new one.
281 * (3) Generates the bit-and sum arrays for each character template
282 * at each pixel position in %pixs. These are used in the
283 * Viterbi dynamic programming step.
284 * (4) The values are saved in the scoring arrays at the left edge
285 * of the template. They are used in the Viterbi process
286 * at the setwidth position (which is near the RHS of the template
287 * as it is positioned on pixs) in the generated trellis.
288 * </pre>
289 */
290 static l_int32
recogPrepareForDecoding(L_RECOG * recog,PIX * pixs,l_int32 debug)291 recogPrepareForDecoding(L_RECOG *recog,
292 PIX *pixs,
293 l_int32 debug)
294 {
295 l_int32 i;
296 PIX *pix1;
297 L_RDID *did;
298
299 PROCNAME("recogPrepareForDecoding");
300
301 if (!recog)
302 return ERROR_INT("recog not defined", procName, 1);
303 if (!pixs || pixGetDepth(pixs) != 1)
304 return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
305 if (!recog->train_done)
306 return ERROR_INT("training not finished", procName, 1);
307
308 if (!recog->ave_done)
309 recogAverageSamples(&recog, 0);
310
311 /* Binarize and crop to foreground if necessary */
312 if ((pix1 = recogProcessToIdentify(recog, pixs, 0)) == NULL)
313 return ERROR_INT("pix1 not made", procName, 1);
314
315 /* Remove any existing RecogDID and set up a new one */
316 recogDestroyDid(recog);
317 if (recogCreateDid(recog, pix1)) {
318 pixDestroy(&pix1);
319 return ERROR_INT("decoder not made", procName, 1);
320 }
321
322 /* Compute vertical sum and first moment arrays */
323 did = recogGetDid(recog); /* owned by recog */
324 did->nasum = pixCountPixelsByColumn(pix1);
325 did->namoment = pixGetMomentByColumn(pix1, 1);
326
327 /* Generate the arrays */
328 for (i = 0; i < recog->did->narray; i++)
329 recogMakeDecodingArray(recog, i, debug);
330
331 pixDestroy(&pix1);
332 return 0;
333 }
334
335
336 /*!
337 * \brief recogMakeDecodingArray()
338 *
339 * \param[in] recog
340 * \param[in] index of averaged template
341 * \param[in] debug 1 for debug output; 0 otherwise
342 * \return 0 if OK, 1 on error
343 *
344 * <pre>
345 * Notes:
346 * (1) Generates the bit-and sum array for a character template along pixs.
347 * (2) The values are saved in the scoring arrays at the left edge
348 * of the template as it is positioned on pixs.
349 * </pre>
350 */
351 static l_int32
recogMakeDecodingArray(L_RECOG * recog,l_int32 index,l_int32 debug)352 recogMakeDecodingArray(L_RECOG *recog,
353 l_int32 index,
354 l_int32 debug)
355 {
356 l_int32 i, j, w1, h1, w2, h2, nx, ycent2, count, maxcount, maxdely;
357 l_int32 sum, moment, dely, shifty;
358 l_int32 *counta, *delya, *ycent1, *arraysum, *arraymoment, *sumtab;
359 NUMA *nasum, *namoment;
360 PIX *pix1, *pix2, *pix3;
361 L_RDID *did;
362
363 PROCNAME("recogMakeDecodingArray");
364
365 if (!recog)
366 return ERROR_INT("recog not defined", procName, 1);
367 if ((did = recogGetDid(recog)) == NULL)
368 return ERROR_INT("did not defined", procName, 1);
369 if (index < 0 || index >= did->narray)
370 return ERROR_INT("invalid index", procName, 1);
371
372 /* Check that pix1 is large enough for this template. */
373 pix1 = did->pixs; /* owned by did; do not destroy */
374 pixGetDimensions(pix1, &w1, &h1, NULL);
375 pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
376 pixGetDimensions(pix2, &w2, &h2, NULL);
377 if (w1 < w2) {
378 L_INFO("w1 = %d < w2 = %d for index %d\n", procName, w1, w2, index);
379 pixDestroy(&pix2);
380 return 0;
381 }
382
383 nasum = did->nasum;
384 namoment = did->namoment;
385 ptaGetIPt(recog->pta_u, index, NULL, &ycent2);
386 sumtab = recog->sumtab;
387 counta = did->counta[index];
388 delya = did->delya[index];
389
390 /* Set up the array for ycent1. This gives the y-centroid location
391 * for a window of width w2, starting at location i. */
392 nx = w1 - w2 + 1; /* number of positions w2 can be placed in w1 */
393 ycent1 = (l_int32 *)LEPT_CALLOC(nx, sizeof(l_int32));
394 arraysum = numaGetIArray(nasum);
395 arraymoment = numaGetIArray(namoment);
396 for (i = 0, sum = 0, moment = 0; i < w2; i++) {
397 sum += arraysum[i];
398 moment += arraymoment[i];
399 }
400 for (i = 0; i < nx - 1; i++) {
401 ycent1[i] = (sum == 0) ? ycent2 : (l_float32)moment / (l_float32)sum;
402 sum += arraysum[w2 + i] - arraysum[i];
403 moment += arraymoment[w2 + i] - arraymoment[i];
404 }
405 ycent1[nx - 1] = (sum == 0) ? ycent2 : (l_float32)moment / (l_float32)sum;
406
407 /* Compute the bit-and sum between the template pix2 and pix1, at
408 * locations where the left side of pix2 goes from 0 to nx - 1
409 * in pix1. Do this around the vertical alignment of the pix2
410 * centroid and the windowed pix1 centroid.
411 * (1) Start with pix3 cleared and approximately equal in size to pix1.
412 * (2) Blit the y-shifted pix2 onto pix3. Then all ON pixels
413 * are within the intersection of pix1 and the shifted pix2.
414 * (3) AND pix1 with pix3. */
415 pix3 = pixCreate(w2, h1, 1);
416 for (i = 0; i < nx; i++) {
417 shifty = (l_int32)(ycent1[i] - ycent2 + 0.5);
418 maxcount = 0;
419 maxdely = 0;
420 for (j = -MaxYShift; j <= MaxYShift; j++) {
421 pixClearAll(pix3);
422 dely = shifty + j; /* amount pix2 is shifted relative to pix1 */
423 pixRasterop(pix3, 0, dely, w2, h2, PIX_SRC, pix2, 0, 0);
424 pixRasterop(pix3, 0, 0, w2, h1, PIX_SRC & PIX_DST, pix1, i, 0);
425 pixCountPixels(pix3, &count, sumtab);
426 if (count > maxcount) {
427 maxcount = count;
428 maxdely = dely;
429 }
430 }
431 counta[i] = maxcount;
432 delya[i] = maxdely;
433 }
434 did->fullarrays = TRUE;
435
436 pixDestroy(&pix2);
437 pixDestroy(&pix3);
438 LEPT_FREE(ycent1);
439 LEPT_FREE(arraysum);
440 LEPT_FREE(arraymoment);
441 return 0;
442 }
443
444
445 /*------------------------------------------------------------------------*
446 * Dynamic programming for best path
447 *------------------------------------------------------------------------*/
448 /*!
449 * \brief recogRunViterbi()
450 *
451 * \param[in] recog with LUT's pre-computed
452 * \param[out] ppixdb [optional] debug result; can be null
453 * \return 0 if OK, 1 on error
454 *
455 * <pre>
456 * Notes:
457 * (1) This can be used when the templates are unscaled. It works by
458 * matching the average, unscaled templates of each class to
459 * all positions.
460 * (2) It is recursive, in that
461 * (a) we compute the score successively at all pixel positions x,
462 * (b) to compute the score at x in the trellis, for each
463 * template we look backwards to (x - setwidth) to get the
464 * score if that template were to be printed with its
465 * setwidth location at x. We save at x the template and
466 * score that maximizes the sum of the score at (x - setwidth)
467 * and the log-likelihood for the template to be printed with
468 * its LHS there.
469 * (3) The primary output is a boxa of the locations for splitting
470 * the input image. These locations are used later to split the
471 * image and send the pieces individually for recognition.
472 * This can be done in either recogIdentifyMultiple(), or
473 * for debugging in recogRescoreDidResult().
474 * </pre>
475 */
476 static l_int32
recogRunViterbi(L_RECOG * recog,PIX ** ppixdb)477 recogRunViterbi(L_RECOG *recog,
478 PIX **ppixdb)
479 {
480 l_int32 i, w1, w2, h1, xnz, x, narray, minsetw;
481 l_int32 first, templ, xloc, dely, counts, area1;
482 l_int32 besttempl, spacetempl;
483 l_int32 *setw, *didtempl;
484 l_int32 *area2; /* must be freed */
485 l_float32 prevscore, matchscore, maxscore, correl;
486 l_float32 *didscore;
487 BOX *box;
488 PIX *pix1;
489 L_RDID *did;
490
491 PROCNAME("recogRunViterbi");
492
493 if (ppixdb) *ppixdb = NULL;
494 if (!recog)
495 return ERROR_INT("recog not defined", procName, 1);
496 if ((did = recogGetDid(recog)) == NULL)
497 return ERROR_INT("did not defined", procName, 1);
498 if (did->fullarrays == 0)
499 return ERROR_INT("did full arrays not made", procName, 1);
500
501 /* Compute the minimum setwidth. Bad templates with very small
502 * width can cause havoc because the setwidth is too small. */
503 w1 = did->size;
504 narray = did->narray;
505 spacetempl = narray;
506 setw = did->setwidth;
507 minsetw = 100000;
508 for (i = 0; i < narray; i++) {
509 if (setw[i] < minsetw)
510 minsetw = setw[i];
511 }
512 if (minsetw <= 2)
513 return ERROR_INT("minsetw <= 2; bad templates", procName, 1);
514
515 /* The score array is initialized to 0.0. As we proceed to
516 * the left, the log likelihood for the partial paths goes
517 * negative, and we prune for the max (least negative) path.
518 * No matches will be computed until we reach x = min(setwidth);
519 * until then first == TRUE after looping over templates. */
520 didscore = did->trellisscore;
521 didtempl = did->trellistempl;
522 area2 = numaGetIArray(recog->nasum_u);
523 for (x = minsetw; x < w1; x++) { /* will always get a score */
524 first = TRUE;
525 for (i = 0; i < narray; i++) {
526 if (x - setw[i] < 0) continue;
527 matchscore = didscore[x - setw[i]] +
528 did->gamma[1] * did->counta[i][x - setw[i]] +
529 did->beta[1] * area2[i];
530 if (first) {
531 maxscore = matchscore;
532 besttempl = i;
533 first = FALSE;
534 } else {
535 if (matchscore > maxscore) {
536 maxscore = matchscore;
537 besttempl = i;
538 }
539 }
540 }
541
542 /* We can also put down a single pixel space, with no cost
543 * because all pixels are bg. */
544 prevscore = didscore[x - 1];
545 if (prevscore > maxscore) { /* 1 pixel space is best */
546 maxscore = prevscore;
547 besttempl = spacetempl;
548 }
549 didscore[x] = maxscore;
550 didtempl[x] = besttempl;
551 }
552
553 /* Backtrack to get the best path.
554 * Skip over (i.e., ignore) all single pixel spaces. */
555 for (x = w1 - 1; x >= 0; x--) {
556 if (didtempl[x] != spacetempl) break;
557 }
558 h1 = pixGetHeight(did->pixs);
559 while (x > 0) {
560 if (didtempl[x] == spacetempl) { /* skip over spaces */
561 x--;
562 continue;
563 }
564 templ = didtempl[x];
565 xloc = x - setw[templ];
566 if (xloc < 0) break;
567 counts = did->counta[templ][xloc]; /* bit-and counts */
568 recogGetWindowedArea(recog, templ, xloc, &dely, &area1);
569 correl = ((l_float32)(counts) * counts) /
570 (l_float32)(area2[templ] * area1);
571 pix1 = pixaGetPix(recog->pixa_u, templ, L_CLONE);
572 w2 = pixGetWidth(pix1);
573 numaAddNumber(did->natempl, templ);
574 numaAddNumber(did->naxloc, xloc);
575 numaAddNumber(did->nadely, dely);
576 numaAddNumber(did->nawidth, pixGetWidth(pix1));
577 numaAddNumber(did->nascore, correl);
578 xnz = L_MAX(xloc, 0);
579 box = boxCreate(xnz, dely, w2, h1);
580 boxaAddBox(did->boxa, box, L_INSERT);
581 pixDestroy(&pix1);
582 x = xloc;
583 }
584
585 if (ppixdb) {
586 numaWriteStream(stderr, did->natempl);
587 numaWriteStream(stderr, did->naxloc);
588 numaWriteStream(stderr, did->nadely);
589 numaWriteStream(stderr, did->nawidth);
590 numaWriteStream(stderr, did->nascore);
591 boxaWriteStream(stderr, did->boxa);
592 *ppixdb = recogShowPath(recog, 0);
593 }
594
595 LEPT_FREE(area2);
596 return 0;
597 }
598
599
600 /*!
601 * \brief recogRescoreDidResult()
602 *
603 * \param[in] recog with LUT's pre-computed
604 * \param[out] ppixdb [optional] debug result; can be null
605 * \return 0 if OK, 1 on error
606 *
607 * <pre>
608 * Notes:
609 * (1) This does correlation matching with all unscaled templates,
610 * using the character segmentation determined by the Viterbi path.
611 * </pre>
612 */
613 static l_int32
recogRescoreDidResult(L_RECOG * recog,PIX ** ppixdb)614 recogRescoreDidResult(L_RECOG *recog,
615 PIX **ppixdb)
616 {
617 l_int32 i, n, sample, x, dely, index;
618 char *text;
619 l_float32 score;
620 BOX *box1;
621 PIX *pixs, *pix1;
622 L_RDID *did;
623
624 PROCNAME("recogRescoreDidResult");
625
626 if (ppixdb) *ppixdb = NULL;
627 if (!recog)
628 return ERROR_INT("recog not defined", procName, 1);
629 if ((did = recogGetDid(recog)) == NULL)
630 return ERROR_INT("did not defined", procName, 1);
631 if (did->fullarrays == 0)
632 return ERROR_INT("did full arrays not made", procName, 1);
633 if ((n = numaGetCount(did->naxloc)) == 0)
634 return ERROR_INT("no elements in path", procName, 1);
635
636 pixs = did->pixs;
637 for (i = 0; i < n; i++) {
638 box1 = boxaGetBox(did->boxa, i, L_COPY);
639 boxGetGeometry(box1, &x, &dely, NULL, NULL);
640 pix1 = pixClipRectangle(pixs, box1, NULL);
641 recogIdentifyPix(recog, pix1, NULL);
642 recogTransferRchToDid(recog, x, dely);
643 if (ppixdb) {
644 rchExtract(recog->rch, &index, &score, &text,
645 &sample, NULL, NULL, NULL);
646 fprintf(stderr, "text = %s, index = %d, sample = %d,"
647 " score = %5.3f\n", text, index, sample, score);
648 }
649 pixDestroy(&pix1);
650 boxDestroy(&box1);
651 LEPT_FREE(text);
652 }
653
654 if (ppixdb)
655 *ppixdb = recogShowPath(recog, 1);
656
657 return 0;
658 }
659
660
661 /*!
662 * \brief recogShowPath()
663 *
664 * \param[in] recog with LUT's pre-computed
665 * \param[in] select 0 for Viterbi; 1 for rescored
666 * \return pix debug output), or NULL on error
667 */
668 static PIX *
recogShowPath(L_RECOG * recog,l_int32 select)669 recogShowPath(L_RECOG *recog,
670 l_int32 select)
671 {
672 char textstr[16];
673 l_int32 i, j, n, index, xloc, dely;
674 l_float32 score;
675 L_BMF *bmf;
676 NUMA *natempl_s, *nasample_s, *nascore_s, *naxloc_s, *nadely_s;
677 PIX *pixs, *pix0, *pix1, *pix2, *pix3, *pix4, *pix5;
678 L_RDID *did;
679
680 PROCNAME("recogShowPath");
681
682 if (!recog)
683 return (PIX *)ERROR_PTR("recog not defined", procName, NULL);
684 if ((did = recogGetDid(recog)) == NULL)
685 return (PIX *)ERROR_PTR("did not defined", procName, NULL);
686
687 bmf = bmfCreate(NULL, 8);
688 pixs = pixScale(did->pixs, 4.0, 4.0);
689 pix0 = pixAddBorderGeneral(pixs, 0, 0, 0, 40, 0);
690 pix1 = pixConvertTo32(pix0);
691 if (select == 0) { /* Viterbi */
692 natempl_s = did->natempl;
693 nascore_s = did->nascore;
694 naxloc_s = did->naxloc;
695 nadely_s = did->nadely;
696 } else { /* rescored */
697 natempl_s = did->natempl_r;
698 nasample_s = did->nasample_r;
699 nascore_s = did->nascore_r;
700 naxloc_s = did->naxloc_r;
701 nadely_s = did->nadely_r;
702 }
703
704 n = numaGetCount(natempl_s);
705 for (i = 0; i < n; i++) {
706 numaGetIValue(natempl_s, i, &index);
707 if (select == 0) {
708 pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
709 } else {
710 numaGetIValue(nasample_s, i, &j);
711 pix2 = pixaaGetPix(recog->pixaa_u, index, j, L_CLONE);
712 }
713 pix3 = pixScale(pix2, 4.0, 4.0);
714 pix4 = pixErodeBrick(NULL, pix3, 5, 5);
715 pixXor(pix4, pix4, pix3);
716 numaGetFValue(nascore_s, i, &score);
717 snprintf(textstr, sizeof(textstr), "%5.3f", score);
718 pix5 = pixAddTextlines(pix4, bmf, textstr, 1, L_ADD_BELOW);
719 numaGetIValue(naxloc_s, i, &xloc);
720 numaGetIValue(nadely_s, i, &dely);
721 pixPaintThroughMask(pix1, pix5, 4 * xloc, 4 * dely, 0xff000000);
722 pixDestroy(&pix2);
723 pixDestroy(&pix3);
724 pixDestroy(&pix4);
725 pixDestroy(&pix5);
726 }
727 pixDestroy(&pixs);
728 pixDestroy(&pix0);
729 bmfDestroy(&bmf);
730 return pix1;
731 }
732
733
734 /*------------------------------------------------------------------------*
735 * Create/destroy temporary DID data *
736 *------------------------------------------------------------------------*/
737 /*!
738 * \brief recogCreateDid()
739 *
740 * \param[in] recog
741 * \param[in] pixs of 1 bpp image to match
742 * \return 0 if OK, 1 on error
743 */
744 l_int32
recogCreateDid(L_RECOG * recog,PIX * pixs)745 recogCreateDid(L_RECOG *recog,
746 PIX *pixs)
747 {
748 l_int32 i;
749 PIX *pix1;
750 L_RDID *did;
751
752 PROCNAME("recogCreateDid");
753
754 if (!recog)
755 return ERROR_INT("recog not defined", procName, 1);
756 if (!pixs)
757 return ERROR_INT("pixs not defined", procName, 1);
758
759 recogDestroyDid(recog);
760
761 did = (L_RDID *)LEPT_CALLOC(1, sizeof(L_RDID));
762 recog->did = did;
763 did->pixs = pixClone(pixs);
764 did->narray = recog->setsize;
765 did->size = pixGetWidth(pixs);
766 did->natempl = numaCreate(5);
767 did->naxloc = numaCreate(5);
768 did->nadely = numaCreate(5);
769 did->nawidth = numaCreate(5);
770 did->boxa = boxaCreate(5);
771 did->nascore = numaCreate(5);
772 did->natempl_r = numaCreate(5);
773 did->nasample_r = numaCreate(5);
774 did->naxloc_r = numaCreate(5);
775 did->nadely_r = numaCreate(5);
776 did->nawidth_r = numaCreate(5);
777 did->nascore_r = numaCreate(5);
778
779 /* Make the arrays */
780 did->setwidth = (l_int32 *)LEPT_CALLOC(did->narray, sizeof(l_int32));
781 did->counta = (l_int32 **)LEPT_CALLOC(did->narray, sizeof(l_int32 *));
782 did->delya = (l_int32 **)LEPT_CALLOC(did->narray, sizeof(l_int32 *));
783 did->beta = (l_float32 *)LEPT_CALLOC(5, sizeof(l_float32));
784 did->gamma = (l_float32 *)LEPT_CALLOC(5, sizeof(l_float32));
785 did->trellisscore = (l_float32 *)LEPT_CALLOC(did->size, sizeof(l_float32));
786 did->trellistempl = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
787 for (i = 0; i < did->narray; i++) {
788 did->counta[i] = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
789 did->delya[i] = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
790 }
791
792 /* Populate the setwidth array */
793 for (i = 0; i < did->narray; i++) {
794 pix1 = pixaGetPix(recog->pixa_u, i, L_CLONE);
795 did->setwidth[i] = (l_int32)(SetwidthFraction * pixGetWidth(pix1));
796 pixDestroy(&pix1);
797 }
798
799 return 0;
800 }
801
802
803 /*!
804 * \brief recogDestroyDid()
805 *
806 * \param[in] recog
807 * \return 0 if OK, 1 on error
808 *
809 * <pre>
810 * Notes:
811 * (1) As the signature indicates, this is owned by the recog, and can
812 * only be destroyed using this function.
813 * </pre>
814 */
815 l_int32
recogDestroyDid(L_RECOG * recog)816 recogDestroyDid(L_RECOG *recog)
817 {
818 l_int32 i;
819 L_RDID *did;
820
821 PROCNAME("recogDestroyDid");
822
823 if (!recog)
824 return ERROR_INT("recog not defined", procName, 1);
825
826 if ((did = recog->did) == NULL) return 0;
827 if (!did->counta || !did->delya)
828 return ERROR_INT("ptr array is null; shouldn't happen!", procName, 1);
829
830 for (i = 0; i < did->narray; i++) {
831 LEPT_FREE(did->counta[i]);
832 LEPT_FREE(did->delya[i]);
833 }
834 LEPT_FREE(did->setwidth);
835 LEPT_FREE(did->counta);
836 LEPT_FREE(did->delya);
837 LEPT_FREE(did->beta);
838 LEPT_FREE(did->gamma);
839 LEPT_FREE(did->trellisscore);
840 LEPT_FREE(did->trellistempl);
841 pixDestroy(&did->pixs);
842 numaDestroy(&did->nasum);
843 numaDestroy(&did->namoment);
844 numaDestroy(&did->natempl);
845 numaDestroy(&did->naxloc);
846 numaDestroy(&did->nadely);
847 numaDestroy(&did->nawidth);
848 boxaDestroy(&did->boxa);
849 numaDestroy(&did->nascore);
850 numaDestroy(&did->natempl_r);
851 numaDestroy(&did->nasample_r);
852 numaDestroy(&did->naxloc_r);
853 numaDestroy(&did->nadely_r);
854 numaDestroy(&did->nawidth_r);
855 numaDestroy(&did->nascore_r);
856 LEPT_FREE(did);
857 recog->did = NULL;
858 return 0;
859 }
860
861
862 /*------------------------------------------------------------------------*
863 * Various helpers *
864 *------------------------------------------------------------------------*/
865 /*!
866 * \brief recogDidExists()
867 *
868 * \param[in] recog
869 * \return 1 if recog->did exists; 0 if not or on error.
870 */
871 l_int32
recogDidExists(L_RECOG * recog)872 recogDidExists(L_RECOG *recog)
873 {
874 PROCNAME("recogDidExists");
875
876 if (!recog)
877 return ERROR_INT("recog not defined", procName, 0);
878 return (recog->did) ? 1 : 0;
879 }
880
881
882 /*!
883 * \brief recogGetDid()
884 *
885 * \param[in] recog
886 * \return did still owned by the recog, or NULL on error
887 *
888 * <pre>
889 * Notes:
890 * (1) This also makes sure the arrays are defined.
891 * </pre>
892 */
893 L_RDID *
recogGetDid(L_RECOG * recog)894 recogGetDid(L_RECOG *recog)
895 {
896 l_int32 i;
897 L_RDID *did;
898
899 PROCNAME("recogGetDid");
900
901 if (!recog)
902 return (L_RDID *)ERROR_PTR("recog not defined", procName, NULL);
903 if ((did = recog->did) == NULL)
904 return (L_RDID *)ERROR_PTR("did not defined", procName, NULL);
905 if (!did->counta || !did->delya)
906 return (L_RDID *)ERROR_PTR("did array ptrs not defined",
907 procName, NULL);
908 for (i = 0; i < did->narray; i++) {
909 if (!did->counta[i] || !did->delya[i])
910 return (L_RDID *)ERROR_PTR("did arrays not defined",
911 procName, NULL);
912 }
913
914 return did;
915 }
916
917
918 /*!
919 * \brief recogGetWindowedArea()
920 *
921 * \param[in] recog
922 * \param[in] index of template
923 * \param[in] x pixel position of left hand edge of template
924 * \param[out] pdely y shift of template relative to pix1
925 * \param[out] pwsum number of fg pixels in window of pixs
926 * \return 0 if OK, 1 on error
927 *
928 * <pre>
929 * Notes:
930 * (1) This is called after the best path has been found through
931 * the trellis, in order to produce a correlation that can be used
932 * to evaluate the confidence we have in the identification.
933 * The correlation is |1 & 2|^2 / (|1| * |2|).
934 * |1 & 2| is given by the count array, |2| is found from
935 * nasum_u[], and |1| is wsum returned from this function.
936 * </pre>
937 */
938 static l_int32
recogGetWindowedArea(L_RECOG * recog,l_int32 index,l_int32 x,l_int32 * pdely,l_int32 * pwsum)939 recogGetWindowedArea(L_RECOG *recog,
940 l_int32 index,
941 l_int32 x,
942 l_int32 *pdely,
943 l_int32 *pwsum)
944 {
945 l_int32 w1, h1, w2, h2;
946 PIX *pix1, *pix2, *pixt;
947 L_RDID *did;
948
949 PROCNAME("recogGetWindowedArea");
950
951 if (pdely) *pdely = 0;
952 if (pwsum) *pwsum = 0;
953 if (!pdely || !pwsum)
954 return ERROR_INT("&dely and &wsum not both defined", procName, 1);
955 if (!recog)
956 return ERROR_INT("recog not defined", procName, 1);
957 if ((did = recogGetDid(recog)) == NULL)
958 return ERROR_INT("did not defined", procName, 1);
959 if (index < 0 || index >= did->narray)
960 return ERROR_INT("invalid index", procName, 1);
961 pix1 = did->pixs;
962 pixGetDimensions(pix1, &w1, &h1, NULL);
963 if (x >= w1)
964 return ERROR_INT("invalid x position", procName, 1);
965
966 pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
967 pixGetDimensions(pix2, &w2, &h2, NULL);
968 if (w1 < w2) {
969 L_INFO("template %d too small\n", procName, index);
970 pixDestroy(&pix2);
971 return 0;
972 }
973
974 *pdely = did->delya[index][x];
975 pixt = pixCreate(w2, h1, 1);
976 pixRasterop(pixt, 0, *pdely, w2, h2, PIX_SRC, pix2, 0, 0);
977 pixRasterop(pixt, 0, 0, w2, h1, PIX_SRC & PIX_DST, pix1, x, 0);
978 pixCountPixels(pixt, pwsum, recog->sumtab);
979 pixDestroy(&pix2);
980 pixDestroy(&pixt);
981 return 0;
982 }
983
984
985 /*!
986 * \brief recogSetChannelParams()
987 *
988 * \param[in] recog
989 * \param[in] nlevels
990 * \return 0 if OK, 1 on error
991 *
992 * <pre>
993 * Notes:
994 * (1) This converts the independent bit-flip probabilities in the
995 * "channel" into log-likelihood coefficients on image sums.
996 * These coefficients are only defined for the non-background
997 * template levels. Thus for nlevels = 2 (one fg, one bg),
998 * only beta[1] and gamma[1] are used. For nlevels = 4 (three
999 * fg templates), we use beta[1-3] and gamma[1-3].
1000 * </pre>
1001 */
1002 l_int32
recogSetChannelParams(L_RECOG * recog,l_int32 nlevels)1003 recogSetChannelParams(L_RECOG *recog,
1004 l_int32 nlevels)
1005 {
1006 l_int32 i;
1007 const l_float32 *da;
1008 L_RDID *did;
1009
1010 PROCNAME("recogSetChannelParams");
1011
1012 if (!recog)
1013 return ERROR_INT("recog not defined", procName, 1);
1014 if ((did = recogGetDid(recog)) == NULL)
1015 return ERROR_INT("did not defined", procName, 1);
1016 if (nlevels == 2)
1017 da = DefaultAlpha2;
1018 else if (nlevels == 4)
1019 da = DefaultAlpha4;
1020 else
1021 return ERROR_INT("nlevels not 2 or 4", procName, 1);
1022
1023 for (i = 1; i < nlevels; i++) {
1024 did->beta[i] = log((1.0 - da[i]) / da[0]);
1025 did->gamma[i] = log(da[0] * da[i] / ((1.0 - da[0]) * (1.0 - da[i])));
1026 /* fprintf(stderr, "beta[%d] = %7.3f, gamma[%d] = %7.3f\n",
1027 i, did->beta[i], i, did->gamma[i]); */
1028 }
1029
1030 return 0;
1031 }
1032
1033
1034 /*!
1035 * \brief recogTransferRchToDid()
1036 *
1037 * \param[in] recog with rch and did defined
1038 * \param[in] x left edge of extracted region, relative to decoded line
1039 * \param[in] y top edge of extracted region, relative to input image
1040 * \return 0 if OK, 1 on error
1041 *
1042 * <pre>
1043 * Notes:
1044 * (1) This is used to transfer the results for a single character match
1045 * to the rescored did arrays.
1046 * </pre>
1047 */
1048 static l_int32
recogTransferRchToDid(L_RECOG * recog,l_int32 x,l_int32 y)1049 recogTransferRchToDid(L_RECOG *recog,
1050 l_int32 x,
1051 l_int32 y)
1052 {
1053 L_RDID *did;
1054 L_RCH *rch;
1055
1056 PROCNAME("recogTransferRchToDid");
1057
1058 if (!recog)
1059 return ERROR_INT("recog not defined", procName, 1);
1060 if ((did = recogGetDid(recog)) == NULL)
1061 return ERROR_INT("did not defined", procName, 1);
1062 if ((rch = recog->rch) == NULL)
1063 return ERROR_INT("rch not defined", procName, 1);
1064
1065 numaAddNumber(did->natempl_r, rch->index);
1066 numaAddNumber(did->nasample_r, rch->sample);
1067 numaAddNumber(did->naxloc_r, rch->xloc + x);
1068 numaAddNumber(did->nadely_r, rch->yloc + y);
1069 numaAddNumber(did->nawidth_r, rch->width);
1070 numaAddNumber(did->nascore_r, rch->score);
1071 return 0;
1072 }
1073