1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file recogdid.c
29  * <pre>
30  *
31  *      Top-level identification
32  *         BOXA             *recogDecode()
33  *
34  *      Generate decoding arrays
35  *         static l_int32    recogPrepareForDecoding()
36  *         static l_int32    recogMakeDecodingArray()
37  *
38  *      Dynamic programming for best path
39  *         static l_int32    recogRunViterbi()
40  *         static l_int32    recogRescoreDidResult()
41  *         static PIX       *recogShowPath()
42  *
43  *      Create/destroy temporary DID data
44  *         l_int32           recogCreateDid()
45  *         l_int32           recogDestroyDid()
46  *
47  *      Various helpers
48  *         l_int32           recogDidExists()
49  *         L_RDID           *recogGetDid()
50  *         static l_int32    recogGetWindowedArea()
51  *         l_int32           recogSetChannelParams()
52  *         static l_int32    recogTransferRchToDid()
53  *
54  *  See recogbasic.c for examples of training a recognizer, which is
55  *  required before it can be used for document image decoding.
56  *
57  *  Gary Kopec pioneered this hidden markov approach to "Document Image
58  *  Decoding" (DID) in the early 1990s.  It is based on estimation
59  *  using a generative model of the image generation process, and
60  *  provides the most likely decoding of an image if the model is correct.
61  *  Given the model, it finds the maximum a posteriori (MAP) "message"
62  *  given the observed image.  The model describes how to generate
63  *  an image from a message, and the MAP message is derived from the
64  *  observed image using Bayes' theorem.  This approach can also be used
65  *  to build the model, using the iterative expectation/maximization
66  *  method from labeled but errorful data.
67  *
68  *  In a little more detail: The model comprises three things: the ideal
69  *  printed character templates, the independent bit-flip noise model, and
70  *  the character setwidths.  When a character is printed, the setwidth
71  *  is the distance in pixels that you move forward before being able
72  *  to print the next character.  It is typically slightly less than the
73  *  width of the character template: if too small, an extra character can be
74  *  hallucinated; if too large, it will not be able to match the next
75  *  character template on the line.  The model assumes that the probabilities
76  *  of bit flip depend only on the assignment of the pixel to background
77  *  or template foreground.  The multilevel templates have different
78  *  bit flip probabilities for each level.  Because a character image
79  *  is composed of many pixels, each of which can be independently flipped,
80  *  the actual probability of seeing any rendering is exceedingly small,
81  *  being composed of the product of the probabilities for each pixel.
82  *  The log likelihood is used both to avoid numeric underflow and,
83  *  more importantly, because it results in a summation of independent
84  *  pixel probabilities.  That summation can be shown, in Kopec's
85  *  original paper, to consist of a sum of two terms: (a) the number of
86  *  fg pixels in the bit-and of the observed image with the ideal
87  *  template and (b) the number of fg pixels in the template.  Each
88  *  has a coefficient that depends only on the bit-flip probabilities
89  *  for the fg and bg.  A beautiful result, and computationally simple!
90  *  One nice feature of this approach is that the result of the decoding
91  *  is not very sensitive to the values  used for the bit flip probabilities.
92  *
93  *  The procedure for finding the best decoding (MAP) for a given image goes
94  *  under several names: Viterbi, dynamic programming, hidden markov model.
95  *  It is called a "hidden markov model" because the templates are assumed
96  *  to be printed serially and we don't know what they are -- the identity
97  *  of the templates must be inferred from the observed image.
98  *  The possible decodings form a dense trellis over the pixel positions,
99  *  where at each pixel position you have the possibility of having any
100  *  of the characters printed there (with some reference point) or having
101  *  a single pixel wide space inserted there.  Thus, before the trellis
102  *  can be traversed, we must do the work of finding the log probability,
103  *  at each pixel location, that each of the templates was printed there.
104  *  Armed with those arrays of data, the dynamic programming procedure
105  *  moves from left to right, one pixel at a time, recursively finding
106  *  the path with the highest log probability that gets to that pixel
107  *  position (and noting which template was printed to arrive there).
108  *  After reaching the right side of the image, we can simply backtrack
109  *  along the path, jumping over each template that lies on the highest
110  *  scoring path.  This best path thus only goes through a few of the
111  *  pixel positions.
112  *
113  *  There are two refinements to the original Kopec paper.  In the first,
114  *  one uses multiple, non-overlapping fg templates, each with its own
115  *  bit flip probability.  This makes sense, because the probability
116  *  that a fg boundary pixel flips to bg is greater than that of a fg
117  *  pixel not on the boundary.  And the flip probability of a fg boundary
118  *  pixel is smaller than that of a bg boundary pixel, which in turn
119  *  is greater than that of a bg pixel not on a boundary (the latter
120  *  is taken to be the true background).  Then the simplest realistic
121  *  multiple template model has three templates that are not background.
122  *
123  *  In the second refinement, a heuristic (strict upper bound) is used
124  *  iteratively in the Viterbi process to compute the log probabilities.
125  *  Using the heuristic, you find the best path, and then score all nodes
126  *  on that path with the actual probability, which is guaranteed to
127  *  be a smaller number.  You run this iteratively, rescoring just the best
128  *  found path each time.  After each rescoring, the path may change because
129  *  the local scores have been reduced.  However, the process converges
130  *  rapidly, and when it doesn't change, it must be the best path because
131  *  it is properly scored (even if neighboring paths are heuristically
132  *  scored).  The heuristic score is found column-wise by assuming
133  *  that all the fg pixels in the template are on fg pixels in the image --
134  *  we just take the minimum of the number of pixels in the template
135  *  and image column.  This can easily give a 10-fold reduction in
136  *  computation because the heuristic score can be computed much faster
137  *  than the exact score.
138  *
139  *  For reference, the classic paper on the approach by Kopec is:
140  *  * "Document Image Decoding Using Markov Source Models", IEEE Trans.
141  *    PAMI, Vol 16, No. 6, June 1994, pp 602-617.
142  *  A refinement of the method for multilevel templates by Kopec is:
143  *  * "Multilevel Character Templates for Document Image Decoding",
144  *    Proc. SPIE 3027, Document Recognition IV, p. 168ff, 1997.
145  *  Further refinements for more efficient decoding are given in these
146  *  two papers, which are both stored on leptonica.org:
147  *  * "Document Image Decoding using Iterated Complete Path Search", Minka,
148  *    Bloomberg and Popat, Proc. SPIE Vol 4307, p. 250-258, Document
149  *    Recognition and Retrieval VIII, San Jose, CA 2001.
150  *  * "Document Image Decoding using Iterated Complete Path Search with
151  *    Subsampled Heuristic Scoring", Bloomberg, Minka and Popat, ICDAR 2001,
152  *    p. 344-349, Sept. 2001, Seattle.
153  * </pre>
154  */
155 
156 #include <string.h>
157 #include <math.h>
158 #include "allheaders.h"
159 
160 static l_int32 recogPrepareForDecoding(L_RECOG *recog, PIX *pixs,
161                                        l_int32 debug);
162 static l_int32 recogMakeDecodingArray(L_RECOG *recog, l_int32 index,
163                                       l_int32 debug);
164 static l_int32 recogRunViterbi(L_RECOG *recog, PIX **ppixdb);
165 static l_int32 recogRescoreDidResult(L_RECOG *recog, PIX **ppixdb);
166 static PIX *recogShowPath(L_RECOG *recog, l_int32 select);
167 static l_int32 recogGetWindowedArea(L_RECOG *recog, l_int32 index,
168                                     l_int32 x, l_int32 *pdely, l_int32 *pwsum);
169 static l_int32 recogTransferRchToDid(L_RECOG *recog, l_int32 x, l_int32 y);
170 
171     /* Parameters for modeling the decoding */
172 static const l_float32  SetwidthFraction = 0.95;
173 static const l_int32    MaxYShift = 1;
174 
175     /* Channel parameters.  alpha[0] is the probability that a bg pixel
176      * is OFF.  alpha[1] is the probability that level 1 fg is ON.
177      * The actual values are not too critical, but they must be larger
178      * than 0.5 and smaller than 1.0.  For more accuracy in template
179      * matching, use a 4-level template, where levels 2 and 3 are
180      * boundary pixels in the fg and bg, respectively. */
181 static const l_float32  DefaultAlpha2[] = {0.95f, 0.9f};
182 static const l_float32  DefaultAlpha4[] = {0.95f, 0.9f, 0.75f, 0.25f};
183 
184 
185 /*------------------------------------------------------------------------*
186  *                       Top-level identification                         *
187  *------------------------------------------------------------------------*/
188 /*!
189  * \brief   recogDecode()
190  *
191  * \param[in]    recog with LUT's pre-computed
192  * \param[in]    pixs typically of multiple touching characters, 1 bpp
193  * \param[in]    nlevels of templates; 2 for now
194  * \param[out]   ppixdb [optional] debug result; can be null
195  * \return  boxa  segmentation of pixs into characters, or NULL on error
196  *
197  * <pre>
198  * Notes:
199  *      (1) The input pixs has been filtered so that it is likely to be
200  *          composed of more than one touching character.  Specifically,
201  *          its height can only slightly exceed that of the tallest
202  *          unscaled template, the width is somewhat larger than the
203  *          width of the widest unscaled template, and the w/h aspect ratio
204  *          is bounded by max_wh_ratio.
205  *      (2) This uses the DID mechanism with labeled templates to
206  *          segment the input %pixs.  The resulting segmentation is
207  *          returned.  (It is given by did->boxa).
208  *      (3) In debug mode, the Viterbi path is rescored based on all
209  *          the templates.  In non-debug mode, the same procedure is
210  *          carried out by recogIdentifyPix() on the result of the
211  *          segmentation.
212  * </pre>
213  */
214 BOXA  *
recogDecode(L_RECOG * recog,PIX * pixs,l_int32 nlevels,PIX ** ppixdb)215 recogDecode(L_RECOG  *recog,
216             PIX      *pixs,
217             l_int32   nlevels,
218             PIX     **ppixdb)
219 {
220 l_int32  debug;
221 PIX     *pix1;
222 PIXA    *pixa;
223 
224     PROCNAME("recogDecode");
225 
226     if (ppixdb) *ppixdb = NULL;
227     if (!recog)
228         return (BOXA *)ERROR_PTR("recog not defined", procName, NULL);
229     if (!pixs || pixGetDepth(pixs) != 1)
230         return (BOXA *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
231     if (!recog->train_done)
232         return (BOXA *)ERROR_PTR("training not finished", procName, NULL);
233     if (nlevels != 2)
234         return (BOXA *)ERROR_PTR("nlevels != 2 (for now)", procName, NULL);
235 
236     debug = (ppixdb) ? 1 : 0;
237     if (recogPrepareForDecoding(recog, pixs, debug))
238         return (BOXA *)ERROR_PTR("error making arrays", procName, NULL);
239     recogSetChannelParams(recog, nlevels);
240 
241         /* Normal path; just run Viterbi */
242     if (!debug) {
243         if (recogRunViterbi(recog, NULL) == 0)
244             return boxaCopy(recog->did->boxa, L_COPY);
245         else
246             return (BOXA *)ERROR_PTR("error in Viterbi", procName, NULL);
247     }
248 
249         /* Debug path */
250     if (recogRunViterbi(recog, &pix1))
251         return (BOXA *)ERROR_PTR("error in viterbi", procName, NULL);
252     pixa = pixaCreate(2);
253     pixaAddPix(pixa, pix1, L_INSERT);
254     if (recogRescoreDidResult(recog, &pix1)) {
255         pixaDestroy(&pixa);
256         return (BOXA *)ERROR_PTR("error in rescoring", procName, NULL);
257     }
258     pixaAddPix(pixa, pix1, L_INSERT);
259     *ppixdb = pixaDisplayTiledInRows(pixa, 32, 2 * pixGetWidth(pix1) + 100,
260                                      1.0, 0, 30, 2);
261     pixaDestroy(&pixa);
262     return boxaCopy(recog->did->boxa, L_COPY);
263 }
264 
265 
266 /*------------------------------------------------------------------------*
267  *                       Generate decoding arrays                         *
268  *------------------------------------------------------------------------*/
269 /*!
270  * \brief   recogPrepareForDecoding()
271  *
272  * \param[in]    recog with LUT's pre-computed
273  * \param[in]    pixs typically of multiple touching characters, 1 bpp
274  * \param[in]    debug 1 for debug output; 0 otherwise
275  * \return  0 if OK, 1 on error
276  *
277  * <pre>
278  * Notes:
279  *      (1) Binarizes and crops input %pixs.
280  *      (2) Removes previous L_RDID struct and makes a new one.
281  *      (3) Generates the bit-and sum arrays for each character template
282  *          at each pixel position in %pixs.  These are used in the
283  *          Viterbi dynamic programming step.
284  *      (4) The values are saved in the scoring arrays at the left edge
285  *          of the template.  They are used in the Viterbi process
286  *          at the setwidth position (which is near the RHS of the template
287  *          as it is positioned on pixs) in the generated trellis.
288  * </pre>
289  */
290 static l_int32
recogPrepareForDecoding(L_RECOG * recog,PIX * pixs,l_int32 debug)291 recogPrepareForDecoding(L_RECOG  *recog,
292                         PIX      *pixs,
293                         l_int32   debug)
294 {
295 l_int32  i;
296 PIX     *pix1;
297 L_RDID  *did;
298 
299     PROCNAME("recogPrepareForDecoding");
300 
301     if (!recog)
302         return ERROR_INT("recog not defined", procName, 1);
303     if (!pixs || pixGetDepth(pixs) != 1)
304         return ERROR_INT("pixs not defined or not 1 bpp", procName, 1);
305     if (!recog->train_done)
306         return ERROR_INT("training not finished", procName, 1);
307 
308     if (!recog->ave_done)
309         recogAverageSamples(&recog, 0);
310 
311         /* Binarize and crop to foreground if necessary */
312     if ((pix1 = recogProcessToIdentify(recog, pixs, 0)) == NULL)
313         return ERROR_INT("pix1 not made", procName, 1);
314 
315         /* Remove any existing RecogDID and set up a new one */
316     recogDestroyDid(recog);
317     if (recogCreateDid(recog, pix1)) {
318         pixDestroy(&pix1);
319         return ERROR_INT("decoder not made", procName, 1);
320     }
321 
322         /* Compute vertical sum and first moment arrays */
323     did = recogGetDid(recog);  /* owned by recog */
324     did->nasum = pixCountPixelsByColumn(pix1);
325     did->namoment = pixGetMomentByColumn(pix1, 1);
326 
327         /* Generate the arrays */
328     for (i = 0; i < recog->did->narray; i++)
329         recogMakeDecodingArray(recog, i, debug);
330 
331     pixDestroy(&pix1);
332     return 0;
333 }
334 
335 
336 /*!
337  * \brief   recogMakeDecodingArray()
338  *
339  * \param[in]    recog
340  * \param[in]    index of averaged template
341  * \param[in]    debug 1 for debug output; 0 otherwise
342  * \return  0 if OK, 1 on error
343  *
344  * <pre>
345  * Notes:
346  *      (1) Generates the bit-and sum array for a character template along pixs.
347  *      (2) The values are saved in the scoring arrays at the left edge
348  *          of the template as it is positioned on pixs.
349  * </pre>
350  */
351 static l_int32
recogMakeDecodingArray(L_RECOG * recog,l_int32 index,l_int32 debug)352 recogMakeDecodingArray(L_RECOG  *recog,
353                        l_int32   index,
354                        l_int32   debug)
355 {
356 l_int32   i, j, w1, h1, w2, h2, nx, ycent2, count, maxcount, maxdely;
357 l_int32   sum, moment, dely, shifty;
358 l_int32  *counta, *delya, *ycent1, *arraysum, *arraymoment, *sumtab;
359 NUMA     *nasum, *namoment;
360 PIX      *pix1, *pix2, *pix3;
361 L_RDID   *did;
362 
363     PROCNAME("recogMakeDecodingArray");
364 
365     if (!recog)
366         return ERROR_INT("recog not defined", procName, 1);
367     if ((did = recogGetDid(recog)) == NULL)
368         return ERROR_INT("did not defined", procName, 1);
369     if (index < 0 || index >= did->narray)
370         return ERROR_INT("invalid index", procName, 1);
371 
372         /* Check that pix1 is large enough for this template. */
373     pix1 = did->pixs;  /* owned by did; do not destroy */
374     pixGetDimensions(pix1, &w1, &h1, NULL);
375     pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
376     pixGetDimensions(pix2, &w2, &h2, NULL);
377     if (w1 < w2) {
378         L_INFO("w1 = %d < w2 = %d for index %d\n", procName, w1, w2, index);
379         pixDestroy(&pix2);
380         return 0;
381     }
382 
383     nasum = did->nasum;
384     namoment = did->namoment;
385     ptaGetIPt(recog->pta_u, index, NULL, &ycent2);
386     sumtab = recog->sumtab;
387     counta = did->counta[index];
388     delya = did->delya[index];
389 
390         /* Set up the array for ycent1.  This gives the y-centroid location
391          * for a window of width w2, starting at location i. */
392     nx = w1 - w2 + 1;  /* number of positions w2 can be placed in w1 */
393     ycent1 = (l_int32 *)LEPT_CALLOC(nx, sizeof(l_int32));
394     arraysum = numaGetIArray(nasum);
395     arraymoment = numaGetIArray(namoment);
396     for (i = 0, sum = 0, moment = 0; i < w2; i++) {
397         sum += arraysum[i];
398         moment += arraymoment[i];
399     }
400     for (i = 0; i < nx - 1; i++) {
401         ycent1[i] = (sum == 0) ? ycent2 : (l_float32)moment / (l_float32)sum;
402         sum += arraysum[w2 + i] - arraysum[i];
403         moment += arraymoment[w2 + i] - arraymoment[i];
404     }
405     ycent1[nx - 1] = (sum == 0) ? ycent2 : (l_float32)moment / (l_float32)sum;
406 
407         /* Compute the bit-and sum between the template pix2 and pix1, at
408          * locations where the left side of pix2 goes from 0 to nx - 1
409          * in pix1.  Do this around the vertical alignment of the pix2
410          * centroid and the windowed pix1 centroid.
411          *  (1) Start with pix3 cleared and approximately equal in size to pix1.
412          *  (2) Blit the y-shifted pix2 onto pix3.  Then all ON pixels
413          *      are within the intersection of pix1 and the shifted pix2.
414          *  (3) AND pix1 with pix3. */
415     pix3 = pixCreate(w2, h1, 1);
416     for (i = 0; i < nx; i++) {
417         shifty = (l_int32)(ycent1[i] - ycent2 + 0.5);
418         maxcount = 0;
419         maxdely = 0;
420         for (j = -MaxYShift; j <= MaxYShift; j++) {
421             pixClearAll(pix3);
422             dely = shifty + j;  /* amount pix2 is shifted relative to pix1 */
423             pixRasterop(pix3, 0, dely, w2, h2, PIX_SRC, pix2, 0, 0);
424             pixRasterop(pix3, 0, 0, w2, h1, PIX_SRC & PIX_DST, pix1, i, 0);
425             pixCountPixels(pix3, &count, sumtab);
426             if (count > maxcount) {
427                 maxcount = count;
428                 maxdely = dely;
429             }
430         }
431         counta[i] = maxcount;
432         delya[i] = maxdely;
433     }
434     did->fullarrays = TRUE;
435 
436     pixDestroy(&pix2);
437     pixDestroy(&pix3);
438     LEPT_FREE(ycent1);
439     LEPT_FREE(arraysum);
440     LEPT_FREE(arraymoment);
441     return 0;
442 }
443 
444 
445 /*------------------------------------------------------------------------*
446  *                  Dynamic programming for best path
447  *------------------------------------------------------------------------*/
448 /*!
449  * \brief   recogRunViterbi()
450  *
451  * \param[in]    recog with LUT's pre-computed
452  * \param[out]   ppixdb [optional] debug result; can be null
453  * \return  0 if OK, 1 on error
454  *
455  * <pre>
456  * Notes:
457  *      (1) This can be used when the templates are unscaled.  It works by
458  *          matching the average, unscaled templates of each class to
459  *          all positions.
460  *      (2) It is recursive, in that
461  *          (a) we compute the score successively at all pixel positions x,
462  *          (b) to compute the score at x in the trellis, for each
463  *              template we look backwards to (x - setwidth) to get the
464  *              score if that template were to be printed with its
465  *              setwidth location at x.  We save at x the template and
466  *              score that maximizes the sum of the score at (x - setwidth)
467  *              and the log-likelihood for the template to be printed with
468  *              its LHS there.
469  *      (3) The primary output is a boxa of the locations for splitting
470  *          the input image.  These locations are used later to split the
471  *          image and send the pieces individually for recognition.
472  *          This can be done in either recogIdentifyMultiple(), or
473  *          for debugging in recogRescoreDidResult().
474  * </pre>
475  */
476 static l_int32
recogRunViterbi(L_RECOG * recog,PIX ** ppixdb)477 recogRunViterbi(L_RECOG  *recog,
478                 PIX     **ppixdb)
479 {
480 l_int32     i, w1, w2, h1, xnz, x, narray, minsetw;
481 l_int32     first, templ, xloc, dely, counts, area1;
482 l_int32     besttempl, spacetempl;
483 l_int32    *setw, *didtempl;
484 l_int32    *area2;  /* must be freed */
485 l_float32   prevscore, matchscore, maxscore, correl;
486 l_float32  *didscore;
487 BOX        *box;
488 PIX        *pix1;
489 L_RDID     *did;
490 
491     PROCNAME("recogRunViterbi");
492 
493     if (ppixdb) *ppixdb = NULL;
494     if (!recog)
495         return ERROR_INT("recog not defined", procName, 1);
496     if ((did = recogGetDid(recog)) == NULL)
497         return ERROR_INT("did not defined", procName, 1);
498     if (did->fullarrays == 0)
499         return ERROR_INT("did full arrays not made", procName, 1);
500 
501         /* Compute the minimum setwidth. Bad templates with very small
502          * width can cause havoc because the setwidth is too small. */
503     w1 = did->size;
504     narray = did->narray;
505     spacetempl = narray;
506     setw = did->setwidth;
507     minsetw = 100000;
508     for (i = 0; i < narray; i++) {
509         if (setw[i] < minsetw)
510             minsetw = setw[i];
511     }
512     if (minsetw <= 2)
513         return ERROR_INT("minsetw <= 2; bad templates", procName, 1);
514 
515         /* The score array is initialized to 0.0.  As we proceed to
516          * the left, the log likelihood for the partial paths goes
517          * negative, and we prune for the max (least negative) path.
518          * No matches will be computed until we reach x = min(setwidth);
519          * until then first == TRUE after looping over templates. */
520     didscore = did->trellisscore;
521     didtempl = did->trellistempl;
522     area2 = numaGetIArray(recog->nasum_u);
523     for (x = minsetw; x < w1; x++) {  /* will always get a score */
524         first = TRUE;
525         for (i = 0; i < narray; i++) {
526             if (x - setw[i] < 0) continue;
527             matchscore = didscore[x - setw[i]] +
528                          did->gamma[1] * did->counta[i][x - setw[i]] +
529                          did->beta[1] * area2[i];
530             if (first) {
531                 maxscore = matchscore;
532                 besttempl = i;
533                 first = FALSE;
534             } else {
535                 if (matchscore > maxscore) {
536                     maxscore = matchscore;
537                     besttempl = i;
538                 }
539             }
540         }
541 
542             /* We can also put down a single pixel space, with no cost
543              * because all pixels are bg. */
544         prevscore = didscore[x - 1];
545         if (prevscore > maxscore) {  /* 1 pixel space is best */
546             maxscore = prevscore;
547             besttempl = spacetempl;
548         }
549         didscore[x] = maxscore;
550         didtempl[x] = besttempl;
551     }
552 
553         /* Backtrack to get the best path.
554          * Skip over (i.e., ignore) all single pixel spaces. */
555     for (x = w1 - 1; x >= 0; x--) {
556         if (didtempl[x] != spacetempl) break;
557     }
558     h1 = pixGetHeight(did->pixs);
559     while (x > 0) {
560         if (didtempl[x] == spacetempl) {  /* skip over spaces */
561             x--;
562             continue;
563         }
564         templ = didtempl[x];
565         xloc = x - setw[templ];
566         if (xloc < 0) break;
567         counts = did->counta[templ][xloc];  /* bit-and counts */
568         recogGetWindowedArea(recog, templ, xloc, &dely, &area1);
569         correl = ((l_float32)(counts) * counts) /
570                   (l_float32)(area2[templ] * area1);
571         pix1 = pixaGetPix(recog->pixa_u, templ, L_CLONE);
572         w2 = pixGetWidth(pix1);
573         numaAddNumber(did->natempl, templ);
574         numaAddNumber(did->naxloc, xloc);
575         numaAddNumber(did->nadely, dely);
576         numaAddNumber(did->nawidth, pixGetWidth(pix1));
577         numaAddNumber(did->nascore, correl);
578         xnz = L_MAX(xloc, 0);
579         box = boxCreate(xnz, dely, w2, h1);
580         boxaAddBox(did->boxa, box, L_INSERT);
581         pixDestroy(&pix1);
582         x = xloc;
583     }
584 
585     if (ppixdb) {
586         numaWriteStream(stderr, did->natempl);
587         numaWriteStream(stderr, did->naxloc);
588         numaWriteStream(stderr, did->nadely);
589         numaWriteStream(stderr, did->nawidth);
590         numaWriteStream(stderr, did->nascore);
591         boxaWriteStream(stderr, did->boxa);
592         *ppixdb = recogShowPath(recog, 0);
593     }
594 
595     LEPT_FREE(area2);
596     return 0;
597 }
598 
599 
600 /*!
601  * \brief   recogRescoreDidResult()
602  *
603  * \param[in]    recog with LUT's pre-computed
604  * \param[out]   ppixdb [optional] debug result; can be null
605  * \return  0 if OK, 1 on error
606  *
607  * <pre>
608  * Notes:
609  *      (1) This does correlation matching with all unscaled templates,
610  *          using the character segmentation determined by the Viterbi path.
611  * </pre>
612  */
613 static l_int32
recogRescoreDidResult(L_RECOG * recog,PIX ** ppixdb)614 recogRescoreDidResult(L_RECOG  *recog,
615                       PIX     **ppixdb)
616 {
617 l_int32    i, n, sample, x, dely, index;
618 char      *text;
619 l_float32  score;
620 BOX       *box1;
621 PIX       *pixs, *pix1;
622 L_RDID    *did;
623 
624     PROCNAME("recogRescoreDidResult");
625 
626     if (ppixdb) *ppixdb = NULL;
627     if (!recog)
628         return ERROR_INT("recog not defined", procName, 1);
629     if ((did = recogGetDid(recog)) == NULL)
630         return ERROR_INT("did not defined", procName, 1);
631     if (did->fullarrays == 0)
632         return ERROR_INT("did full arrays not made", procName, 1);
633     if ((n = numaGetCount(did->naxloc)) == 0)
634         return ERROR_INT("no elements in path", procName, 1);
635 
636     pixs = did->pixs;
637     for (i = 0; i < n; i++) {
638         box1 = boxaGetBox(did->boxa, i, L_COPY);
639         boxGetGeometry(box1, &x, &dely, NULL, NULL);
640         pix1 = pixClipRectangle(pixs, box1, NULL);
641         recogIdentifyPix(recog, pix1, NULL);
642         recogTransferRchToDid(recog, x, dely);
643         if (ppixdb) {
644             rchExtract(recog->rch, &index, &score, &text,
645                        &sample, NULL, NULL, NULL);
646             fprintf(stderr, "text = %s, index = %d, sample = %d,"
647                     " score = %5.3f\n", text, index, sample, score);
648         }
649         pixDestroy(&pix1);
650         boxDestroy(&box1);
651         LEPT_FREE(text);
652     }
653 
654     if (ppixdb)
655         *ppixdb = recogShowPath(recog, 1);
656 
657     return 0;
658 }
659 
660 
661 /*!
662  * \brief   recogShowPath()
663  *
664  * \param[in]    recog with LUT's pre-computed
665  * \param[in]    select 0 for Viterbi; 1 for rescored
666  * \return  pix debug output), or NULL on error
667  */
668 static PIX *
recogShowPath(L_RECOG * recog,l_int32 select)669 recogShowPath(L_RECOG  *recog,
670               l_int32   select)
671 {
672 char       textstr[16];
673 l_int32    i, j, n, index, xloc, dely;
674 l_float32  score;
675 L_BMF     *bmf;
676 NUMA      *natempl_s, *nasample_s, *nascore_s, *naxloc_s, *nadely_s;
677 PIX       *pixs, *pix0, *pix1, *pix2, *pix3, *pix4, *pix5;
678 L_RDID    *did;
679 
680     PROCNAME("recogShowPath");
681 
682     if (!recog)
683         return (PIX *)ERROR_PTR("recog not defined", procName, NULL);
684     if ((did = recogGetDid(recog)) == NULL)
685         return (PIX *)ERROR_PTR("did not defined", procName, NULL);
686 
687     bmf = bmfCreate(NULL, 8);
688     pixs = pixScale(did->pixs, 4.0, 4.0);
689     pix0 = pixAddBorderGeneral(pixs, 0, 0, 0, 40, 0);
690     pix1 = pixConvertTo32(pix0);
691     if (select == 0) {  /* Viterbi */
692         natempl_s = did->natempl;
693         nascore_s = did->nascore;
694         naxloc_s = did->naxloc;
695         nadely_s = did->nadely;
696     } else {  /* rescored */
697         natempl_s = did->natempl_r;
698         nasample_s = did->nasample_r;
699         nascore_s = did->nascore_r;
700         naxloc_s = did->naxloc_r;
701         nadely_s = did->nadely_r;
702     }
703 
704     n = numaGetCount(natempl_s);
705     for (i = 0; i < n; i++) {
706         numaGetIValue(natempl_s, i, &index);
707         if (select == 0) {
708             pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
709         } else {
710             numaGetIValue(nasample_s, i, &j);
711             pix2 = pixaaGetPix(recog->pixaa_u, index, j, L_CLONE);
712         }
713         pix3 = pixScale(pix2, 4.0, 4.0);
714         pix4 = pixErodeBrick(NULL, pix3, 5, 5);
715         pixXor(pix4, pix4, pix3);
716         numaGetFValue(nascore_s, i, &score);
717         snprintf(textstr, sizeof(textstr), "%5.3f", score);
718         pix5 = pixAddTextlines(pix4, bmf, textstr, 1, L_ADD_BELOW);
719         numaGetIValue(naxloc_s, i, &xloc);
720         numaGetIValue(nadely_s, i, &dely);
721         pixPaintThroughMask(pix1, pix5, 4 * xloc, 4 * dely, 0xff000000);
722         pixDestroy(&pix2);
723         pixDestroy(&pix3);
724         pixDestroy(&pix4);
725         pixDestroy(&pix5);
726     }
727     pixDestroy(&pixs);
728     pixDestroy(&pix0);
729     bmfDestroy(&bmf);
730     return pix1;
731 }
732 
733 
734 /*------------------------------------------------------------------------*
735  *                  Create/destroy temporary DID data                     *
736  *------------------------------------------------------------------------*/
737 /*!
738  * \brief   recogCreateDid()
739  *
740  * \param[in]    recog
741  * \param[in]    pixs of 1 bpp image to match
742  * \return  0 if OK, 1 on error
743  */
744 l_int32
recogCreateDid(L_RECOG * recog,PIX * pixs)745 recogCreateDid(L_RECOG  *recog,
746                PIX      *pixs)
747 {
748 l_int32  i;
749 PIX     *pix1;
750 L_RDID  *did;
751 
752     PROCNAME("recogCreateDid");
753 
754     if (!recog)
755         return ERROR_INT("recog not defined", procName, 1);
756     if (!pixs)
757         return ERROR_INT("pixs not defined", procName, 1);
758 
759     recogDestroyDid(recog);
760 
761     did = (L_RDID *)LEPT_CALLOC(1, sizeof(L_RDID));
762     recog->did = did;
763     did->pixs = pixClone(pixs);
764     did->narray = recog->setsize;
765     did->size = pixGetWidth(pixs);
766     did->natempl = numaCreate(5);
767     did->naxloc = numaCreate(5);
768     did->nadely = numaCreate(5);
769     did->nawidth = numaCreate(5);
770     did->boxa = boxaCreate(5);
771     did->nascore = numaCreate(5);
772     did->natempl_r = numaCreate(5);
773     did->nasample_r = numaCreate(5);
774     did->naxloc_r = numaCreate(5);
775     did->nadely_r = numaCreate(5);
776     did->nawidth_r = numaCreate(5);
777     did->nascore_r = numaCreate(5);
778 
779         /* Make the arrays */
780     did->setwidth = (l_int32 *)LEPT_CALLOC(did->narray, sizeof(l_int32));
781     did->counta = (l_int32 **)LEPT_CALLOC(did->narray, sizeof(l_int32 *));
782     did->delya = (l_int32 **)LEPT_CALLOC(did->narray, sizeof(l_int32 *));
783     did->beta = (l_float32 *)LEPT_CALLOC(5, sizeof(l_float32));
784     did->gamma = (l_float32 *)LEPT_CALLOC(5, sizeof(l_float32));
785     did->trellisscore = (l_float32 *)LEPT_CALLOC(did->size, sizeof(l_float32));
786     did->trellistempl = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
787     for (i = 0; i < did->narray; i++) {
788         did->counta[i] = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
789         did->delya[i] = (l_int32 *)LEPT_CALLOC(did->size, sizeof(l_int32));
790     }
791 
792         /* Populate the setwidth array */
793     for (i = 0; i < did->narray; i++) {
794         pix1 = pixaGetPix(recog->pixa_u, i, L_CLONE);
795         did->setwidth[i] = (l_int32)(SetwidthFraction * pixGetWidth(pix1));
796         pixDestroy(&pix1);
797     }
798 
799     return 0;
800 }
801 
802 
803 /*!
804  * \brief   recogDestroyDid()
805  *
806  * \param[in]    recog
807  * \return  0 if OK, 1 on error
808  *
809  * <pre>
810  * Notes:
811  *      (1) As the signature indicates, this is owned by the recog, and can
812  *          only be destroyed using this function.
813  * </pre>
814  */
815 l_int32
recogDestroyDid(L_RECOG * recog)816 recogDestroyDid(L_RECOG  *recog)
817 {
818 l_int32  i;
819 L_RDID  *did;
820 
821     PROCNAME("recogDestroyDid");
822 
823     if (!recog)
824         return ERROR_INT("recog not defined", procName, 1);
825 
826     if ((did = recog->did) == NULL) return 0;
827     if (!did->counta || !did->delya)
828         return ERROR_INT("ptr array is null; shouldn't happen!", procName, 1);
829 
830     for (i = 0; i < did->narray; i++) {
831         LEPT_FREE(did->counta[i]);
832         LEPT_FREE(did->delya[i]);
833     }
834     LEPT_FREE(did->setwidth);
835     LEPT_FREE(did->counta);
836     LEPT_FREE(did->delya);
837     LEPT_FREE(did->beta);
838     LEPT_FREE(did->gamma);
839     LEPT_FREE(did->trellisscore);
840     LEPT_FREE(did->trellistempl);
841     pixDestroy(&did->pixs);
842     numaDestroy(&did->nasum);
843     numaDestroy(&did->namoment);
844     numaDestroy(&did->natempl);
845     numaDestroy(&did->naxloc);
846     numaDestroy(&did->nadely);
847     numaDestroy(&did->nawidth);
848     boxaDestroy(&did->boxa);
849     numaDestroy(&did->nascore);
850     numaDestroy(&did->natempl_r);
851     numaDestroy(&did->nasample_r);
852     numaDestroy(&did->naxloc_r);
853     numaDestroy(&did->nadely_r);
854     numaDestroy(&did->nawidth_r);
855     numaDestroy(&did->nascore_r);
856     LEPT_FREE(did);
857     recog->did = NULL;
858     return 0;
859 }
860 
861 
862 /*------------------------------------------------------------------------*
863  *                            Various helpers                             *
864  *------------------------------------------------------------------------*/
865 /*!
866  * \brief   recogDidExists()
867  *
868  * \param[in]    recog
869  * \return  1 if recog->did exists; 0 if not or on error.
870  */
871 l_int32
recogDidExists(L_RECOG * recog)872 recogDidExists(L_RECOG  *recog)
873 {
874     PROCNAME("recogDidExists");
875 
876     if (!recog)
877         return ERROR_INT("recog not defined", procName, 0);
878     return (recog->did) ? 1 : 0;
879 }
880 
881 
882 /*!
883  * \brief   recogGetDid()
884  *
885  * \param[in]    recog
886  * \return  did still owned by the recog, or NULL on error
887  *
888  * <pre>
889  * Notes:
890  *      (1) This also makes sure the arrays are defined.
891  * </pre>
892  */
893 L_RDID *
recogGetDid(L_RECOG * recog)894 recogGetDid(L_RECOG  *recog)
895 {
896 l_int32  i;
897 L_RDID  *did;
898 
899     PROCNAME("recogGetDid");
900 
901     if (!recog)
902         return (L_RDID *)ERROR_PTR("recog not defined", procName, NULL);
903     if ((did = recog->did) == NULL)
904         return (L_RDID *)ERROR_PTR("did not defined", procName, NULL);
905     if (!did->counta || !did->delya)
906         return (L_RDID *)ERROR_PTR("did array ptrs not defined",
907                                    procName, NULL);
908     for (i = 0; i < did->narray; i++) {
909         if (!did->counta[i] || !did->delya[i])
910             return (L_RDID *)ERROR_PTR("did arrays not defined",
911                                        procName, NULL);
912     }
913 
914     return did;
915 }
916 
917 
918 /*!
919  * \brief   recogGetWindowedArea()
920  *
921  * \param[in]    recog
922  * \param[in]    index of template
923  * \param[in]    x pixel position of left hand edge of template
924  * \param[out]   pdely y shift of template relative to pix1
925  * \param[out]   pwsum number of fg pixels in window of pixs
926  * \return  0 if OK, 1 on error
927  *
928  * <pre>
929  * Notes:
930  *      (1) This is called after the best path has been found through
931  *          the trellis, in order to produce a correlation that can be used
932  *          to evaluate the confidence we have in the identification.
933  *          The correlation is |1 & 2|^2 / (|1| * |2|).
934  *          |1 & 2| is given by the count array, |2| is found from
935  *          nasum_u[], and |1| is wsum returned from this function.
936  * </pre>
937  */
938 static l_int32
recogGetWindowedArea(L_RECOG * recog,l_int32 index,l_int32 x,l_int32 * pdely,l_int32 * pwsum)939 recogGetWindowedArea(L_RECOG  *recog,
940                      l_int32   index,
941                      l_int32   x,
942                      l_int32  *pdely,
943                      l_int32  *pwsum)
944 {
945 l_int32  w1, h1, w2, h2;
946 PIX     *pix1, *pix2, *pixt;
947 L_RDID  *did;
948 
949     PROCNAME("recogGetWindowedArea");
950 
951     if (pdely) *pdely = 0;
952     if (pwsum) *pwsum = 0;
953     if (!pdely || !pwsum)
954         return ERROR_INT("&dely and &wsum not both defined", procName, 1);
955     if (!recog)
956         return ERROR_INT("recog not defined", procName, 1);
957     if ((did = recogGetDid(recog)) == NULL)
958         return ERROR_INT("did not defined", procName, 1);
959     if (index < 0 || index >= did->narray)
960         return ERROR_INT("invalid index", procName, 1);
961     pix1 = did->pixs;
962     pixGetDimensions(pix1, &w1, &h1, NULL);
963     if (x >= w1)
964         return ERROR_INT("invalid x position", procName, 1);
965 
966     pix2 = pixaGetPix(recog->pixa_u, index, L_CLONE);
967     pixGetDimensions(pix2, &w2, &h2, NULL);
968     if (w1 < w2) {
969         L_INFO("template %d too small\n", procName, index);
970         pixDestroy(&pix2);
971         return 0;
972     }
973 
974     *pdely = did->delya[index][x];
975     pixt = pixCreate(w2, h1, 1);
976     pixRasterop(pixt, 0, *pdely, w2, h2, PIX_SRC, pix2, 0, 0);
977     pixRasterop(pixt, 0, 0, w2, h1, PIX_SRC & PIX_DST, pix1, x, 0);
978     pixCountPixels(pixt, pwsum, recog->sumtab);
979     pixDestroy(&pix2);
980     pixDestroy(&pixt);
981     return 0;
982 }
983 
984 
985 /*!
986  * \brief   recogSetChannelParams()
987  *
988  * \param[in]    recog
989  * \param[in]    nlevels
990  * \return  0 if OK, 1 on error
991  *
992  * <pre>
993  * Notes:
994  *      (1) This converts the independent bit-flip probabilities in the
995  *          "channel" into log-likelihood coefficients on image sums.
996  *          These coefficients are only defined for the non-background
997  *          template levels.  Thus for nlevels = 2 (one fg, one bg),
998  *          only beta[1] and gamma[1] are used.  For nlevels = 4 (three
999  *          fg templates), we use beta[1-3] and gamma[1-3].
1000  * </pre>
1001  */
1002 l_int32
recogSetChannelParams(L_RECOG * recog,l_int32 nlevels)1003 recogSetChannelParams(L_RECOG  *recog,
1004                       l_int32   nlevels)
1005 {
1006 l_int32           i;
1007 const l_float32  *da;
1008 L_RDID           *did;
1009 
1010     PROCNAME("recogSetChannelParams");
1011 
1012     if (!recog)
1013         return ERROR_INT("recog not defined", procName, 1);
1014     if ((did = recogGetDid(recog)) == NULL)
1015         return ERROR_INT("did not defined", procName, 1);
1016     if (nlevels == 2)
1017         da = DefaultAlpha2;
1018     else if (nlevels == 4)
1019         da = DefaultAlpha4;
1020     else
1021         return ERROR_INT("nlevels not 2 or 4", procName, 1);
1022 
1023     for (i = 1; i < nlevels; i++) {
1024         did->beta[i] = log((1.0 - da[i]) / da[0]);
1025         did->gamma[i] = log(da[0] * da[i] / ((1.0 - da[0]) * (1.0 - da[i])));
1026 /*        fprintf(stderr, "beta[%d] = %7.3f, gamma[%d] = %7.3f\n",
1027                 i, did->beta[i], i, did->gamma[i]);  */
1028     }
1029 
1030     return 0;
1031 }
1032 
1033 
1034 /*!
1035  * \brief   recogTransferRchToDid()
1036  *
1037  * \param[in]    recog with rch and did defined
1038  * \param[in]    x left edge of extracted region, relative to decoded line
1039  * \param[in]    y top edge of extracted region, relative to input image
1040  * \return  0 if OK, 1 on error
1041  *
1042  * <pre>
1043  * Notes:
1044  *      (1) This is used to transfer the results for a single character match
1045  *          to the rescored did arrays.
1046  * </pre>
1047  */
1048 static l_int32
recogTransferRchToDid(L_RECOG * recog,l_int32 x,l_int32 y)1049 recogTransferRchToDid(L_RECOG  *recog,
1050                       l_int32   x,
1051                       l_int32   y)
1052 {
1053 L_RDID  *did;
1054 L_RCH   *rch;
1055 
1056     PROCNAME("recogTransferRchToDid");
1057 
1058     if (!recog)
1059         return ERROR_INT("recog not defined", procName, 1);
1060     if ((did = recogGetDid(recog)) == NULL)
1061         return ERROR_INT("did not defined", procName, 1);
1062     if ((rch = recog->rch) == NULL)
1063         return ERROR_INT("rch not defined", procName, 1);
1064 
1065     numaAddNumber(did->natempl_r, rch->index);
1066     numaAddNumber(did->nasample_r, rch->sample);
1067     numaAddNumber(did->naxloc_r, rch->xloc + x);
1068     numaAddNumber(did->nadely_r, rch->yloc + y);
1069     numaAddNumber(did->nawidth_r, rch->width);
1070     numaAddNumber(did->nascore_r, rch->score);
1071     return 0;
1072 }
1073