1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 #ifndef  LEPTONICA_RECOG_H
28 #define  LEPTONICA_RECOG_H
29 
30 /*!
31  * \file recog.h
32  *
33  * <pre>
34  *     This is a simple utility for training and recognizing individual
35  *     machine-printed text characters.  It is designed to be adapted
36  *     to a particular set of character images; e.g., from a book.
37  *
38  *     There are two methods of training the recognizer.  In the most
39  *     simple, a set of bitmaps has been labeled by some means, such
40  *     a generic OCR program.  This is input either one template at a time
41  *     or as a pixa of templates, to a function that creates a recog.
42  *     If in a pixa, the text string label must be embedded in the
43  *     text field of each pix.
44  *
45  *     If labeled data is not available, we start with a bootstrap
46  *     recognizer (BSR) that has labeled data from a variety of sources.
47  *     These images are scaled, typically to a fixed height, and then
48  *     fed similarly scaled unlabeled images from the source (e.g., book),
49  *     and the BSR attempts to identify them.  All images that have
50  *     a high enough correlation score with one of the templates in the
51  *     BSR are emitted in a pixa, which now holds unscaled and labeled
52  *     templates from the source.  This is the generator for a book adapted
53  *     recognizer (BAR).
54  *
55  *     The pixa should always be thought of as the primary structure.
56  *     It is the generator for the recog, because a recog is built
57  *     from a pixa of unscaled images.
58  *
59  *     New image templates can be added to a recog as long as it is
60  *     in training mode.  Once training is finished, to add templates
61  *     it is necessary to extract the generating pixa, add templates
62  *     to that pixa, and make a new recog.  Similarly, we do not
63  *     join two recog; instead, we simply join their generating pixa,
64  *     and make a recog from that.
65  *
66  *     To remove outliers from a pixa of labeled pix, make a recog,
67  *     determine the outliers, and generate a new pixa with the
68  *     outliers removed.  The outliers are determined by building
69  *     special templates for each character set that are scaled averages
70  *     of the individual templates.  Then a correlation score is found
71  *     between each template and the averaged templates.  There are
72  *     two implementations; outliers are determined as either:
73  *      (1) a template having a correlation score with its class average
74  *          that is below a threshold, or
75  *      (2) a template having a correlation score with its class average
76  *          that is smaller than the correlation score with the average
77  *          of another class.
78  *     Outliers are removed from the generating pixa.  Scaled averaging
79  *     is only performed for determining outliers and for splitting
80  *     characters; it is never used in a trained recognizer for identifying
81  *     unlabeled samples.
82  *
83  *     Two methods using averaged templates are provided for splitting
84  *     touching characters:
85  *      (1) greedy matching
86  *      (2) document image decoding (DID)
87  *     The DID method is the default.  It is about 5x faster and
88  *     possibly more accurate.
89  *
90  *     Once a BAR has been made, unlabeled sample images are identified
91  *     by finding the individual template in the BAR with highest
92  *     correlation.  The input images and images in the BAR can be
93  *     represented in two ways:
94  *      (1) as scanned, binarized to 1 bpp
95  *      (2) as a width-normalized outline formed by thinning to a
96  *          skeleton and then dilating by a fixed amount.
97  *
98  *     The recog can be serialized to file and read back.  The serialized
99  *     version holds the templates used for correlation (which may have
100  *     been modified by scaling and turning into lines from the unscaled
101  *     templates), plus, for arbitrary character sets, the UTF8
102  *     representation and the lookup table mapping from the character
103  *     representation to index.
104  *
105  *     Why do we not use averaged templates for recognition?
106  *     Letterforms can take on significantly different shapes (eg.,
107  *     the letters 'a' and 'g'), and it makes no sense to average these.
108  *     The previous version of this utility allowed multiple recognizers
109  *     to exist, but this is an unnecessary complication if recognition
110  *     is done on all samples instead of on averages.
111  * </pre>
112  */
113 
114 #define  RECOG_VERSION_NUMBER      2
115 
116 struct L_Recog {
117     l_int32        scalew;       /*!< scale all examples to this width;      */
118                                  /*!< use 0 prevent horizontal scaling       */
119     l_int32        scaleh;       /*!< scale all examples to this height;     */
120                                  /*!< use 0 prevent vertical scaling         */
121     l_int32        linew;        /*!< use a value > 0 to convert the bitmap  */
122                                  /*!< to lines of fixed width; 0 to skip     */
123     l_int32        templ_use;    /*!< template use: use either the average   */
124                                  /*!< or all temmplates (L_USE_AVERAGE or    */
125                                  /*!< L_USE_ALL)                             */
126     l_int32        maxarraysize; /*!< initialize container arrays to this    */
127     l_int32        setsize;      /*!< size of character set                  */
128     l_int32        threshold;    /*!< for binarizing if depth > 1            */
129     l_int32        maxyshift;    /*!< vertical jiggle on nominal centroid    */
130                                  /*!< alignment; typically 0 or 1            */
131     l_int32        charset_type; /*!< one of L_ARABIC_NUMERALS, etc.         */
132     l_int32        charset_size; /*!< expected number of classes in charset  */
133     l_int32        min_nopad;    /*!< min number of samples without padding  */
134     l_int32        num_samples;  /*!< number of training samples             */
135     l_int32        minwidth_u;   /*!< min width averaged unscaled templates  */
136     l_int32        maxwidth_u;   /*!< max width averaged unscaled templates  */
137     l_int32        minheight_u;  /*!< min height averaged unscaled templates */
138     l_int32        maxheight_u;  /*!< max height averaged unscaled templates */
139     l_int32        minwidth;     /*!< min width averaged scaled templates    */
140     l_int32        maxwidth;     /*!< max width averaged scaled templates    */
141     l_int32        ave_done;     /*!< set to 1 when averaged bitmaps are made */
142     l_int32        train_done;   /*!< set to 1 when training is complete or  */
143                                  /*!< identification has started             */
144     l_float32      max_wh_ratio; /*!< max width/height ratio to split        */
145     l_float32      max_ht_ratio; /*!< max of max/min template height ratio   */
146     l_int32        min_splitw;   /*!< min component width kept in splitting  */
147     l_int32        max_splith;   /*!< max component height kept in splitting */
148     struct Sarray *sa_text;      /*!< text array for arbitrary char set      */
149     struct L_Dna  *dna_tochar;   /*!< index-to-char lut for arbitrary charset */
150     l_int32       *centtab;      /*!< table for finding centroids            */
151     l_int32       *sumtab;       /*!< table for finding pixel sums           */
152     struct Pixaa  *pixaa_u;      /*!< all unscaled templates for each class  */
153     struct Ptaa   *ptaa_u;       /*!< centroids of all unscaled templates    */
154     struct Numaa  *naasum_u;     /*!< area of all unscaled templates         */
155     struct Pixaa  *pixaa;        /*!< all (scaled) templates for each class  */
156     struct Ptaa   *ptaa;         /*!< centroids of all (scaledl) templates   */
157     struct Numaa  *naasum;       /*!< area of all (scaled) templates         */
158     struct Pixa   *pixa_u;       /*!< averaged unscaled templates per class  */
159     struct Pta    *pta_u;        /*!< centroids of unscaled ave. templates   */
160     struct Numa   *nasum_u;      /*!< area of unscaled averaged templates    */
161     struct Pixa   *pixa;         /*!< averaged (scaled) templates per class  */
162     struct Pta    *pta;          /*!< centroids of (scaled) ave. templates   */
163     struct Numa   *nasum;        /*!< area of (scaled) averaged templates    */
164     struct Pixa   *pixa_tr;      /*!< all input training images              */
165     struct Pixa   *pixadb_ave;   /*!< unscaled and scaled averaged bitmaps   */
166     struct Pixa   *pixa_id;      /*!< input images for identifying           */
167     struct Pix    *pixdb_ave;    /*!< debug: best match of input against ave. */
168     struct Pix    *pixdb_range;  /*!< debug: best matches within range       */
169     struct Pixa   *pixadb_boot;  /*!< debug: bootstrap training results      */
170     struct Pixa   *pixadb_split; /*!< debug: splitting results               */
171     struct L_Bmf  *bmf;          /*!< bmf fonts                              */
172     l_int32        bmf_size;     /*!< font size of bmf; default is 6 pt      */
173     struct L_Rdid *did;          /*!< temp data used for image decoding      */
174     struct L_Rch  *rch;          /*!< temp data used for holding best char   */
175     struct L_Rcha *rcha;         /*!< temp data used for array of best chars */
176 };
177 typedef struct L_Recog L_RECOG;
178 
179 /*!
180  *  Data returned from correlation matching on a single character
181  */
182 struct L_Rch {
183     l_int32        index;      /*!< index of best template                   */
184     l_float32      score;      /*!< correlation score of best template       */
185     char          *text;       /*!< character string of best template        */
186     l_int32        sample;     /*!< index of best sample (within the best    */
187                                /*!< template class, if all samples are used) */
188     l_int32        xloc;       /*!< x-location of template (delx + shiftx)   */
189     l_int32        yloc;       /*!< y-location of template (dely + shifty)   */
190     l_int32        width;      /*!< width of best template                   */
191 };
192 typedef struct L_Rch L_RCH;
193 
194 /*!
195  *  Data returned from correlation matching on an array of characters
196  */
197 struct L_Rcha {
198     struct Numa   *naindex;    /*!< indices of best templates                */
199     struct Numa   *nascore;    /*!< correlation scores of best templates     */
200     struct Sarray *satext;     /*!< character strings of best templates      */
201     struct Numa   *nasample;   /*!< indices of best samples                  */
202     struct Numa   *naxloc;     /*!< x-locations of templates (delx + shiftx) */
203     struct Numa   *nayloc;     /*!< y-locations of templates (dely + shifty) */
204     struct Numa   *nawidth;    /*!< widths of best templates                 */
205 };
206 typedef struct L_Rcha L_RCHA;
207 
208 /*!
209  *  Data used for decoding a line of characters.
210  */
211 struct L_Rdid {
212     struct Pix    *pixs;         /*!< clone of pix to be decoded             */
213     l_int32      **counta;       /*!< count array for each averaged template */
214     l_int32      **delya;        /*!< best y-shift array per average template */
215     l_int32        narray;       /*!< number of averaged templates           */
216     l_int32        size;         /*!< size of count array (width of pixs)    */
217     l_int32       *setwidth;     /*!< setwidths for each template            */
218     struct Numa   *nasum;        /*!< pixel count in pixs by column          */
219     struct Numa   *namoment;     /*!< first moment of pixels in pixs by cols */
220     l_int32        fullarrays;   /*!< 1 if full arrays are made; 0 otherwise */
221     l_float32     *beta;         /*!< channel coeffs for template fg term    */
222     l_float32     *gamma;        /*!< channel coeffs for bit-and term        */
223     l_float32     *trellisscore; /*!< score on trellis                       */
224     l_int32       *trellistempl; /*!< template on trellis (for backtrack)    */
225     struct Numa   *natempl;      /*!< indices of best path templates         */
226     struct Numa   *naxloc;       /*!< x locations of best path templates     */
227     struct Numa   *nadely;       /*!< y locations of best path templates     */
228     struct Numa   *nawidth;      /*!< widths of best path templates          */
229     struct Boxa   *boxa;         /*!< Viterbi result for splitting input pixs */
230     struct Numa   *nascore;      /*!< correlation scores: best path templates */
231     struct Numa   *natempl_r;    /*!< indices of best rescored templates     */
232     struct Numa   *nasample_r;   /*!< samples of best scored templates       */
233     struct Numa   *naxloc_r;     /*!< x locations of best rescoredtemplates  */
234     struct Numa   *nadely_r;     /*!< y locations of best rescoredtemplates  */
235     struct Numa   *nawidth_r;    /*!< widths of best rescoredtemplates       */
236     struct Numa   *nascore_r;    /*!< correlation scores: rescored templates */
237 };
238 typedef struct L_Rdid L_RDID;
239 
240 
241 /*-------------------------------------------------------------------------*
242  *             Flags for describing limited character sets                 *
243  *-------------------------------------------------------------------------*/
244 /*! Flags for describing limited character sets */
245 enum {
246     L_UNKNOWN = 0,           /*!< character set type is not specified      */
247     L_ARABIC_NUMERALS = 1,   /*!< 10 digits                                */
248     L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m)     */
249     L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M)     */
250     L_LC_ALPHA = 4,          /*!< 26 lower-case letters                    */
251     L_UC_ALPHA = 5           /*!< 26 upper-case letters                    */
252 };
253 
254 /*-------------------------------------------------------------------------*
255  *      Flags for selecting between using average and all templates        *
256  *-------------------------------------------------------------------------*/
257 /*! Flags for selecting average or all templates: recog->templ_use */
258 enum {
259     L_USE_ALL_TEMPLATES = 0,     /*!< use all templates; default            */
260     L_USE_AVERAGE_TEMPLATES = 1  /*!< use average templates; special cases  */
261 };
262 
263 #endif  /* LEPTONICA_RECOG_H */
264