1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 #ifndef  LEPTONICA_RECOG_H
28 #define  LEPTONICA_RECOG_H
29 
30 /*!
31  * \file recog.h
32  *
33  * <pre>
34  *     A simple utility for training and recognizing individual
35  *     machine-printed text characters.  In an application, one can
36  *     envision using a number of these, one for each trained set.
37  *
38  *     In training mode, a set of labelled bitmaps is presented, either
39  *     one at a time, or in a directory, or in a pixa.  If in a directory,
40  *     or a pixa, the labelling text string must be embedded in the
41  *     text field of the image file.
42  *
43  *     Any number of recognizers (L_Recog) can be trained and then used
44  *     together in an array (L_Recoga).  All these trained structures
45  *     can be serialized to file and read back.  The serialized version
46  *     holds all the bitmaps used for training, plus, for arbitrary
47  *     character sets, the UTF8 representation and the lookup table
48  *     mapping from the character representation to index.
49  *
50  *     There are three levels of "sets" here:
51  *
52  *       (1) Example set: the examples representing a character that
53  *           were printed in the same way, so that they can be combined
54  *           without scaling to form an "average" template for the character.
55  *           In the recognition phase, we use either this aligned average,
56  *           or the individual bitmaps.  All examples in the set are given
57  *           the same character label.   Example: the letter 'a' in the
58  *           predominant font in a book.
59  *
60  *       (2) Character set (represented by L_Recog, a single recognizer):
61  *           The set of different characters, each of which is described
62  *           by (1).  Each element of the set has a different character
63  *           label.  Example: the digits '0' through '9' that are used for
64  *           page numbering in a book.
65  *
66  *       (3) Recognizer set (represented by L_Recoga, an array of recogs):
67  *           A set of recognizers, each of which is described by (2).
68  *           In general, we do not want to combine the character sets
69  *           with the same labels within different recognizer sets,
70  *           because the bitmaps can differ in font type, style or size.
71  *           Example 1: the letter 'a' can be printed in two very different
72  *           ways (either with a large loop or with a smaller loop in
73  *           the lower half); both share the same label but need to be
74  *           distinguished so that they are not mixed when averaging.
75  *           Example 2: a recognizer trained for a book may be missing
76  *           some characters, so we need to supplement it with another
77  *           "generic" or "bootstrap" recognizer that has the additional
78  *           characters from a variety of sources.  Bootstrap recognizers
79  *           must be run in a mode where all characters are scaled.
80  *
81  *     In the recognition process, for each component in an input image,
82  *     each recognizer (L_Recog) records the best match (highest
83  *     correlation score).  If there is more than one recognizer, these
84  *     results are aggregated to find the best match for each character
85  *     for all the recognizers, and this is stored in L_Recoga.
86  * </pre>
87  */
88 
89 #define  RECOG_VERSION_NUMBER      1
90 
91 struct L_Recoga {
92     l_int32              n;      /*!< number of recogs                       */
93     l_int32              nalloc; /*!< number of recog ptrs allocated         */
94     struct L_Recog     **recog;  /*!< recog ptr array                        */
95     struct L_Rcha       *rcha;   /*!< stores the array of best chars         */
96 };
97 typedef struct L_Recoga L_RECOGA;
98 
99 
100 struct L_Recog {
101     l_int32        scalew;       /*!< scale all examples to this width;      */
102                                  /*!< use 0 prevent horizontal scaling       */
103     l_int32        scaleh;       /*!< scale all examples to this height;     */
104                                  /*!< use 0 prevent vertical scaling         */
105     l_int32        templ_type;   /*!< template type: either an average of    */
106                                  /*!< examples (L_USE_AVERAGE) or the set    */
107                                  /*!< of all examples (L_USE_ALL)            */
108     l_int32        maxarraysize; /*!< initialize container arrays to this    */
109     l_int32        setsize;      /*!< size of character set                  */
110     l_int32        threshold;    /*!< for binarizing if depth > 1            */
111     l_int32        maxyshift;    /*!< vertical jiggle on nominal centroid    */
112                                  /*!< alignment; typically 0 or 1            */
113     l_float32      asperity_fr;  /*!< +- allowed fractional asperity ratio   */
114     l_int32        charset_type; /*!< one of L_ARABIC_NUMERALS, etc.         */
115     l_int32        charset_size; /*!< expected number of classes in charse   */
116     char          *bootdir;      /*!< dir with bootstrap pixa charsets       */
117     char          *bootpattern;  /*!< file pattern: bootstrap pixa charsets  */
118     char          *bootpath;     /*!< path for single bootstrap pixa charset */
119     l_int32        boot_iters;   /*!< num of 2x2 erosion iters on boot pixa  */
120     l_int32        min_nopad;    /*!< min number of samples without padding  */
121     l_int32        max_afterpad; /*!< max number of samples after padding    */
122     l_int32        min_samples;  /*!< min num of total samples; else use boot */
123     l_int32        num_samples;  /*!< number of training samples             */
124     l_int32        minwidth_u;   /*!< min width averaged unscaled templates  */
125     l_int32        maxwidth_u;   /*!< max width averaged unscaled templates  */
126     l_int32        minheight_u;  /*!< min height averaged unscaled templates */
127     l_int32        maxheight_u;  /*!< max height averaged unscaled templates */
128     l_int32        minwidth;     /*!< min width averaged scaled templates    */
129     l_int32        maxwidth;     /*!< max width averaged scaled templates    */
130     l_int32        ave_done;     /*!< set to 1 when averaged bitmaps are made */
131     l_int32        train_done;   /*!< set to 1 when training is complete or  */
132                                  /*!< identification has started             */
133     l_int32        min_splitw;   /*!< min component width kept in splitting  */
134     l_int32        min_splith;   /*!< min component height kept in splitting */
135     l_int32        max_splith;   /*!< max component height kept in splitting */
136     struct Sarray *sa_text;      /*!< text array for arbitrary char set      */
137     struct L_Dna  *dna_tochar;   /*!< index-to-char lut for arbitrary charset */
138     l_int32       *centtab;      /*!< table for finding centroids            */
139     l_int32       *sumtab;       /*!< table for finding pixel sums           */
140     struct Pixaa  *pixaa_u;      /*!< all unscaled bitmaps for each class    */
141     struct Pixa   *pixa_u;       /*!< averaged unscaled bitmaps per class    */
142     struct Ptaa   *ptaa_u;       /*!< centroids of all unscaled bitmaps      */
143     struct Pta    *pta_u;        /*!< centroids of unscaled averaged bitmaps */
144     struct Numaa  *naasum_u;     /*!< area of all unscaled bitmap examples   */
145     struct Numa   *nasum_u;      /*!< area of unscaled averaged bitmaps      */
146     struct Pixaa  *pixaa;        /*!< all bitmap examples for each class     */
147     struct Pixa   *pixa;         /*!< averaged bitmaps for each class        */
148     struct Ptaa   *ptaa;         /*!< centroids of all bitmap examples       */
149     struct Pta    *pta;          /*!< centroids of averaged bitmaps          */
150     struct Numaa  *naasum;       /*!< area of all bitmap examples            */
151     struct Numa   *nasum;        /*!< area of averaged bitmaps               */
152     struct Pixa   *pixa_tr;      /*!< input training images                  */
153     struct Pixa   *pixadb_ave;   /*!< unscaled and scaled averaged bitmaps   */
154     struct Pixa   *pixa_id;      /*!< input images for identifying           */
155     struct Pix    *pixdb_ave;    /*!< debug: best match of input against ave. */
156     struct Pix    *pixdb_range;  /*!< debug: best matches within range       */
157     struct Pixa   *pixadb_boot;  /*!< debug: bootstrap training results      */
158     struct Pixa   *pixadb_split; /*!< debug: splitting results               */
159     struct L_Bmf  *bmf;          /*!< bmf fonts                              */
160     l_int32        bmf_size;     /*!< font size of bmf; default is 6 pt      */
161     struct L_Rdid *did;          /*!< temp data used for image decoding      */
162     struct L_Rch  *rch;          /*!< temp data used for holding best char   */
163     struct L_Rcha *rcha;         /*!< temp data used for array of best chars */
164     l_int32        bootrecog;    /*!< 1 if using bootstrap samples; else 0   */
165     l_int32        index;        /*!< recog index in recoga; -1 if no parent */
166     struct L_Recoga  *parent;    /*!< ptr to parent array; can be null       */
167 
168 };
169 typedef struct L_Recog L_RECOG;
170 
171 /*!
172  *  Data returned from correlation matching on a single character
173  */
174 struct L_Rch {
175     l_int32        index;      /*!< index of best template                   */
176     l_float32      score;      /*!< correlation score of best template       */
177     char          *text;       /*!< character string of best template        */
178     l_int32        sample;     /*!< index of best sample (within the best    */
179                                /*!< template class, if all samples are used) */
180     l_int32        xloc;       /*!< x-location of template (delx + shiftx)   */
181     l_int32        yloc;       /*!< y-location of template (dely + shifty)   */
182     l_int32        width;      /*!< width of best template                   */
183 };
184 typedef struct L_Rch L_RCH;
185 
186 /*!
187  *  Data returned from correlation matching on an array of characters
188  */
189 struct L_Rcha {
190     struct Numa   *naindex;    /*!< indices of best templates                */
191     struct Numa   *nascore;    /*!< correlation scores of best templates     */
192     struct Sarray *satext;     /*!< character strings of best templates      */
193     struct Numa   *nasample;   /*!< indices of best samples                  */
194     struct Numa   *naxloc;     /*!< x-locations of templates (delx + shiftx) */
195     struct Numa   *nayloc;     /*!< y-locations of templates (dely + shifty) */
196     struct Numa   *nawidth;    /*!< widths of best templates                 */
197 };
198 typedef struct L_Rcha L_RCHA;
199 
200 /*!
201  *  Data used for decoding a line of characters.
202  */
203 struct L_Rdid {
204     struct Pix    *pixs;         /*!< clone of pix to be decoded             */
205     l_int32      **counta;       /*!< count array for each averaged template */
206     l_int32      **delya;        /*!< best y-shift array per average template */
207     l_int32        narray;       /*!< number of averaged templates           */
208     l_int32        size;         /*!< size of count array (width of pixs)    */
209     l_int32       *setwidth;     /*!< setwidths for each template            */
210     struct Numa   *nasum;        /*!< pixel count in pixs by column          */
211     struct Numa   *namoment;     /*!< first moment of pixels in pixs by cols */
212     l_int32        fullarrays;   /*!< 1 if full arrays are made; 0 otherwise */
213     l_float32     *beta;         /*!< channel coeffs for template fg term    */
214     l_float32     *gamma;        /*!< channel coeffs for bit-and term        */
215     l_float32     *trellisscore; /*!< score on trellis                       */
216     l_int32       *trellistempl; /*!< template on trellis (for backtrack)    */
217     struct Numa   *natempl;      /*!< indices of best path templates         */
218     struct Numa   *naxloc;       /*!< x locations of best path templates     */
219     struct Numa   *nadely;       /*!< y locations of best path templates     */
220     struct Numa   *nawidth;      /*!< widths of best path templates          */
221     struct Numa   *nascore;      /*!< correlation scores: best path templates */
222     struct Numa   *natempl_r;    /*!< indices of best rescored templates     */
223     struct Numa   *naxloc_r;     /*!< x locations of best rescoredtemplates  */
224     struct Numa   *nadely_r;     /*!< y locations of best rescoredtemplates  */
225     struct Numa   *nawidth_r;    /*!< widths of best rescoredtemplates       */
226     struct Numa   *nascore_r;    /*!< correlation scores: rescored templates */
227 };
228 typedef struct L_Rdid L_RDID;
229 
230 
231 /*-------------------------------------------------------------------------*
232  *                    Flags for selecting processing                       *
233  *-------------------------------------------------------------------------*/
234 
235 /*! Flags for selecting processing */
236 enum {
237     L_SELECT_UNSCALED = 0,     /*!< select the unscaled bitmaps            */
238     L_SELECT_SCALED = 1,       /*!< select the scaled bitmaps              */
239     L_SELECT_BOTH = 2          /*!< select both unscaled and scaled        */
240 };
241 
242 /*-------------------------------------------------------------------------*
243  *                Flags for determining what to test against               *
244  *-------------------------------------------------------------------------*/
245 
246 /*! Flags for determining what to test against */
247 enum {
248     L_USE_AVERAGE = 0,       /*!< form template from class average         */
249     L_USE_ALL = 1            /*!< match against all elements of each class */
250 };
251 
252 /*-------------------------------------------------------------------------*
253  *             Flags for describing limited character sets                 *
254  *-------------------------------------------------------------------------*/
255 
256 /*! Flags for describing limited character sets */
257 enum {
258     L_UNKNOWN = 0,           /*!< character set type is not specified      */
259     L_ARABIC_NUMERALS = 1,   /*!< 10 digits                                */
260     L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m)     */
261     L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M)     */
262     L_LC_ALPHA = 4,          /*!< 26 lower-case letters                    */
263     L_UC_ALPHA = 5           /*!< 26 upper-case letters                    */
264 };
265 
266 #endif  /* LEPTONICA_RECOG_H */
267