1 /*====================================================================* 2 - Copyright (C) 2001 Leptonica. All rights reserved. 3 - 4 - Redistribution and use in source and binary forms, with or without 5 - modification, are permitted provided that the following conditions 6 - are met: 7 - 1. Redistributions of source code must retain the above copyright 8 - notice, this list of conditions and the following disclaimer. 9 - 2. Redistributions in binary form must reproduce the above 10 - copyright notice, this list of conditions and the following 11 - disclaimer in the documentation and/or other materials 12 - provided with the distribution. 13 - 14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY 18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 *====================================================================*/ 26 27 #ifndef LEPTONICA_RECOG_H 28 #define LEPTONICA_RECOG_H 29 30 /*! 31 * \file recog.h 32 * 33 * <pre> 34 * A simple utility for training and recognizing individual 35 * machine-printed text characters. In an application, one can 36 * envision using a number of these, one for each trained set. 37 * 38 * In training mode, a set of labelled bitmaps is presented, either 39 * one at a time, or in a directory, or in a pixa. If in a directory, 40 * or a pixa, the labelling text string must be embedded in the 41 * text field of the image file. 42 * 43 * Any number of recognizers (L_Recog) can be trained and then used 44 * together in an array (L_Recoga). All these trained structures 45 * can be serialized to file and read back. The serialized version 46 * holds all the bitmaps used for training, plus, for arbitrary 47 * character sets, the UTF8 representation and the lookup table 48 * mapping from the character representation to index. 49 * 50 * There are three levels of "sets" here: 51 * 52 * (1) Example set: the examples representing a character that 53 * were printed in the same way, so that they can be combined 54 * without scaling to form an "average" template for the character. 55 * In the recognition phase, we use either this aligned average, 56 * or the individual bitmaps. All examples in the set are given 57 * the same character label. Example: the letter 'a' in the 58 * predominant font in a book. 59 * 60 * (2) Character set (represented by L_Recog, a single recognizer): 61 * The set of different characters, each of which is described 62 * by (1). Each element of the set has a different character 63 * label. Example: the digits '0' through '9' that are used for 64 * page numbering in a book. 65 * 66 * (3) Recognizer set (represented by L_Recoga, an array of recogs): 67 * A set of recognizers, each of which is described by (2). 68 * In general, we do not want to combine the character sets 69 * with the same labels within different recognizer sets, 70 * because the bitmaps can differ in font type, style or size. 71 * Example 1: the letter 'a' can be printed in two very different 72 * ways (either with a large loop or with a smaller loop in 73 * the lower half); both share the same label but need to be 74 * distinguished so that they are not mixed when averaging. 75 * Example 2: a recognizer trained for a book may be missing 76 * some characters, so we need to supplement it with another 77 * "generic" or "bootstrap" recognizer that has the additional 78 * characters from a variety of sources. Bootstrap recognizers 79 * must be run in a mode where all characters are scaled. 80 * 81 * In the recognition process, for each component in an input image, 82 * each recognizer (L_Recog) records the best match (highest 83 * correlation score). If there is more than one recognizer, these 84 * results are aggregated to find the best match for each character 85 * for all the recognizers, and this is stored in L_Recoga. 86 * </pre> 87 */ 88 89 #define RECOG_VERSION_NUMBER 1 90 91 struct L_Recoga { 92 l_int32 n; /*!< number of recogs */ 93 l_int32 nalloc; /*!< number of recog ptrs allocated */ 94 struct L_Recog **recog; /*!< recog ptr array */ 95 struct L_Rcha *rcha; /*!< stores the array of best chars */ 96 }; 97 typedef struct L_Recoga L_RECOGA; 98 99 100 struct L_Recog { 101 l_int32 scalew; /*!< scale all examples to this width; */ 102 /*!< use 0 prevent horizontal scaling */ 103 l_int32 scaleh; /*!< scale all examples to this height; */ 104 /*!< use 0 prevent vertical scaling */ 105 l_int32 templ_type; /*!< template type: either an average of */ 106 /*!< examples (L_USE_AVERAGE) or the set */ 107 /*!< of all examples (L_USE_ALL) */ 108 l_int32 maxarraysize; /*!< initialize container arrays to this */ 109 l_int32 setsize; /*!< size of character set */ 110 l_int32 threshold; /*!< for binarizing if depth > 1 */ 111 l_int32 maxyshift; /*!< vertical jiggle on nominal centroid */ 112 /*!< alignment; typically 0 or 1 */ 113 l_float32 asperity_fr; /*!< +- allowed fractional asperity ratio */ 114 l_int32 charset_type; /*!< one of L_ARABIC_NUMERALS, etc. */ 115 l_int32 charset_size; /*!< expected number of classes in charse */ 116 char *bootdir; /*!< dir with bootstrap pixa charsets */ 117 char *bootpattern; /*!< file pattern: bootstrap pixa charsets */ 118 char *bootpath; /*!< path for single bootstrap pixa charset */ 119 l_int32 boot_iters; /*!< num of 2x2 erosion iters on boot pixa */ 120 l_int32 min_nopad; /*!< min number of samples without padding */ 121 l_int32 max_afterpad; /*!< max number of samples after padding */ 122 l_int32 min_samples; /*!< min num of total samples; else use boot */ 123 l_int32 num_samples; /*!< number of training samples */ 124 l_int32 minwidth_u; /*!< min width averaged unscaled templates */ 125 l_int32 maxwidth_u; /*!< max width averaged unscaled templates */ 126 l_int32 minheight_u; /*!< min height averaged unscaled templates */ 127 l_int32 maxheight_u; /*!< max height averaged unscaled templates */ 128 l_int32 minwidth; /*!< min width averaged scaled templates */ 129 l_int32 maxwidth; /*!< max width averaged scaled templates */ 130 l_int32 ave_done; /*!< set to 1 when averaged bitmaps are made */ 131 l_int32 train_done; /*!< set to 1 when training is complete or */ 132 /*!< identification has started */ 133 l_int32 min_splitw; /*!< min component width kept in splitting */ 134 l_int32 min_splith; /*!< min component height kept in splitting */ 135 l_int32 max_splith; /*!< max component height kept in splitting */ 136 struct Sarray *sa_text; /*!< text array for arbitrary char set */ 137 struct L_Dna *dna_tochar; /*!< index-to-char lut for arbitrary charset */ 138 l_int32 *centtab; /*!< table for finding centroids */ 139 l_int32 *sumtab; /*!< table for finding pixel sums */ 140 struct Pixaa *pixaa_u; /*!< all unscaled bitmaps for each class */ 141 struct Pixa *pixa_u; /*!< averaged unscaled bitmaps per class */ 142 struct Ptaa *ptaa_u; /*!< centroids of all unscaled bitmaps */ 143 struct Pta *pta_u; /*!< centroids of unscaled averaged bitmaps */ 144 struct Numaa *naasum_u; /*!< area of all unscaled bitmap examples */ 145 struct Numa *nasum_u; /*!< area of unscaled averaged bitmaps */ 146 struct Pixaa *pixaa; /*!< all bitmap examples for each class */ 147 struct Pixa *pixa; /*!< averaged bitmaps for each class */ 148 struct Ptaa *ptaa; /*!< centroids of all bitmap examples */ 149 struct Pta *pta; /*!< centroids of averaged bitmaps */ 150 struct Numaa *naasum; /*!< area of all bitmap examples */ 151 struct Numa *nasum; /*!< area of averaged bitmaps */ 152 struct Pixa *pixa_tr; /*!< input training images */ 153 struct Pixa *pixadb_ave; /*!< unscaled and scaled averaged bitmaps */ 154 struct Pixa *pixa_id; /*!< input images for identifying */ 155 struct Pix *pixdb_ave; /*!< debug: best match of input against ave. */ 156 struct Pix *pixdb_range; /*!< debug: best matches within range */ 157 struct Pixa *pixadb_boot; /*!< debug: bootstrap training results */ 158 struct Pixa *pixadb_split; /*!< debug: splitting results */ 159 struct L_Bmf *bmf; /*!< bmf fonts */ 160 l_int32 bmf_size; /*!< font size of bmf; default is 6 pt */ 161 struct L_Rdid *did; /*!< temp data used for image decoding */ 162 struct L_Rch *rch; /*!< temp data used for holding best char */ 163 struct L_Rcha *rcha; /*!< temp data used for array of best chars */ 164 l_int32 bootrecog; /*!< 1 if using bootstrap samples; else 0 */ 165 l_int32 index; /*!< recog index in recoga; -1 if no parent */ 166 struct L_Recoga *parent; /*!< ptr to parent array; can be null */ 167 168 }; 169 typedef struct L_Recog L_RECOG; 170 171 /*! 172 * Data returned from correlation matching on a single character 173 */ 174 struct L_Rch { 175 l_int32 index; /*!< index of best template */ 176 l_float32 score; /*!< correlation score of best template */ 177 char *text; /*!< character string of best template */ 178 l_int32 sample; /*!< index of best sample (within the best */ 179 /*!< template class, if all samples are used) */ 180 l_int32 xloc; /*!< x-location of template (delx + shiftx) */ 181 l_int32 yloc; /*!< y-location of template (dely + shifty) */ 182 l_int32 width; /*!< width of best template */ 183 }; 184 typedef struct L_Rch L_RCH; 185 186 /*! 187 * Data returned from correlation matching on an array of characters 188 */ 189 struct L_Rcha { 190 struct Numa *naindex; /*!< indices of best templates */ 191 struct Numa *nascore; /*!< correlation scores of best templates */ 192 struct Sarray *satext; /*!< character strings of best templates */ 193 struct Numa *nasample; /*!< indices of best samples */ 194 struct Numa *naxloc; /*!< x-locations of templates (delx + shiftx) */ 195 struct Numa *nayloc; /*!< y-locations of templates (dely + shifty) */ 196 struct Numa *nawidth; /*!< widths of best templates */ 197 }; 198 typedef struct L_Rcha L_RCHA; 199 200 /*! 201 * Data used for decoding a line of characters. 202 */ 203 struct L_Rdid { 204 struct Pix *pixs; /*!< clone of pix to be decoded */ 205 l_int32 **counta; /*!< count array for each averaged template */ 206 l_int32 **delya; /*!< best y-shift array per average template */ 207 l_int32 narray; /*!< number of averaged templates */ 208 l_int32 size; /*!< size of count array (width of pixs) */ 209 l_int32 *setwidth; /*!< setwidths for each template */ 210 struct Numa *nasum; /*!< pixel count in pixs by column */ 211 struct Numa *namoment; /*!< first moment of pixels in pixs by cols */ 212 l_int32 fullarrays; /*!< 1 if full arrays are made; 0 otherwise */ 213 l_float32 *beta; /*!< channel coeffs for template fg term */ 214 l_float32 *gamma; /*!< channel coeffs for bit-and term */ 215 l_float32 *trellisscore; /*!< score on trellis */ 216 l_int32 *trellistempl; /*!< template on trellis (for backtrack) */ 217 struct Numa *natempl; /*!< indices of best path templates */ 218 struct Numa *naxloc; /*!< x locations of best path templates */ 219 struct Numa *nadely; /*!< y locations of best path templates */ 220 struct Numa *nawidth; /*!< widths of best path templates */ 221 struct Numa *nascore; /*!< correlation scores: best path templates */ 222 struct Numa *natempl_r; /*!< indices of best rescored templates */ 223 struct Numa *naxloc_r; /*!< x locations of best rescoredtemplates */ 224 struct Numa *nadely_r; /*!< y locations of best rescoredtemplates */ 225 struct Numa *nawidth_r; /*!< widths of best rescoredtemplates */ 226 struct Numa *nascore_r; /*!< correlation scores: rescored templates */ 227 }; 228 typedef struct L_Rdid L_RDID; 229 230 231 /*-------------------------------------------------------------------------* 232 * Flags for selecting processing * 233 *-------------------------------------------------------------------------*/ 234 235 /*! Flags for selecting processing */ 236 enum { 237 L_SELECT_UNSCALED = 0, /*!< select the unscaled bitmaps */ 238 L_SELECT_SCALED = 1, /*!< select the scaled bitmaps */ 239 L_SELECT_BOTH = 2 /*!< select both unscaled and scaled */ 240 }; 241 242 /*-------------------------------------------------------------------------* 243 * Flags for determining what to test against * 244 *-------------------------------------------------------------------------*/ 245 246 /*! Flags for determining what to test against */ 247 enum { 248 L_USE_AVERAGE = 0, /*!< form template from class average */ 249 L_USE_ALL = 1 /*!< match against all elements of each class */ 250 }; 251 252 /*-------------------------------------------------------------------------* 253 * Flags for describing limited character sets * 254 *-------------------------------------------------------------------------*/ 255 256 /*! Flags for describing limited character sets */ 257 enum { 258 L_UNKNOWN = 0, /*!< character set type is not specified */ 259 L_ARABIC_NUMERALS = 1, /*!< 10 digits */ 260 L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m) */ 261 L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M) */ 262 L_LC_ALPHA = 4, /*!< 26 lower-case letters */ 263 L_UC_ALPHA = 5 /*!< 26 upper-case letters */ 264 }; 265 266 #endif /* LEPTONICA_RECOG_H */ 267