1 /*====================================================================* 2 - Copyright (C) 2001 Leptonica. All rights reserved. 3 - 4 - Redistribution and use in source and binary forms, with or without 5 - modification, are permitted provided that the following conditions 6 - are met: 7 - 1. Redistributions of source code must retain the above copyright 8 - notice, this list of conditions and the following disclaimer. 9 - 2. Redistributions in binary form must reproduce the above 10 - copyright notice, this list of conditions and the following 11 - disclaimer in the documentation and/or other materials 12 - provided with the distribution. 13 - 14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY 18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 *====================================================================*/ 26 27 #ifndef LEPTONICA_RECOG_H 28 #define LEPTONICA_RECOG_H 29 30 /*! 31 * \file recog.h 32 * 33 * <pre> 34 * This is a simple utility for training and recognizing individual 35 * machine-printed text characters. It is designed to be adapted 36 * to a particular set of character images; e.g., from a book. 37 * 38 * There are two methods of training the recognizer. In the most 39 * simple, a set of bitmaps has been labeled by some means, such 40 * a generic OCR program. This is input either one template at a time 41 * or as a pixa of templates, to a function that creates a recog. 42 * If in a pixa, the text string label must be embedded in the 43 * text field of each pix. 44 * 45 * If labeled data is not available, we start with a bootstrap 46 * recognizer (BSR) that has labeled data from a variety of sources. 47 * These images are scaled, typically to a fixed height, and then 48 * fed similarly scaled unlabeled images from the source (e.g., book), 49 * and the BSR attempts to identify them. All images that have 50 * a high enough correlation score with one of the templates in the 51 * BSR are emitted in a pixa, which now holds unscaled and labeled 52 * templates from the source. This is the generator for a book adapted 53 * recognizer (BAR). 54 * 55 * The pixa should always be thought of as the primary structure. 56 * It is the generator for the recog, because a recog is built 57 * from a pixa of unscaled images. 58 * 59 * New image templates can be added to a recog as long as it is 60 * in training mode. Once training is finished, to add templates 61 * it is necessary to extract the generating pixa, add templates 62 * to that pixa, and make a new recog. Similarly, we do not 63 * join two recog; instead, we simply join their generating pixa, 64 * and make a recog from that. 65 * 66 * To remove outliers from a pixa of labeled pix, make a recog, 67 * determine the outliers, and generate a new pixa with the 68 * outliers removed. The outliers are determined by building 69 * special templates for each character set that are scaled averages 70 * of the individual templates. Then a correlation score is found 71 * between each template and the averaged templates. There are 72 * two implementations; outliers are determined as either: 73 * (1) a template having a correlation score with its class average 74 * that is below a threshold, or 75 * (2) a template having a correlation score with its class average 76 * that is smaller than the correlation score with the average 77 * of another class. 78 * Outliers are removed from the generating pixa. Scaled averaging 79 * is only performed for determining outliers and for splitting 80 * characters; it is never used in a trained recognizer for identifying 81 * unlabeled samples. 82 * 83 * Two methods using averaged templates are provided for splitting 84 * touching characters: 85 * (1) greedy matching 86 * (2) document image decoding (DID) 87 * The DID method is the default. It is about 5x faster and 88 * possibly more accurate. 89 * 90 * Once a BAR has been made, unlabeled sample images are identified 91 * by finding the individual template in the BAR with highest 92 * correlation. The input images and images in the BAR can be 93 * represented in two ways: 94 * (1) as scanned, binarized to 1 bpp 95 * (2) as a width-normalized outline formed by thinning to a 96 * skeleton and then dilating by a fixed amount. 97 * 98 * The recog can be serialized to file and read back. The serialized 99 * version holds the templates used for correlation (which may have 100 * been modified by scaling and turning into lines from the unscaled 101 * templates), plus, for arbitrary character sets, the UTF8 102 * representation and the lookup table mapping from the character 103 * representation to index. 104 * 105 * Why do we not use averaged templates for recognition? 106 * Letterforms can take on significantly different shapes (eg., 107 * the letters 'a' and 'g'), and it makes no sense to average these. 108 * The previous version of this utility allowed multiple recognizers 109 * to exist, but this is an unnecessary complication if recognition 110 * is done on all samples instead of on averages. 111 * </pre> 112 */ 113 114 #define RECOG_VERSION_NUMBER 2 115 116 struct L_Recog { 117 l_int32 scalew; /*!< scale all examples to this width; */ 118 /*!< use 0 prevent horizontal scaling */ 119 l_int32 scaleh; /*!< scale all examples to this height; */ 120 /*!< use 0 prevent vertical scaling */ 121 l_int32 linew; /*!< use a value > 0 to convert the bitmap */ 122 /*!< to lines of fixed width; 0 to skip */ 123 l_int32 templ_use; /*!< template use: use either the average */ 124 /*!< or all temmplates (L_USE_AVERAGE or */ 125 /*!< L_USE_ALL) */ 126 l_int32 maxarraysize; /*!< initialize container arrays to this */ 127 l_int32 setsize; /*!< size of character set */ 128 l_int32 threshold; /*!< for binarizing if depth > 1 */ 129 l_int32 maxyshift; /*!< vertical jiggle on nominal centroid */ 130 /*!< alignment; typically 0 or 1 */ 131 l_int32 charset_type; /*!< one of L_ARABIC_NUMERALS, etc. */ 132 l_int32 charset_size; /*!< expected number of classes in charset */ 133 l_int32 min_nopad; /*!< min number of samples without padding */ 134 l_int32 num_samples; /*!< number of training samples */ 135 l_int32 minwidth_u; /*!< min width averaged unscaled templates */ 136 l_int32 maxwidth_u; /*!< max width averaged unscaled templates */ 137 l_int32 minheight_u; /*!< min height averaged unscaled templates */ 138 l_int32 maxheight_u; /*!< max height averaged unscaled templates */ 139 l_int32 minwidth; /*!< min width averaged scaled templates */ 140 l_int32 maxwidth; /*!< max width averaged scaled templates */ 141 l_int32 ave_done; /*!< set to 1 when averaged bitmaps are made */ 142 l_int32 train_done; /*!< set to 1 when training is complete or */ 143 /*!< identification has started */ 144 l_float32 max_wh_ratio; /*!< max width/height ratio to split */ 145 l_float32 max_ht_ratio; /*!< max of max/min template height ratio */ 146 l_int32 min_splitw; /*!< min component width kept in splitting */ 147 l_int32 max_splith; /*!< max component height kept in splitting */ 148 struct Sarray *sa_text; /*!< text array for arbitrary char set */ 149 struct L_Dna *dna_tochar; /*!< index-to-char lut for arbitrary charset */ 150 l_int32 *centtab; /*!< table for finding centroids */ 151 l_int32 *sumtab; /*!< table for finding pixel sums */ 152 struct Pixaa *pixaa_u; /*!< all unscaled templates for each class */ 153 struct Ptaa *ptaa_u; /*!< centroids of all unscaled templates */ 154 struct Numaa *naasum_u; /*!< area of all unscaled templates */ 155 struct Pixaa *pixaa; /*!< all (scaled) templates for each class */ 156 struct Ptaa *ptaa; /*!< centroids of all (scaledl) templates */ 157 struct Numaa *naasum; /*!< area of all (scaled) templates */ 158 struct Pixa *pixa_u; /*!< averaged unscaled templates per class */ 159 struct Pta *pta_u; /*!< centroids of unscaled ave. templates */ 160 struct Numa *nasum_u; /*!< area of unscaled averaged templates */ 161 struct Pixa *pixa; /*!< averaged (scaled) templates per class */ 162 struct Pta *pta; /*!< centroids of (scaled) ave. templates */ 163 struct Numa *nasum; /*!< area of (scaled) averaged templates */ 164 struct Pixa *pixa_tr; /*!< all input training images */ 165 struct Pixa *pixadb_ave; /*!< unscaled and scaled averaged bitmaps */ 166 struct Pixa *pixa_id; /*!< input images for identifying */ 167 struct Pix *pixdb_ave; /*!< debug: best match of input against ave. */ 168 struct Pix *pixdb_range; /*!< debug: best matches within range */ 169 struct Pixa *pixadb_boot; /*!< debug: bootstrap training results */ 170 struct Pixa *pixadb_split; /*!< debug: splitting results */ 171 struct L_Bmf *bmf; /*!< bmf fonts */ 172 l_int32 bmf_size; /*!< font size of bmf; default is 6 pt */ 173 struct L_Rdid *did; /*!< temp data used for image decoding */ 174 struct L_Rch *rch; /*!< temp data used for holding best char */ 175 struct L_Rcha *rcha; /*!< temp data used for array of best chars */ 176 }; 177 typedef struct L_Recog L_RECOG; 178 179 /*! 180 * Data returned from correlation matching on a single character 181 */ 182 struct L_Rch { 183 l_int32 index; /*!< index of best template */ 184 l_float32 score; /*!< correlation score of best template */ 185 char *text; /*!< character string of best template */ 186 l_int32 sample; /*!< index of best sample (within the best */ 187 /*!< template class, if all samples are used) */ 188 l_int32 xloc; /*!< x-location of template (delx + shiftx) */ 189 l_int32 yloc; /*!< y-location of template (dely + shifty) */ 190 l_int32 width; /*!< width of best template */ 191 }; 192 typedef struct L_Rch L_RCH; 193 194 /*! 195 * Data returned from correlation matching on an array of characters 196 */ 197 struct L_Rcha { 198 struct Numa *naindex; /*!< indices of best templates */ 199 struct Numa *nascore; /*!< correlation scores of best templates */ 200 struct Sarray *satext; /*!< character strings of best templates */ 201 struct Numa *nasample; /*!< indices of best samples */ 202 struct Numa *naxloc; /*!< x-locations of templates (delx + shiftx) */ 203 struct Numa *nayloc; /*!< y-locations of templates (dely + shifty) */ 204 struct Numa *nawidth; /*!< widths of best templates */ 205 }; 206 typedef struct L_Rcha L_RCHA; 207 208 /*! 209 * Data used for decoding a line of characters. 210 */ 211 struct L_Rdid { 212 struct Pix *pixs; /*!< clone of pix to be decoded */ 213 l_int32 **counta; /*!< count array for each averaged template */ 214 l_int32 **delya; /*!< best y-shift array per average template */ 215 l_int32 narray; /*!< number of averaged templates */ 216 l_int32 size; /*!< size of count array (width of pixs) */ 217 l_int32 *setwidth; /*!< setwidths for each template */ 218 struct Numa *nasum; /*!< pixel count in pixs by column */ 219 struct Numa *namoment; /*!< first moment of pixels in pixs by cols */ 220 l_int32 fullarrays; /*!< 1 if full arrays are made; 0 otherwise */ 221 l_float32 *beta; /*!< channel coeffs for template fg term */ 222 l_float32 *gamma; /*!< channel coeffs for bit-and term */ 223 l_float32 *trellisscore; /*!< score on trellis */ 224 l_int32 *trellistempl; /*!< template on trellis (for backtrack) */ 225 struct Numa *natempl; /*!< indices of best path templates */ 226 struct Numa *naxloc; /*!< x locations of best path templates */ 227 struct Numa *nadely; /*!< y locations of best path templates */ 228 struct Numa *nawidth; /*!< widths of best path templates */ 229 struct Boxa *boxa; /*!< Viterbi result for splitting input pixs */ 230 struct Numa *nascore; /*!< correlation scores: best path templates */ 231 struct Numa *natempl_r; /*!< indices of best rescored templates */ 232 struct Numa *nasample_r; /*!< samples of best scored templates */ 233 struct Numa *naxloc_r; /*!< x locations of best rescoredtemplates */ 234 struct Numa *nadely_r; /*!< y locations of best rescoredtemplates */ 235 struct Numa *nawidth_r; /*!< widths of best rescoredtemplates */ 236 struct Numa *nascore_r; /*!< correlation scores: rescored templates */ 237 }; 238 typedef struct L_Rdid L_RDID; 239 240 241 /*-------------------------------------------------------------------------* 242 * Flags for describing limited character sets * 243 *-------------------------------------------------------------------------*/ 244 /*! Flags for describing limited character sets */ 245 enum { 246 L_UNKNOWN = 0, /*!< character set type is not specified */ 247 L_ARABIC_NUMERALS = 1, /*!< 10 digits */ 248 L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m) */ 249 L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M) */ 250 L_LC_ALPHA = 4, /*!< 26 lower-case letters */ 251 L_UC_ALPHA = 5 /*!< 26 upper-case letters */ 252 }; 253 254 /*-------------------------------------------------------------------------* 255 * Flags for selecting between using average and all templates * 256 *-------------------------------------------------------------------------*/ 257 /*! Flags for selecting average or all templates: recog->templ_use */ 258 enum { 259 L_USE_ALL_TEMPLATES = 0, /*!< use all templates; default */ 260 L_USE_AVERAGE_TEMPLATES = 1 /*!< use average templates; special cases */ 261 }; 262 263 #endif /* LEPTONICA_RECOG_H */ 264