1 /********************************************************************** 2 * File: rejctmap.h (Formerly rejmap.h) 3 * Description: REJ and REJMAP class functions. 4 * Author: Phil Cheatle 5 * 6 * (C) Copyright 1994, Hewlett-Packard Ltd. 7 ** Licensed under the Apache License, Version 2.0 (the "License"); 8 ** you may not use this file except in compliance with the License. 9 ** You may obtain a copy of the License at 10 ** http://www.apache.org/licenses/LICENSE-2.0 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 * 17 18 This module may look unnecessarily verbose, but here's the philosophy... 19 20 ALL processing of the reject map is done in this module. There are lots of 21 separate calls to set reject/accept flags. These have DELIBERATELY been kept 22 distinct so that this module can decide what to do. 23 24 Basically, there is a flag for each sort of rejection or acceptance. This 25 provides a history of what has happened to EACH character. 26 27 Determining whether a character is CURRENTLY rejected depends on implicit 28 understanding of the SEQUENCE of possible calls. The flags are defined and 29 grouped in the REJ_FLAGS enum. These groupings are used in determining a 30 characters CURRENT rejection status. Basically, a character is ACCEPTED if 31 32 none of the permanent rej flags are set 33 AND ( the character has never been rejected 34 OR an accept flag is set which is LATER than the latest reject flag ) 35 36 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE 37 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! 38 **********************************************************************/ 39 40 #ifndef REJCTMAP_H 41 #define REJCTMAP_H 42 43 #include "errcode.h" 44 #include "params.h" 45 46 #include <bitset> 47 #include <memory> 48 49 namespace tesseract { 50 51 enum REJ_FLAGS { 52 /* Reject modes which are NEVER overridden */ 53 R_TESS_FAILURE, // PERM Tess didn't classify 54 R_SMALL_XHT, // PERM Xht too small 55 R_EDGE_CHAR, // PERM Too close to edge of image 56 R_1IL_CONFLICT, // PERM 1Il confusion 57 R_POSTNN_1IL, // PERM 1Il unrejected by NN 58 R_REJ_CBLOB, // PERM Odd blob 59 R_MM_REJECT, // PERM Matrix match rejection (m's) 60 R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend 61 62 /* Initial reject modes (pre NN_ACCEPT) */ 63 R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) 64 R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD 65 R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD 66 R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD 67 68 /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ 69 R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop 70 R_DUBIOUS, // TEMP Post NN dodgy chars 71 R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN 72 R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest 73 R_XHT_FIXUP, // TEMP Xht tests unsure 74 75 /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ 76 R_BAD_QUALITY, // TEMP Quality metrics bad for WERD 77 78 /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ 79 R_DOC_REJ, // TEMP Document rejection 80 R_BLOCK_REJ, // TEMP Block rejection 81 R_ROW_REJ, // TEMP Row rejection 82 R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space 83 84 /* Accept modes which occur between the above rejection groups */ 85 R_NN_ACCEPT, // NN acceptance 86 R_HYPHEN_ACCEPT, // Hyphen acceptance 87 R_MM_ACCEPT, // Matrix match acceptance 88 R_QUALITY_ACCEPT, // Accept word in good quality doc 89 R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures 90 }; 91 92 /* REJECT MAP VALUES */ 93 94 #define MAP_ACCEPT '1' 95 #define MAP_REJECT_PERM '0' 96 #define MAP_REJECT_TEMP '2' 97 #define MAP_REJECT_POTENTIAL '3' 98 99 class REJ { 100 std::bitset<32> flags; 101 set_flag(REJ_FLAGS rej_flag)102 void set_flag(REJ_FLAGS rej_flag) { 103 flags.set(rej_flag); 104 } 105 106 public: 107 REJ() = default; 108 REJ(const REJ & source)109 REJ( // classwise copy 110 const REJ &source) { 111 flags = source.flags; 112 } 113 114 REJ &operator=( // assign REJ 115 const REJ &source) = default; 116 flag(REJ_FLAGS rej_flag)117 bool flag(REJ_FLAGS rej_flag) const { 118 return flags[rej_flag]; 119 } 120 display_char()121 char display_char() const { 122 if (perm_rejected()) { 123 return MAP_REJECT_PERM; 124 } else if (accept_if_good_quality()) { 125 return MAP_REJECT_POTENTIAL; 126 } else if (rejected()) { 127 return MAP_REJECT_TEMP; 128 } else { 129 return MAP_ACCEPT; 130 } 131 } 132 perm_rejected()133 bool perm_rejected() const { // Is char perm reject? 134 return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) || 135 flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) || 136 flag(R_BAD_REPETITION) || flag(R_MM_REJECT)); 137 } 138 139 private: rej_before_nn_accept()140 bool rej_before_nn_accept() const { 141 return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) || 142 flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER); 143 } 144 rej_between_nn_and_mm()145 bool rej_between_nn_and_mm() const { 146 return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) || 147 flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP); 148 } 149 rej_between_mm_and_quality_accept()150 bool rej_between_mm_and_quality_accept() const { 151 return flag(R_BAD_QUALITY); 152 } 153 rej_between_quality_and_minimal_rej_accept()154 bool rej_between_quality_and_minimal_rej_accept() const { 155 return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) || 156 flag(R_UNLV_REJ); 157 } 158 rej_before_mm_accept()159 bool rej_before_mm_accept() const { 160 return rej_between_nn_and_mm() || 161 (rej_before_nn_accept() && !flag(R_NN_ACCEPT) && 162 !flag(R_HYPHEN_ACCEPT)); 163 } 164 rej_before_quality_accept()165 bool rej_before_quality_accept() const { 166 return rej_between_mm_and_quality_accept() || 167 (!flag(R_MM_ACCEPT) && rej_before_mm_accept()); 168 } 169 170 public: rejected()171 bool rejected() const { // Is char rejected? 172 if (flag(R_MINIMAL_REJ_ACCEPT)) { 173 return false; 174 } else { 175 return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() || 176 (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept())); 177 } 178 } 179 accept_if_good_quality()180 bool accept_if_good_quality() const { // potential rej? 181 return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) && 182 !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) && 183 !flag(R_CONTAINS_BLANKS) && 184 (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() && 185 !rej_between_quality_and_minimal_rej_accept())); 186 } 187 setrej_tess_failure()188 void setrej_tess_failure() { // Tess generated blank 189 set_flag(R_TESS_FAILURE); 190 } 191 setrej_small_xht()192 void setrej_small_xht() { // Small xht char/wd 193 set_flag(R_SMALL_XHT); 194 } 195 setrej_edge_char()196 void setrej_edge_char() { // Close to image edge 197 set_flag(R_EDGE_CHAR); 198 } 199 setrej_1Il_conflict()200 void setrej_1Il_conflict() { // Initial reject map 201 set_flag(R_1IL_CONFLICT); 202 } 203 setrej_postNN_1Il()204 void setrej_postNN_1Il() { // 1Il after NN 205 set_flag(R_POSTNN_1IL); 206 } 207 setrej_rej_cblob()208 void setrej_rej_cblob() { // Insert duff blob 209 set_flag(R_REJ_CBLOB); 210 } 211 setrej_mm_reject()212 void setrej_mm_reject() { // Matrix matcher 213 set_flag(R_MM_REJECT); 214 } 215 setrej_bad_repetition()216 void setrej_bad_repetition() { // Odd repeated char 217 set_flag(R_BAD_REPETITION); 218 } 219 setrej_poor_match()220 void setrej_poor_match() { // Failed Rays heuristic 221 set_flag(R_POOR_MATCH); 222 } 223 setrej_not_tess_accepted()224 void setrej_not_tess_accepted() { 225 // TEMP reject_word 226 set_flag(R_NOT_TESS_ACCEPTED); 227 } 228 setrej_contains_blanks()229 void setrej_contains_blanks() { 230 // TEMP reject_word 231 set_flag(R_CONTAINS_BLANKS); 232 } 233 setrej_bad_permuter()234 void setrej_bad_permuter() { // POTENTIAL reject_word 235 set_flag(R_BAD_PERMUTER); 236 } 237 setrej_hyphen()238 void setrej_hyphen() { // PostNN dubious hyphen or . 239 set_flag(R_HYPHEN); 240 } 241 setrej_dubious()242 void setrej_dubious() { // PostNN dubious limit 243 set_flag(R_DUBIOUS); 244 } 245 setrej_no_alphanums()246 void setrej_no_alphanums() { // TEMP reject_word 247 set_flag(R_NO_ALPHANUMS); 248 } 249 setrej_mostly_rej()250 void setrej_mostly_rej() { // TEMP reject_word 251 set_flag(R_MOSTLY_REJ); 252 } 253 setrej_xht_fixup()254 void setrej_xht_fixup() { // xht fixup 255 set_flag(R_XHT_FIXUP); 256 } 257 setrej_bad_quality()258 void setrej_bad_quality() { // TEMP reject_word 259 set_flag(R_BAD_QUALITY); 260 } 261 setrej_doc_rej()262 void setrej_doc_rej() { // TEMP reject_word 263 set_flag(R_DOC_REJ); 264 } 265 setrej_block_rej()266 void setrej_block_rej() { // TEMP reject_word 267 set_flag(R_BLOCK_REJ); 268 } 269 setrej_row_rej()270 void setrej_row_rej() { // TEMP reject_word 271 set_flag(R_ROW_REJ); 272 } 273 setrej_unlv_rej()274 void setrej_unlv_rej() { // TEMP reject_word 275 set_flag(R_UNLV_REJ); 276 } 277 setrej_hyphen_accept()278 void setrej_hyphen_accept() { // NN Flipped a char 279 set_flag(R_HYPHEN_ACCEPT); 280 } 281 setrej_nn_accept()282 void setrej_nn_accept() { // NN Flipped a char 283 set_flag(R_NN_ACCEPT); 284 } 285 setrej_mm_accept()286 void setrej_mm_accept() { // Matrix matcher 287 set_flag(R_MM_ACCEPT); 288 } 289 setrej_quality_accept()290 void setrej_quality_accept() { // Quality flip a char 291 set_flag(R_QUALITY_ACCEPT); 292 } 293 setrej_minimal_rej_accept()294 void setrej_minimal_rej_accept() { 295 // Accept all except blank 296 set_flag(R_MINIMAL_REJ_ACCEPT); 297 } 298 accepted()299 bool accepted() const { // Is char accepted? 300 return !rejected(); 301 } 302 recoverable()303 bool recoverable() const { 304 return (rejected() && !perm_rejected()); 305 } 306 307 void full_print(FILE *fp) const; 308 }; 309 310 class REJMAP { 311 std::unique_ptr<REJ[]> ptr; // ptr to the chars 312 uint16_t len = 0; // Number of chars 313 314 public: 315 REJMAP() = default; 316 REJMAP(const REJMAP & rejmap)317 REJMAP(const REJMAP &rejmap) { 318 *this = rejmap; 319 } 320 321 REJMAP &operator=(const REJMAP &source); 322 323 // Sets up the ptr array to length, whatever it was before. 324 void initialise(uint16_t length); 325 326 REJ &operator[]( // access function 327 uint16_t index) const // map index 328 { 329 ASSERT_HOST(index < len); 330 return ptr[index]; // no bounds checks 331 } 332 length()333 uint16_t length() const { // map length 334 return len; 335 } 336 337 int16_t accept_count() const; // How many accepted? 338 reject_count()339 int16_t reject_count() const { // How many rejects? 340 return len - accept_count(); 341 } 342 343 // Cut out an element. 344 void remove_pos(uint16_t pos); 345 346 void print(FILE *fp) const; 347 348 void full_print(FILE *fp) const; 349 350 bool recoverable_rejects() const; // Any non perm rejs? 351 352 bool quality_recoverable_rejects() const; 353 // Any potential rejs? 354 355 void rej_word_small_xht(); // Reject whole word 356 // Reject whole word 357 void rej_word_tess_failure(); 358 void rej_word_not_tess_accepted(); 359 // Reject whole word 360 // Reject whole word 361 void rej_word_contains_blanks(); 362 // Reject whole word 363 void rej_word_bad_permuter(); 364 void rej_word_xht_fixup(); // Reject whole word 365 // Reject whole word 366 void rej_word_no_alphanums(); 367 void rej_word_mostly_rej(); // Reject whole word 368 void rej_word_bad_quality(); // Reject whole word 369 void rej_word_doc_rej(); // Reject whole word 370 void rej_word_block_rej(); // Reject whole word 371 void rej_word_row_rej(); // Reject whole word 372 }; 373 374 } // namespace tesseract 375 376 #endif 377