1 /**********************************************************************
2  * File:        rejctmap.h  (Formerly rejmap.h)
3  * Description: REJ and REJMAP class functions.
4  * Author:    Phil Cheatle
5  *
6  * (C) Copyright 1994, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17 
18 This module may look unnecessarily verbose, but here's the philosophy...
19 
20 ALL processing of the reject map is done in this module. There are lots of
21 separate calls to set reject/accept flags. These have DELIBERATELY been kept
22 distinct so that this module can decide what to do.
23 
24 Basically, there is a flag for each sort of rejection or acceptance. This
25 provides a history of what has happened to EACH character.
26 
27 Determining whether a character is CURRENTLY rejected depends on implicit
28 understanding of the SEQUENCE of possible calls. The flags are defined and
29 grouped in the REJ_FLAGS enum. These groupings are used in determining a
30 characters CURRENT rejection status. Basically, a character is ACCEPTED if
31 
32     none of the permanent rej flags are set
33   AND (    the character has never been rejected
34       OR an accept flag is set which is LATER than the latest reject flag )
35 
36 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
37 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
38 **********************************************************************/
39 
40 #ifndef REJCTMAP_H
41 #define REJCTMAP_H
42 
43 #include "errcode.h"
44 #include "params.h"
45 
46 #include <bitset>
47 #include <memory>
48 
49 namespace tesseract {
50 
51 enum REJ_FLAGS {
52   /* Reject modes which are NEVER overridden */
53   R_TESS_FAILURE,   // PERM Tess didn't classify
54   R_SMALL_XHT,      // PERM Xht too small
55   R_EDGE_CHAR,      // PERM Too close to edge of image
56   R_1IL_CONFLICT,   // PERM 1Il confusion
57   R_POSTNN_1IL,     // PERM 1Il unrejected by NN
58   R_REJ_CBLOB,      // PERM Odd blob
59   R_MM_REJECT,      // PERM Matrix match rejection (m's)
60   R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend
61 
62   /* Initial reject modes (pre NN_ACCEPT) */
63   R_POOR_MATCH,        // TEMP Ray's original heuristic (Not used)
64   R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD
65   R_CONTAINS_BLANKS,   // TEMP Tess failed on other chs in WERD
66   R_BAD_PERMUTER,      // POTENTIAL Bad permuter for WERD
67 
68   /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
69   R_HYPHEN,       // TEMP Post NN dodgy hyphen or full stop
70   R_DUBIOUS,      // TEMP Post NN dodgy chars
71   R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN
72   R_MOSTLY_REJ,   // TEMP Most of word rejected so rej the rest
73   R_XHT_FIXUP,    // TEMP Xht tests unsure
74 
75   /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
76   R_BAD_QUALITY, // TEMP Quality metrics bad for WERD
77 
78   /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/
79   R_DOC_REJ,   // TEMP Document rejection
80   R_BLOCK_REJ, // TEMP Block rejection
81   R_ROW_REJ,   // TEMP Row rejection
82   R_UNLV_REJ,  // TEMP ~ turned to - or ^ turned to space
83 
84   /* Accept modes which occur between the above rejection groups */
85   R_NN_ACCEPT,         // NN acceptance
86   R_HYPHEN_ACCEPT,     // Hyphen acceptance
87   R_MM_ACCEPT,         // Matrix match acceptance
88   R_QUALITY_ACCEPT,    // Accept word in good quality doc
89   R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures
90 };
91 
92 /* REJECT MAP VALUES */
93 
94 #define MAP_ACCEPT '1'
95 #define MAP_REJECT_PERM '0'
96 #define MAP_REJECT_TEMP '2'
97 #define MAP_REJECT_POTENTIAL '3'
98 
99 class REJ {
100   std::bitset<32> flags;
101 
set_flag(REJ_FLAGS rej_flag)102   void set_flag(REJ_FLAGS rej_flag) {
103     flags.set(rej_flag);
104   }
105 
106 public:
107   REJ() = default;
108 
REJ(const REJ & source)109   REJ( // classwise copy
110       const REJ &source) {
111     flags = source.flags;
112   }
113 
114   REJ &operator=( // assign REJ
115       const REJ &source) = default;
116 
flag(REJ_FLAGS rej_flag)117   bool flag(REJ_FLAGS rej_flag) const {
118     return flags[rej_flag];
119   }
120 
display_char()121   char display_char() const {
122     if (perm_rejected()) {
123       return MAP_REJECT_PERM;
124     } else if (accept_if_good_quality()) {
125       return MAP_REJECT_POTENTIAL;
126     } else if (rejected()) {
127       return MAP_REJECT_TEMP;
128     } else {
129       return MAP_ACCEPT;
130     }
131   }
132 
perm_rejected()133   bool perm_rejected() const { // Is char perm reject?
134     return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) ||
135             flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) ||
136             flag(R_BAD_REPETITION) || flag(R_MM_REJECT));
137   }
138 
139 private:
rej_before_nn_accept()140   bool rej_before_nn_accept() const {
141     return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) ||
142            flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER);
143   }
144 
rej_between_nn_and_mm()145   bool rej_between_nn_and_mm() const {
146     return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) ||
147            flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP);
148   }
149 
rej_between_mm_and_quality_accept()150   bool rej_between_mm_and_quality_accept() const {
151     return flag(R_BAD_QUALITY);
152   }
153 
rej_between_quality_and_minimal_rej_accept()154   bool rej_between_quality_and_minimal_rej_accept() const {
155     return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) ||
156            flag(R_UNLV_REJ);
157   }
158 
rej_before_mm_accept()159   bool rej_before_mm_accept() const {
160     return rej_between_nn_and_mm() ||
161            (rej_before_nn_accept() && !flag(R_NN_ACCEPT) &&
162             !flag(R_HYPHEN_ACCEPT));
163   }
164 
rej_before_quality_accept()165   bool rej_before_quality_accept() const {
166     return rej_between_mm_and_quality_accept() ||
167            (!flag(R_MM_ACCEPT) && rej_before_mm_accept());
168   }
169 
170 public:
rejected()171   bool rejected() const { // Is char rejected?
172     if (flag(R_MINIMAL_REJ_ACCEPT)) {
173       return false;
174     } else {
175       return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() ||
176               (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept()));
177     }
178   }
179 
accept_if_good_quality()180   bool accept_if_good_quality() const { // potential rej?
181     return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) &&
182             !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) &&
183             !flag(R_CONTAINS_BLANKS) &&
184             (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() &&
185              !rej_between_quality_and_minimal_rej_accept()));
186   }
187 
setrej_tess_failure()188   void setrej_tess_failure() { // Tess generated blank
189     set_flag(R_TESS_FAILURE);
190   }
191 
setrej_small_xht()192   void setrej_small_xht() { // Small xht char/wd
193     set_flag(R_SMALL_XHT);
194   }
195 
setrej_edge_char()196   void setrej_edge_char() { // Close to image edge
197     set_flag(R_EDGE_CHAR);
198   }
199 
setrej_1Il_conflict()200   void setrej_1Il_conflict() { // Initial reject map
201     set_flag(R_1IL_CONFLICT);
202   }
203 
setrej_postNN_1Il()204   void setrej_postNN_1Il() { // 1Il after NN
205     set_flag(R_POSTNN_1IL);
206   }
207 
setrej_rej_cblob()208   void setrej_rej_cblob() { // Insert duff blob
209     set_flag(R_REJ_CBLOB);
210   }
211 
setrej_mm_reject()212   void setrej_mm_reject() { // Matrix matcher
213     set_flag(R_MM_REJECT);
214   }
215 
setrej_bad_repetition()216   void setrej_bad_repetition() { // Odd repeated char
217     set_flag(R_BAD_REPETITION);
218   }
219 
setrej_poor_match()220   void setrej_poor_match() { // Failed Rays heuristic
221     set_flag(R_POOR_MATCH);
222   }
223 
setrej_not_tess_accepted()224   void setrej_not_tess_accepted() {
225     // TEMP reject_word
226     set_flag(R_NOT_TESS_ACCEPTED);
227   }
228 
setrej_contains_blanks()229   void setrej_contains_blanks() {
230     // TEMP reject_word
231     set_flag(R_CONTAINS_BLANKS);
232   }
233 
setrej_bad_permuter()234   void setrej_bad_permuter() { // POTENTIAL reject_word
235     set_flag(R_BAD_PERMUTER);
236   }
237 
setrej_hyphen()238   void setrej_hyphen() { // PostNN dubious hyphen or .
239     set_flag(R_HYPHEN);
240   }
241 
setrej_dubious()242   void setrej_dubious() { // PostNN dubious limit
243     set_flag(R_DUBIOUS);
244   }
245 
setrej_no_alphanums()246   void setrej_no_alphanums() { // TEMP reject_word
247     set_flag(R_NO_ALPHANUMS);
248   }
249 
setrej_mostly_rej()250   void setrej_mostly_rej() { // TEMP reject_word
251     set_flag(R_MOSTLY_REJ);
252   }
253 
setrej_xht_fixup()254   void setrej_xht_fixup() { // xht fixup
255     set_flag(R_XHT_FIXUP);
256   }
257 
setrej_bad_quality()258   void setrej_bad_quality() { // TEMP reject_word
259     set_flag(R_BAD_QUALITY);
260   }
261 
setrej_doc_rej()262   void setrej_doc_rej() { // TEMP reject_word
263     set_flag(R_DOC_REJ);
264   }
265 
setrej_block_rej()266   void setrej_block_rej() { // TEMP reject_word
267     set_flag(R_BLOCK_REJ);
268   }
269 
setrej_row_rej()270   void setrej_row_rej() { // TEMP reject_word
271     set_flag(R_ROW_REJ);
272   }
273 
setrej_unlv_rej()274   void setrej_unlv_rej() { // TEMP reject_word
275     set_flag(R_UNLV_REJ);
276   }
277 
setrej_hyphen_accept()278   void setrej_hyphen_accept() { // NN Flipped a char
279     set_flag(R_HYPHEN_ACCEPT);
280   }
281 
setrej_nn_accept()282   void setrej_nn_accept() { // NN Flipped a char
283     set_flag(R_NN_ACCEPT);
284   }
285 
setrej_mm_accept()286   void setrej_mm_accept() { // Matrix matcher
287     set_flag(R_MM_ACCEPT);
288   }
289 
setrej_quality_accept()290   void setrej_quality_accept() { // Quality flip a char
291     set_flag(R_QUALITY_ACCEPT);
292   }
293 
setrej_minimal_rej_accept()294   void setrej_minimal_rej_accept() {
295     // Accept all except blank
296     set_flag(R_MINIMAL_REJ_ACCEPT);
297   }
298 
accepted()299   bool accepted() const { // Is char accepted?
300     return !rejected();
301   }
302 
recoverable()303   bool recoverable() const {
304     return (rejected() && !perm_rejected());
305   }
306 
307   void full_print(FILE *fp) const;
308 };
309 
310 class REJMAP {
311   std::unique_ptr<REJ[]> ptr; // ptr to the chars
312   uint16_t len = 0;           // Number of chars
313 
314 public:
315   REJMAP() = default;
316 
REJMAP(const REJMAP & rejmap)317   REJMAP(const REJMAP &rejmap) {
318     *this = rejmap;
319   }
320 
321   REJMAP &operator=(const REJMAP &source);
322 
323   // Sets up the ptr array to length, whatever it was before.
324   void initialise(uint16_t length);
325 
326   REJ &operator[](         // access function
327       uint16_t index) const // map index
328   {
329     ASSERT_HOST(index < len);
330     return ptr[index]; // no bounds checks
331   }
332 
length()333   uint16_t length() const { // map length
334     return len;
335   }
336 
337   int16_t accept_count() const; // How many accepted?
338 
reject_count()339   int16_t reject_count() const { // How many rejects?
340     return len - accept_count();
341   }
342 
343   // Cut out an element.
344   void remove_pos(uint16_t pos);
345 
346   void print(FILE *fp) const;
347 
348   void full_print(FILE *fp) const;
349 
350   bool recoverable_rejects() const; // Any non perm rejs?
351 
352   bool quality_recoverable_rejects() const;
353   // Any potential rejs?
354 
355   void rej_word_small_xht(); // Reject whole word
356                              // Reject whole word
357   void rej_word_tess_failure();
358   void rej_word_not_tess_accepted();
359   // Reject whole word
360   // Reject whole word
361   void rej_word_contains_blanks();
362   // Reject whole word
363   void rej_word_bad_permuter();
364   void rej_word_xht_fixup(); // Reject whole word
365                              // Reject whole word
366   void rej_word_no_alphanums();
367   void rej_word_mostly_rej();  // Reject whole word
368   void rej_word_bad_quality(); // Reject whole word
369   void rej_word_doc_rej();     // Reject whole word
370   void rej_word_block_rej();   // Reject whole word
371   void rej_word_row_rej();     // Reject whole word
372 };
373 
374 } // namespace tesseract
375 
376 #endif
377