1 /********************************************************************** 2 * File: werd.h 3 * Description: Code for the WERD class. 4 * Author: Ray Smith 5 * 6 * (C) Copyright 1991, Hewlett-Packard Ltd. 7 ** Licensed under the Apache License, Version 2.0 (the "License"); 8 ** you may not use this file except in compliance with the License. 9 ** You may obtain a copy of the License at 10 ** http://www.apache.org/licenses/LICENSE-2.0 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 * 17 **********************************************************************/ 18 19 #ifndef WERD_H 20 #define WERD_H 21 22 #include "elst2.h" 23 #include "params.h" 24 #include "stepblob.h" 25 26 #include <bitset> 27 28 namespace tesseract { 29 30 enum WERD_FLAGS { 31 W_SEGMENTED, ///< correctly segmented 32 W_ITALIC, ///< italic text 33 W_BOLD, ///< bold text 34 W_BOL, ///< start of line 35 W_EOL, ///< end of line 36 W_NORMALIZED, ///< flags 37 W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense. 38 W_SCRIPT_IS_LATIN, ///< Special case latin for y. splitting. 39 W_DONT_CHOP, ///< fixed pitch chopped 40 W_REP_CHAR, ///< repeated character 41 W_FUZZY_SP, ///< fuzzy space 42 W_FUZZY_NON, ///< fuzzy nonspace 43 W_INVERSE ///< white on black 44 }; 45 46 enum DISPLAY_FLAGS { 47 /* Display flags bit number allocations */ 48 DF_BOX, ///< Bounding box 49 DF_TEXT, ///< Correct ascii 50 DF_POLYGONAL, ///< Polyg approx 51 DF_EDGE_STEP, ///< Edge steps 52 DF_BN_POLYGONAL, ///< BL normalisd polyapx 53 DF_BLAMER ///< Blamer information 54 }; 55 56 class ROW; // forward decl 57 58 class TESS_API WERD : public ELIST2_LINK { 59 public: 60 WERD() = default; 61 // WERD constructed with: 62 // blob_list - blobs of the word (we take this list's contents) 63 // blanks - number of blanks before the word 64 // text - correct text (outlives WERD) 65 WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text); 66 67 // WERD constructed from: 68 // blob_list - blobs in the word 69 // clone - werd to clone flags, etc from. 70 WERD(C_BLOB_LIST *blob_list, WERD *clone); 71 72 // Construct a WERD from a single_blob and clone the flags from this. 73 // W_BOL and W_EOL flags are set according to the given values. 74 WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob); 75 76 ~WERD() = default; 77 78 // assignment 79 WERD &operator=(const WERD &source); 80 81 // This method returns a new werd constructed using the blobs in the input 82 // all_blobs list, which correspond to the blobs in this werd object. The 83 // blobs used to construct the new word are consumed and removed from the 84 // input all_blobs list. 85 // Returns nullptr if the word couldn't be constructed. 86 // Returns original blobs for which no matches were found in the output list 87 // orphan_blobs (appends). 88 WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs); 89 90 // Accessors for reject / DUFF blobs in various formats rej_cblob_list()91 C_BLOB_LIST *rej_cblob_list() { // compact format 92 return &rej_cblobs; 93 } 94 95 // Accessors for good blobs in various formats. cblob_list()96 C_BLOB_LIST *cblob_list() { // get compact blobs 97 return &cblobs; 98 } 99 space()100 uint8_t space() const { // access function 101 return blanks; 102 } set_blanks(uint8_t new_blanks)103 void set_blanks(uint8_t new_blanks) { 104 blanks = new_blanks; 105 } script_id()106 int script_id() const { 107 return script_id_; 108 } set_script_id(int id)109 void set_script_id(int id) { 110 script_id_ = id; 111 } 112 113 // Returns the (default) bounding box including all the dots. 114 TBOX bounding_box() const; // compute bounding box 115 // Returns the bounding box including the desired combination of upper and 116 // lower noise/diacritic elements. 117 TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; 118 // Returns the bounding box of only the good blobs. 119 TBOX true_bounding_box() const; 120 text()121 const char *text() const { 122 return correct.c_str(); 123 } set_text(const char * new_text)124 void set_text(const char *new_text) { 125 correct = new_text; 126 } 127 flag(WERD_FLAGS mask)128 bool flag(WERD_FLAGS mask) const { 129 return flags[mask]; 130 } set_flag(WERD_FLAGS mask,bool value)131 void set_flag(WERD_FLAGS mask, bool value) { 132 flags.set(mask, value); 133 } 134 display_flag(uint8_t flag)135 bool display_flag(uint8_t flag) const { 136 return disp_flags[flag]; 137 } set_display_flag(uint8_t flag,bool value)138 void set_display_flag(uint8_t flag, bool value) { 139 disp_flags.set(flag, value); 140 } 141 142 WERD *shallow_copy(); // shallow copy word 143 144 // reposition word by vector 145 void move(const ICOORD vec); 146 147 // join other's blobs onto this werd, emptying out other. 148 void join_on(WERD *other); 149 150 // copy other's blobs onto this word, leaving other intact. 151 void copy_on(WERD *other); 152 153 // tprintf word metadata (but not blob innards) 154 void print() const; 155 156 #ifndef GRAPHICS_DISABLED 157 // plot word on window in a uniform colour 158 void plot(ScrollView *window, ScrollView::Color colour); 159 160 // Get the next color in the (looping) rainbow. 161 static ScrollView::Color NextColor(ScrollView::Color colour); 162 163 // plot word on window in a rainbow of colours 164 void plot(ScrollView *window); 165 166 // plot rejected blobs in a rainbow of colours 167 void plot_rej_blobs(ScrollView *window); 168 #endif // !GRAPHICS_DISABLED 169 170 // Removes noise from the word by moving small outlines to the rej_cblobs 171 // list, based on the size_threshold. 172 void CleanNoise(float size_threshold); 173 174 // Extracts all the noise outlines and stuffs the pointers into the given 175 // vector of outlines. Afterwards, the outlines vector owns the pointers. 176 void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines); 177 // Adds the selected outlines to the indcated real blobs, and puts the rest 178 // back in rej_cblobs where they came from. Where the target_blobs entry is 179 // nullptr, a run of wanted outlines is put into a single new blob. 180 // Ownership of the outlines is transferred back to the word. (Hence 181 // vector and not PointerVector.) 182 // Returns true if any new blob was added to the start of the word, which 183 // suggests that it might need joining to the word before it, and likewise 184 // sets make_next_word_fuzzy true if any new blob was added to the end. 185 bool AddSelectedOutlines(const std::vector<bool> &wanted, 186 const std::vector<C_BLOB *> &target_blobs, 187 const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy); 188 189 private: 190 uint8_t blanks = 0; // no of blanks 191 std::bitset<16> flags; // flags about word 192 std::bitset<16> disp_flags; // display flags 193 int16_t script_id_ = 0; // From unicharset. 194 std::string correct; // correct text 195 C_BLOB_LIST cblobs; // compacted blobs 196 C_BLOB_LIST rej_cblobs; // DUFF blobs 197 }; 198 199 ELIST2IZEH(WERD) 200 201 } // namespace tesseract 202 203 #include "ocrrow.h" // placed here due to 204 205 namespace tesseract { 206 207 // compare words by increasing order of left edge, suitable for qsort(3) 208 int word_comparator(const void *word1p, const void *word2p); 209 210 } // namespace tesseract 211 212 #endif 213