1 /**********************************************************************
2  * File:        werd.h
3  * Description: Code for the WERD class.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifndef WERD_H
20 #define WERD_H
21 
22 #include "elst2.h"
23 #include "params.h"
24 #include "stepblob.h"
25 
26 #include <bitset>
27 
28 namespace tesseract {
29 
30 enum WERD_FLAGS {
31   W_SEGMENTED,          ///< correctly segmented
32   W_ITALIC,             ///< italic text
33   W_BOLD,               ///< bold text
34   W_BOL,                ///< start of line
35   W_EOL,                ///< end of line
36   W_NORMALIZED,         ///< flags
37   W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense.
38   W_SCRIPT_IS_LATIN,    ///< Special case latin for y. splitting.
39   W_DONT_CHOP,          ///< fixed pitch chopped
40   W_REP_CHAR,           ///< repeated character
41   W_FUZZY_SP,           ///< fuzzy space
42   W_FUZZY_NON,          ///< fuzzy nonspace
43   W_INVERSE             ///< white on black
44 };
45 
46 enum DISPLAY_FLAGS {
47   /* Display flags bit number allocations */
48   DF_BOX,          ///< Bounding box
49   DF_TEXT,         ///< Correct ascii
50   DF_POLYGONAL,    ///< Polyg approx
51   DF_EDGE_STEP,    ///< Edge steps
52   DF_BN_POLYGONAL, ///< BL normalisd polyapx
53   DF_BLAMER        ///< Blamer information
54 };
55 
56 class ROW; // forward decl
57 
58 class TESS_API WERD : public ELIST2_LINK {
59 public:
60   WERD() = default;
61   // WERD constructed with:
62   //   blob_list - blobs of the word (we take this list's contents)
63   //   blanks - number of blanks before the word
64   //   text - correct text (outlives WERD)
65   WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);
66 
67   // WERD constructed from:
68   //   blob_list - blobs in the word
69   //   clone - werd to clone flags, etc from.
70   WERD(C_BLOB_LIST *blob_list, WERD *clone);
71 
72   // Construct a WERD from a single_blob and clone the flags from this.
73   // W_BOL and W_EOL flags are set according to the given values.
74   WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob);
75 
76   ~WERD() = default;
77 
78   // assignment
79   WERD &operator=(const WERD &source);
80 
81   // This method returns a new werd constructed using the blobs in the input
82   // all_blobs list, which correspond to the blobs in this werd object. The
83   // blobs used to construct the new word are consumed and removed from the
84   // input all_blobs list.
85   // Returns nullptr if the word couldn't be constructed.
86   // Returns original blobs for which no matches were found in the output list
87   // orphan_blobs (appends).
88   WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs);
89 
90   // Accessors for reject / DUFF blobs in various formats
rej_cblob_list()91   C_BLOB_LIST *rej_cblob_list() { // compact format
92     return &rej_cblobs;
93   }
94 
95   // Accessors for good blobs in various formats.
cblob_list()96   C_BLOB_LIST *cblob_list() { // get compact blobs
97     return &cblobs;
98   }
99 
space()100   uint8_t space() const { // access function
101     return blanks;
102   }
set_blanks(uint8_t new_blanks)103   void set_blanks(uint8_t new_blanks) {
104     blanks = new_blanks;
105   }
script_id()106   int script_id() const {
107     return script_id_;
108   }
set_script_id(int id)109   void set_script_id(int id) {
110     script_id_ = id;
111   }
112 
113   // Returns the (default) bounding box including all the dots.
114   TBOX bounding_box() const; // compute bounding box
115   // Returns the bounding box including the desired combination of upper and
116   // lower noise/diacritic elements.
117   TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
118   // Returns the bounding box of only the good blobs.
119   TBOX true_bounding_box() const;
120 
text()121   const char *text() const {
122     return correct.c_str();
123   }
set_text(const char * new_text)124   void set_text(const char *new_text) {
125     correct = new_text;
126   }
127 
flag(WERD_FLAGS mask)128   bool flag(WERD_FLAGS mask) const {
129     return flags[mask];
130   }
set_flag(WERD_FLAGS mask,bool value)131   void set_flag(WERD_FLAGS mask, bool value) {
132     flags.set(mask, value);
133   }
134 
display_flag(uint8_t flag)135   bool display_flag(uint8_t flag) const {
136     return disp_flags[flag];
137   }
set_display_flag(uint8_t flag,bool value)138   void set_display_flag(uint8_t flag, bool value) {
139     disp_flags.set(flag, value);
140   }
141 
142   WERD *shallow_copy(); // shallow copy word
143 
144   // reposition word by vector
145   void move(const ICOORD vec);
146 
147   // join other's blobs onto this werd, emptying out other.
148   void join_on(WERD *other);
149 
150   // copy other's blobs onto this word, leaving other intact.
151   void copy_on(WERD *other);
152 
153   // tprintf word metadata (but not blob innards)
154   void print() const;
155 
156 #ifndef GRAPHICS_DISABLED
157   // plot word on window in a uniform colour
158   void plot(ScrollView *window, ScrollView::Color colour);
159 
160   // Get the next color in the (looping) rainbow.
161   static ScrollView::Color NextColor(ScrollView::Color colour);
162 
163   // plot word on window in a rainbow of colours
164   void plot(ScrollView *window);
165 
166   // plot rejected blobs in a rainbow of colours
167   void plot_rej_blobs(ScrollView *window);
168 #endif // !GRAPHICS_DISABLED
169 
170   // Removes noise from the word by moving small outlines to the rej_cblobs
171   // list, based on the size_threshold.
172   void CleanNoise(float size_threshold);
173 
174   // Extracts all the noise outlines and stuffs the pointers into the given
175   // vector of outlines. Afterwards, the outlines vector owns the pointers.
176   void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
177   // Adds the selected outlines to the indcated real blobs, and puts the rest
178   // back in rej_cblobs where they came from. Where the target_blobs entry is
179   // nullptr, a run of wanted outlines is put into a single new blob.
180   // Ownership of the outlines is transferred back to the word. (Hence
181   // vector and not PointerVector.)
182   // Returns true if any new blob was added to the start of the word, which
183   // suggests that it might need joining to the word before it, and likewise
184   // sets make_next_word_fuzzy true if any new blob was added to the end.
185   bool AddSelectedOutlines(const std::vector<bool> &wanted,
186                            const std::vector<C_BLOB *> &target_blobs,
187                            const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
188 
189 private:
190   uint8_t blanks = 0;     // no of blanks
191   std::bitset<16> flags;  // flags about word
192   std::bitset<16> disp_flags; // display flags
193   int16_t script_id_ = 0; // From unicharset.
194   std::string correct;    // correct text
195   C_BLOB_LIST cblobs;     // compacted blobs
196   C_BLOB_LIST rej_cblobs; // DUFF blobs
197 };
198 
199 ELIST2IZEH(WERD)
200 
201 } // namespace tesseract
202 
203 #include "ocrrow.h" // placed here due to
204 
205 namespace tesseract {
206 
207 // compare words by increasing order of left edge, suitable for qsort(3)
208 int word_comparator(const void *word1p, const void *word2p);
209 
210 } // namespace tesseract
211 
212 #endif
213