1 /**********************************************************************
2 * File: tfacepp.cpp (Formerly tface++.c)
3 * Description: C++ side of the C/C++ Tess/Editor interface.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include <cmath>
20
21 #include "blamer.h"
22 #include "errcode.h"
23 #include "ratngs.h"
24 #include "reject.h"
25 #include "tesseractclass.h"
26 #include "werd.h"
27
28 #define MAX_UNDIVIDED_LENGTH 24
29
30 /**********************************************************************
31 * recog_word
32 *
33 * Convert the word to tess form and pass it to the tess segmenter.
34 * Convert the output back to editor form.
35 **********************************************************************/
36 namespace tesseract {
recog_word(WERD_RES * word)37 void Tesseract::recog_word(WERD_RES *word) {
38 if (wordrec_skip_no_truth_words &&
39 (word->blamer_bundle == nullptr ||
40 word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
41 if (classify_debug_level) {
42 tprintf("No truth for word - skipping\n");
43 }
44 word->tess_failed = true;
45 return;
46 }
47 ASSERT_HOST(!word->chopped_word->blobs.empty());
48 recog_word_recursive(word);
49 word->SetupBoxWord();
50 ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
51 // Check that the ratings matrix size matches the sum of all the
52 // segmentation states.
53 if (!word->StatesAllValid()) {
54 tprintf("Not all words have valid states relative to ratings matrix!!");
55 word->DebugWordChoices(true, nullptr);
56 ASSERT_HOST(word->StatesAllValid());
57 }
58 if (tessedit_override_permuter) {
59 /* Override the permuter type if a straight dictionary check disagrees. */
60 uint8_t perm_type = word->best_choice->permuter();
61 if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) &&
62 (perm_type != USER_DAWG_PERM)) {
63 uint8_t real_dict_perm_type = dict_word(*word->best_choice);
64 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) ||
65 (real_dict_perm_type == USER_DAWG_PERM)) &&
66 (alpha_count(word->best_choice->unichar_string().c_str(),
67 word->best_choice->unichar_lengths().c_str()) > 0)) {
68 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
69 }
70 }
71 if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) {
72 tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter());
73 }
74 }
75 // Factored out from control.cpp
76 ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
77 if (word->best_choice == nullptr || word->best_choice->empty() ||
78 strspn(word->best_choice->unichar_string().c_str(), " ") ==
79 word->best_choice->length()) {
80 word->tess_failed = true;
81 word->reject_map.initialise(word->box_word->length());
82 word->reject_map.rej_word_tess_failure();
83 } else {
84 word->tess_failed = false;
85 }
86 }
87
88 /**********************************************************************
89 * recog_word_recursive
90 *
91 * Convert the word to tess form and pass it to the tess segmenter.
92 * Convert the output back to editor form.
93 **********************************************************************/
recog_word_recursive(WERD_RES * word)94 void Tesseract::recog_word_recursive(WERD_RES *word) {
95 auto word_length = word->chopped_word->NumBlobs(); // no of blobs
96 if (word_length > MAX_UNDIVIDED_LENGTH) {
97 return split_and_recog_word(word);
98 }
99 cc_recog(word);
100 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
101
102 // Do sanity checks and minor fixes on best_choice.
103 if (word->best_choice->length() > word_length) {
104 word->best_choice->make_bad(); // should never happen
105 tprintf(
106 "recog_word: Discarded long string \"%s\""
107 " (%d characters vs %d blobs)\n",
108 word->best_choice->unichar_string().c_str(), word->best_choice->length(), word_length);
109 tprintf("Word is at:");
110 word->word->bounding_box().print();
111 }
112 if (word->best_choice->length() < word_length) {
113 UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
114 while (word->best_choice->length() < word_length) {
115 word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty());
116 }
117 }
118 }
119
120 /**********************************************************************
121 * split_and_recog_word
122 *
123 * Split the word into 2 smaller pieces at the largest gap.
124 * Recognize the pieces and stick the results back together.
125 **********************************************************************/
split_and_recog_word(WERD_RES * word)126 void Tesseract::split_and_recog_word(WERD_RES *word) {
127 // Find the biggest blob gap in the chopped_word.
128 int bestgap = -INT32_MAX;
129 int split_index = 0;
130 for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
131 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
132 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
133 int gap = blob_box.left() - prev_box.right();
134 if (gap > bestgap) {
135 bestgap = gap;
136 split_index = b;
137 }
138 }
139 ASSERT_HOST(split_index > 0);
140
141 WERD_RES *word2 = nullptr;
142 BlamerBundle *orig_bb = nullptr;
143 split_word(word, split_index, &word2, &orig_bb);
144
145 // Recognize the first part of the word.
146 recog_word_recursive(word);
147 // Recognize the second part of the word.
148 recog_word_recursive(word2);
149
150 join_words(word, word2, orig_bb);
151 }
152
153 /**********************************************************************
154 * split_word
155 *
156 * Split a given WERD_RES in place into two smaller words for recognition.
157 * split_pt is the index of the first blob to go in the second word.
158 * The underlying word is left alone, only the TWERD (and subsequent data)
159 * are split up. orig_blamer_bundle is set to the original blamer bundle,
160 * and will now be owned by the caller. New blamer bundles are forged for the
161 * two pieces.
162 **********************************************************************/
split_word(WERD_RES * word,unsigned split_pt,WERD_RES ** right_piece,BlamerBundle ** orig_blamer_bundle) const163 void Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
164 BlamerBundle **orig_blamer_bundle) const {
165 ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
166
167 // Save a copy of the blamer bundle so we can try to reconstruct it below.
168 BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : nullptr;
169
170 auto *word2 = new WERD_RES(*word);
171
172 // blow away the copied chopped_word, as we want to work with
173 // the blobs from the input chopped_word so seam_arrays can be merged.
174 TWERD *chopped = word->chopped_word;
175 auto *chopped2 = new TWERD;
176 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
177 for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
178 chopped2->blobs.push_back(chopped->blobs[i]);
179 }
180 chopped->blobs.resize(split_pt);
181 word->chopped_word = nullptr;
182 delete word2->chopped_word;
183 word2->chopped_word = nullptr;
184
185 const UNICHARSET &unicharset = *word->uch_set;
186 word->ClearResults();
187 word2->ClearResults();
188 word->chopped_word = chopped;
189 word2->chopped_word = chopped2;
190 word->SetupBasicsFromChoppedWord(unicharset);
191 word2->SetupBasicsFromChoppedWord(unicharset);
192
193 // Try to adjust the blamer bundle.
194 if (orig_bb != nullptr) {
195 // TODO(rays) Looks like a leak to me.
196 // orig_bb should take, rather than copy.
197 word->blamer_bundle = new BlamerBundle();
198 word2->blamer_bundle = new BlamerBundle();
199 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
200 word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer,
201 word->blamer_bundle, word2->blamer_bundle);
202 }
203
204 *right_piece = word2;
205 *orig_blamer_bundle = orig_bb;
206 }
207
208 /**********************************************************************
209 * join_words
210 *
211 * The opposite of split_word():
212 * join word2 (including any recognized data / seam array / etc)
213 * onto the right of word and then delete word2.
214 * Also, if orig_bb is provided, stitch it back into word.
215 **********************************************************************/
join_words(WERD_RES * word,WERD_RES * word2,BlamerBundle * orig_bb) const216 void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const {
217 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
218 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
219 // Tack the word2 outputs onto the end of the word outputs.
220 word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
221 word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
222 word2->chopped_word->blobs.clear();
223 word2->rebuild_word->blobs.clear();
224 TPOINT split_pt;
225 split_pt.x = (prev_box.right() + blob_box.left()) / 2;
226 split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4;
227 // Move the word2 seams onto the end of the word1 seam_array.
228 // Since the seam list is one element short, an empty seam marking the
229 // end of the last blob in the first word is needed first.
230 word->seam_array.push_back(new SEAM(0.0f, split_pt));
231 word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
232 word2->seam_array.clear();
233 // Fix widths and gaps.
234 word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
235 word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
236 // Fix the ratings matrix.
237 int rat1 = word->ratings->dimension();
238 int rat2 = word2->ratings->dimension();
239 word->ratings->AttachOnCorner(word2->ratings);
240 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
241 word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
242 // Append the word choices.
243 *word->raw_choice += *word2->raw_choice;
244
245 // How many alt choices from each should we try to get?
246 const int kAltsPerPiece = 2;
247 // When do we start throwing away extra alt choices?
248 const int kTooManyAltChoices = 100;
249
250 // Construct the cartesian product of the best_choices of word(1) and word2.
251 WERD_CHOICE_LIST joined_choices;
252 WERD_CHOICE_IT jc_it(&joined_choices);
253 WERD_CHOICE_IT bc1_it(&word->best_choices);
254 WERD_CHOICE_IT bc2_it(&word2->best_choices);
255 int num_word1_choices = word->best_choices.length();
256 int total_joined_choices = num_word1_choices;
257 // Nota Bene: For the main loop here, we operate only on the 2nd and greater
258 // word2 choices, and put them in the joined_choices list. The 1st word2
259 // choice gets added to the original word1 choices in-place after we have
260 // finished with them.
261 int bc2_index = 1;
262 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
263 if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) {
264 break;
265 }
266 int bc1_index = 0;
267 for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) {
268 if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) {
269 break;
270 }
271 auto *wc = new WERD_CHOICE(*bc1_it.data());
272 *wc += *bc2_it.data();
273 jc_it.add_after_then_move(wc);
274 ++total_joined_choices;
275 }
276 }
277 // Now that we've filled in as many alternates as we want, paste the best
278 // choice for word2 onto the original word alt_choices.
279 bc1_it.move_to_first();
280 bc2_it.move_to_first();
281 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
282 *bc1_it.data() += *bc2_it.data();
283 }
284 bc1_it.move_to_last();
285 bc1_it.add_list_after(&joined_choices);
286
287 // Restore the pointer to original blamer bundle and combine blamer
288 // information recorded in the splits.
289 if (orig_bb != nullptr) {
290 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer);
291 delete word->blamer_bundle;
292 word->blamer_bundle = orig_bb;
293 }
294 word->SetupBoxWord();
295 word->reject_map.initialise(word->box_word->length());
296 delete word2;
297 }
298
299 } // namespace tesseract
300