1 ///////////////////////////////////////////////////////////////////////
2 // File:        ambigs.h
3 // Description: Constants, flags, functions for dealing with
4 //              ambiguities (training and recognition).
5 // Author:      Daria Antonova
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
21 #define TESSERACT_CCUTIL_AMBIGS_H_
22 
23 #ifdef HAVE_CONFIG_H
24 #  include "config_auto.h" // DISABLED_LEGACY_ENGINE
25 #endif
26 
27 #if !defined(DISABLED_LEGACY_ENGINE)
28 
29 #  include <tesseract/unichar.h>
30 #  include "elst.h"
31 #  include "tprintf.h"
32 #  include "unicharset.h"
33 
34 #  define MAX_AMBIG_SIZE 10
35 
36 namespace tesseract {
37 
38 using UnicharIdVector = std::vector<UNICHAR_ID>;
39 
40 enum AmbigType {
41   NOT_AMBIG,      // the ngram pair is not ambiguous
42   REPLACE_AMBIG,  // ocred ngram should always be substituted with correct
43   DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
44   SIMILAR_AMBIG,  // use pairwise classifier for ocred/correct pair (1-1)
45   CASE_AMBIG,     // this is a case ambiguity (1-1)
46 
47   AMBIG_TYPE_COUNT // number of enum entries
48 };
49 
50 // A collection of utility functions for arrays of UNICHAR_IDs that are
51 // terminated by INVALID_UNICHAR_ID.
52 class UnicharIdArrayUtils {
53 public:
54   // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
55   // less than length of array2, if any array1[i] is less than array2[i].
56   // Returns 0 if the arrays are equal, 1 otherwise.
57   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
compare(const UNICHAR_ID * ptr1,const UNICHAR_ID * ptr2)58   static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
59     for (;;) {
60       const UNICHAR_ID val1 = *ptr1++;
61       const UNICHAR_ID val2 = *ptr2++;
62       if (val1 != val2) {
63         if (val1 == INVALID_UNICHAR_ID) {
64           return -1;
65         }
66         if (val2 == INVALID_UNICHAR_ID) {
67           return 1;
68         }
69         if (val1 < val2) {
70           return -1;
71         }
72         return 1;
73       }
74       if (val1 == INVALID_UNICHAR_ID) {
75         return 0;
76       }
77     }
78   }
79 
80   // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
81   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
82   // and that dst has enough space for all the elements from src.
copy(const UNICHAR_ID src[],UNICHAR_ID dst[])83   static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
84     int i = 0;
85     do {
86       dst[i] = src[i];
87     } while (dst[i++] != INVALID_UNICHAR_ID);
88     return i - 1;
89   }
90 
91   // Prints unichars corresponding to the unichar_ids in the given array.
92   // The function assumes that array is terminated by INVALID_UNICHAR_ID.
print(const UNICHAR_ID array[],const UNICHARSET & unicharset)93   static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {
94     const UNICHAR_ID *ptr = array;
95     if (*ptr == INVALID_UNICHAR_ID) {
96       tprintf("[Empty]");
97     }
98     while (*ptr != INVALID_UNICHAR_ID) {
99       tprintf("%s ", unicharset.id_to_unichar(*ptr++));
100     }
101     tprintf("( ");
102     ptr = array;
103     while (*ptr != INVALID_UNICHAR_ID) {
104       tprintf("%d ", *ptr++);
105     }
106     tprintf(")\n");
107   }
108 };
109 
110 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
111 // start with the same unichar (e.g. r->t rn->m rr1->m).
112 class AmbigSpec : public ELIST_LINK {
113 public:
114   AmbigSpec();
115   ~AmbigSpec() = default;
116 
117   // Comparator function for sorting AmbigSpec_LISTs. The lists will
118   // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
119   // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
compare_ambig_specs(const void * spec1,const void * spec2)120   static int compare_ambig_specs(const void *spec1, const void *spec2) {
121     const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1);
122     const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2);
123     int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
124     if (result != 0) {
125       return result;
126     }
127     return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments);
128   }
129 
130   UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
131   UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
132   UNICHAR_ID correct_ngram_id;
133   AmbigType type;
134   int wrong_ngram_size;
135 };
136 ELISTIZEH(AmbigSpec)
137 
138 // AMBIG_TABLE[i] stores a set of ambiguities whose
139 // wrong ngram starts with unichar id i.
140 using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;
141 
142 class UnicharAmbigs {
143 public:
144   UnicharAmbigs() = default;
~UnicharAmbigs()145   ~UnicharAmbigs() {
146     for (auto data : replace_ambigs_) {
147       delete data;
148     }
149     for (auto data : dang_ambigs_) {
150       delete data;
151     }
152     for (auto data : one_to_one_definite_ambigs_) {
153       delete data;
154     }
155   }
156 
dang_ambigs()157   const UnicharAmbigsVector &dang_ambigs() const {
158     return dang_ambigs_;
159   }
replace_ambigs()160   const UnicharAmbigsVector &replace_ambigs() const {
161     return replace_ambigs_;
162   }
163 
164   // Initializes the ambigs by adding a nullptr pointer to each table.
165   void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);
166 
167   // Loads the universal ambigs that are useful for any language.
168   void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);
169 
170   // Fills in two ambiguity tables (replaceable and dangerous) with information
171   // read from the ambigs file. An ambiguity table is an array of lists.
172   // The array is indexed by a class id. Each entry in the table provides
173   // a list of potential ambiguities which can start with the corresponding
174   // character. For example the ambiguity "rn -> m", would be located in the
175   // table at index of unicharset.unichar_to_id('r').
176   // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
177   // one_to_one_definite_ambigs_. This vector is also indexed by the class id
178   // of the wrong part of the ambiguity and each entry contains a vector of
179   // unichar ids that are ambiguous to it.
180   // encoder_set is used to encode the ambiguity strings, undisturbed by new
181   // unichar_ids that may be created by adding the ambigs.
182   void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,
183                          bool use_ambigs_for_adaption, UNICHARSET *unicharset);
184 
185   // Returns definite 1-1 ambigs for the given unichar id.
OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id)186   inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
187     if (one_to_one_definite_ambigs_.empty()) {
188       return nullptr;
189     }
190     return one_to_one_definite_ambigs_[unichar_id];
191   }
192 
193   // Returns a pointer to the vector with all unichar ids that appear in the
194   // 'correct' part of the ambiguity pair when the given unichar id appears
195   // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
196   // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
197   // m will return a pointer to a vector with unichar ids of r,n,i.
AmbigsForAdaption(UNICHAR_ID unichar_id)198   inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {
199     if (ambigs_for_adaption_.empty()) {
200       return nullptr;
201     }
202     return ambigs_for_adaption_[unichar_id];
203   }
204 
205   // Similar to the above, but return the vector of unichar ids for which
206   // the given unichar_id is an ambiguity (appears in the 'wrong' part of
207   // some ambiguity pair).
ReverseAmbigsForAdaption(UNICHAR_ID unichar_id)208   inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {
209     if (reverse_ambigs_for_adaption_.empty()) {
210       return nullptr;
211     }
212     return reverse_ambigs_for_adaption_[unichar_id];
213   }
214 
215 private:
216   bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,
217                           char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
218                           int *replacement_ambig_part_size, char *replacement_string, int *type);
219   bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,
220                        UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
221                        const char *replacement_string, int type, AmbigSpec *ambig_spec,
222                        UNICHARSET *unicharset);
223 
224   UnicharAmbigsVector dang_ambigs_;
225   UnicharAmbigsVector replace_ambigs_;
226   std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;
227   std::vector<UnicharIdVector *> ambigs_for_adaption_;
228   std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;
229 };
230 
231 } // namespace tesseract
232 
233 #endif // !defined(DISABLED_LEGACY_ENGINE)
234 
235 #endif // TESSERACT_CCUTIL_AMBIGS_H_
236