1 /////////////////////////////////////////////////////////////////////// 2 // File: ambigs.h 3 // Description: Constants, flags, functions for dealing with 4 // ambiguities (training and recognition). 5 // Author: Daria Antonova 6 // 7 // (C) Copyright 2008, Google Inc. 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 // 18 /////////////////////////////////////////////////////////////////////// 19 20 #ifndef TESSERACT_CCUTIL_AMBIGS_H_ 21 #define TESSERACT_CCUTIL_AMBIGS_H_ 22 23 #ifdef HAVE_CONFIG_H 24 # include "config_auto.h" // DISABLED_LEGACY_ENGINE 25 #endif 26 27 #if !defined(DISABLED_LEGACY_ENGINE) 28 29 # include <tesseract/unichar.h> 30 # include "elst.h" 31 # include "tprintf.h" 32 # include "unicharset.h" 33 34 # define MAX_AMBIG_SIZE 10 35 36 namespace tesseract { 37 38 using UnicharIdVector = std::vector<UNICHAR_ID>; 39 40 enum AmbigType { 41 NOT_AMBIG, // the ngram pair is not ambiguous 42 REPLACE_AMBIG, // ocred ngram should always be substituted with correct 43 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) 44 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) 45 CASE_AMBIG, // this is a case ambiguity (1-1) 46 47 AMBIG_TYPE_COUNT // number of enum entries 48 }; 49 50 // A collection of utility functions for arrays of UNICHAR_IDs that are 51 // terminated by INVALID_UNICHAR_ID. 52 class UnicharIdArrayUtils { 53 public: 54 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is 55 // less than length of array2, if any array1[i] is less than array2[i]. 56 // Returns 0 if the arrays are equal, 1 otherwise. 57 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. compare(const UNICHAR_ID * ptr1,const UNICHAR_ID * ptr2)58 static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) { 59 for (;;) { 60 const UNICHAR_ID val1 = *ptr1++; 61 const UNICHAR_ID val2 = *ptr2++; 62 if (val1 != val2) { 63 if (val1 == INVALID_UNICHAR_ID) { 64 return -1; 65 } 66 if (val2 == INVALID_UNICHAR_ID) { 67 return 1; 68 } 69 if (val1 < val2) { 70 return -1; 71 } 72 return 1; 73 } 74 if (val1 == INVALID_UNICHAR_ID) { 75 return 0; 76 } 77 } 78 } 79 80 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. 81 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID 82 // and that dst has enough space for all the elements from src. copy(const UNICHAR_ID src[],UNICHAR_ID dst[])83 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { 84 int i = 0; 85 do { 86 dst[i] = src[i]; 87 } while (dst[i++] != INVALID_UNICHAR_ID); 88 return i - 1; 89 } 90 91 // Prints unichars corresponding to the unichar_ids in the given array. 92 // The function assumes that array is terminated by INVALID_UNICHAR_ID. print(const UNICHAR_ID array[],const UNICHARSET & unicharset)93 static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) { 94 const UNICHAR_ID *ptr = array; 95 if (*ptr == INVALID_UNICHAR_ID) { 96 tprintf("[Empty]"); 97 } 98 while (*ptr != INVALID_UNICHAR_ID) { 99 tprintf("%s ", unicharset.id_to_unichar(*ptr++)); 100 } 101 tprintf("( "); 102 ptr = array; 103 while (*ptr != INVALID_UNICHAR_ID) { 104 tprintf("%d ", *ptr++); 105 } 106 tprintf(")\n"); 107 } 108 }; 109 110 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that 111 // start with the same unichar (e.g. r->t rn->m rr1->m). 112 class AmbigSpec : public ELIST_LINK { 113 public: 114 AmbigSpec(); 115 ~AmbigSpec() = default; 116 117 // Comparator function for sorting AmbigSpec_LISTs. The lists will 118 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors 119 // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. compare_ambig_specs(const void * spec1,const void * spec2)120 static int compare_ambig_specs(const void *spec1, const void *spec2) { 121 const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1); 122 const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2); 123 int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); 124 if (result != 0) { 125 return result; 126 } 127 return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments); 128 } 129 130 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 131 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; 132 UNICHAR_ID correct_ngram_id; 133 AmbigType type; 134 int wrong_ngram_size; 135 }; 136 ELISTIZEH(AmbigSpec) 137 138 // AMBIG_TABLE[i] stores a set of ambiguities whose 139 // wrong ngram starts with unichar id i. 140 using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>; 141 142 class UnicharAmbigs { 143 public: 144 UnicharAmbigs() = default; ~UnicharAmbigs()145 ~UnicharAmbigs() { 146 for (auto data : replace_ambigs_) { 147 delete data; 148 } 149 for (auto data : dang_ambigs_) { 150 delete data; 151 } 152 for (auto data : one_to_one_definite_ambigs_) { 153 delete data; 154 } 155 } 156 dang_ambigs()157 const UnicharAmbigsVector &dang_ambigs() const { 158 return dang_ambigs_; 159 } replace_ambigs()160 const UnicharAmbigsVector &replace_ambigs() const { 161 return replace_ambigs_; 162 } 163 164 // Initializes the ambigs by adding a nullptr pointer to each table. 165 void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption); 166 167 // Loads the universal ambigs that are useful for any language. 168 void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset); 169 170 // Fills in two ambiguity tables (replaceable and dangerous) with information 171 // read from the ambigs file. An ambiguity table is an array of lists. 172 // The array is indexed by a class id. Each entry in the table provides 173 // a list of potential ambiguities which can start with the corresponding 174 // character. For example the ambiguity "rn -> m", would be located in the 175 // table at index of unicharset.unichar_to_id('r'). 176 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in 177 // one_to_one_definite_ambigs_. This vector is also indexed by the class id 178 // of the wrong part of the ambiguity and each entry contains a vector of 179 // unichar ids that are ambiguous to it. 180 // encoder_set is used to encode the ambiguity strings, undisturbed by new 181 // unichar_ids that may be created by adding the ambigs. 182 void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, 183 bool use_ambigs_for_adaption, UNICHARSET *unicharset); 184 185 // Returns definite 1-1 ambigs for the given unichar id. OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id)186 inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const { 187 if (one_to_one_definite_ambigs_.empty()) { 188 return nullptr; 189 } 190 return one_to_one_definite_ambigs_[unichar_id]; 191 } 192 193 // Returns a pointer to the vector with all unichar ids that appear in the 194 // 'correct' part of the ambiguity pair when the given unichar id appears 195 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of 196 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of 197 // m will return a pointer to a vector with unichar ids of r,n,i. AmbigsForAdaption(UNICHAR_ID unichar_id)198 inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const { 199 if (ambigs_for_adaption_.empty()) { 200 return nullptr; 201 } 202 return ambigs_for_adaption_[unichar_id]; 203 } 204 205 // Similar to the above, but return the vector of unichar ids for which 206 // the given unichar_id is an ambiguity (appears in the 'wrong' part of 207 // some ambiguity pair). ReverseAmbigsForAdaption(UNICHAR_ID unichar_id)208 inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const { 209 if (reverse_ambigs_for_adaption_.empty()) { 210 return nullptr; 211 } 212 return reverse_ambigs_for_adaption_[unichar_id]; 213 } 214 215 private: 216 bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset, 217 char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, 218 int *replacement_ambig_part_size, char *replacement_string, int *type); 219 bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size, 220 UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, 221 const char *replacement_string, int type, AmbigSpec *ambig_spec, 222 UNICHARSET *unicharset); 223 224 UnicharAmbigsVector dang_ambigs_; 225 UnicharAmbigsVector replace_ambigs_; 226 std::vector<UnicharIdVector *> one_to_one_definite_ambigs_; 227 std::vector<UnicharIdVector *> ambigs_for_adaption_; 228 std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_; 229 }; 230 231 } // namespace tesseract 232 233 #endif // !defined(DISABLED_LEGACY_ENGINE) 234 235 #endif // TESSERACT_CCUTIL_AMBIGS_H_ 236