1 /////////////////////////////////////////////////////////////////////// 2 // File: unicharset.h 3 // Description: Unicode character/ligature set class. 4 // Author: Thomas Kielbus 5 // 6 // (C) Copyright 2006, Google Inc. 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ 20 #define TESSERACT_CCUTIL_UNICHARSET_H_ 21 22 #include "errcode.h" 23 #include "unicharmap.h" 24 25 #include <tesseract/unichar.h> 26 #include "helpers.h" 27 #include "serialis.h" 28 29 #include <functional> // for std::function 30 31 namespace tesseract { 32 33 // Enum holding special values of unichar_id. Every unicharset has these. 34 // Warning! Keep in sync with kSpecialUnicharCodes. 35 enum SpecialUnicharCodes { 36 UNICHAR_SPACE, 37 UNICHAR_JOINED, 38 UNICHAR_BROKEN, 39 40 SPECIAL_UNICHAR_CODES_COUNT 41 }; 42 43 // Boolean flag for unichar_insert. It's a bit of a double negative to allow 44 // the default value to be false. 45 enum class OldUncleanUnichars { 46 kFalse, 47 kTrue, 48 }; 49 50 class TESS_API CHAR_FRAGMENT { 51 public: 52 // Minimum number of characters used for fragment representation. 53 static const int kMinLen = 6; 54 // Maximum number of characters used for fragment representation. 55 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 56 // Maximum number of fragments per character. 57 static const int kMaxChunks = 5; 58 59 // Setters and Getters. set_all(const char * unichar,int pos,int total,bool natural)60 inline void set_all(const char *unichar, int pos, int total, bool natural) { 61 set_unichar(unichar); 62 set_pos(pos); 63 set_total(total); 64 set_natural(natural); 65 } set_unichar(const char * uch)66 inline void set_unichar(const char *uch) { 67 strncpy(this->unichar, uch, sizeof(this->unichar)); 68 this->unichar[UNICHAR_LEN] = '\0'; 69 } set_pos(int p)70 inline void set_pos(int p) { 71 this->pos = p; 72 } set_total(int t)73 inline void set_total(int t) { 74 this->total = t; 75 } get_unichar()76 inline const char *get_unichar() const { 77 return this->unichar; 78 } get_pos()79 inline int get_pos() const { 80 return this->pos; 81 } get_total()82 inline int get_total() const { 83 return this->total; 84 } 85 86 // Returns the string that represents a fragment 87 // with the given unichar, pos and total. 88 static std::string to_string(const char *unichar, int pos, int total, 89 bool natural); 90 // Returns the string that represents this fragment. to_string()91 std::string to_string() const { 92 return to_string(unichar, pos, total, natural); 93 } 94 95 // Checks whether a fragment has the same unichar, 96 // position and total as the given inputs. equals(const char * other_unichar,int other_pos,int other_total)97 inline bool equals(const char *other_unichar, int other_pos, 98 int other_total) const { 99 return (strcmp(this->unichar, other_unichar) == 0 && 100 this->pos == other_pos && this->total == other_total); 101 } equals(const CHAR_FRAGMENT * other)102 inline bool equals(const CHAR_FRAGMENT *other) const { 103 return this->equals(other->get_unichar(), other->get_pos(), 104 other->get_total()); 105 } 106 107 // Checks whether a given fragment is a continuation of this fragment. 108 // Assumes that the given fragment pointer is not nullptr. is_continuation_of(const CHAR_FRAGMENT * fragment)109 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 110 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 111 this->total == fragment->get_total() && 112 this->pos == fragment->get_pos() + 1); 113 } 114 115 // Returns true if this fragment is a beginning fragment. is_beginning()116 inline bool is_beginning() const { 117 return this->pos == 0; 118 } 119 120 // Returns true if this fragment is an ending fragment. is_ending()121 inline bool is_ending() const { 122 return this->pos == this->total - 1; 123 } 124 125 // Returns true if the fragment was a separate component to begin with, 126 // ie did not need chopping to be isolated, but may have been separated 127 // out from a multi-outline blob. is_natural()128 inline bool is_natural() const { 129 return natural; 130 } set_natural(bool value)131 void set_natural(bool value) { 132 natural = value; 133 } 134 135 // Parses the string to see whether it represents a character fragment 136 // (rather than a regular character). If so, allocates memory for a new 137 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 138 // information. Fragments are of the form: 139 // |m|1|2, meaning chunk 1 of 2 of character m, or 140 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed 141 // to divide the parts, as they were already separate connected components. 142 // 143 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 144 // instance, otherwise (if the string does not represent a fragment or it 145 // looks like it does, but parsing it as a fragment fails) returns nullptr. 146 // 147 // Note: The caller is responsible for deallocating memory 148 // associated with the returned pointer. 149 static CHAR_FRAGMENT *parse_from_string(const char *str); 150 151 private: 152 char unichar[UNICHAR_LEN + 1]; 153 // True if the fragment was a separate component to begin with, 154 // ie did not need chopping to be isolated, but may have been separated 155 // out from a multi-outline blob. 156 bool natural; 157 int16_t pos; // fragment position in the character 158 int16_t total; // total number of fragments in the character 159 }; 160 161 // The UNICHARSET class is an utility class for Tesseract that holds the 162 // set of characters that are used by the engine. Each character is identified 163 // by a unique number, from 0 to (size - 1). 164 class TESS_API UNICHARSET { 165 public: 166 // Custom list of characters and their ligature forms (UTF8) 167 // These map to unicode values in the private use area (PUC) and are supported 168 // by only few font families (eg. Wyld, Adobe Caslon Pro). 169 static const char *kCustomLigatures[][2]; 170 171 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. 172 static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; 173 174 // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h) 175 enum Direction { 176 U_LEFT_TO_RIGHT = 0, 177 U_RIGHT_TO_LEFT = 1, 178 U_EUROPEAN_NUMBER = 2, 179 U_EUROPEAN_NUMBER_SEPARATOR = 3, 180 U_EUROPEAN_NUMBER_TERMINATOR = 4, 181 U_ARABIC_NUMBER = 5, 182 U_COMMON_NUMBER_SEPARATOR = 6, 183 U_BLOCK_SEPARATOR = 7, 184 U_SEGMENT_SEPARATOR = 8, 185 U_WHITE_SPACE_NEUTRAL = 9, 186 U_OTHER_NEUTRAL = 10, 187 U_LEFT_TO_RIGHT_EMBEDDING = 11, 188 U_LEFT_TO_RIGHT_OVERRIDE = 12, 189 U_RIGHT_TO_LEFT_ARABIC = 13, 190 U_RIGHT_TO_LEFT_EMBEDDING = 14, 191 U_RIGHT_TO_LEFT_OVERRIDE = 15, 192 U_POP_DIRECTIONAL_FORMAT = 16, 193 U_DIR_NON_SPACING_MARK = 17, 194 U_BOUNDARY_NEUTRAL = 18, 195 U_FIRST_STRONG_ISOLATE = 19, 196 U_LEFT_TO_RIGHT_ISOLATE = 20, 197 U_RIGHT_TO_LEFT_ISOLATE = 21, 198 U_POP_DIRECTIONAL_ISOLATE = 22, 199 #ifndef U_HIDE_DEPRECATED_API 200 U_CHAR_DIRECTION_COUNT 201 #endif // U_HIDE_DEPRECATED_API 202 }; 203 204 // Create an empty UNICHARSET 205 UNICHARSET(); 206 207 ~UNICHARSET(); 208 209 // Return the UNICHAR_ID of a given unichar representation within the 210 // UNICHARSET. 211 UNICHAR_ID unichar_to_id(const char *const unichar_repr) const; 212 213 // Return the UNICHAR_ID of a given unichar representation within the 214 // UNICHARSET. Only the first length characters from unichar_repr are used. 215 UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const; 216 217 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 218 // while leaving the rest of the string encodable. Returns 0 if the 219 // beginning of the string is not encodable. 220 // WARNING: this function now encodes the whole string for precision. 221 // Use encode_string in preference to repeatedly calling step. 222 int step(const char *str) const; 223 224 // Returns true if the given UTF-8 string is encodable with this UNICHARSET. 225 // If not encodable, write the first byte offset which cannot be converted 226 // into the second (return) argument. 227 bool encodable_string(const char *str, unsigned *first_bad_position) const; 228 229 // Encodes the given UTF-8 string with this UNICHARSET. 230 // Any part of the string that cannot be encoded (because the utf8 can't 231 // be broken up into pieces that are in the unicharset) then: 232 // if give_up_on_failure, stops and returns a partial encoding, 233 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. 234 // Returns true if the encoding succeeds completely, false if there is at 235 // least one failure. 236 // If lengths is not nullptr, then it is filled with the corresponding 237 // byte length of each encoded UNICHAR_ID. 238 // If encoded_length is not nullptr then on return it contains the length of 239 // str that was encoded. (if give_up_on_failure the location of the first 240 // failure, otherwise strlen(str).) 241 // WARNING: Caller must guarantee that str has already been cleaned of codes 242 // that do not belong in the unicharset, or encoding may fail. 243 // Use CleanupString to perform the cleaning. 244 bool encode_string(const char *str, bool give_up_on_failure, 245 std::vector<UNICHAR_ID> *encoding, 246 std::vector<char> *lengths, 247 unsigned *encoded_length) const; 248 249 // Return the unichar representation corresponding to the given UNICHAR_ID 250 // within the UNICHARSET. 251 const char *id_to_unichar(UNICHAR_ID id) const; 252 253 // Return the UTF8 representation corresponding to the given UNICHAR_ID after 254 // resolving any private encodings internal to Tesseract. This method is 255 // preferable to id_to_unichar for outputting text that will be visible to 256 // external applications. 257 const char *id_to_unichar_ext(UNICHAR_ID id) const; 258 259 // Return a string that reformats the utf8 str into the str followed 260 // by its hex unicodes. 261 static std::string debug_utf8_str(const char *str); 262 263 // Removes/replaces content that belongs in rendered text, but not in the 264 // unicharset. CleanupString(const char * utf8_str)265 static std::string CleanupString(const char *utf8_str) { 266 return CleanupString(utf8_str, strlen(utf8_str)); 267 } 268 static std::string CleanupString(const char *utf8_str, size_t length); 269 270 // Return a string containing debug information on the unichar, including 271 // the id_to_unichar, its hex unicodes and the properties. 272 std::string debug_str(UNICHAR_ID id) const; debug_str(const char * unichar_repr)273 std::string debug_str(const char *unichar_repr) const { 274 return debug_str(unichar_to_id(unichar_repr)); 275 } 276 277 // Adds a unichar representation to the set. If old_style is true, then 278 // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL 279 // characters are ignored/skipped as if they don't exist and n-grams that 280 // can already be encoded are not added. 281 void unichar_insert(const char *const unichar_repr, 282 OldUncleanUnichars old_style); unichar_insert(const char * const unichar_repr)283 void unichar_insert(const char *const unichar_repr) { 284 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); 285 } 286 // Adds a unichar representation to the set. Avoids setting old_style to true, 287 // unless it is necessary to make the new unichar get added. unichar_insert_backwards_compatible(const char * const unichar_repr)288 void unichar_insert_backwards_compatible(const char *const unichar_repr) { 289 std::string cleaned = CleanupString(unichar_repr); 290 if (cleaned != unichar_repr) { 291 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); 292 } else { 293 auto old_size = size(); 294 unichar_insert(unichar_repr, OldUncleanUnichars::kFalse); 295 if (size() == old_size) { 296 unichar_insert(unichar_repr, OldUncleanUnichars::kTrue); 297 } 298 } 299 } 300 301 // Return true if the given unichar id exists within the set. 302 // Relies on the fact that unichar ids are contiguous in the unicharset. contains_unichar_id(UNICHAR_ID unichar_id)303 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 304 return static_cast<size_t>(unichar_id) < unichars.size(); 305 } 306 307 // Return true if the given unichar representation exists within the set. 308 bool contains_unichar(const char *const unichar_repr) const; 309 bool contains_unichar(const char *const unichar_repr, int length) const; 310 311 // Return true if the given unichar representation corresponds to the given 312 // UNICHAR_ID within the set. 313 bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const; 314 315 // Delete CHAR_FRAGMENTs stored in properties of unichars array. delete_pointers_in_unichars()316 void delete_pointers_in_unichars() { 317 for (auto &unichar : unichars) { 318 delete unichar.properties.fragment; 319 unichar.properties.fragment = nullptr; 320 } 321 } 322 323 // Clear the UNICHARSET (all the previous data is lost). clear()324 void clear() { 325 if (script_table != nullptr) { 326 for (int i = 0; i < script_table_size_used; ++i) { 327 delete[] script_table[i]; 328 } 329 delete[] script_table; 330 script_table = nullptr; 331 script_table_size_used = 0; 332 } 333 script_table_size_reserved = 0; 334 delete_pointers_in_unichars(); 335 unichars.clear(); 336 ids.clear(); 337 top_bottom_set_ = false; 338 script_has_upper_lower_ = false; 339 script_has_xheight_ = false; 340 old_style_included_ = false; 341 null_sid_ = 0; 342 common_sid_ = 0; 343 latin_sid_ = 0; 344 cyrillic_sid_ = 0; 345 greek_sid_ = 0; 346 han_sid_ = 0; 347 hiragana_sid_ = 0; 348 katakana_sid_ = 0; 349 thai_sid_ = 0; 350 hangul_sid_ = 0; 351 default_sid_ = 0; 352 } 353 354 // Return the size of the set (the number of different UNICHAR it holds). size()355 size_t size() const { 356 return unichars.size(); 357 } 358 359 // Opens the file indicated by filename and saves unicharset to that file. 360 // Returns true if the operation is successful. save_to_file(const char * const filename)361 bool save_to_file(const char *const filename) const { 362 FILE *file = fopen(filename, "w+b"); 363 if (file == nullptr) { 364 return false; 365 } 366 bool result = save_to_file(file); 367 fclose(file); 368 return result; 369 } 370 371 // Saves the content of the UNICHARSET to the given file. 372 // Returns true if the operation is successful. save_to_file(FILE * file)373 bool save_to_file(FILE *file) const { 374 std::string str; 375 return save_to_string(str) && 376 tesseract::Serialize(file, &str[0], str.length()); 377 } 378 save_to_file(tesseract::TFile * file)379 bool save_to_file(tesseract::TFile *file) const { 380 std::string str; 381 return save_to_string(str) && file->Serialize(&str[0], str.length()); 382 } 383 384 // Saves the content of the UNICHARSET to the given string. 385 // Returns true if the operation is successful. 386 bool save_to_string(std::string &str) const; 387 388 // Opens the file indicated by filename and loads the UNICHARSET 389 // from the given file. The previous data is lost. 390 // Returns true if the operation is successful. load_from_file(const char * const filename,bool skip_fragments)391 bool load_from_file(const char *const filename, bool skip_fragments) { 392 FILE *file = fopen(filename, "rb"); 393 if (file == nullptr) { 394 return false; 395 } 396 bool result = load_from_file(file, skip_fragments); 397 fclose(file); 398 return result; 399 } 400 // returns true if the operation is successful. load_from_file(const char * const filename)401 bool load_from_file(const char *const filename) { 402 return load_from_file(filename, false); 403 } 404 405 // Loads the UNICHARSET from the given file. The previous data is lost. 406 // Returns true if the operation is successful. 407 bool load_from_file(FILE *file, bool skip_fragments); load_from_file(FILE * file)408 bool load_from_file(FILE *file) { 409 return load_from_file(file, false); 410 } 411 bool load_from_file(tesseract::TFile *file, bool skip_fragments); 412 413 // Sets up internal data after loading the file, based on the char 414 // properties. Called from load_from_file, but also needs to be run 415 // during set_unicharset_properties. 416 void post_load_setup(); 417 418 // Returns true if right_to_left scripts are significant in the unicharset, 419 // but without being so sensitive that "universal" unicharsets containing 420 // characters from many scripts, like orientation and script detection, 421 // look like they are right_to_left. 422 bool major_right_to_left() const; 423 424 // Set a whitelist and/or blacklist of characters to recognize. 425 // An empty or nullptr whitelist enables everything (minus any blacklist). 426 // An empty or nullptr blacklist disables nothing. 427 // An empty or nullptr unblacklist has no effect. 428 // The blacklist overrides the whitelist. 429 // The unblacklist overrides the blacklist. 430 // Each list is a string of utf8 character strings. Boundaries between 431 // unicharset units are worked out automatically, and characters not in 432 // the unicharset are silently ignored. 433 void set_black_and_whitelist(const char *blacklist, const char *whitelist, 434 const char *unblacklist); 435 436 // Set the isalpha property of the given unichar to the given value. set_isalpha(UNICHAR_ID unichar_id,bool value)437 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 438 unichars[unichar_id].properties.isalpha = value; 439 } 440 441 // Set the islower property of the given unichar to the given value. set_islower(UNICHAR_ID unichar_id,bool value)442 void set_islower(UNICHAR_ID unichar_id, bool value) { 443 unichars[unichar_id].properties.islower = value; 444 } 445 446 // Set the isupper property of the given unichar to the given value. set_isupper(UNICHAR_ID unichar_id,bool value)447 void set_isupper(UNICHAR_ID unichar_id, bool value) { 448 unichars[unichar_id].properties.isupper = value; 449 } 450 451 // Set the isdigit property of the given unichar to the given value. set_isdigit(UNICHAR_ID unichar_id,bool value)452 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 453 unichars[unichar_id].properties.isdigit = value; 454 } 455 456 // Set the ispunctuation property of the given unichar to the given value. set_ispunctuation(UNICHAR_ID unichar_id,bool value)457 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 458 unichars[unichar_id].properties.ispunctuation = value; 459 } 460 461 // Set the isngram property of the given unichar to the given value. set_isngram(UNICHAR_ID unichar_id,bool value)462 void set_isngram(UNICHAR_ID unichar_id, bool value) { 463 unichars[unichar_id].properties.isngram = value; 464 } 465 466 // Set the script name of the given unichar to the given value. 467 // Value is copied and thus can be a temporary; set_script(UNICHAR_ID unichar_id,const char * value)468 void set_script(UNICHAR_ID unichar_id, const char *value) { 469 unichars[unichar_id].properties.script_id = add_script(value); 470 } 471 472 // Set other_case unichar id in the properties for the given unichar id. set_other_case(UNICHAR_ID unichar_id,UNICHAR_ID other_case)473 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 474 unichars[unichar_id].properties.other_case = other_case; 475 } 476 477 // Set the direction property of the given unichar to the given value. set_direction(UNICHAR_ID unichar_id,UNICHARSET::Direction value)478 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { 479 unichars[unichar_id].properties.direction = value; 480 } 481 482 // Set mirror unichar id in the properties for the given unichar id. set_mirror(UNICHAR_ID unichar_id,UNICHAR_ID mirror)483 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { 484 unichars[unichar_id].properties.mirror = mirror; 485 } 486 487 // Record normalized version of unichar with the given unichar_id. set_normed(UNICHAR_ID unichar_id,const char * normed)488 void set_normed(UNICHAR_ID unichar_id, const char *normed) { 489 unichars[unichar_id].properties.normed = normed; 490 unichars[unichar_id].properties.normed_ids.clear(); 491 } 492 // Sets the normed_ids vector from the normed string. normed_ids is not 493 // stored in the file, and needs to be set when the UNICHARSET is loaded. 494 void set_normed_ids(UNICHAR_ID unichar_id); 495 496 // Return the isalpha property of the given unichar. get_isalpha(UNICHAR_ID unichar_id)497 bool get_isalpha(UNICHAR_ID unichar_id) const { 498 if (INVALID_UNICHAR_ID == unichar_id) { 499 return false; 500 } 501 ASSERT_HOST(contains_unichar_id(unichar_id)); 502 return unichars[unichar_id].properties.isalpha; 503 } 504 505 // Return the islower property of the given unichar. get_islower(UNICHAR_ID unichar_id)506 bool get_islower(UNICHAR_ID unichar_id) const { 507 if (INVALID_UNICHAR_ID == unichar_id) { 508 return false; 509 } 510 ASSERT_HOST(contains_unichar_id(unichar_id)); 511 return unichars[unichar_id].properties.islower; 512 } 513 514 // Return the isupper property of the given unichar. get_isupper(UNICHAR_ID unichar_id)515 bool get_isupper(UNICHAR_ID unichar_id) const { 516 if (INVALID_UNICHAR_ID == unichar_id) { 517 return false; 518 } 519 ASSERT_HOST(contains_unichar_id(unichar_id)); 520 return unichars[unichar_id].properties.isupper; 521 } 522 523 // Return the isdigit property of the given unichar. get_isdigit(UNICHAR_ID unichar_id)524 bool get_isdigit(UNICHAR_ID unichar_id) const { 525 if (INVALID_UNICHAR_ID == unichar_id) { 526 return false; 527 } 528 ASSERT_HOST(contains_unichar_id(unichar_id)); 529 return unichars[unichar_id].properties.isdigit; 530 } 531 532 // Return the ispunctuation property of the given unichar. get_ispunctuation(UNICHAR_ID unichar_id)533 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 534 if (INVALID_UNICHAR_ID == unichar_id) { 535 return false; 536 } 537 ASSERT_HOST(contains_unichar_id(unichar_id)); 538 return unichars[unichar_id].properties.ispunctuation; 539 } 540 541 // Return the isngram property of the given unichar. get_isngram(UNICHAR_ID unichar_id)542 bool get_isngram(UNICHAR_ID unichar_id) const { 543 if (INVALID_UNICHAR_ID == unichar_id) { 544 return false; 545 } 546 ASSERT_HOST(contains_unichar_id(unichar_id)); 547 return unichars[unichar_id].properties.isngram; 548 } 549 550 // Returns whether the unichar id represents a unicode value in the private 551 // use area. 552 bool get_isprivate(UNICHAR_ID unichar_id) const; 553 554 // Returns true if the ids have useful min/max top/bottom values. top_bottom_useful()555 bool top_bottom_useful() const { 556 return top_bottom_set_; 557 } 558 // Sets all ranges to empty, so they can be expanded to set the values. 559 void set_ranges_empty(); 560 // Sets all the properties for this unicharset given a src_unicharset with 561 // everything set. The unicharsets don't have to be the same, and graphemes 562 // are correctly accounted for. SetPropertiesFromOther(const UNICHARSET & src)563 void SetPropertiesFromOther(const UNICHARSET &src) { 564 PartialSetPropertiesFromOther(0, src); 565 } 566 // Sets properties from Other, starting only at the given index. 567 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src); 568 // Expands the tops and bottoms and widths for this unicharset given a 569 // src_unicharset with ranges in it. The unicharsets don't have to be the 570 // same, and graphemes are correctly accounted for. 571 void ExpandRangesFromOther(const UNICHARSET &src); 572 // Makes this a copy of src. Clears this completely first, so the automattic 573 // ids will not be present in this if not in src. 574 void CopyFrom(const UNICHARSET &src); 575 // For each id in src, if it does not occur in this, add it, as in 576 // SetPropertiesFromOther, otherwise expand the ranges, as in 577 // ExpandRangesFromOther. 578 void AppendOtherUnicharset(const UNICHARSET &src); 579 // Returns true if the acceptable ranges of the tops of the characters do 580 // not overlap, making their x-height calculations distinct. 581 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; 582 // Returns the min and max bottom and top of the given unichar in 583 // baseline-normalized coordinates, ie, where the baseline is 584 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 585 // (See normalis.h for the definitions). get_top_bottom(UNICHAR_ID unichar_id,int * min_bottom,int * max_bottom,int * min_top,int * max_top)586 void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, 587 int *min_top, int *max_top) const { 588 if (INVALID_UNICHAR_ID == unichar_id) { 589 *min_bottom = *min_top = 0; 590 *max_bottom = *max_top = 256; // kBlnCellHeight 591 return; 592 } 593 ASSERT_HOST(contains_unichar_id(unichar_id)); 594 *min_bottom = unichars[unichar_id].properties.min_bottom; 595 *max_bottom = unichars[unichar_id].properties.max_bottom; 596 *min_top = unichars[unichar_id].properties.min_top; 597 *max_top = unichars[unichar_id].properties.max_top; 598 } set_top_bottom(UNICHAR_ID unichar_id,int min_bottom,int max_bottom,int min_top,int max_top)599 void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, 600 int min_top, int max_top) { 601 unichars[unichar_id].properties.min_bottom = 602 ClipToRange<int>(min_bottom, 0, UINT8_MAX); 603 unichars[unichar_id].properties.max_bottom = 604 ClipToRange<int>(max_bottom, 0, UINT8_MAX); 605 unichars[unichar_id].properties.min_top = 606 ClipToRange<int>(min_top, 0, UINT8_MAX); 607 unichars[unichar_id].properties.max_top = 608 ClipToRange<int>(max_top, 0, UINT8_MAX); 609 } 610 // Returns the width stats (as mean, sd) of the given unichar relative to the 611 // median advance of all characters in the character set. get_width_stats(UNICHAR_ID unichar_id,float * width,float * width_sd)612 void get_width_stats(UNICHAR_ID unichar_id, float *width, 613 float *width_sd) const { 614 if (INVALID_UNICHAR_ID == unichar_id) { 615 *width = 0.0f; 616 *width_sd = 0.0f; 617 ; 618 return; 619 } 620 ASSERT_HOST(contains_unichar_id(unichar_id)); 621 *width = unichars[unichar_id].properties.width; 622 *width_sd = unichars[unichar_id].properties.width_sd; 623 } set_width_stats(UNICHAR_ID unichar_id,float width,float width_sd)624 void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) { 625 unichars[unichar_id].properties.width = width; 626 unichars[unichar_id].properties.width_sd = width_sd; 627 } 628 // Returns the stats of the x-bearing (as mean, sd) of the given unichar 629 // relative to the median advance of all characters in the character set. get_bearing_stats(UNICHAR_ID unichar_id,float * bearing,float * bearing_sd)630 void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, 631 float *bearing_sd) const { 632 if (INVALID_UNICHAR_ID == unichar_id) { 633 *bearing = *bearing_sd = 0.0f; 634 return; 635 } 636 ASSERT_HOST(contains_unichar_id(unichar_id)); 637 *bearing = unichars[unichar_id].properties.bearing; 638 *bearing_sd = unichars[unichar_id].properties.bearing_sd; 639 } set_bearing_stats(UNICHAR_ID unichar_id,float bearing,float bearing_sd)640 void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, 641 float bearing_sd) { 642 unichars[unichar_id].properties.bearing = bearing; 643 unichars[unichar_id].properties.bearing_sd = bearing_sd; 644 } 645 // Returns the stats of the x-advance of the given unichar (as mean, sd) 646 // relative to the median advance of all characters in the character set. get_advance_stats(UNICHAR_ID unichar_id,float * advance,float * advance_sd)647 void get_advance_stats(UNICHAR_ID unichar_id, float *advance, 648 float *advance_sd) const { 649 if (INVALID_UNICHAR_ID == unichar_id) { 650 *advance = *advance_sd = 0; 651 return; 652 } 653 ASSERT_HOST(contains_unichar_id(unichar_id)); 654 *advance = unichars[unichar_id].properties.advance; 655 *advance_sd = unichars[unichar_id].properties.advance_sd; 656 } set_advance_stats(UNICHAR_ID unichar_id,float advance,float advance_sd)657 void set_advance_stats(UNICHAR_ID unichar_id, float advance, 658 float advance_sd) { 659 unichars[unichar_id].properties.advance = advance; 660 unichars[unichar_id].properties.advance_sd = advance_sd; 661 } 662 // Returns true if the font metrics properties are empty. PropertiesIncomplete(UNICHAR_ID unichar_id)663 bool PropertiesIncomplete(UNICHAR_ID unichar_id) const { 664 return unichars[unichar_id].properties.AnyRangeEmpty(); 665 } 666 667 // Returns true if the script of the given id is space delimited. 668 // Returns false for Han and Thai scripts. IsSpaceDelimited(UNICHAR_ID unichar_id)669 bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { 670 if (INVALID_UNICHAR_ID == unichar_id) { 671 return true; 672 } 673 int script_id = get_script(unichar_id); 674 return script_id != han_sid_ && script_id != thai_sid_ && 675 script_id != hangul_sid_ && script_id != hiragana_sid_ && 676 script_id != katakana_sid_; 677 } 678 679 // Return the script name of the given unichar. 680 // The returned pointer will always be the same for the same script, it's 681 // managed by unicharset and thus MUST NOT be deleted get_script(UNICHAR_ID unichar_id)682 int get_script(UNICHAR_ID unichar_id) const { 683 if (INVALID_UNICHAR_ID == unichar_id) { 684 return null_sid_; 685 } 686 ASSERT_HOST(contains_unichar_id(unichar_id)); 687 return unichars[unichar_id].properties.script_id; 688 } 689 690 // Return the character properties, eg. alpha/upper/lower/digit/punct, 691 // as a bit field of unsigned int. 692 unsigned int get_properties(UNICHAR_ID unichar_id) const; 693 694 // Return the character property as a single char. If a character has 695 // multiple attributes, the main property is defined by the following order: 696 // upper_case : 'A' 697 // lower_case : 'a' 698 // alpha : 'x' 699 // digit : '0' 700 // punctuation: 'p' 701 char get_chartype(UNICHAR_ID unichar_id) const; 702 703 // Get other_case unichar id in the properties for the given unichar id. get_other_case(UNICHAR_ID unichar_id)704 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 705 if (INVALID_UNICHAR_ID == unichar_id) { 706 return INVALID_UNICHAR_ID; 707 } 708 ASSERT_HOST(contains_unichar_id(unichar_id)); 709 return unichars[unichar_id].properties.other_case; 710 } 711 712 // Returns the direction property of the given unichar. get_direction(UNICHAR_ID unichar_id)713 Direction get_direction(UNICHAR_ID unichar_id) const { 714 if (INVALID_UNICHAR_ID == unichar_id) { 715 return UNICHARSET::U_OTHER_NEUTRAL; 716 } 717 ASSERT_HOST(contains_unichar_id(unichar_id)); 718 return unichars[unichar_id].properties.direction; 719 } 720 721 // Get mirror unichar id in the properties for the given unichar id. get_mirror(UNICHAR_ID unichar_id)722 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { 723 if (INVALID_UNICHAR_ID == unichar_id) { 724 return INVALID_UNICHAR_ID; 725 } 726 ASSERT_HOST(contains_unichar_id(unichar_id)); 727 return unichars[unichar_id].properties.mirror; 728 } 729 730 // Returns UNICHAR_ID of the corresponding lower-case unichar. to_lower(UNICHAR_ID unichar_id)731 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 732 if (INVALID_UNICHAR_ID == unichar_id) { 733 return INVALID_UNICHAR_ID; 734 } 735 ASSERT_HOST(contains_unichar_id(unichar_id)); 736 if (unichars[unichar_id].properties.islower) { 737 return unichar_id; 738 } 739 return unichars[unichar_id].properties.other_case; 740 } 741 742 // Returns UNICHAR_ID of the corresponding upper-case unichar. to_upper(UNICHAR_ID unichar_id)743 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 744 if (INVALID_UNICHAR_ID == unichar_id) { 745 return INVALID_UNICHAR_ID; 746 } 747 ASSERT_HOST(contains_unichar_id(unichar_id)); 748 if (unichars[unichar_id].properties.isupper) { 749 return unichar_id; 750 } 751 return unichars[unichar_id].properties.other_case; 752 } 753 754 // Returns true if this UNICHARSET has the special codes in 755 // SpecialUnicharCodes available. If false then there are normal unichars 756 // at these codes and they should not be used. has_special_codes()757 bool has_special_codes() const { 758 return get_fragment(UNICHAR_BROKEN) != nullptr && 759 strcmp(id_to_unichar(UNICHAR_BROKEN), 760 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; 761 } 762 763 // Returns true if there are any repeated unicodes in the normalized 764 // text of any unichar-id in the unicharset. 765 bool AnyRepeatedUnicodes() const; 766 767 // Return a pointer to the CHAR_FRAGMENT class if the given 768 // unichar id represents a character fragment. get_fragment(UNICHAR_ID unichar_id)769 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 770 if (INVALID_UNICHAR_ID == unichar_id) { 771 return nullptr; 772 } 773 ASSERT_HOST(contains_unichar_id(unichar_id)); 774 return unichars[unichar_id].properties.fragment; 775 } 776 777 // Return the isalpha property of the given unichar representation. get_isalpha(const char * const unichar_repr)778 bool get_isalpha(const char *const unichar_repr) const { 779 return get_isalpha(unichar_to_id(unichar_repr)); 780 } 781 782 // Return the islower property of the given unichar representation. get_islower(const char * const unichar_repr)783 bool get_islower(const char *const unichar_repr) const { 784 return get_islower(unichar_to_id(unichar_repr)); 785 } 786 787 // Return the isupper property of the given unichar representation. get_isupper(const char * const unichar_repr)788 bool get_isupper(const char *const unichar_repr) const { 789 return get_isupper(unichar_to_id(unichar_repr)); 790 } 791 792 // Return the isdigit property of the given unichar representation. get_isdigit(const char * const unichar_repr)793 bool get_isdigit(const char *const unichar_repr) const { 794 return get_isdigit(unichar_to_id(unichar_repr)); 795 } 796 797 // Return the ispunctuation property of the given unichar representation. get_ispunctuation(const char * const unichar_repr)798 bool get_ispunctuation(const char *const unichar_repr) const { 799 return get_ispunctuation(unichar_to_id(unichar_repr)); 800 } 801 802 // Return the character properties, eg. alpha/upper/lower/digit/punct, 803 // of the given unichar representation get_properties(const char * const unichar_repr)804 unsigned int get_properties(const char *const unichar_repr) const { 805 return get_properties(unichar_to_id(unichar_repr)); 806 } 807 get_chartype(const char * const unichar_repr)808 char get_chartype(const char *const unichar_repr) const { 809 return get_chartype(unichar_to_id(unichar_repr)); 810 } 811 812 // Return the script name of the given unichar representation. 813 // The returned pointer will always be the same for the same script, it's 814 // managed by unicharset and thus MUST NOT be deleted get_script(const char * const unichar_repr)815 int get_script(const char *const unichar_repr) const { 816 return get_script(unichar_to_id(unichar_repr)); 817 } 818 819 // Return a pointer to the CHAR_FRAGMENT class struct if the given 820 // unichar representation represents a character fragment. get_fragment(const char * const unichar_repr)821 const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const { 822 if (unichar_repr == nullptr || unichar_repr[0] == '\0' || 823 !ids.contains(unichar_repr, false)) { 824 return nullptr; 825 } 826 return get_fragment(unichar_to_id(unichar_repr)); 827 } 828 829 // Return the isalpha property of the given unichar representation. 830 // Only the first length characters from unichar_repr are used. get_isalpha(const char * const unichar_repr,int length)831 bool get_isalpha(const char *const unichar_repr, int length) const { 832 return get_isalpha(unichar_to_id(unichar_repr, length)); 833 } 834 835 // Return the islower property of the given unichar representation. 836 // Only the first length characters from unichar_repr are used. get_islower(const char * const unichar_repr,int length)837 bool get_islower(const char *const unichar_repr, int length) const { 838 return get_islower(unichar_to_id(unichar_repr, length)); 839 } 840 841 // Return the isupper property of the given unichar representation. 842 // Only the first length characters from unichar_repr are used. get_isupper(const char * const unichar_repr,int length)843 bool get_isupper(const char *const unichar_repr, int length) const { 844 return get_isupper(unichar_to_id(unichar_repr, length)); 845 } 846 847 // Return the isdigit property of the given unichar representation. 848 // Only the first length characters from unichar_repr are used. get_isdigit(const char * const unichar_repr,int length)849 bool get_isdigit(const char *const unichar_repr, int length) const { 850 return get_isdigit(unichar_to_id(unichar_repr, length)); 851 } 852 853 // Return the ispunctuation property of the given unichar representation. 854 // Only the first length characters from unichar_repr are used. get_ispunctuation(const char * const unichar_repr,int length)855 bool get_ispunctuation(const char *const unichar_repr, int length) const { 856 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 857 } 858 859 // Returns normalized version of unichar with the given unichar_id. get_normed_unichar(UNICHAR_ID unichar_id)860 const char *get_normed_unichar(UNICHAR_ID unichar_id) const { 861 if (unichar_id == UNICHAR_SPACE) { 862 return " "; 863 } 864 return unichars[unichar_id].properties.normed.c_str(); 865 } 866 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized 867 // version of the given id. There may be more than one UNICHAR_ID in the 868 // vector if unichar_id represents a ligature. normed_ids(UNICHAR_ID unichar_id)869 const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const { 870 return unichars[unichar_id].properties.normed_ids; 871 } 872 873 // Return the script name of the given unichar representation. 874 // Only the first length characters from unichar_repr are used. 875 // The returned pointer will always be the same for the same script, it's 876 // managed by unicharset and thus MUST NOT be deleted get_script(const char * const unichar_repr,int length)877 int get_script(const char *const unichar_repr, int length) const { 878 return get_script(unichar_to_id(unichar_repr, length)); 879 } 880 881 // Return the (current) number of scripts in the script table get_script_table_size()882 int get_script_table_size() const { 883 return script_table_size_used; 884 } 885 886 // Return the script string from its id get_script_from_script_id(int id)887 const char *get_script_from_script_id(int id) const { 888 if (id >= script_table_size_used || id < 0) { 889 return null_script; 890 } 891 return script_table[id]; 892 } 893 894 // Returns the id from the name of the script, or 0 if script is not found. 895 // Note that this is an expensive operation since it involves iteratively 896 // comparing strings in the script table. To avoid dependency on STL, we 897 // won't use a hash. Instead, the calling function can use this to lookup 898 // and save the ID for relevant scripts for fast comparisons later. 899 int get_script_id_from_name(const char *script_name) const; 900 901 // Return true if the given script is the null script is_null_script(const char * script)902 bool is_null_script(const char *script) const { 903 return script == null_script; 904 } 905 906 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 907 // then the returned pointer will be the same. 908 // The script parameter is copied and thus can be a temporary. 909 int add_script(const char *script); 910 911 // Return the enabled property of the given unichar. get_enabled(UNICHAR_ID unichar_id)912 bool get_enabled(UNICHAR_ID unichar_id) const { 913 ASSERT_HOST(contains_unichar_id(unichar_id)); 914 return unichars[unichar_id].properties.enabled; 915 } 916 null_sid()917 int null_sid() const { 918 return null_sid_; 919 } common_sid()920 int common_sid() const { 921 return common_sid_; 922 } latin_sid()923 int latin_sid() const { 924 return latin_sid_; 925 } cyrillic_sid()926 int cyrillic_sid() const { 927 return cyrillic_sid_; 928 } greek_sid()929 int greek_sid() const { 930 return greek_sid_; 931 } han_sid()932 int han_sid() const { 933 return han_sid_; 934 } hiragana_sid()935 int hiragana_sid() const { 936 return hiragana_sid_; 937 } katakana_sid()938 int katakana_sid() const { 939 return katakana_sid_; 940 } thai_sid()941 int thai_sid() const { 942 return thai_sid_; 943 } hangul_sid()944 int hangul_sid() const { 945 return hangul_sid_; 946 } default_sid()947 int default_sid() const { 948 return default_sid_; 949 } 950 951 // Returns true if the unicharset has the concept of upper/lower case. script_has_upper_lower()952 bool script_has_upper_lower() const { 953 return script_has_upper_lower_; 954 } 955 // Returns true if the unicharset has the concept of x-height. 956 // script_has_xheight can be true even if script_has_upper_lower is not, 957 // when the script has a sufficiently predominant top line with ascenders, 958 // such as Devanagari and Thai. script_has_xheight()959 bool script_has_xheight() const { 960 return script_has_xheight_; 961 } 962 963 private: 964 struct TESS_API UNICHAR_PROPERTIES { 965 UNICHAR_PROPERTIES(); 966 // Initializes all properties to sensible default values. 967 void Init(); 968 // Sets all ranges wide open. Initialization default in case there are 969 // no useful values available. 970 void SetRangesOpen(); 971 // Sets all ranges to empty. Used before expanding with font-based data. 972 void SetRangesEmpty(); 973 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats 974 // is empty. 975 bool AnyRangeEmpty() const; 976 // Expands the ranges with the ranges from the src properties. 977 void ExpandRangesFrom(const UNICHAR_PROPERTIES &src); 978 // Copies the properties from src into this. 979 void CopyFrom(const UNICHAR_PROPERTIES &src); 980 981 bool isalpha; 982 bool islower; 983 bool isupper; 984 bool isdigit; 985 bool ispunctuation; 986 bool isngram; 987 bool enabled; 988 // Possible limits of the top and bottom of the bounding box in 989 // baseline-normalized coordinates, ie, where the baseline is 990 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 991 // (See normalis.h for the definitions). 992 uint8_t min_bottom; 993 uint8_t max_bottom; 994 uint8_t min_top; 995 uint8_t max_top; 996 // Statistics of the widths of bounding box, relative to the median advance. 997 float width; 998 float width_sd; 999 // Stats of the x-bearing and advance, also relative to the median advance. 1000 float bearing; 1001 float bearing_sd; 1002 float advance; 1003 float advance_sd; 1004 int script_id; 1005 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 1006 Direction direction; // direction of this unichar 1007 // Mirror property is useful for reverse DAWG lookup for words in 1008 // right-to-left languages (e.g. "(word)" would be in 1009 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. 1010 // However, what we want in our DAWG is 1011 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not 1012 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. 1013 UNICHAR_ID mirror; 1014 // A string of unichar_ids that represent the corresponding normed string. 1015 // For awkward characters like em-dash, this gives hyphen. 1016 // For ligatures, this gives the string of normal unichars. 1017 std::vector<UNICHAR_ID> normed_ids; 1018 std::string normed; // normalized version of this unichar 1019 // Contains meta information about the fragment if a unichar represents 1020 // a fragment of a character, otherwise should be set to nullptr. 1021 // It is assumed that character fragments are added to the unicharset 1022 // after the corresponding 'base' characters. 1023 CHAR_FRAGMENT *fragment; 1024 }; 1025 1026 struct UNICHAR_SLOT { 1027 char representation[UNICHAR_LEN + 1]; 1028 UNICHAR_PROPERTIES properties; 1029 }; 1030 1031 // Internal recursive version of encode_string above. 1032 // str is the start of the whole string. 1033 // str_index is the current position in str. 1034 // str_length is the length of str. 1035 // encoding is a working encoding of str. 1036 // lengths is a working set of lengths of each element of encoding. 1037 // best_total_length is the longest length of str that has been successfully 1038 // encoded so far. 1039 // On return: 1040 // best_encoding contains the encoding that used the longest part of str. 1041 // best_lengths (may be null) contains the lengths of best_encoding. 1042 void encode_string(const char *str, int str_index, int str_length, 1043 std::vector<UNICHAR_ID> *encoding, 1044 std::vector<char> *lengths, unsigned *best_total_length, 1045 std::vector<UNICHAR_ID> *best_encoding, 1046 std::vector<char> *best_lengths) const; 1047 1048 // Gets the properties for a grapheme string, combining properties for 1049 // multiple characters in a meaningful way where possible. 1050 // Returns false if no valid match was found in the unicharset. 1051 // NOTE that script_id, mirror, and other_case refer to this unicharset on 1052 // return and will need redirecting if the target unicharset is different. 1053 bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const; 1054 1055 // Load ourselves from a "file" where our only interface to the file is 1056 // an implementation of fgets(). This is the parsing primitive accessed by 1057 // the public routines load_from_file(). 1058 bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb, 1059 bool skip_fragments); 1060 1061 // List of mappings to make when ingesting strings from the outside. 1062 // The substitutions clean up text that should exists for rendering of 1063 // synthetic data, but not in the recognition set. 1064 static const char *kCleanupMaps[][2]; 1065 static const char *null_script; 1066 1067 std::vector<UNICHAR_SLOT> unichars; 1068 UNICHARMAP ids; 1069 char **script_table; 1070 int script_table_size_used; 1071 int script_table_size_reserved; 1072 // True if the unichars have their tops/bottoms set. 1073 bool top_bottom_set_; 1074 // True if the unicharset has significant upper/lower case chars. 1075 bool script_has_upper_lower_; 1076 // True if the unicharset has a significant mean-line with significant 1077 // ascenders above that. 1078 bool script_has_xheight_; 1079 // True if the set contains chars that would be changed by the cleanup. 1080 bool old_style_included_; 1081 1082 // A few convenient script name-to-id mapping without using hash. 1083 // These are initialized when unicharset file is loaded. Anything 1084 // missing from this list can be looked up using get_script_id_from_name. 1085 int null_sid_; 1086 int common_sid_; 1087 int latin_sid_; 1088 int cyrillic_sid_; 1089 int greek_sid_; 1090 int han_sid_; 1091 int hiragana_sid_; 1092 int katakana_sid_; 1093 int thai_sid_; 1094 int hangul_sid_; 1095 // The most frequently occurring script in the charset. 1096 int default_sid_; 1097 }; 1098 1099 } // namespace tesseract 1100 1101 #endif // TESSERACT_CCUTIL_UNICHARSET_H_ 1102