1 ///////////////////////////////////////////////////////////////////////
2 // File:        unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author:      Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_
20 #define TESSERACT_CCUTIL_UNICHARSET_H_
21 
22 #include "errcode.h"
23 #include "unicharmap.h"
24 
25 #include <tesseract/unichar.h>
26 #include "helpers.h"
27 #include "serialis.h"
28 
29 #include <functional> // for std::function
30 
31 namespace tesseract {
32 
33 // Enum holding special values of unichar_id. Every unicharset has these.
34 // Warning! Keep in sync with kSpecialUnicharCodes.
35 enum SpecialUnicharCodes {
36   UNICHAR_SPACE,
37   UNICHAR_JOINED,
38   UNICHAR_BROKEN,
39 
40   SPECIAL_UNICHAR_CODES_COUNT
41 };
42 
43 // Boolean flag for unichar_insert. It's a bit of a double negative to allow
44 // the default value to be false.
45 enum class OldUncleanUnichars {
46   kFalse,
47   kTrue,
48 };
49 
50 class TESS_API CHAR_FRAGMENT {
51 public:
52   // Minimum number of characters used for fragment representation.
53   static const int kMinLen = 6;
54   // Maximum number of characters used for fragment representation.
55   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
56   // Maximum number of fragments per character.
57   static const int kMaxChunks = 5;
58 
59   // Setters and Getters.
set_all(const char * unichar,int pos,int total,bool natural)60   inline void set_all(const char *unichar, int pos, int total, bool natural) {
61     set_unichar(unichar);
62     set_pos(pos);
63     set_total(total);
64     set_natural(natural);
65   }
set_unichar(const char * uch)66   inline void set_unichar(const char *uch) {
67     strncpy(this->unichar, uch, sizeof(this->unichar));
68     this->unichar[UNICHAR_LEN] = '\0';
69   }
set_pos(int p)70   inline void set_pos(int p) {
71     this->pos = p;
72   }
set_total(int t)73   inline void set_total(int t) {
74     this->total = t;
75   }
get_unichar()76   inline const char *get_unichar() const {
77     return this->unichar;
78   }
get_pos()79   inline int get_pos() const {
80     return this->pos;
81   }
get_total()82   inline int get_total() const {
83     return this->total;
84   }
85 
86   // Returns the string that represents a fragment
87   // with the given unichar, pos and total.
88   static std::string to_string(const char *unichar, int pos, int total,
89                                bool natural);
90   // Returns the string that represents this fragment.
to_string()91   std::string to_string() const {
92     return to_string(unichar, pos, total, natural);
93   }
94 
95   // Checks whether a fragment has the same unichar,
96   // position and total as the given inputs.
equals(const char * other_unichar,int other_pos,int other_total)97   inline bool equals(const char *other_unichar, int other_pos,
98                      int other_total) const {
99     return (strcmp(this->unichar, other_unichar) == 0 &&
100             this->pos == other_pos && this->total == other_total);
101   }
equals(const CHAR_FRAGMENT * other)102   inline bool equals(const CHAR_FRAGMENT *other) const {
103     return this->equals(other->get_unichar(), other->get_pos(),
104                         other->get_total());
105   }
106 
107   // Checks whether a given fragment is a continuation of this fragment.
108   // Assumes that the given fragment pointer is not nullptr.
is_continuation_of(const CHAR_FRAGMENT * fragment)109   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
110     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
111             this->total == fragment->get_total() &&
112             this->pos == fragment->get_pos() + 1);
113   }
114 
115   // Returns true if this fragment is a beginning fragment.
is_beginning()116   inline bool is_beginning() const {
117     return this->pos == 0;
118   }
119 
120   // Returns true if this fragment is an ending fragment.
is_ending()121   inline bool is_ending() const {
122     return this->pos == this->total - 1;
123   }
124 
125   // Returns true if the fragment was a separate component to begin with,
126   // ie did not need chopping to be isolated, but may have been separated
127   // out from a multi-outline blob.
is_natural()128   inline bool is_natural() const {
129     return natural;
130   }
set_natural(bool value)131   void set_natural(bool value) {
132     natural = value;
133   }
134 
135   // Parses the string to see whether it represents a character fragment
136   // (rather than a regular character). If so, allocates memory for a new
137   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
138   // information. Fragments are of the form:
139   // |m|1|2, meaning chunk 1 of 2 of character m, or
140   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
141   // to divide the parts, as they were already separate connected components.
142   //
143   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
144   // instance, otherwise (if the string does not represent a fragment or it
145   // looks like it does, but parsing it as a fragment fails) returns nullptr.
146   //
147   // Note: The caller is responsible for deallocating memory
148   // associated with the returned pointer.
149   static CHAR_FRAGMENT *parse_from_string(const char *str);
150 
151 private:
152   char unichar[UNICHAR_LEN + 1];
153   // True if the fragment was a separate component to begin with,
154   // ie did not need chopping to be isolated, but may have been separated
155   // out from a multi-outline blob.
156   bool natural;
157   int16_t pos;   // fragment position in the character
158   int16_t total; // total number of fragments in the character
159 };
160 
161 // The UNICHARSET class is an utility class for Tesseract that holds the
162 // set of characters that are used by the engine. Each character is identified
163 // by a unique number, from 0 to (size - 1).
164 class TESS_API UNICHARSET {
165 public:
166   // Custom list of characters and their ligature forms (UTF8)
167   // These map to unicode values in the private use area (PUC) and are supported
168   // by only few font families (eg. Wyld, Adobe Caslon Pro).
169   static const char *kCustomLigatures[][2];
170 
171   // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
172   static const char *kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
173 
174   // ICU 2.0 UCharDirection enum (from icu/include/unicode/uchar.h)
175   enum Direction {
176     U_LEFT_TO_RIGHT = 0,
177     U_RIGHT_TO_LEFT = 1,
178     U_EUROPEAN_NUMBER = 2,
179     U_EUROPEAN_NUMBER_SEPARATOR = 3,
180     U_EUROPEAN_NUMBER_TERMINATOR = 4,
181     U_ARABIC_NUMBER = 5,
182     U_COMMON_NUMBER_SEPARATOR = 6,
183     U_BLOCK_SEPARATOR = 7,
184     U_SEGMENT_SEPARATOR = 8,
185     U_WHITE_SPACE_NEUTRAL = 9,
186     U_OTHER_NEUTRAL = 10,
187     U_LEFT_TO_RIGHT_EMBEDDING = 11,
188     U_LEFT_TO_RIGHT_OVERRIDE = 12,
189     U_RIGHT_TO_LEFT_ARABIC = 13,
190     U_RIGHT_TO_LEFT_EMBEDDING = 14,
191     U_RIGHT_TO_LEFT_OVERRIDE = 15,
192     U_POP_DIRECTIONAL_FORMAT = 16,
193     U_DIR_NON_SPACING_MARK = 17,
194     U_BOUNDARY_NEUTRAL = 18,
195     U_FIRST_STRONG_ISOLATE = 19,
196     U_LEFT_TO_RIGHT_ISOLATE = 20,
197     U_RIGHT_TO_LEFT_ISOLATE = 21,
198     U_POP_DIRECTIONAL_ISOLATE = 22,
199 #ifndef U_HIDE_DEPRECATED_API
200     U_CHAR_DIRECTION_COUNT
201 #endif // U_HIDE_DEPRECATED_API
202   };
203 
204   // Create an empty UNICHARSET
205   UNICHARSET();
206 
207   ~UNICHARSET();
208 
209   // Return the UNICHAR_ID of a given unichar representation within the
210   // UNICHARSET.
211   UNICHAR_ID unichar_to_id(const char *const unichar_repr) const;
212 
213   // Return the UNICHAR_ID of a given unichar representation within the
214   // UNICHARSET. Only the first length characters from unichar_repr are used.
215   UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const;
216 
217   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
218   // while leaving the rest of the string encodable. Returns 0 if the
219   // beginning of the string is not encodable.
220   // WARNING: this function now encodes the whole string for precision.
221   // Use encode_string in preference to repeatedly calling step.
222   int step(const char *str) const;
223 
224   // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
225   // If not encodable, write the first byte offset which cannot be converted
226   // into the second (return) argument.
227   bool encodable_string(const char *str, unsigned *first_bad_position) const;
228 
229   // Encodes the given UTF-8 string with this UNICHARSET.
230   // Any part of the string that cannot be encoded (because the utf8 can't
231   // be broken up into pieces that are in the unicharset) then:
232   // if give_up_on_failure, stops and returns a partial encoding,
233   // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
234   // Returns true if the encoding succeeds completely, false if there is at
235   // least one failure.
236   // If lengths is not nullptr, then it is filled with the corresponding
237   // byte length of each encoded UNICHAR_ID.
238   // If encoded_length is not nullptr then on return it contains the length of
239   // str that was encoded. (if give_up_on_failure the location of the first
240   // failure, otherwise strlen(str).)
241   // WARNING: Caller must guarantee that str has already been cleaned of codes
242   // that do not belong in the unicharset, or encoding may fail.
243   // Use CleanupString to perform the cleaning.
244   bool encode_string(const char *str, bool give_up_on_failure,
245                      std::vector<UNICHAR_ID> *encoding,
246                      std::vector<char> *lengths,
247                      unsigned *encoded_length) const;
248 
249   // Return the unichar representation corresponding to the given UNICHAR_ID
250   // within the UNICHARSET.
251   const char *id_to_unichar(UNICHAR_ID id) const;
252 
253   // Return the UTF8 representation corresponding to the given UNICHAR_ID after
254   // resolving any private encodings internal to Tesseract. This method is
255   // preferable to id_to_unichar for outputting text that will be visible to
256   // external applications.
257   const char *id_to_unichar_ext(UNICHAR_ID id) const;
258 
259   // Return a string that reformats the utf8 str into the str followed
260   // by its hex unicodes.
261   static std::string debug_utf8_str(const char *str);
262 
263   // Removes/replaces content that belongs in rendered text, but not in the
264   // unicharset.
CleanupString(const char * utf8_str)265   static std::string CleanupString(const char *utf8_str) {
266     return CleanupString(utf8_str, strlen(utf8_str));
267   }
268   static std::string CleanupString(const char *utf8_str, size_t length);
269 
270   // Return a string containing debug information on the unichar, including
271   // the id_to_unichar, its hex unicodes and the properties.
272   std::string debug_str(UNICHAR_ID id) const;
debug_str(const char * unichar_repr)273   std::string debug_str(const char *unichar_repr) const {
274     return debug_str(unichar_to_id(unichar_repr));
275   }
276 
277   // Adds a unichar representation to the set. If old_style is true, then
278   // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
279   // characters are ignored/skipped as if they don't exist and n-grams that
280   // can already be encoded are not added.
281   void unichar_insert(const char *const unichar_repr,
282                       OldUncleanUnichars old_style);
unichar_insert(const char * const unichar_repr)283   void unichar_insert(const char *const unichar_repr) {
284     unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
285   }
286   // Adds a unichar representation to the set. Avoids setting old_style to true,
287   // unless it is necessary to make the new unichar get added.
unichar_insert_backwards_compatible(const char * const unichar_repr)288   void unichar_insert_backwards_compatible(const char *const unichar_repr) {
289     std::string cleaned = CleanupString(unichar_repr);
290     if (cleaned != unichar_repr) {
291       unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
292     } else {
293       auto old_size = size();
294       unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
295       if (size() == old_size) {
296         unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
297       }
298     }
299   }
300 
301   // Return true if the given unichar id exists within the set.
302   // Relies on the fact that unichar ids are contiguous in the unicharset.
contains_unichar_id(UNICHAR_ID unichar_id)303   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
304     return static_cast<size_t>(unichar_id) < unichars.size();
305   }
306 
307   // Return true if the given unichar representation exists within the set.
308   bool contains_unichar(const char *const unichar_repr) const;
309   bool contains_unichar(const char *const unichar_repr, int length) const;
310 
311   // Return true if the given unichar representation corresponds to the given
312   // UNICHAR_ID within the set.
313   bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const;
314 
315   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
delete_pointers_in_unichars()316   void delete_pointers_in_unichars() {
317     for (auto &unichar : unichars) {
318       delete unichar.properties.fragment;
319       unichar.properties.fragment = nullptr;
320     }
321   }
322 
323   // Clear the UNICHARSET (all the previous data is lost).
clear()324   void clear() {
325     if (script_table != nullptr) {
326       for (int i = 0; i < script_table_size_used; ++i) {
327         delete[] script_table[i];
328       }
329       delete[] script_table;
330       script_table = nullptr;
331       script_table_size_used = 0;
332     }
333     script_table_size_reserved = 0;
334     delete_pointers_in_unichars();
335     unichars.clear();
336     ids.clear();
337     top_bottom_set_ = false;
338     script_has_upper_lower_ = false;
339     script_has_xheight_ = false;
340     old_style_included_ = false;
341     null_sid_ = 0;
342     common_sid_ = 0;
343     latin_sid_ = 0;
344     cyrillic_sid_ = 0;
345     greek_sid_ = 0;
346     han_sid_ = 0;
347     hiragana_sid_ = 0;
348     katakana_sid_ = 0;
349     thai_sid_ = 0;
350     hangul_sid_ = 0;
351     default_sid_ = 0;
352   }
353 
354   // Return the size of the set (the number of different UNICHAR it holds).
size()355   size_t size() const {
356     return unichars.size();
357   }
358 
359   // Opens the file indicated by filename and saves unicharset to that file.
360   // Returns true if the operation is successful.
save_to_file(const char * const filename)361   bool save_to_file(const char *const filename) const {
362     FILE *file = fopen(filename, "w+b");
363     if (file == nullptr) {
364       return false;
365     }
366     bool result = save_to_file(file);
367     fclose(file);
368     return result;
369   }
370 
371   // Saves the content of the UNICHARSET to the given file.
372   // Returns true if the operation is successful.
save_to_file(FILE * file)373   bool save_to_file(FILE *file) const {
374     std::string str;
375     return save_to_string(str) &&
376            tesseract::Serialize(file, &str[0], str.length());
377   }
378 
save_to_file(tesseract::TFile * file)379   bool save_to_file(tesseract::TFile *file) const {
380     std::string str;
381     return save_to_string(str) && file->Serialize(&str[0], str.length());
382   }
383 
384   // Saves the content of the UNICHARSET to the given string.
385   // Returns true if the operation is successful.
386   bool save_to_string(std::string &str) const;
387 
388   // Opens the file indicated by filename and loads the UNICHARSET
389   // from the given file. The previous data is lost.
390   // Returns true if the operation is successful.
load_from_file(const char * const filename,bool skip_fragments)391   bool load_from_file(const char *const filename, bool skip_fragments) {
392     FILE *file = fopen(filename, "rb");
393     if (file == nullptr) {
394       return false;
395     }
396     bool result = load_from_file(file, skip_fragments);
397     fclose(file);
398     return result;
399   }
400   // returns true if the operation is successful.
load_from_file(const char * const filename)401   bool load_from_file(const char *const filename) {
402     return load_from_file(filename, false);
403   }
404 
405   // Loads the UNICHARSET from the given file. The previous data is lost.
406   // Returns true if the operation is successful.
407   bool load_from_file(FILE *file, bool skip_fragments);
load_from_file(FILE * file)408   bool load_from_file(FILE *file) {
409     return load_from_file(file, false);
410   }
411   bool load_from_file(tesseract::TFile *file, bool skip_fragments);
412 
413   // Sets up internal data after loading the file, based on the char
414   // properties. Called from load_from_file, but also needs to be run
415   // during set_unicharset_properties.
416   void post_load_setup();
417 
418   // Returns true if right_to_left scripts are significant in the unicharset,
419   // but without being so sensitive that "universal" unicharsets containing
420   // characters from many scripts, like orientation and script detection,
421   // look like they are right_to_left.
422   bool major_right_to_left() const;
423 
424   // Set a whitelist and/or blacklist of characters to recognize.
425   // An empty or nullptr whitelist enables everything (minus any blacklist).
426   // An empty or nullptr blacklist disables nothing.
427   // An empty or nullptr unblacklist has no effect.
428   // The blacklist overrides the whitelist.
429   // The unblacklist overrides the blacklist.
430   // Each list is a string of utf8 character strings. Boundaries between
431   // unicharset units are worked out automatically, and characters not in
432   // the unicharset are silently ignored.
433   void set_black_and_whitelist(const char *blacklist, const char *whitelist,
434                                const char *unblacklist);
435 
436   // Set the isalpha property of the given unichar to the given value.
set_isalpha(UNICHAR_ID unichar_id,bool value)437   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
438     unichars[unichar_id].properties.isalpha = value;
439   }
440 
441   // Set the islower property of the given unichar to the given value.
set_islower(UNICHAR_ID unichar_id,bool value)442   void set_islower(UNICHAR_ID unichar_id, bool value) {
443     unichars[unichar_id].properties.islower = value;
444   }
445 
446   // Set the isupper property of the given unichar to the given value.
set_isupper(UNICHAR_ID unichar_id,bool value)447   void set_isupper(UNICHAR_ID unichar_id, bool value) {
448     unichars[unichar_id].properties.isupper = value;
449   }
450 
451   // Set the isdigit property of the given unichar to the given value.
set_isdigit(UNICHAR_ID unichar_id,bool value)452   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
453     unichars[unichar_id].properties.isdigit = value;
454   }
455 
456   // Set the ispunctuation property of the given unichar to the given value.
set_ispunctuation(UNICHAR_ID unichar_id,bool value)457   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
458     unichars[unichar_id].properties.ispunctuation = value;
459   }
460 
461   // Set the isngram property of the given unichar to the given value.
set_isngram(UNICHAR_ID unichar_id,bool value)462   void set_isngram(UNICHAR_ID unichar_id, bool value) {
463     unichars[unichar_id].properties.isngram = value;
464   }
465 
466   // Set the script name of the given unichar to the given value.
467   // Value is copied and thus can be a temporary;
set_script(UNICHAR_ID unichar_id,const char * value)468   void set_script(UNICHAR_ID unichar_id, const char *value) {
469     unichars[unichar_id].properties.script_id = add_script(value);
470   }
471 
472   // Set other_case unichar id in the properties for the given unichar id.
set_other_case(UNICHAR_ID unichar_id,UNICHAR_ID other_case)473   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
474     unichars[unichar_id].properties.other_case = other_case;
475   }
476 
477   // Set the direction property of the given unichar to the given value.
set_direction(UNICHAR_ID unichar_id,UNICHARSET::Direction value)478   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
479     unichars[unichar_id].properties.direction = value;
480   }
481 
482   // Set mirror unichar id in the properties for the given unichar id.
set_mirror(UNICHAR_ID unichar_id,UNICHAR_ID mirror)483   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
484     unichars[unichar_id].properties.mirror = mirror;
485   }
486 
487   // Record normalized version of unichar with the given unichar_id.
set_normed(UNICHAR_ID unichar_id,const char * normed)488   void set_normed(UNICHAR_ID unichar_id, const char *normed) {
489     unichars[unichar_id].properties.normed = normed;
490     unichars[unichar_id].properties.normed_ids.clear();
491   }
492   // Sets the normed_ids vector from the normed string. normed_ids is not
493   // stored in the file, and needs to be set when the UNICHARSET is loaded.
494   void set_normed_ids(UNICHAR_ID unichar_id);
495 
496   // Return the isalpha property of the given unichar.
get_isalpha(UNICHAR_ID unichar_id)497   bool get_isalpha(UNICHAR_ID unichar_id) const {
498     if (INVALID_UNICHAR_ID == unichar_id) {
499       return false;
500     }
501     ASSERT_HOST(contains_unichar_id(unichar_id));
502     return unichars[unichar_id].properties.isalpha;
503   }
504 
505   // Return the islower property of the given unichar.
get_islower(UNICHAR_ID unichar_id)506   bool get_islower(UNICHAR_ID unichar_id) const {
507     if (INVALID_UNICHAR_ID == unichar_id) {
508       return false;
509     }
510     ASSERT_HOST(contains_unichar_id(unichar_id));
511     return unichars[unichar_id].properties.islower;
512   }
513 
514   // Return the isupper property of the given unichar.
get_isupper(UNICHAR_ID unichar_id)515   bool get_isupper(UNICHAR_ID unichar_id) const {
516     if (INVALID_UNICHAR_ID == unichar_id) {
517       return false;
518     }
519     ASSERT_HOST(contains_unichar_id(unichar_id));
520     return unichars[unichar_id].properties.isupper;
521   }
522 
523   // Return the isdigit property of the given unichar.
get_isdigit(UNICHAR_ID unichar_id)524   bool get_isdigit(UNICHAR_ID unichar_id) const {
525     if (INVALID_UNICHAR_ID == unichar_id) {
526       return false;
527     }
528     ASSERT_HOST(contains_unichar_id(unichar_id));
529     return unichars[unichar_id].properties.isdigit;
530   }
531 
532   // Return the ispunctuation property of the given unichar.
get_ispunctuation(UNICHAR_ID unichar_id)533   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
534     if (INVALID_UNICHAR_ID == unichar_id) {
535       return false;
536     }
537     ASSERT_HOST(contains_unichar_id(unichar_id));
538     return unichars[unichar_id].properties.ispunctuation;
539   }
540 
541   // Return the isngram property of the given unichar.
get_isngram(UNICHAR_ID unichar_id)542   bool get_isngram(UNICHAR_ID unichar_id) const {
543     if (INVALID_UNICHAR_ID == unichar_id) {
544       return false;
545     }
546     ASSERT_HOST(contains_unichar_id(unichar_id));
547     return unichars[unichar_id].properties.isngram;
548   }
549 
550   // Returns whether the unichar id represents a unicode value in the private
551   // use area.
552   bool get_isprivate(UNICHAR_ID unichar_id) const;
553 
554   // Returns true if the ids have useful min/max top/bottom values.
top_bottom_useful()555   bool top_bottom_useful() const {
556     return top_bottom_set_;
557   }
558   // Sets all ranges to empty, so they can be expanded to set the values.
559   void set_ranges_empty();
560   // Sets all the properties for this unicharset given a src_unicharset with
561   // everything set. The unicharsets don't have to be the same, and graphemes
562   // are correctly accounted for.
SetPropertiesFromOther(const UNICHARSET & src)563   void SetPropertiesFromOther(const UNICHARSET &src) {
564     PartialSetPropertiesFromOther(0, src);
565   }
566   // Sets properties from Other, starting only at the given index.
567   void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src);
568   // Expands the tops and bottoms and widths for this unicharset given a
569   // src_unicharset with ranges in it. The unicharsets don't have to be the
570   // same, and graphemes are correctly accounted for.
571   void ExpandRangesFromOther(const UNICHARSET &src);
572   // Makes this a copy of src. Clears this completely first, so the automattic
573   // ids will not be present in this if not in src.
574   void CopyFrom(const UNICHARSET &src);
575   // For each id in src, if it does not occur in this, add it, as in
576   // SetPropertiesFromOther, otherwise expand the ranges, as in
577   // ExpandRangesFromOther.
578   void AppendOtherUnicharset(const UNICHARSET &src);
579   // Returns true if the acceptable ranges of the tops of the characters do
580   // not overlap, making their x-height calculations distinct.
581   bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
582   // Returns the min and max bottom and top of the given unichar in
583   // baseline-normalized coordinates, ie, where the baseline is
584   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
585   // (See normalis.h for the definitions).
get_top_bottom(UNICHAR_ID unichar_id,int * min_bottom,int * max_bottom,int * min_top,int * max_top)586   void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom,
587                       int *min_top, int *max_top) const {
588     if (INVALID_UNICHAR_ID == unichar_id) {
589       *min_bottom = *min_top = 0;
590       *max_bottom = *max_top = 256; // kBlnCellHeight
591       return;
592     }
593     ASSERT_HOST(contains_unichar_id(unichar_id));
594     *min_bottom = unichars[unichar_id].properties.min_bottom;
595     *max_bottom = unichars[unichar_id].properties.max_bottom;
596     *min_top = unichars[unichar_id].properties.min_top;
597     *max_top = unichars[unichar_id].properties.max_top;
598   }
set_top_bottom(UNICHAR_ID unichar_id,int min_bottom,int max_bottom,int min_top,int max_top)599   void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom,
600                       int min_top, int max_top) {
601     unichars[unichar_id].properties.min_bottom =
602         ClipToRange<int>(min_bottom, 0, UINT8_MAX);
603     unichars[unichar_id].properties.max_bottom =
604         ClipToRange<int>(max_bottom, 0, UINT8_MAX);
605     unichars[unichar_id].properties.min_top =
606         ClipToRange<int>(min_top, 0, UINT8_MAX);
607     unichars[unichar_id].properties.max_top =
608         ClipToRange<int>(max_top, 0, UINT8_MAX);
609   }
610   // Returns the width stats (as mean, sd) of the given unichar relative to the
611   // median advance of all characters in the character set.
get_width_stats(UNICHAR_ID unichar_id,float * width,float * width_sd)612   void get_width_stats(UNICHAR_ID unichar_id, float *width,
613                        float *width_sd) const {
614     if (INVALID_UNICHAR_ID == unichar_id) {
615       *width = 0.0f;
616       *width_sd = 0.0f;
617       ;
618       return;
619     }
620     ASSERT_HOST(contains_unichar_id(unichar_id));
621     *width = unichars[unichar_id].properties.width;
622     *width_sd = unichars[unichar_id].properties.width_sd;
623   }
set_width_stats(UNICHAR_ID unichar_id,float width,float width_sd)624   void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd) {
625     unichars[unichar_id].properties.width = width;
626     unichars[unichar_id].properties.width_sd = width_sd;
627   }
628   // Returns the stats of the x-bearing (as mean, sd) of the given unichar
629   // relative to the median advance of all characters in the character set.
get_bearing_stats(UNICHAR_ID unichar_id,float * bearing,float * bearing_sd)630   void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing,
631                          float *bearing_sd) const {
632     if (INVALID_UNICHAR_ID == unichar_id) {
633       *bearing = *bearing_sd = 0.0f;
634       return;
635     }
636     ASSERT_HOST(contains_unichar_id(unichar_id));
637     *bearing = unichars[unichar_id].properties.bearing;
638     *bearing_sd = unichars[unichar_id].properties.bearing_sd;
639   }
set_bearing_stats(UNICHAR_ID unichar_id,float bearing,float bearing_sd)640   void set_bearing_stats(UNICHAR_ID unichar_id, float bearing,
641                          float bearing_sd) {
642     unichars[unichar_id].properties.bearing = bearing;
643     unichars[unichar_id].properties.bearing_sd = bearing_sd;
644   }
645   // Returns the stats of the x-advance of the given unichar (as mean, sd)
646   // relative to the median advance of all characters in the character set.
get_advance_stats(UNICHAR_ID unichar_id,float * advance,float * advance_sd)647   void get_advance_stats(UNICHAR_ID unichar_id, float *advance,
648                          float *advance_sd) const {
649     if (INVALID_UNICHAR_ID == unichar_id) {
650       *advance = *advance_sd = 0;
651       return;
652     }
653     ASSERT_HOST(contains_unichar_id(unichar_id));
654     *advance = unichars[unichar_id].properties.advance;
655     *advance_sd = unichars[unichar_id].properties.advance_sd;
656   }
set_advance_stats(UNICHAR_ID unichar_id,float advance,float advance_sd)657   void set_advance_stats(UNICHAR_ID unichar_id, float advance,
658                          float advance_sd) {
659     unichars[unichar_id].properties.advance = advance;
660     unichars[unichar_id].properties.advance_sd = advance_sd;
661   }
662   // Returns true if the font metrics properties are empty.
PropertiesIncomplete(UNICHAR_ID unichar_id)663   bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
664     return unichars[unichar_id].properties.AnyRangeEmpty();
665   }
666 
667   // Returns true if the script of the given id is space delimited.
668   // Returns false for Han and Thai scripts.
IsSpaceDelimited(UNICHAR_ID unichar_id)669   bool IsSpaceDelimited(UNICHAR_ID unichar_id) const {
670     if (INVALID_UNICHAR_ID == unichar_id) {
671       return true;
672     }
673     int script_id = get_script(unichar_id);
674     return script_id != han_sid_ && script_id != thai_sid_ &&
675            script_id != hangul_sid_ && script_id != hiragana_sid_ &&
676            script_id != katakana_sid_;
677   }
678 
679   // Return the script name of the given unichar.
680   // The returned pointer will always be the same for the same script, it's
681   // managed by unicharset and thus MUST NOT be deleted
get_script(UNICHAR_ID unichar_id)682   int get_script(UNICHAR_ID unichar_id) const {
683     if (INVALID_UNICHAR_ID == unichar_id) {
684       return null_sid_;
685     }
686     ASSERT_HOST(contains_unichar_id(unichar_id));
687     return unichars[unichar_id].properties.script_id;
688   }
689 
690   // Return the character properties, eg. alpha/upper/lower/digit/punct,
691   // as a bit field of unsigned int.
692   unsigned int get_properties(UNICHAR_ID unichar_id) const;
693 
694   // Return the character property as a single char.  If a character has
695   // multiple attributes, the main property is defined by the following order:
696   //   upper_case : 'A'
697   //   lower_case : 'a'
698   //   alpha      : 'x'
699   //   digit      : '0'
700   //   punctuation: 'p'
701   char get_chartype(UNICHAR_ID unichar_id) const;
702 
703   // Get other_case unichar id in the properties for the given unichar id.
get_other_case(UNICHAR_ID unichar_id)704   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
705     if (INVALID_UNICHAR_ID == unichar_id) {
706       return INVALID_UNICHAR_ID;
707     }
708     ASSERT_HOST(contains_unichar_id(unichar_id));
709     return unichars[unichar_id].properties.other_case;
710   }
711 
712   // Returns the direction property of the given unichar.
get_direction(UNICHAR_ID unichar_id)713   Direction get_direction(UNICHAR_ID unichar_id) const {
714     if (INVALID_UNICHAR_ID == unichar_id) {
715       return UNICHARSET::U_OTHER_NEUTRAL;
716     }
717     ASSERT_HOST(contains_unichar_id(unichar_id));
718     return unichars[unichar_id].properties.direction;
719   }
720 
721   // Get mirror unichar id in the properties for the given unichar id.
get_mirror(UNICHAR_ID unichar_id)722   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
723     if (INVALID_UNICHAR_ID == unichar_id) {
724       return INVALID_UNICHAR_ID;
725     }
726     ASSERT_HOST(contains_unichar_id(unichar_id));
727     return unichars[unichar_id].properties.mirror;
728   }
729 
730   // Returns UNICHAR_ID of the corresponding lower-case unichar.
to_lower(UNICHAR_ID unichar_id)731   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
732     if (INVALID_UNICHAR_ID == unichar_id) {
733       return INVALID_UNICHAR_ID;
734     }
735     ASSERT_HOST(contains_unichar_id(unichar_id));
736     if (unichars[unichar_id].properties.islower) {
737       return unichar_id;
738     }
739     return unichars[unichar_id].properties.other_case;
740   }
741 
742   // Returns UNICHAR_ID of the corresponding upper-case unichar.
to_upper(UNICHAR_ID unichar_id)743   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
744     if (INVALID_UNICHAR_ID == unichar_id) {
745       return INVALID_UNICHAR_ID;
746     }
747     ASSERT_HOST(contains_unichar_id(unichar_id));
748     if (unichars[unichar_id].properties.isupper) {
749       return unichar_id;
750     }
751     return unichars[unichar_id].properties.other_case;
752   }
753 
754   // Returns true if this UNICHARSET has the special codes in
755   // SpecialUnicharCodes available. If false then there are normal unichars
756   // at these codes and they should not be used.
has_special_codes()757   bool has_special_codes() const {
758     return get_fragment(UNICHAR_BROKEN) != nullptr &&
759            strcmp(id_to_unichar(UNICHAR_BROKEN),
760                   kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
761   }
762 
763   // Returns true if there are any repeated unicodes in the normalized
764   // text of any unichar-id in the unicharset.
765   bool AnyRepeatedUnicodes() const;
766 
767   // Return a pointer to the CHAR_FRAGMENT class if the given
768   // unichar id represents a character fragment.
get_fragment(UNICHAR_ID unichar_id)769   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
770     if (INVALID_UNICHAR_ID == unichar_id) {
771       return nullptr;
772     }
773     ASSERT_HOST(contains_unichar_id(unichar_id));
774     return unichars[unichar_id].properties.fragment;
775   }
776 
777   // Return the isalpha property of the given unichar representation.
get_isalpha(const char * const unichar_repr)778   bool get_isalpha(const char *const unichar_repr) const {
779     return get_isalpha(unichar_to_id(unichar_repr));
780   }
781 
782   // Return the islower property of the given unichar representation.
get_islower(const char * const unichar_repr)783   bool get_islower(const char *const unichar_repr) const {
784     return get_islower(unichar_to_id(unichar_repr));
785   }
786 
787   // Return the isupper property of the given unichar representation.
get_isupper(const char * const unichar_repr)788   bool get_isupper(const char *const unichar_repr) const {
789     return get_isupper(unichar_to_id(unichar_repr));
790   }
791 
792   // Return the isdigit property of the given unichar representation.
get_isdigit(const char * const unichar_repr)793   bool get_isdigit(const char *const unichar_repr) const {
794     return get_isdigit(unichar_to_id(unichar_repr));
795   }
796 
797   // Return the ispunctuation property of the given unichar representation.
get_ispunctuation(const char * const unichar_repr)798   bool get_ispunctuation(const char *const unichar_repr) const {
799     return get_ispunctuation(unichar_to_id(unichar_repr));
800   }
801 
802   // Return the character properties, eg. alpha/upper/lower/digit/punct,
803   // of the given unichar representation
get_properties(const char * const unichar_repr)804   unsigned int get_properties(const char *const unichar_repr) const {
805     return get_properties(unichar_to_id(unichar_repr));
806   }
807 
get_chartype(const char * const unichar_repr)808   char get_chartype(const char *const unichar_repr) const {
809     return get_chartype(unichar_to_id(unichar_repr));
810   }
811 
812   // Return the script name of the given unichar representation.
813   // The returned pointer will always be the same for the same script, it's
814   // managed by unicharset and thus MUST NOT be deleted
get_script(const char * const unichar_repr)815   int get_script(const char *const unichar_repr) const {
816     return get_script(unichar_to_id(unichar_repr));
817   }
818 
819   // Return a pointer to the CHAR_FRAGMENT class struct if the given
820   // unichar representation represents a character fragment.
get_fragment(const char * const unichar_repr)821   const CHAR_FRAGMENT *get_fragment(const char *const unichar_repr) const {
822     if (unichar_repr == nullptr || unichar_repr[0] == '\0' ||
823         !ids.contains(unichar_repr, false)) {
824       return nullptr;
825     }
826     return get_fragment(unichar_to_id(unichar_repr));
827   }
828 
829   // Return the isalpha property of the given unichar representation.
830   // Only the first length characters from unichar_repr are used.
get_isalpha(const char * const unichar_repr,int length)831   bool get_isalpha(const char *const unichar_repr, int length) const {
832     return get_isalpha(unichar_to_id(unichar_repr, length));
833   }
834 
835   // Return the islower property of the given unichar representation.
836   // Only the first length characters from unichar_repr are used.
get_islower(const char * const unichar_repr,int length)837   bool get_islower(const char *const unichar_repr, int length) const {
838     return get_islower(unichar_to_id(unichar_repr, length));
839   }
840 
841   // Return the isupper property of the given unichar representation.
842   // Only the first length characters from unichar_repr are used.
get_isupper(const char * const unichar_repr,int length)843   bool get_isupper(const char *const unichar_repr, int length) const {
844     return get_isupper(unichar_to_id(unichar_repr, length));
845   }
846 
847   // Return the isdigit property of the given unichar representation.
848   // Only the first length characters from unichar_repr are used.
get_isdigit(const char * const unichar_repr,int length)849   bool get_isdigit(const char *const unichar_repr, int length) const {
850     return get_isdigit(unichar_to_id(unichar_repr, length));
851   }
852 
853   // Return the ispunctuation property of the given unichar representation.
854   // Only the first length characters from unichar_repr are used.
get_ispunctuation(const char * const unichar_repr,int length)855   bool get_ispunctuation(const char *const unichar_repr, int length) const {
856     return get_ispunctuation(unichar_to_id(unichar_repr, length));
857   }
858 
859   // Returns normalized version of unichar with the given unichar_id.
get_normed_unichar(UNICHAR_ID unichar_id)860   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
861     if (unichar_id == UNICHAR_SPACE) {
862       return " ";
863     }
864     return unichars[unichar_id].properties.normed.c_str();
865   }
866   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
867   // version of the given id. There may be more than one UNICHAR_ID in the
868   // vector if unichar_id represents a ligature.
normed_ids(UNICHAR_ID unichar_id)869   const std::vector<UNICHAR_ID> &normed_ids(UNICHAR_ID unichar_id) const {
870     return unichars[unichar_id].properties.normed_ids;
871   }
872 
873   // Return the script name of the given unichar representation.
874   // Only the first length characters from unichar_repr are used.
875   // The returned pointer will always be the same for the same script, it's
876   // managed by unicharset and thus MUST NOT be deleted
get_script(const char * const unichar_repr,int length)877   int get_script(const char *const unichar_repr, int length) const {
878     return get_script(unichar_to_id(unichar_repr, length));
879   }
880 
881   // Return the (current) number of scripts in the script table
get_script_table_size()882   int get_script_table_size() const {
883     return script_table_size_used;
884   }
885 
886   // Return the script string from its id
get_script_from_script_id(int id)887   const char *get_script_from_script_id(int id) const {
888     if (id >= script_table_size_used || id < 0) {
889       return null_script;
890     }
891     return script_table[id];
892   }
893 
894   // Returns the id from the name of the script, or 0 if script is not found.
895   // Note that this is an expensive operation since it involves iteratively
896   // comparing strings in the script table.  To avoid dependency on STL, we
897   // won't use a hash.  Instead, the calling function can use this to lookup
898   // and save the ID for relevant scripts for fast comparisons later.
899   int get_script_id_from_name(const char *script_name) const;
900 
901   // Return true if the given script is the null script
is_null_script(const char * script)902   bool is_null_script(const char *script) const {
903     return script == null_script;
904   }
905 
906   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
907   // then the returned pointer will be the same.
908   // The script parameter is copied and thus can be a temporary.
909   int add_script(const char *script);
910 
911   // Return the enabled property of the given unichar.
get_enabled(UNICHAR_ID unichar_id)912   bool get_enabled(UNICHAR_ID unichar_id) const {
913     ASSERT_HOST(contains_unichar_id(unichar_id));
914     return unichars[unichar_id].properties.enabled;
915   }
916 
null_sid()917   int null_sid() const {
918     return null_sid_;
919   }
common_sid()920   int common_sid() const {
921     return common_sid_;
922   }
latin_sid()923   int latin_sid() const {
924     return latin_sid_;
925   }
cyrillic_sid()926   int cyrillic_sid() const {
927     return cyrillic_sid_;
928   }
greek_sid()929   int greek_sid() const {
930     return greek_sid_;
931   }
han_sid()932   int han_sid() const {
933     return han_sid_;
934   }
hiragana_sid()935   int hiragana_sid() const {
936     return hiragana_sid_;
937   }
katakana_sid()938   int katakana_sid() const {
939     return katakana_sid_;
940   }
thai_sid()941   int thai_sid() const {
942     return thai_sid_;
943   }
hangul_sid()944   int hangul_sid() const {
945     return hangul_sid_;
946   }
default_sid()947   int default_sid() const {
948     return default_sid_;
949   }
950 
951   // Returns true if the unicharset has the concept of upper/lower case.
script_has_upper_lower()952   bool script_has_upper_lower() const {
953     return script_has_upper_lower_;
954   }
955   // Returns true if the unicharset has the concept of x-height.
956   // script_has_xheight can be true even if script_has_upper_lower is not,
957   // when the script has a sufficiently predominant top line with ascenders,
958   // such as Devanagari and Thai.
script_has_xheight()959   bool script_has_xheight() const {
960     return script_has_xheight_;
961   }
962 
963 private:
964   struct TESS_API UNICHAR_PROPERTIES {
965     UNICHAR_PROPERTIES();
966     // Initializes all properties to sensible default values.
967     void Init();
968     // Sets all ranges wide open. Initialization default in case there are
969     // no useful values available.
970     void SetRangesOpen();
971     // Sets all ranges to empty. Used before expanding with font-based data.
972     void SetRangesEmpty();
973     // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
974     // is empty.
975     bool AnyRangeEmpty() const;
976     // Expands the ranges with the ranges from the src properties.
977     void ExpandRangesFrom(const UNICHAR_PROPERTIES &src);
978     // Copies the properties from src into this.
979     void CopyFrom(const UNICHAR_PROPERTIES &src);
980 
981     bool isalpha;
982     bool islower;
983     bool isupper;
984     bool isdigit;
985     bool ispunctuation;
986     bool isngram;
987     bool enabled;
988     // Possible limits of the top and bottom of the bounding box in
989     // baseline-normalized coordinates, ie, where the baseline is
990     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
991     // (See normalis.h for the definitions).
992     uint8_t min_bottom;
993     uint8_t max_bottom;
994     uint8_t min_top;
995     uint8_t max_top;
996     // Statistics of the widths of bounding box, relative to the median advance.
997     float width;
998     float width_sd;
999     // Stats of the x-bearing and advance, also relative to the median advance.
1000     float bearing;
1001     float bearing_sd;
1002     float advance;
1003     float advance_sd;
1004     int script_id;
1005     UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
1006     Direction direction;   // direction of this unichar
1007     // Mirror property is useful for reverse DAWG lookup for words in
1008     // right-to-left languages (e.g. "(word)" would be in
1009     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
1010     // However, what we want in our DAWG is
1011     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
1012     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
1013     UNICHAR_ID mirror;
1014     // A string of unichar_ids that represent the corresponding normed string.
1015     // For awkward characters like em-dash, this gives hyphen.
1016     // For ligatures, this gives the string of normal unichars.
1017     std::vector<UNICHAR_ID> normed_ids;
1018     std::string normed; // normalized version of this unichar
1019     // Contains meta information about the fragment if a unichar represents
1020     // a fragment of a character, otherwise should be set to nullptr.
1021     // It is assumed that character fragments are added to the unicharset
1022     // after the corresponding 'base' characters.
1023     CHAR_FRAGMENT *fragment;
1024   };
1025 
1026   struct UNICHAR_SLOT {
1027     char representation[UNICHAR_LEN + 1];
1028     UNICHAR_PROPERTIES properties;
1029   };
1030 
1031   // Internal recursive version of encode_string above.
1032   // str is the start of the whole string.
1033   // str_index is the current position in str.
1034   // str_length is the length of str.
1035   // encoding is a working encoding of str.
1036   // lengths is a working set of lengths of each element of encoding.
1037   // best_total_length is the longest length of str that has been successfully
1038   // encoded so far.
1039   // On return:
1040   // best_encoding contains the encoding that used the longest part of str.
1041   // best_lengths (may be null) contains the lengths of best_encoding.
1042   void encode_string(const char *str, int str_index, int str_length,
1043                      std::vector<UNICHAR_ID> *encoding,
1044                      std::vector<char> *lengths, unsigned *best_total_length,
1045                      std::vector<UNICHAR_ID> *best_encoding,
1046                      std::vector<char> *best_lengths) const;
1047 
1048   // Gets the properties for a grapheme string, combining properties for
1049   // multiple characters in a meaningful way where possible.
1050   // Returns false if no valid match was found in the unicharset.
1051   // NOTE that script_id, mirror, and other_case refer to this unicharset on
1052   // return and will need redirecting if the target unicharset is different.
1053   bool GetStrProperties(const char *utf8_str, UNICHAR_PROPERTIES *props) const;
1054 
1055   // Load ourselves from a "file" where our only interface to the file is
1056   // an implementation of fgets().  This is the parsing primitive accessed by
1057   // the public routines load_from_file().
1058   bool load_via_fgets(const std::function<char *(char *, int)> &fgets_cb,
1059                       bool skip_fragments);
1060 
1061   // List of mappings to make when ingesting strings from the outside.
1062   // The substitutions clean up text that should exists for rendering of
1063   // synthetic data, but not in the recognition set.
1064   static const char *kCleanupMaps[][2];
1065   static const char *null_script;
1066 
1067   std::vector<UNICHAR_SLOT> unichars;
1068   UNICHARMAP ids;
1069   char **script_table;
1070   int script_table_size_used;
1071   int script_table_size_reserved;
1072   // True if the unichars have their tops/bottoms set.
1073   bool top_bottom_set_;
1074   // True if the unicharset has significant upper/lower case chars.
1075   bool script_has_upper_lower_;
1076   // True if the unicharset has a significant mean-line with significant
1077   // ascenders above that.
1078   bool script_has_xheight_;
1079   // True if the set contains chars that would be changed by the cleanup.
1080   bool old_style_included_;
1081 
1082   // A few convenient script name-to-id mapping without using hash.
1083   // These are initialized when unicharset file is loaded.  Anything
1084   // missing from this list can be looked up using get_script_id_from_name.
1085   int null_sid_;
1086   int common_sid_;
1087   int latin_sid_;
1088   int cyrillic_sid_;
1089   int greek_sid_;
1090   int han_sid_;
1091   int hiragana_sid_;
1092   int katakana_sid_;
1093   int thai_sid_;
1094   int hangul_sid_;
1095   // The most frequently occurring script in the charset.
1096   int default_sid_;
1097 };
1098 
1099 } // namespace tesseract
1100 
1101 #endif // TESSERACT_CCUTIL_UNICHARSET_H_
1102