1 ///////////////////////////////////////////////////////////////////////
2 // File:        unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author:      Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #include "unicharset.h"
20 
21 #include "params.h"
22 
23 #include <tesseract/unichar.h>
24 #include "serialis.h"
25 
26 #include <algorithm>
27 #include <cassert>
28 #include <cstdio>
29 #include <cstring>
30 #include <iomanip> // for std::setw
31 #include <locale>  // for std::locale::classic
32 #include <sstream> // for std::istringstream, std::ostringstream
33 
34 namespace tesseract {
35 
36 // Special character used in representing character fragments.
37 static const char kSeparator = '|';
38 // Special character used in representing 'natural' character fragments.
39 static const char kNaturalFlag = 'n';
40 
41 static const int ISALPHA_MASK = 0x1;
42 static const int ISLOWER_MASK = 0x2;
43 static const int ISUPPER_MASK = 0x4;
44 static const int ISDIGIT_MASK = 0x8;
45 static const int ISPUNCTUATION_MASK = 0x10;
46 
47 // Y coordinate threshold for determining cap-height vs x-height.
48 // TODO(rays) Bring the global definition down to the ccutil library level,
49 // so this constant is relative to some other constants.
50 static const int kMeanlineThreshold = 220;
51 // Let C be the number of alpha chars for which all tops exceed
52 // kMeanlineThreshold, and X the number of alpha chars for which all
53 // tops are below kMeanlineThreshold, then if X > C *
54 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
55 // half the alpha characters have upper or lower case, then the
56 // unicharset "has x-height".
57 const double kMinXHeightFraction = 0.25;
58 const double kMinCapHeightFraction = 0.05;
59 
60 /*static */
61 const char *UNICHARSET::kCustomLigatures[][2] = {
62     {"ct", "\uE003"}, // c + t -> U+E003
63     {"ſh", "\uE006"}, // long-s + h -> U+E006
64     {"ſi", "\uE007"}, // long-s + i -> U+E007
65     {"ſl", "\uE008"}, // long-s + l -> U+E008
66     {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
67     {nullptr, nullptr}};
68 
69 // List of mappings to make when ingesting strings from the outside.
70 // The substitutions clean up text that should exist for rendering of
71 // synthetic data, but not in the recognition set.
72 const char *UNICHARSET::kCleanupMaps[][2] = {
73     {"\u0640", ""},   // TATWEEL is deleted.
74     {"\ufb01", "fi"}, // fi ligature->fi pair.
75     {"\ufb02", "fl"}, // fl ligature->fl pair.
76     {nullptr, nullptr}};
77 
78 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
79 const char *UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
80     " ", "Joined", "|Broken|0|1"};
81 
82 const char *UNICHARSET::null_script = "NULL";
83 
UNICHAR_PROPERTIES()84 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
85   Init();
86 }
87 
88 // Initialize all properties to sensible default values.
Init()89 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
90   isalpha = false;
91   islower = false;
92   isupper = false;
93   isdigit = false;
94   ispunctuation = false;
95   isngram = false;
96   enabled = false;
97   SetRangesOpen();
98   script_id = 0;
99   other_case = 0;
100   mirror = 0;
101   normed = "";
102   direction = UNICHARSET::U_LEFT_TO_RIGHT;
103   fragment = nullptr;
104 }
105 
106 // Sets all ranges wide open. Initialization default in case there are
107 // no useful values available.
SetRangesOpen()108 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
109   min_bottom = 0;
110   max_bottom = UINT8_MAX;
111   min_top = 0;
112   max_top = UINT8_MAX;
113   width = 0.0f;
114   width_sd = 0.0f;
115   bearing = 0.0f;
116   bearing_sd = 0.0f;
117   advance = 0.0f;
118   advance_sd = 0.0f;
119 }
120 
121 // Sets all ranges to empty. Used before expanding with font-based data.
SetRangesEmpty()122 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
123   min_bottom = UINT8_MAX;
124   max_bottom = 0;
125   min_top = UINT8_MAX;
126   max_top = 0;
127   width = 0.0f;
128   width_sd = 0.0f;
129   bearing = 0.0f;
130   bearing_sd = 0.0f;
131   advance = 0.0f;
132   advance_sd = 0.0f;
133 }
134 
135 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
136 // is empty.
AnyRangeEmpty() const137 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
138   return width == 0.0f || advance == 0.0f;
139 }
140 
141 // Expands the ranges with the ranges from the src properties.
ExpandRangesFrom(const UNICHAR_PROPERTIES & src)142 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
143     const UNICHAR_PROPERTIES &src) {
144   UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
145   UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
146   UpdateRange(src.min_top, &min_top, &max_top);
147   UpdateRange(src.max_top, &min_top, &max_top);
148   if (src.width_sd > width_sd) {
149     width = src.width;
150     width_sd = src.width_sd;
151   }
152   if (src.bearing_sd > bearing_sd) {
153     bearing = src.bearing;
154     bearing_sd = src.bearing_sd;
155   }
156   if (src.advance_sd > advance_sd) {
157     advance = src.advance;
158     advance_sd = src.advance_sd;
159   }
160 }
161 
162 // Copies the properties from src into this.
CopyFrom(const UNICHAR_PROPERTIES & src)163 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
164   // Apart from the fragment, everything else can be done with a default copy.
165   CHAR_FRAGMENT *saved_fragment = fragment;
166   *this = src; // Bitwise copy.
167   fragment = saved_fragment;
168 }
169 
UNICHARSET()170 UNICHARSET::UNICHARSET()
171     : ids(), script_table(nullptr), script_table_size_used(0) {
172   clear();
173   for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
174     unichar_insert(kSpecialUnicharCodes[i]);
175     if (i == UNICHAR_JOINED) {
176       set_isngram(i, true);
177     }
178   }
179 }
180 
~UNICHARSET()181 UNICHARSET::~UNICHARSET() {
182   clear();
183 }
184 
185 UNICHAR_ID
unichar_to_id(const char * const unichar_repr) const186 UNICHARSET::unichar_to_id(const char *const unichar_repr) const {
187   std::string cleaned =
188       old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
189   return ids.contains(cleaned.data(), cleaned.size())
190              ? ids.unichar_to_id(cleaned.data(), cleaned.size())
191              : INVALID_UNICHAR_ID;
192 }
193 
unichar_to_id(const char * const unichar_repr,int length) const194 UNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr,
195                                      int length) const {
196   assert(length > 0 && length <= UNICHAR_LEN);
197   std::string cleaned(unichar_repr, length);
198   if (!old_style_included_) {
199     cleaned = CleanupString(unichar_repr, length);
200   }
201   return ids.contains(cleaned.data(), cleaned.size())
202              ? ids.unichar_to_id(cleaned.data(), cleaned.size())
203              : INVALID_UNICHAR_ID;
204 }
205 
206 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
207 // while leaving the rest of the string encodable. Returns 0 if the
208 // beginning of the string is not encodable.
209 // WARNING: this function now encodes the whole string for precision.
210 // Use encode_string in preference to repeatedly calling step.
step(const char * str) const211 int UNICHARSET::step(const char *str) const {
212   std::vector<UNICHAR_ID> encoding;
213   std::vector<char> lengths;
214   encode_string(str, true, &encoding, &lengths, nullptr);
215   if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
216     return 0;
217   }
218   return lengths[0];
219 }
220 
221 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
222 // If not encodable, write the first byte offset which cannot be converted
223 // into the second (return) argument.
encodable_string(const char * str,unsigned * first_bad_position) const224 bool UNICHARSET::encodable_string(const char *str,
225                                   unsigned *first_bad_position) const {
226   std::vector<UNICHAR_ID> encoding;
227   return encode_string(str, true, &encoding, nullptr, first_bad_position);
228 }
229 
230 // Encodes the given UTF-8 string with this UNICHARSET.
231 // Returns true if the encoding succeeds completely, false if there is at
232 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
233 // the rest of the string is still encoded.
234 // If lengths is not nullptr, then it is filled with the corresponding
235 // byte length of each encoded UNICHAR_ID.
236 // WARNING: Caller must guarantee that str has already been cleaned of codes
237 // that do not belong in the unicharset, or encoding may fail.
238 // Use CleanupString to perform the cleaning.
encode_string(const char * str,bool give_up_on_failure,std::vector<UNICHAR_ID> * encoding,std::vector<char> * lengths,unsigned * encoded_length) const239 bool UNICHARSET::encode_string(const char *str, bool give_up_on_failure,
240                                std::vector<UNICHAR_ID> *encoding,
241                                std::vector<char> *lengths,
242                                unsigned *encoded_length) const {
243   std::vector<UNICHAR_ID> working_encoding;
244   std::vector<char> working_lengths;
245   std::vector<char> best_lengths;
246   encoding->clear(); // Just in case str is empty.
247   auto str_length = strlen(str);
248   unsigned str_pos = 0;
249   bool perfect = true;
250   while (str_pos < str_length) {
251     encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
252                   &str_pos, encoding, &best_lengths);
253     if (str_pos < str_length) {
254       // This is a non-match. Skip one utf-8 character.
255       perfect = false;
256       if (give_up_on_failure) {
257         break;
258       }
259       int step = UNICHAR::utf8_step(str + str_pos);
260       if (step == 0) {
261         step = 1;
262       }
263       encoding->push_back(INVALID_UNICHAR_ID);
264       best_lengths.push_back(step);
265       str_pos += step;
266       working_encoding = *encoding;
267       working_lengths = best_lengths;
268     }
269   }
270   if (lengths != nullptr) {
271     *lengths = best_lengths;
272   }
273   if (encoded_length != nullptr) {
274     *encoded_length = str_pos;
275   }
276   return perfect;
277 }
278 
id_to_unichar(UNICHAR_ID id) const279 const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
280   if (id == INVALID_UNICHAR_ID) {
281     return INVALID_UNICHAR;
282   }
283   ASSERT_HOST(static_cast<unsigned>(id) < this->size());
284   return unichars[id].representation;
285 }
286 
id_to_unichar_ext(UNICHAR_ID id) const287 const char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
288   if (id == INVALID_UNICHAR_ID) {
289     return INVALID_UNICHAR;
290   }
291   ASSERT_HOST(static_cast<unsigned>(id) < this->size());
292   // Resolve from the kCustomLigatures table if this is a private encoding.
293   if (get_isprivate(id)) {
294     const char *ch = id_to_unichar(id);
295     for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
296       if (!strcmp(ch, kCustomLigatures[i][1])) {
297         return kCustomLigatures[i][0];
298       }
299     }
300   }
301   // Otherwise return the stored representation.
302   return unichars[id].representation;
303 }
304 
305 // Return a string that reformats the utf8 str into the str followed
306 // by its hex unicodes.
debug_utf8_str(const char * str)307 std::string UNICHARSET::debug_utf8_str(const char *str) {
308   std::string result = str;
309   result += " [";
310   int step = 1;
311   // Chop into unicodes and code each as hex.
312   for (int i = 0; str[i] != '\0'; i += step) {
313     char hex[sizeof(int) * 2 + 1];
314     step = UNICHAR::utf8_step(str + i);
315     if (step == 0) {
316       step = 1;
317       sprintf(hex, "%x", str[i]);
318     } else {
319       UNICHAR ch(str + i, step);
320       sprintf(hex, "%x", ch.first_uni());
321     }
322     result += hex;
323     result += " ";
324   }
325   result += "]";
326   return result;
327 }
328 
329 // Return a string containing debug information on the unichar, including
330 // the id_to_unichar, its hex unicodes and the properties.
debug_str(UNICHAR_ID id) const331 std::string UNICHARSET::debug_str(UNICHAR_ID id) const {
332   if (id == INVALID_UNICHAR_ID) {
333     return std::string(id_to_unichar(id));
334   }
335   const CHAR_FRAGMENT *fragment = this->get_fragment(id);
336   if (fragment) {
337     return fragment->to_string();
338   }
339   const char *str = id_to_unichar(id);
340   std::string result = debug_utf8_str(str);
341   // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
342   if (get_isalpha(id)) {
343     if (get_islower(id)) {
344       result += "a";
345     } else if (get_isupper(id)) {
346       result += "A";
347     } else {
348       result += "x";
349     }
350   }
351   // Append 0 if a digit.
352   if (get_isdigit(id)) {
353     result += "0";
354   }
355   // Append p is a punctuation symbol.
356   if (get_ispunctuation(id)) {
357     result += "p";
358   }
359   return result;
360 }
361 
362 // Sets the normed_ids vector from the normed string. normed_ids is not
363 // stored in the file, and needs to be set when the UNICHARSET is loaded.
set_normed_ids(UNICHAR_ID unichar_id)364 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
365   unichars[unichar_id].properties.normed_ids.clear();
366   if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
367     unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
368   } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
369                             true, &unichars[unichar_id].properties.normed_ids,
370                             nullptr, nullptr)) {
371     unichars[unichar_id].properties.normed_ids.clear();
372     unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
373   }
374 }
375 
376 // Returns whether the unichar id represents a unicode value in the private use
377 // area. We use this range only internally to represent uncommon ligatures
378 // (eg. 'ct') that do not have regular unicode values.
get_isprivate(UNICHAR_ID unichar_id) const379 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
380   UNICHAR uc(id_to_unichar(unichar_id), -1);
381   int uni = uc.first_uni();
382   return (uni >= 0xE000 && uni <= 0xF8FF);
383 }
384 
385 // Sets all ranges to empty, so they can be expanded to set the values.
set_ranges_empty()386 void UNICHARSET::set_ranges_empty() {
387   for (auto &uc : unichars) {
388     uc.properties.SetRangesEmpty();
389   }
390 }
391 
392 // Sets all the properties for this unicharset given a src unicharset with
393 // everything set. The unicharsets don't have to be the same, and graphemes
394 // are correctly accounted for.
PartialSetPropertiesFromOther(int start_index,const UNICHARSET & src)395 void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
396                                                const UNICHARSET &src) {
397   for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
398     const char *utf8 = id_to_unichar(ch);
399     UNICHAR_PROPERTIES properties;
400     if (src.GetStrProperties(utf8, &properties)) {
401       // Setup the script_id, other_case, and mirror properly.
402       const char *script = src.get_script_from_script_id(properties.script_id);
403       properties.script_id = add_script(script);
404       const char *other_case = src.id_to_unichar(properties.other_case);
405       if (contains_unichar(other_case)) {
406         properties.other_case = unichar_to_id(other_case);
407       } else {
408         properties.other_case = ch;
409       }
410       const char *mirror_str = src.id_to_unichar(properties.mirror);
411       if (contains_unichar(mirror_str)) {
412         properties.mirror = unichar_to_id(mirror_str);
413       } else {
414         properties.mirror = ch;
415       }
416       unichars[ch].properties.CopyFrom(properties);
417       set_normed_ids(ch);
418     }
419   }
420 }
421 
422 // Expands the tops and bottoms and widths for this unicharset given a
423 // src unicharset with ranges in it. The unicharsets don't have to be the
424 // same, and graphemes are correctly accounted for.
ExpandRangesFromOther(const UNICHARSET & src)425 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET &src) {
426   for (unsigned ch = 0; ch < unichars.size(); ++ch) {
427     const char *utf8 = id_to_unichar(ch);
428     UNICHAR_PROPERTIES properties;
429     if (src.GetStrProperties(utf8, &properties)) {
430       // Expand just the ranges from properties.
431       unichars[ch].properties.ExpandRangesFrom(properties);
432     }
433   }
434 }
435 
436 // Makes this a copy of src. Clears this completely first, so the automatic
437 // ids will not be present in this if not in src. Does NOT reorder the set!
CopyFrom(const UNICHARSET & src)438 void UNICHARSET::CopyFrom(const UNICHARSET &src) {
439   clear();
440   for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
441     const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
442     const char *utf8 = src.id_to_unichar(ch);
443     unichar_insert_backwards_compatible(utf8);
444     unichars[ch].properties.ExpandRangesFrom(src_props);
445   }
446   // Set properties, including mirror and other_case, WITHOUT reordering
447   // the unicharset.
448   PartialSetPropertiesFromOther(0, src);
449 }
450 
451 // For each id in src, if it does not occur in this, add it, as in
452 // SetPropertiesFromOther, otherwise expand the ranges, as in
453 // ExpandRangesFromOther.
AppendOtherUnicharset(const UNICHARSET & src)454 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET &src) {
455   int initial_used = unichars.size();
456   for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
457     const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
458     const char *utf8 = src.id_to_unichar(ch);
459     int id = unichars.size();
460     if (contains_unichar(utf8)) {
461       id = unichar_to_id(utf8);
462       // Just expand current ranges.
463       unichars[id].properties.ExpandRangesFrom(src_props);
464     } else {
465       unichar_insert_backwards_compatible(utf8);
466       unichars[id].properties.SetRangesEmpty();
467     }
468   }
469   // Set properties, including mirror and other_case, WITHOUT reordering
470   // the unicharset.
471   PartialSetPropertiesFromOther(initial_used, src);
472 }
473 
474 // Returns true if the acceptable ranges of the tops of the characters do
475 // not overlap, making their x-height calculations distinct.
SizesDistinct(UNICHAR_ID id1,UNICHAR_ID id2) const476 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
477   int overlap = std::min(unichars[id1].properties.max_top,
478                          unichars[id2].properties.max_top) -
479                 std::max(unichars[id1].properties.min_top,
480                          unichars[id2].properties.min_top);
481   return overlap <= 0;
482 }
483 
484 // Internal recursive version of encode_string above.
485 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
486 // each UNICHAR_ID uses the least possible part of the utf8 str.
487 // It does this by depth-first tail recursion on increasing length matches
488 // to the UNICHARSET, saving the first encountered result that encodes the
489 // maximum total length of str. It stops on a failure to encode to make
490 // the overall process of encoding a partially failed string more efficient.
491 // See unicharset.h for definition of the args.
encode_string(const char * str,int str_index,int str_length,std::vector<UNICHAR_ID> * encoding,std::vector<char> * lengths,unsigned * best_total_length,std::vector<UNICHAR_ID> * best_encoding,std::vector<char> * best_lengths) const492 void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
493                                std::vector<UNICHAR_ID> *encoding,
494                                std::vector<char> *lengths,
495                                unsigned *best_total_length,
496                                std::vector<UNICHAR_ID> *best_encoding,
497                                std::vector<char> *best_lengths) const {
498   if (str_index > static_cast<int>(*best_total_length)) {
499     // This is the best result so far.
500     *best_total_length = str_index;
501     *best_encoding = *encoding;
502     if (best_lengths != nullptr) {
503       *best_lengths = *lengths;
504     }
505   }
506   if (str_index == str_length) {
507     return;
508   }
509   int encoding_index = encoding->size();
510   // Find the length of the first matching unicharset member.
511   int length = ids.minmatch(str + str_index);
512   if (length == 0 || str_index + length > str_length) {
513     return;
514   }
515   do {
516     if (ids.contains(str + str_index, length)) {
517       // Successful encoding so far.
518       UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
519       encoding->push_back(id);
520       lengths->push_back(length);
521       encode_string(str, str_index + length, str_length, encoding, lengths,
522                     best_total_length, best_encoding, best_lengths);
523       if (static_cast<int>(*best_total_length) == str_length) {
524         return; // Tail recursion success!
525       }
526       // Failed with that length, truncate back and try again.
527       encoding->resize(encoding_index);
528       lengths->resize(encoding_index);
529     }
530     int step = UNICHAR::utf8_step(str + str_index + length);
531     if (step == 0) {
532       step = 1;
533     }
534     length += step;
535   } while (length <= UNICHAR_LEN && str_index + length <= str_length);
536 }
537 
538 // Gets the properties for a grapheme string, combining properties for
539 // multiple characters in a meaningful way where possible.
540 // Returns false if no valid match was found in the unicharset.
541 // NOTE that script_id, mirror, and other_case refer to this unicharset on
542 // return and will need translation if the target unicharset is different.
GetStrProperties(const char * utf8_str,UNICHAR_PROPERTIES * props) const543 bool UNICHARSET::GetStrProperties(const char *utf8_str,
544                                   UNICHAR_PROPERTIES *props) const {
545   props->Init();
546   props->SetRangesEmpty();
547   int total_unicodes = 0;
548   std::vector<UNICHAR_ID> encoding;
549   if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) {
550     return false; // Some part was invalid.
551   }
552   for (auto it : encoding) {
553     int id = it;
554     const UNICHAR_PROPERTIES &src_props = unichars[id].properties;
555     // Logical OR all the bools.
556     if (src_props.isalpha) {
557       props->isalpha = true;
558     }
559     if (src_props.islower) {
560       props->islower = true;
561     }
562     if (src_props.isupper) {
563       props->isupper = true;
564     }
565     if (src_props.isdigit) {
566       props->isdigit = true;
567     }
568     if (src_props.ispunctuation) {
569       props->ispunctuation = true;
570     }
571     if (src_props.isngram) {
572       props->isngram = true;
573     }
574     if (src_props.enabled) {
575       props->enabled = true;
576     }
577     // Min/max the tops/bottoms.
578     UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
579     UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
580     UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
581     UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
582     float bearing = props->advance + src_props.bearing;
583     if (total_unicodes == 0 || bearing < props->bearing) {
584       props->bearing = bearing;
585       props->bearing_sd = props->advance_sd + src_props.bearing_sd;
586     }
587     props->advance += src_props.advance;
588     props->advance_sd += src_props.advance_sd;
589     // With a single width, just use the widths stored in the unicharset.
590     props->width = src_props.width;
591     props->width_sd = src_props.width_sd;
592     // Use the first script id, other_case, mirror, direction.
593     // Note that these will need translation, except direction.
594     if (total_unicodes == 0) {
595       props->script_id = src_props.script_id;
596       props->other_case = src_props.other_case;
597       props->mirror = src_props.mirror;
598       props->direction = src_props.direction;
599     }
600     // The normed string for the compound character is the concatenation of
601     // the normed versions of the individual characters.
602     props->normed += src_props.normed;
603     ++total_unicodes;
604   }
605   if (total_unicodes > 1) {
606     // Estimate the total widths from the advance - bearing.
607     props->width = props->advance - props->bearing;
608     props->width_sd = props->advance_sd + props->bearing_sd;
609   }
610   return total_unicodes > 0;
611 }
612 
613 // TODO(rays) clean-up the order of functions to match unicharset.h.
614 
get_properties(UNICHAR_ID id) const615 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
616   unsigned int properties = 0;
617   if (this->get_isalpha(id)) {
618     properties |= ISALPHA_MASK;
619   }
620   if (this->get_islower(id)) {
621     properties |= ISLOWER_MASK;
622   }
623   if (this->get_isupper(id)) {
624     properties |= ISUPPER_MASK;
625   }
626   if (this->get_isdigit(id)) {
627     properties |= ISDIGIT_MASK;
628   }
629   if (this->get_ispunctuation(id)) {
630     properties |= ISPUNCTUATION_MASK;
631   }
632   return properties;
633 }
634 
get_chartype(UNICHAR_ID id) const635 char UNICHARSET::get_chartype(UNICHAR_ID id) const {
636   if (this->get_isupper(id)) {
637     return 'A';
638   }
639   if (this->get_islower(id)) {
640     return 'a';
641   }
642   if (this->get_isalpha(id)) {
643     return 'x';
644   }
645   if (this->get_isdigit(id)) {
646     return '0';
647   }
648   if (this->get_ispunctuation(id)) {
649     return 'p';
650   }
651   return 0;
652 }
653 
unichar_insert(const char * const unichar_repr,OldUncleanUnichars old_style)654 void UNICHARSET::unichar_insert(const char *const unichar_repr,
655                                 OldUncleanUnichars old_style) {
656   if (old_style == OldUncleanUnichars::kTrue) {
657     old_style_included_ = true;
658   }
659   std::string cleaned =
660       old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
661   if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
662     const char *str = cleaned.c_str();
663     std::vector<int> encoding;
664     if (!old_style_included_ &&
665         encode_string(str, true, &encoding, nullptr, nullptr)) {
666       return;
667     }
668     unichars.emplace_back();
669     auto &u = unichars.back();
670     int index = 0;
671     do {
672       if (index >= UNICHAR_LEN) {
673         fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
674                 unichar_repr);
675         return;
676       }
677       u.representation[index++] = *str++;
678     } while (*str != '\0');
679     u.representation[index] = '\0';
680     this->set_script(unichars.size() - 1, null_script);
681     // If the given unichar_repr represents a fragmented character, set
682     // fragment property to a pointer to CHAR_FRAGMENT class instance with
683     // information parsed from the unichar representation. Use the script
684     // of the base unichar for the fragmented character if possible.
685     CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
686     u.properties.fragment = frag;
687     if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
688       u.properties.script_id = this->get_script(frag->get_unichar());
689     }
690     u.properties.enabled = true;
691     ids.insert(u.representation, unichars.size() - 1);
692   }
693 }
694 
contains_unichar(const char * const unichar_repr) const695 bool UNICHARSET::contains_unichar(const char *const unichar_repr) const {
696   std::string cleaned =
697       old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
698   return ids.contains(cleaned.data(), cleaned.size());
699 }
700 
contains_unichar(const char * const unichar_repr,int length) const701 bool UNICHARSET::contains_unichar(const char *const unichar_repr,
702                                   int length) const {
703   if (length == 0) {
704     return false;
705   }
706   std::string cleaned(unichar_repr, length);
707   if (!old_style_included_) {
708     cleaned = CleanupString(unichar_repr, length);
709   }
710   return ids.contains(cleaned.data(), cleaned.size());
711 }
712 
eq(UNICHAR_ID unichar_id,const char * const unichar_repr) const713 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
714                     const char *const unichar_repr) const {
715   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
716 }
717 
save_to_string(std::string & str) const718 bool UNICHARSET::save_to_string(std::string &str) const {
719   const int kFileBufSize = 1024;
720   char buffer[kFileBufSize + 1];
721   snprintf(buffer, kFileBufSize, "%zu\n", this->size());
722   str = buffer;
723   for (unsigned id = 0; id < this->size(); ++id) {
724     int min_bottom, max_bottom, min_top, max_top;
725     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
726     float width, width_sd;
727     get_width_stats(id, &width, &width_sd);
728     float bearing, bearing_sd;
729     get_bearing_stats(id, &bearing, &bearing_sd);
730     float advance, advance_sd;
731     get_advance_stats(id, &advance, &advance_sd);
732     unsigned int properties = this->get_properties(id);
733     if (strcmp(this->id_to_unichar(id), " ") == 0) {
734       snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
735                this->get_script_from_script_id(this->get_script(id)),
736                this->get_other_case(id));
737       str += buffer;
738     } else {
739       std::ostringstream stream;
740       stream.imbue(std::locale::classic());
741       stream << this->id_to_unichar(id) << ' ' << properties << ' '
742              << min_bottom << ',' << max_bottom << ',' << min_top << ','
743              << max_top << ',' << width << ',' << width_sd << ',' << bearing
744              << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
745              << this->get_script_from_script_id(this->get_script(id)) << ' '
746              << this->get_other_case(id) << ' ' << this->get_direction(id)
747              << ' ' << this->get_mirror(id) << ' '
748              << this->get_normed_unichar(id) << "\t# "
749              << this->debug_str(id).c_str() << '\n';
750       str += stream.str().c_str();
751     }
752   }
753   return true;
754 }
755 
756 class LocalFilePointer {
757 public:
LocalFilePointer(FILE * stream)758   LocalFilePointer(FILE *stream) : fp_(stream) {}
fgets(char * dst,int size)759   char *fgets(char *dst, int size) {
760     return ::fgets(dst, size, fp_);
761   }
762 
763 private:
764   FILE *fp_;
765 };
766 
load_from_file(FILE * file,bool skip_fragments)767 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
768   LocalFilePointer lfp(file);
769   using namespace std::placeholders; // for _1, _2
770   std::function<char *(char *, int)> fgets_cb =
771       std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
772   bool success = load_via_fgets(fgets_cb, skip_fragments);
773   return success;
774 }
775 
load_from_file(tesseract::TFile * file,bool skip_fragments)776 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
777   using namespace std::placeholders; // for _1, _2
778   std::function<char *(char *, int)> fgets_cb =
779       std::bind(&tesseract::TFile::FGets, file, _1, _2);
780   bool success = load_via_fgets(fgets_cb, skip_fragments);
781   return success;
782 }
783 
load_via_fgets(const std::function<char * (char *,int)> & fgets_cb,bool skip_fragments)784 bool UNICHARSET::load_via_fgets(
785     const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) {
786   int unicharset_size;
787   char buffer[256];
788 
789   this->clear();
790   if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||
791       sscanf(buffer, "%d", &unicharset_size) != 1) {
792     return false;
793   }
794   for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
795     char unichar[256];
796     unsigned int properties;
797     char script[64];
798 
799     strncpy(script, null_script, sizeof(script) - 1);
800     int min_bottom = 0;
801     int max_bottom = UINT8_MAX;
802     int min_top = 0;
803     int max_top = UINT8_MAX;
804     float width = 0.0f;
805     float width_sd = 0.0f;
806     float bearing = 0.0f;
807     float bearing_sd = 0.0f;
808     float advance = 0.0f;
809     float advance_sd = 0.0f;
810     // TODO(eger): check that this default it ok
811     // after enabling BiDi iterator for Arabic.
812     int direction = UNICHARSET::U_LEFT_TO_RIGHT;
813     UNICHAR_ID other_case = unicharset_size;
814     UNICHAR_ID mirror = unicharset_size;
815     if (fgets_cb(buffer, sizeof(buffer)) == nullptr) {
816       return false;
817     }
818     char normed[64];
819     normed[0] = '\0';
820     std::istringstream stream(buffer);
821     stream.imbue(std::locale::classic());
822     // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标  # 标 [6807 ]x
823     // stream.flags(std::ios::hex);
824     stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
825     // stream.flags(std::ios::dec);
826     if (stream.fail()) {
827       fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
828       return false;
829     }
830     auto position = stream.tellg();
831     stream.seekg(position);
832     char c1, c2, c3, c4, c5, c6, c7, c8, c9;
833     stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
834         max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
835         bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
836         script >> other_case >> direction >> mirror >> std::setw(63) >> normed;
837     if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
838         c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
839       stream.clear();
840       stream.seekg(position);
841       stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
842           max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
843           bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
844           script >> other_case >> direction >> mirror;
845       if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
846           c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
847         stream.clear();
848         stream.seekg(position);
849         stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
850             max_top >> std::setw(63) >> script >> other_case >> direction >>
851             mirror;
852         if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
853           stream.clear();
854           stream.seekg(position);
855           stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
856               max_top >> std::setw(63) >> script >> other_case;
857           if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
858             stream.clear();
859             stream.seekg(position);
860             stream >> std::setw(63) >> script >> other_case;
861             if (stream.fail()) {
862               stream.clear();
863               stream.seekg(position);
864               stream >> std::setw(63) >> script;
865             }
866           }
867         }
868       }
869     }
870 
871     // Skip fragments if needed.
872     CHAR_FRAGMENT *frag = nullptr;
873     if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
874       int num_pieces = frag->get_total();
875       delete frag;
876       // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
877       if (num_pieces > 1) {
878         continue;
879       }
880     }
881     // Insert unichar into unicharset and set its properties.
882     if (strcmp(unichar, "NULL") == 0) {
883       this->unichar_insert(" ");
884     } else {
885       this->unichar_insert_backwards_compatible(unichar);
886     }
887 
888     this->set_isalpha(id, properties & ISALPHA_MASK);
889     this->set_islower(id, properties & ISLOWER_MASK);
890     this->set_isupper(id, properties & ISUPPER_MASK);
891     this->set_isdigit(id, properties & ISDIGIT_MASK);
892     this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
893     this->set_isngram(id, false);
894     this->set_script(id, script);
895     this->unichars[id].properties.enabled = true;
896     this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
897     this->set_width_stats(id, width, width_sd);
898     this->set_bearing_stats(id, bearing, bearing_sd);
899     this->set_advance_stats(id, advance, advance_sd);
900     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
901     this->set_other_case(id, (other_case < unicharset_size) ? other_case : id);
902     this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
903     this->set_normed(id, normed[0] != '\0' ? normed : unichar);
904   }
905   post_load_setup();
906   return true;
907 }
908 
909 // Sets up internal data after loading the file, based on the char
910 // properties. Called from load_from_file, but also needs to be run
911 // during set_unicharset_properties.
post_load_setup()912 void UNICHARSET::post_load_setup() {
913   // Number of alpha chars with the case property minus those without,
914   // in order to determine that half the alpha chars have case.
915   int net_case_alphas = 0;
916   int x_height_alphas = 0;
917   int cap_height_alphas = 0;
918   top_bottom_set_ = false;
919   for (unsigned id = 0; id < unichars.size(); ++id) {
920     int min_bottom = 0;
921     int max_bottom = UINT8_MAX;
922     int min_top = 0;
923     int max_top = UINT8_MAX;
924     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
925     if (min_top > 0) {
926       top_bottom_set_ = true;
927     }
928     if (get_isalpha(id)) {
929       if (get_islower(id) || get_isupper(id)) {
930         ++net_case_alphas;
931       } else {
932         --net_case_alphas;
933       }
934       if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
935         ++x_height_alphas;
936       } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
937         ++cap_height_alphas;
938       }
939     }
940     set_normed_ids(id);
941   }
942 
943   script_has_upper_lower_ = net_case_alphas > 0;
944   script_has_xheight_ =
945       script_has_upper_lower_ ||
946       (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
947        cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
948 
949   null_sid_ = get_script_id_from_name(null_script);
950   ASSERT_HOST(null_sid_ == 0);
951   common_sid_ = get_script_id_from_name("Common");
952   latin_sid_ = get_script_id_from_name("Latin");
953   cyrillic_sid_ = get_script_id_from_name("Cyrillic");
954   greek_sid_ = get_script_id_from_name("Greek");
955   han_sid_ = get_script_id_from_name("Han");
956   hiragana_sid_ = get_script_id_from_name("Hiragana");
957   katakana_sid_ = get_script_id_from_name("Katakana");
958   thai_sid_ = get_script_id_from_name("Thai");
959   hangul_sid_ = get_script_id_from_name("Hangul");
960 
961   // Compute default script. Use the highest-counting alpha script, that is
962   // not the common script, as that still contains some "alphas".
963   int *script_counts = new int[script_table_size_used];
964   memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
965   for (unsigned id = 0; id < unichars.size(); ++id) {
966     if (get_isalpha(id)) {
967       ++script_counts[get_script(id)];
968     }
969   }
970   default_sid_ = 0;
971   for (int s = 1; s < script_table_size_used; ++s) {
972     if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
973       default_sid_ = s;
974     }
975   }
976   delete[] script_counts;
977 }
978 
979 // Returns true if right_to_left scripts are significant in the unicharset,
980 // but without being so sensitive that "universal" unicharsets containing
981 // characters from many scripts, like orientation and script detection,
982 // look like they are right_to_left.
major_right_to_left() const983 bool UNICHARSET::major_right_to_left() const {
984   int ltr_count = 0;
985   int rtl_count = 0;
986   for (unsigned id = 0; id < unichars.size(); ++id) {
987     int dir = get_direction(id);
988     if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
989       ltr_count++;
990     }
991     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
992         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
993         dir == UNICHARSET::U_ARABIC_NUMBER) {
994       rtl_count++;
995     }
996   }
997   return rtl_count > ltr_count;
998 }
999 
1000 // Set a whitelist and/or blacklist of characters to recognize.
1001 // An empty or nullptr whitelist enables everything (minus any blacklist).
1002 // An empty or nullptr blacklist disables nothing.
1003 // An empty or nullptr blacklist has no effect.
set_black_and_whitelist(const char * blacklist,const char * whitelist,const char * unblacklist)1004 void UNICHARSET::set_black_and_whitelist(const char *blacklist,
1005                                          const char *whitelist,
1006                                          const char *unblacklist) {
1007   bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1008   // Set everything to default
1009   for (auto &uc : unichars) {
1010     uc.properties.enabled = def_enabled;
1011   }
1012   if (!def_enabled) {
1013     // Enable the whitelist.
1014     std::vector<UNICHAR_ID> encoding;
1015     encode_string(whitelist, false, &encoding, nullptr, nullptr);
1016     for (auto it : encoding) {
1017       if (it != INVALID_UNICHAR_ID) {
1018         unichars[it].properties.enabled = true;
1019       }
1020     }
1021   }
1022   if (blacklist != nullptr && blacklist[0] != '\0') {
1023     // Disable the blacklist.
1024     std::vector<UNICHAR_ID> encoding;
1025     encode_string(blacklist, false, &encoding, nullptr, nullptr);
1026     for (auto it : encoding) {
1027       if (it != INVALID_UNICHAR_ID) {
1028         unichars[it].properties.enabled = false;
1029       }
1030     }
1031   }
1032   if (unblacklist != nullptr && unblacklist[0] != '\0') {
1033     // Re-enable the unblacklist.
1034     std::vector<UNICHAR_ID> encoding;
1035     encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1036     for (auto it : encoding) {
1037       if (it != INVALID_UNICHAR_ID) {
1038         unichars[it].properties.enabled = true;
1039       }
1040     }
1041   }
1042 }
1043 
1044 // Returns true if there are any repeated unicodes in the normalized
1045 // text of any unichar-id in the unicharset.
AnyRepeatedUnicodes() const1046 bool UNICHARSET::AnyRepeatedUnicodes() const {
1047   int start_id = 0;
1048   if (has_special_codes()) {
1049     start_id = SPECIAL_UNICHAR_CODES_COUNT;
1050   }
1051   for (unsigned id = start_id; id < unichars.size(); ++id) {
1052     // Convert to unicodes.
1053     std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1054     for (size_t u = 1; u < unicodes.size(); ++u) {
1055       if (unicodes[u - 1] == unicodes[u]) {
1056         return true;
1057       }
1058     }
1059   }
1060   return false;
1061 }
1062 
add_script(const char * script)1063 int UNICHARSET::add_script(const char *script) {
1064   for (int i = 0; i < script_table_size_used; ++i) {
1065     if (strcmp(script, script_table[i]) == 0) {
1066       return i;
1067     }
1068   }
1069   if (script_table_size_reserved == 0) {
1070     script_table_size_reserved = 8;
1071     script_table = new char *[script_table_size_reserved];
1072   } else if (script_table_size_used >= script_table_size_reserved) {
1073     assert(script_table_size_used == script_table_size_reserved);
1074     script_table_size_reserved += script_table_size_reserved;
1075     char **new_script_table = new char *[script_table_size_reserved];
1076     memcpy(new_script_table, script_table,
1077            script_table_size_used * sizeof(char *));
1078     delete[] script_table;
1079     script_table = new_script_table;
1080   }
1081   script_table[script_table_size_used] = new char[strlen(script) + 1];
1082   strcpy(script_table[script_table_size_used], script);
1083   return script_table_size_used++;
1084 }
1085 
1086 // Returns the string that represents a fragment
1087 // with the given unichar, pos and total.
to_string(const char * unichar,int pos,int total,bool natural)1088 std::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1089                                      bool natural) {
1090   if (total == 1) {
1091     return std::string(unichar);
1092   }
1093   std::string result;
1094   result += kSeparator;
1095   result += unichar;
1096   char buffer[kMaxLen];
1097   snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1098            natural ? kNaturalFlag : kSeparator, total);
1099   result += buffer;
1100   return result;
1101 }
1102 
parse_from_string(const char * string)1103 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
1104   const char *ptr = string;
1105   int len = strlen(string);
1106   if (len < kMinLen || *ptr != kSeparator) {
1107     return nullptr; // this string can not represent a fragment
1108   }
1109   ptr++; // move to the next character
1110   int step = 0;
1111   while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1112     step += UNICHAR::utf8_step(ptr + step);
1113   }
1114   if (step == 0 || step > UNICHAR_LEN) {
1115     return nullptr; // no character for unichar or the character is too long
1116   }
1117   char unichar[UNICHAR_LEN + 1];
1118   strncpy(unichar, ptr, step);
1119   unichar[step] = '\0'; // null terminate unichar
1120   ptr += step;          // move to the next fragment separator
1121   int pos = 0;
1122   int total = 0;
1123   bool natural = false;
1124   char *end_ptr = nullptr;
1125   for (int i = 0; i < 2; i++) {
1126     if (ptr > string + len || *ptr != kSeparator) {
1127       if (i == 1 && *ptr == kNaturalFlag) {
1128         natural = true;
1129       } else {
1130         return nullptr; // Failed to parse fragment representation.
1131       }
1132     }
1133     ptr++; // move to the next character
1134     i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1135            : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1136     ptr = end_ptr;
1137   }
1138   if (ptr != string + len) {
1139     return nullptr; // malformed fragment representation
1140   }
1141   auto *fragment = new CHAR_FRAGMENT();
1142   fragment->set_all(unichar, pos, total, natural);
1143   return fragment;
1144 }
1145 
get_script_id_from_name(const char * script_name) const1146 int UNICHARSET::get_script_id_from_name(const char *script_name) const {
1147   for (int i = 0; i < script_table_size_used; ++i) {
1148     if (strcmp(script_name, script_table[i]) == 0) {
1149       return i;
1150     }
1151   }
1152   return 0; // 0 is always the null_script
1153 }
1154 
1155 // Removes/replaces content that belongs in rendered text, but not in the
1156 // unicharset.
1157 /* static */
CleanupString(const char * utf8_str,size_t length)1158 std::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) {
1159   std::string result;
1160   result.reserve(length);
1161   char ch;
1162   while ((ch = *utf8_str) != '\0' && length-- > 0) {
1163     int key_index = 0;
1164     const char *key;
1165     while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1166       int match = 0;
1167       while (key[match] != '\0' && key[match] == utf8_str[match]) {
1168         ++match;
1169       }
1170       if (key[match] == '\0') {
1171         utf8_str += match;
1172         break;
1173       }
1174       ++key_index;
1175     }
1176     if (key == nullptr) {
1177       result.push_back(ch);
1178       ++utf8_str;
1179     } else {
1180       result.append(kCleanupMaps[key_index][1]);
1181     }
1182   }
1183   return result;
1184 }
1185 
1186 } // namespace tesseract
1187