1 ///////////////////////////////////////////////////////////////////////
2 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #include "unicharset.h"
20
21 #include "params.h"
22
23 #include <tesseract/unichar.h>
24 #include "serialis.h"
25
26 #include <algorithm>
27 #include <cassert>
28 #include <cstdio>
29 #include <cstring>
30 #include <iomanip> // for std::setw
31 #include <locale> // for std::locale::classic
32 #include <sstream> // for std::istringstream, std::ostringstream
33
34 namespace tesseract {
35
36 // Special character used in representing character fragments.
37 static const char kSeparator = '|';
38 // Special character used in representing 'natural' character fragments.
39 static const char kNaturalFlag = 'n';
40
41 static const int ISALPHA_MASK = 0x1;
42 static const int ISLOWER_MASK = 0x2;
43 static const int ISUPPER_MASK = 0x4;
44 static const int ISDIGIT_MASK = 0x8;
45 static const int ISPUNCTUATION_MASK = 0x10;
46
47 // Y coordinate threshold for determining cap-height vs x-height.
48 // TODO(rays) Bring the global definition down to the ccutil library level,
49 // so this constant is relative to some other constants.
50 static const int kMeanlineThreshold = 220;
51 // Let C be the number of alpha chars for which all tops exceed
52 // kMeanlineThreshold, and X the number of alpha chars for which all
53 // tops are below kMeanlineThreshold, then if X > C *
54 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
55 // half the alpha characters have upper or lower case, then the
56 // unicharset "has x-height".
57 const double kMinXHeightFraction = 0.25;
58 const double kMinCapHeightFraction = 0.05;
59
60 /*static */
61 const char *UNICHARSET::kCustomLigatures[][2] = {
62 {"ct", "\uE003"}, // c + t -> U+E003
63 {"ſh", "\uE006"}, // long-s + h -> U+E006
64 {"ſi", "\uE007"}, // long-s + i -> U+E007
65 {"ſl", "\uE008"}, // long-s + l -> U+E008
66 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
67 {nullptr, nullptr}};
68
69 // List of mappings to make when ingesting strings from the outside.
70 // The substitutions clean up text that should exist for rendering of
71 // synthetic data, but not in the recognition set.
72 const char *UNICHARSET::kCleanupMaps[][2] = {
73 {"\u0640", ""}, // TATWEEL is deleted.
74 {"\ufb01", "fi"}, // fi ligature->fi pair.
75 {"\ufb02", "fl"}, // fl ligature->fl pair.
76 {nullptr, nullptr}};
77
78 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
79 const char *UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
80 " ", "Joined", "|Broken|0|1"};
81
82 const char *UNICHARSET::null_script = "NULL";
83
UNICHAR_PROPERTIES()84 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
85 Init();
86 }
87
88 // Initialize all properties to sensible default values.
Init()89 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
90 isalpha = false;
91 islower = false;
92 isupper = false;
93 isdigit = false;
94 ispunctuation = false;
95 isngram = false;
96 enabled = false;
97 SetRangesOpen();
98 script_id = 0;
99 other_case = 0;
100 mirror = 0;
101 normed = "";
102 direction = UNICHARSET::U_LEFT_TO_RIGHT;
103 fragment = nullptr;
104 }
105
106 // Sets all ranges wide open. Initialization default in case there are
107 // no useful values available.
SetRangesOpen()108 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
109 min_bottom = 0;
110 max_bottom = UINT8_MAX;
111 min_top = 0;
112 max_top = UINT8_MAX;
113 width = 0.0f;
114 width_sd = 0.0f;
115 bearing = 0.0f;
116 bearing_sd = 0.0f;
117 advance = 0.0f;
118 advance_sd = 0.0f;
119 }
120
121 // Sets all ranges to empty. Used before expanding with font-based data.
SetRangesEmpty()122 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
123 min_bottom = UINT8_MAX;
124 max_bottom = 0;
125 min_top = UINT8_MAX;
126 max_top = 0;
127 width = 0.0f;
128 width_sd = 0.0f;
129 bearing = 0.0f;
130 bearing_sd = 0.0f;
131 advance = 0.0f;
132 advance_sd = 0.0f;
133 }
134
135 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
136 // is empty.
AnyRangeEmpty() const137 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
138 return width == 0.0f || advance == 0.0f;
139 }
140
141 // Expands the ranges with the ranges from the src properties.
ExpandRangesFrom(const UNICHAR_PROPERTIES & src)142 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
143 const UNICHAR_PROPERTIES &src) {
144 UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
145 UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
146 UpdateRange(src.min_top, &min_top, &max_top);
147 UpdateRange(src.max_top, &min_top, &max_top);
148 if (src.width_sd > width_sd) {
149 width = src.width;
150 width_sd = src.width_sd;
151 }
152 if (src.bearing_sd > bearing_sd) {
153 bearing = src.bearing;
154 bearing_sd = src.bearing_sd;
155 }
156 if (src.advance_sd > advance_sd) {
157 advance = src.advance;
158 advance_sd = src.advance_sd;
159 }
160 }
161
162 // Copies the properties from src into this.
CopyFrom(const UNICHAR_PROPERTIES & src)163 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES &src) {
164 // Apart from the fragment, everything else can be done with a default copy.
165 CHAR_FRAGMENT *saved_fragment = fragment;
166 *this = src; // Bitwise copy.
167 fragment = saved_fragment;
168 }
169
UNICHARSET()170 UNICHARSET::UNICHARSET()
171 : ids(), script_table(nullptr), script_table_size_used(0) {
172 clear();
173 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
174 unichar_insert(kSpecialUnicharCodes[i]);
175 if (i == UNICHAR_JOINED) {
176 set_isngram(i, true);
177 }
178 }
179 }
180
~UNICHARSET()181 UNICHARSET::~UNICHARSET() {
182 clear();
183 }
184
185 UNICHAR_ID
unichar_to_id(const char * const unichar_repr) const186 UNICHARSET::unichar_to_id(const char *const unichar_repr) const {
187 std::string cleaned =
188 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
189 return ids.contains(cleaned.data(), cleaned.size())
190 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
191 : INVALID_UNICHAR_ID;
192 }
193
unichar_to_id(const char * const unichar_repr,int length) const194 UNICHAR_ID UNICHARSET::unichar_to_id(const char *const unichar_repr,
195 int length) const {
196 assert(length > 0 && length <= UNICHAR_LEN);
197 std::string cleaned(unichar_repr, length);
198 if (!old_style_included_) {
199 cleaned = CleanupString(unichar_repr, length);
200 }
201 return ids.contains(cleaned.data(), cleaned.size())
202 ? ids.unichar_to_id(cleaned.data(), cleaned.size())
203 : INVALID_UNICHAR_ID;
204 }
205
206 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
207 // while leaving the rest of the string encodable. Returns 0 if the
208 // beginning of the string is not encodable.
209 // WARNING: this function now encodes the whole string for precision.
210 // Use encode_string in preference to repeatedly calling step.
step(const char * str) const211 int UNICHARSET::step(const char *str) const {
212 std::vector<UNICHAR_ID> encoding;
213 std::vector<char> lengths;
214 encode_string(str, true, &encoding, &lengths, nullptr);
215 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) {
216 return 0;
217 }
218 return lengths[0];
219 }
220
221 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
222 // If not encodable, write the first byte offset which cannot be converted
223 // into the second (return) argument.
encodable_string(const char * str,unsigned * first_bad_position) const224 bool UNICHARSET::encodable_string(const char *str,
225 unsigned *first_bad_position) const {
226 std::vector<UNICHAR_ID> encoding;
227 return encode_string(str, true, &encoding, nullptr, first_bad_position);
228 }
229
230 // Encodes the given UTF-8 string with this UNICHARSET.
231 // Returns true if the encoding succeeds completely, false if there is at
232 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
233 // the rest of the string is still encoded.
234 // If lengths is not nullptr, then it is filled with the corresponding
235 // byte length of each encoded UNICHAR_ID.
236 // WARNING: Caller must guarantee that str has already been cleaned of codes
237 // that do not belong in the unicharset, or encoding may fail.
238 // Use CleanupString to perform the cleaning.
encode_string(const char * str,bool give_up_on_failure,std::vector<UNICHAR_ID> * encoding,std::vector<char> * lengths,unsigned * encoded_length) const239 bool UNICHARSET::encode_string(const char *str, bool give_up_on_failure,
240 std::vector<UNICHAR_ID> *encoding,
241 std::vector<char> *lengths,
242 unsigned *encoded_length) const {
243 std::vector<UNICHAR_ID> working_encoding;
244 std::vector<char> working_lengths;
245 std::vector<char> best_lengths;
246 encoding->clear(); // Just in case str is empty.
247 auto str_length = strlen(str);
248 unsigned str_pos = 0;
249 bool perfect = true;
250 while (str_pos < str_length) {
251 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
252 &str_pos, encoding, &best_lengths);
253 if (str_pos < str_length) {
254 // This is a non-match. Skip one utf-8 character.
255 perfect = false;
256 if (give_up_on_failure) {
257 break;
258 }
259 int step = UNICHAR::utf8_step(str + str_pos);
260 if (step == 0) {
261 step = 1;
262 }
263 encoding->push_back(INVALID_UNICHAR_ID);
264 best_lengths.push_back(step);
265 str_pos += step;
266 working_encoding = *encoding;
267 working_lengths = best_lengths;
268 }
269 }
270 if (lengths != nullptr) {
271 *lengths = best_lengths;
272 }
273 if (encoded_length != nullptr) {
274 *encoded_length = str_pos;
275 }
276 return perfect;
277 }
278
id_to_unichar(UNICHAR_ID id) const279 const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
280 if (id == INVALID_UNICHAR_ID) {
281 return INVALID_UNICHAR;
282 }
283 ASSERT_HOST(static_cast<unsigned>(id) < this->size());
284 return unichars[id].representation;
285 }
286
id_to_unichar_ext(UNICHAR_ID id) const287 const char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
288 if (id == INVALID_UNICHAR_ID) {
289 return INVALID_UNICHAR;
290 }
291 ASSERT_HOST(static_cast<unsigned>(id) < this->size());
292 // Resolve from the kCustomLigatures table if this is a private encoding.
293 if (get_isprivate(id)) {
294 const char *ch = id_to_unichar(id);
295 for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
296 if (!strcmp(ch, kCustomLigatures[i][1])) {
297 return kCustomLigatures[i][0];
298 }
299 }
300 }
301 // Otherwise return the stored representation.
302 return unichars[id].representation;
303 }
304
305 // Return a string that reformats the utf8 str into the str followed
306 // by its hex unicodes.
debug_utf8_str(const char * str)307 std::string UNICHARSET::debug_utf8_str(const char *str) {
308 std::string result = str;
309 result += " [";
310 int step = 1;
311 // Chop into unicodes and code each as hex.
312 for (int i = 0; str[i] != '\0'; i += step) {
313 char hex[sizeof(int) * 2 + 1];
314 step = UNICHAR::utf8_step(str + i);
315 if (step == 0) {
316 step = 1;
317 sprintf(hex, "%x", str[i]);
318 } else {
319 UNICHAR ch(str + i, step);
320 sprintf(hex, "%x", ch.first_uni());
321 }
322 result += hex;
323 result += " ";
324 }
325 result += "]";
326 return result;
327 }
328
329 // Return a string containing debug information on the unichar, including
330 // the id_to_unichar, its hex unicodes and the properties.
debug_str(UNICHAR_ID id) const331 std::string UNICHARSET::debug_str(UNICHAR_ID id) const {
332 if (id == INVALID_UNICHAR_ID) {
333 return std::string(id_to_unichar(id));
334 }
335 const CHAR_FRAGMENT *fragment = this->get_fragment(id);
336 if (fragment) {
337 return fragment->to_string();
338 }
339 const char *str = id_to_unichar(id);
340 std::string result = debug_utf8_str(str);
341 // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
342 if (get_isalpha(id)) {
343 if (get_islower(id)) {
344 result += "a";
345 } else if (get_isupper(id)) {
346 result += "A";
347 } else {
348 result += "x";
349 }
350 }
351 // Append 0 if a digit.
352 if (get_isdigit(id)) {
353 result += "0";
354 }
355 // Append p is a punctuation symbol.
356 if (get_ispunctuation(id)) {
357 result += "p";
358 }
359 return result;
360 }
361
362 // Sets the normed_ids vector from the normed string. normed_ids is not
363 // stored in the file, and needs to be set when the UNICHARSET is loaded.
set_normed_ids(UNICHAR_ID unichar_id)364 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
365 unichars[unichar_id].properties.normed_ids.clear();
366 if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
367 unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
368 } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
369 true, &unichars[unichar_id].properties.normed_ids,
370 nullptr, nullptr)) {
371 unichars[unichar_id].properties.normed_ids.clear();
372 unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
373 }
374 }
375
376 // Returns whether the unichar id represents a unicode value in the private use
377 // area. We use this range only internally to represent uncommon ligatures
378 // (eg. 'ct') that do not have regular unicode values.
get_isprivate(UNICHAR_ID unichar_id) const379 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
380 UNICHAR uc(id_to_unichar(unichar_id), -1);
381 int uni = uc.first_uni();
382 return (uni >= 0xE000 && uni <= 0xF8FF);
383 }
384
385 // Sets all ranges to empty, so they can be expanded to set the values.
set_ranges_empty()386 void UNICHARSET::set_ranges_empty() {
387 for (auto &uc : unichars) {
388 uc.properties.SetRangesEmpty();
389 }
390 }
391
392 // Sets all the properties for this unicharset given a src unicharset with
393 // everything set. The unicharsets don't have to be the same, and graphemes
394 // are correctly accounted for.
PartialSetPropertiesFromOther(int start_index,const UNICHARSET & src)395 void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
396 const UNICHARSET &src) {
397 for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
398 const char *utf8 = id_to_unichar(ch);
399 UNICHAR_PROPERTIES properties;
400 if (src.GetStrProperties(utf8, &properties)) {
401 // Setup the script_id, other_case, and mirror properly.
402 const char *script = src.get_script_from_script_id(properties.script_id);
403 properties.script_id = add_script(script);
404 const char *other_case = src.id_to_unichar(properties.other_case);
405 if (contains_unichar(other_case)) {
406 properties.other_case = unichar_to_id(other_case);
407 } else {
408 properties.other_case = ch;
409 }
410 const char *mirror_str = src.id_to_unichar(properties.mirror);
411 if (contains_unichar(mirror_str)) {
412 properties.mirror = unichar_to_id(mirror_str);
413 } else {
414 properties.mirror = ch;
415 }
416 unichars[ch].properties.CopyFrom(properties);
417 set_normed_ids(ch);
418 }
419 }
420 }
421
422 // Expands the tops and bottoms and widths for this unicharset given a
423 // src unicharset with ranges in it. The unicharsets don't have to be the
424 // same, and graphemes are correctly accounted for.
ExpandRangesFromOther(const UNICHARSET & src)425 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET &src) {
426 for (unsigned ch = 0; ch < unichars.size(); ++ch) {
427 const char *utf8 = id_to_unichar(ch);
428 UNICHAR_PROPERTIES properties;
429 if (src.GetStrProperties(utf8, &properties)) {
430 // Expand just the ranges from properties.
431 unichars[ch].properties.ExpandRangesFrom(properties);
432 }
433 }
434 }
435
436 // Makes this a copy of src. Clears this completely first, so the automatic
437 // ids will not be present in this if not in src. Does NOT reorder the set!
CopyFrom(const UNICHARSET & src)438 void UNICHARSET::CopyFrom(const UNICHARSET &src) {
439 clear();
440 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
441 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
442 const char *utf8 = src.id_to_unichar(ch);
443 unichar_insert_backwards_compatible(utf8);
444 unichars[ch].properties.ExpandRangesFrom(src_props);
445 }
446 // Set properties, including mirror and other_case, WITHOUT reordering
447 // the unicharset.
448 PartialSetPropertiesFromOther(0, src);
449 }
450
451 // For each id in src, if it does not occur in this, add it, as in
452 // SetPropertiesFromOther, otherwise expand the ranges, as in
453 // ExpandRangesFromOther.
AppendOtherUnicharset(const UNICHARSET & src)454 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET &src) {
455 int initial_used = unichars.size();
456 for (unsigned ch = 0; ch < src.unichars.size(); ++ch) {
457 const UNICHAR_PROPERTIES &src_props = src.unichars[ch].properties;
458 const char *utf8 = src.id_to_unichar(ch);
459 int id = unichars.size();
460 if (contains_unichar(utf8)) {
461 id = unichar_to_id(utf8);
462 // Just expand current ranges.
463 unichars[id].properties.ExpandRangesFrom(src_props);
464 } else {
465 unichar_insert_backwards_compatible(utf8);
466 unichars[id].properties.SetRangesEmpty();
467 }
468 }
469 // Set properties, including mirror and other_case, WITHOUT reordering
470 // the unicharset.
471 PartialSetPropertiesFromOther(initial_used, src);
472 }
473
474 // Returns true if the acceptable ranges of the tops of the characters do
475 // not overlap, making their x-height calculations distinct.
SizesDistinct(UNICHAR_ID id1,UNICHAR_ID id2) const476 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
477 int overlap = std::min(unichars[id1].properties.max_top,
478 unichars[id2].properties.max_top) -
479 std::max(unichars[id1].properties.min_top,
480 unichars[id2].properties.min_top);
481 return overlap <= 0;
482 }
483
484 // Internal recursive version of encode_string above.
485 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
486 // each UNICHAR_ID uses the least possible part of the utf8 str.
487 // It does this by depth-first tail recursion on increasing length matches
488 // to the UNICHARSET, saving the first encountered result that encodes the
489 // maximum total length of str. It stops on a failure to encode to make
490 // the overall process of encoding a partially failed string more efficient.
491 // See unicharset.h for definition of the args.
encode_string(const char * str,int str_index,int str_length,std::vector<UNICHAR_ID> * encoding,std::vector<char> * lengths,unsigned * best_total_length,std::vector<UNICHAR_ID> * best_encoding,std::vector<char> * best_lengths) const492 void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
493 std::vector<UNICHAR_ID> *encoding,
494 std::vector<char> *lengths,
495 unsigned *best_total_length,
496 std::vector<UNICHAR_ID> *best_encoding,
497 std::vector<char> *best_lengths) const {
498 if (str_index > static_cast<int>(*best_total_length)) {
499 // This is the best result so far.
500 *best_total_length = str_index;
501 *best_encoding = *encoding;
502 if (best_lengths != nullptr) {
503 *best_lengths = *lengths;
504 }
505 }
506 if (str_index == str_length) {
507 return;
508 }
509 int encoding_index = encoding->size();
510 // Find the length of the first matching unicharset member.
511 int length = ids.minmatch(str + str_index);
512 if (length == 0 || str_index + length > str_length) {
513 return;
514 }
515 do {
516 if (ids.contains(str + str_index, length)) {
517 // Successful encoding so far.
518 UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
519 encoding->push_back(id);
520 lengths->push_back(length);
521 encode_string(str, str_index + length, str_length, encoding, lengths,
522 best_total_length, best_encoding, best_lengths);
523 if (static_cast<int>(*best_total_length) == str_length) {
524 return; // Tail recursion success!
525 }
526 // Failed with that length, truncate back and try again.
527 encoding->resize(encoding_index);
528 lengths->resize(encoding_index);
529 }
530 int step = UNICHAR::utf8_step(str + str_index + length);
531 if (step == 0) {
532 step = 1;
533 }
534 length += step;
535 } while (length <= UNICHAR_LEN && str_index + length <= str_length);
536 }
537
538 // Gets the properties for a grapheme string, combining properties for
539 // multiple characters in a meaningful way where possible.
540 // Returns false if no valid match was found in the unicharset.
541 // NOTE that script_id, mirror, and other_case refer to this unicharset on
542 // return and will need translation if the target unicharset is different.
GetStrProperties(const char * utf8_str,UNICHAR_PROPERTIES * props) const543 bool UNICHARSET::GetStrProperties(const char *utf8_str,
544 UNICHAR_PROPERTIES *props) const {
545 props->Init();
546 props->SetRangesEmpty();
547 int total_unicodes = 0;
548 std::vector<UNICHAR_ID> encoding;
549 if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr)) {
550 return false; // Some part was invalid.
551 }
552 for (auto it : encoding) {
553 int id = it;
554 const UNICHAR_PROPERTIES &src_props = unichars[id].properties;
555 // Logical OR all the bools.
556 if (src_props.isalpha) {
557 props->isalpha = true;
558 }
559 if (src_props.islower) {
560 props->islower = true;
561 }
562 if (src_props.isupper) {
563 props->isupper = true;
564 }
565 if (src_props.isdigit) {
566 props->isdigit = true;
567 }
568 if (src_props.ispunctuation) {
569 props->ispunctuation = true;
570 }
571 if (src_props.isngram) {
572 props->isngram = true;
573 }
574 if (src_props.enabled) {
575 props->enabled = true;
576 }
577 // Min/max the tops/bottoms.
578 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
579 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
580 UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
581 UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
582 float bearing = props->advance + src_props.bearing;
583 if (total_unicodes == 0 || bearing < props->bearing) {
584 props->bearing = bearing;
585 props->bearing_sd = props->advance_sd + src_props.bearing_sd;
586 }
587 props->advance += src_props.advance;
588 props->advance_sd += src_props.advance_sd;
589 // With a single width, just use the widths stored in the unicharset.
590 props->width = src_props.width;
591 props->width_sd = src_props.width_sd;
592 // Use the first script id, other_case, mirror, direction.
593 // Note that these will need translation, except direction.
594 if (total_unicodes == 0) {
595 props->script_id = src_props.script_id;
596 props->other_case = src_props.other_case;
597 props->mirror = src_props.mirror;
598 props->direction = src_props.direction;
599 }
600 // The normed string for the compound character is the concatenation of
601 // the normed versions of the individual characters.
602 props->normed += src_props.normed;
603 ++total_unicodes;
604 }
605 if (total_unicodes > 1) {
606 // Estimate the total widths from the advance - bearing.
607 props->width = props->advance - props->bearing;
608 props->width_sd = props->advance_sd + props->bearing_sd;
609 }
610 return total_unicodes > 0;
611 }
612
613 // TODO(rays) clean-up the order of functions to match unicharset.h.
614
get_properties(UNICHAR_ID id) const615 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
616 unsigned int properties = 0;
617 if (this->get_isalpha(id)) {
618 properties |= ISALPHA_MASK;
619 }
620 if (this->get_islower(id)) {
621 properties |= ISLOWER_MASK;
622 }
623 if (this->get_isupper(id)) {
624 properties |= ISUPPER_MASK;
625 }
626 if (this->get_isdigit(id)) {
627 properties |= ISDIGIT_MASK;
628 }
629 if (this->get_ispunctuation(id)) {
630 properties |= ISPUNCTUATION_MASK;
631 }
632 return properties;
633 }
634
get_chartype(UNICHAR_ID id) const635 char UNICHARSET::get_chartype(UNICHAR_ID id) const {
636 if (this->get_isupper(id)) {
637 return 'A';
638 }
639 if (this->get_islower(id)) {
640 return 'a';
641 }
642 if (this->get_isalpha(id)) {
643 return 'x';
644 }
645 if (this->get_isdigit(id)) {
646 return '0';
647 }
648 if (this->get_ispunctuation(id)) {
649 return 'p';
650 }
651 return 0;
652 }
653
unichar_insert(const char * const unichar_repr,OldUncleanUnichars old_style)654 void UNICHARSET::unichar_insert(const char *const unichar_repr,
655 OldUncleanUnichars old_style) {
656 if (old_style == OldUncleanUnichars::kTrue) {
657 old_style_included_ = true;
658 }
659 std::string cleaned =
660 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
661 if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
662 const char *str = cleaned.c_str();
663 std::vector<int> encoding;
664 if (!old_style_included_ &&
665 encode_string(str, true, &encoding, nullptr, nullptr)) {
666 return;
667 }
668 unichars.emplace_back();
669 auto &u = unichars.back();
670 int index = 0;
671 do {
672 if (index >= UNICHAR_LEN) {
673 fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
674 unichar_repr);
675 return;
676 }
677 u.representation[index++] = *str++;
678 } while (*str != '\0');
679 u.representation[index] = '\0';
680 this->set_script(unichars.size() - 1, null_script);
681 // If the given unichar_repr represents a fragmented character, set
682 // fragment property to a pointer to CHAR_FRAGMENT class instance with
683 // information parsed from the unichar representation. Use the script
684 // of the base unichar for the fragmented character if possible.
685 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(u.representation);
686 u.properties.fragment = frag;
687 if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
688 u.properties.script_id = this->get_script(frag->get_unichar());
689 }
690 u.properties.enabled = true;
691 ids.insert(u.representation, unichars.size() - 1);
692 }
693 }
694
contains_unichar(const char * const unichar_repr) const695 bool UNICHARSET::contains_unichar(const char *const unichar_repr) const {
696 std::string cleaned =
697 old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
698 return ids.contains(cleaned.data(), cleaned.size());
699 }
700
contains_unichar(const char * const unichar_repr,int length) const701 bool UNICHARSET::contains_unichar(const char *const unichar_repr,
702 int length) const {
703 if (length == 0) {
704 return false;
705 }
706 std::string cleaned(unichar_repr, length);
707 if (!old_style_included_) {
708 cleaned = CleanupString(unichar_repr, length);
709 }
710 return ids.contains(cleaned.data(), cleaned.size());
711 }
712
eq(UNICHAR_ID unichar_id,const char * const unichar_repr) const713 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
714 const char *const unichar_repr) const {
715 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
716 }
717
save_to_string(std::string & str) const718 bool UNICHARSET::save_to_string(std::string &str) const {
719 const int kFileBufSize = 1024;
720 char buffer[kFileBufSize + 1];
721 snprintf(buffer, kFileBufSize, "%zu\n", this->size());
722 str = buffer;
723 for (unsigned id = 0; id < this->size(); ++id) {
724 int min_bottom, max_bottom, min_top, max_top;
725 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
726 float width, width_sd;
727 get_width_stats(id, &width, &width_sd);
728 float bearing, bearing_sd;
729 get_bearing_stats(id, &bearing, &bearing_sd);
730 float advance, advance_sd;
731 get_advance_stats(id, &advance, &advance_sd);
732 unsigned int properties = this->get_properties(id);
733 if (strcmp(this->id_to_unichar(id), " ") == 0) {
734 snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
735 this->get_script_from_script_id(this->get_script(id)),
736 this->get_other_case(id));
737 str += buffer;
738 } else {
739 std::ostringstream stream;
740 stream.imbue(std::locale::classic());
741 stream << this->id_to_unichar(id) << ' ' << properties << ' '
742 << min_bottom << ',' << max_bottom << ',' << min_top << ','
743 << max_top << ',' << width << ',' << width_sd << ',' << bearing
744 << ',' << bearing_sd << ',' << advance << ',' << advance_sd << ' '
745 << this->get_script_from_script_id(this->get_script(id)) << ' '
746 << this->get_other_case(id) << ' ' << this->get_direction(id)
747 << ' ' << this->get_mirror(id) << ' '
748 << this->get_normed_unichar(id) << "\t# "
749 << this->debug_str(id).c_str() << '\n';
750 str += stream.str().c_str();
751 }
752 }
753 return true;
754 }
755
756 class LocalFilePointer {
757 public:
LocalFilePointer(FILE * stream)758 LocalFilePointer(FILE *stream) : fp_(stream) {}
fgets(char * dst,int size)759 char *fgets(char *dst, int size) {
760 return ::fgets(dst, size, fp_);
761 }
762
763 private:
764 FILE *fp_;
765 };
766
load_from_file(FILE * file,bool skip_fragments)767 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
768 LocalFilePointer lfp(file);
769 using namespace std::placeholders; // for _1, _2
770 std::function<char *(char *, int)> fgets_cb =
771 std::bind(&LocalFilePointer::fgets, &lfp, _1, _2);
772 bool success = load_via_fgets(fgets_cb, skip_fragments);
773 return success;
774 }
775
load_from_file(tesseract::TFile * file,bool skip_fragments)776 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
777 using namespace std::placeholders; // for _1, _2
778 std::function<char *(char *, int)> fgets_cb =
779 std::bind(&tesseract::TFile::FGets, file, _1, _2);
780 bool success = load_via_fgets(fgets_cb, skip_fragments);
781 return success;
782 }
783
load_via_fgets(const std::function<char * (char *,int)> & fgets_cb,bool skip_fragments)784 bool UNICHARSET::load_via_fgets(
785 const std::function<char *(char *, int)> &fgets_cb, bool skip_fragments) {
786 int unicharset_size;
787 char buffer[256];
788
789 this->clear();
790 if (fgets_cb(buffer, sizeof(buffer)) == nullptr ||
791 sscanf(buffer, "%d", &unicharset_size) != 1) {
792 return false;
793 }
794 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
795 char unichar[256];
796 unsigned int properties;
797 char script[64];
798
799 strncpy(script, null_script, sizeof(script) - 1);
800 int min_bottom = 0;
801 int max_bottom = UINT8_MAX;
802 int min_top = 0;
803 int max_top = UINT8_MAX;
804 float width = 0.0f;
805 float width_sd = 0.0f;
806 float bearing = 0.0f;
807 float bearing_sd = 0.0f;
808 float advance = 0.0f;
809 float advance_sd = 0.0f;
810 // TODO(eger): check that this default it ok
811 // after enabling BiDi iterator for Arabic.
812 int direction = UNICHARSET::U_LEFT_TO_RIGHT;
813 UNICHAR_ID other_case = unicharset_size;
814 UNICHAR_ID mirror = unicharset_size;
815 if (fgets_cb(buffer, sizeof(buffer)) == nullptr) {
816 return false;
817 }
818 char normed[64];
819 normed[0] = '\0';
820 std::istringstream stream(buffer);
821 stream.imbue(std::locale::classic());
822 // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
823 // stream.flags(std::ios::hex);
824 stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
825 // stream.flags(std::ios::dec);
826 if (stream.fail()) {
827 fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
828 return false;
829 }
830 auto position = stream.tellg();
831 stream.seekg(position);
832 char c1, c2, c3, c4, c5, c6, c7, c8, c9;
833 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
834 max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
835 bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
836 script >> other_case >> direction >> mirror >> std::setw(63) >> normed;
837 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
838 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
839 stream.clear();
840 stream.seekg(position);
841 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
842 max_top >> c4 >> width >> c5 >> width_sd >> c6 >> bearing >> c7 >>
843 bearing_sd >> c8 >> advance >> c9 >> advance_sd >> std::setw(63) >>
844 script >> other_case >> direction >> mirror;
845 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
846 c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
847 stream.clear();
848 stream.seekg(position);
849 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
850 max_top >> std::setw(63) >> script >> other_case >> direction >>
851 mirror;
852 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
853 stream.clear();
854 stream.seekg(position);
855 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >>
856 max_top >> std::setw(63) >> script >> other_case;
857 if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
858 stream.clear();
859 stream.seekg(position);
860 stream >> std::setw(63) >> script >> other_case;
861 if (stream.fail()) {
862 stream.clear();
863 stream.seekg(position);
864 stream >> std::setw(63) >> script;
865 }
866 }
867 }
868 }
869 }
870
871 // Skip fragments if needed.
872 CHAR_FRAGMENT *frag = nullptr;
873 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
874 int num_pieces = frag->get_total();
875 delete frag;
876 // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
877 if (num_pieces > 1) {
878 continue;
879 }
880 }
881 // Insert unichar into unicharset and set its properties.
882 if (strcmp(unichar, "NULL") == 0) {
883 this->unichar_insert(" ");
884 } else {
885 this->unichar_insert_backwards_compatible(unichar);
886 }
887
888 this->set_isalpha(id, properties & ISALPHA_MASK);
889 this->set_islower(id, properties & ISLOWER_MASK);
890 this->set_isupper(id, properties & ISUPPER_MASK);
891 this->set_isdigit(id, properties & ISDIGIT_MASK);
892 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
893 this->set_isngram(id, false);
894 this->set_script(id, script);
895 this->unichars[id].properties.enabled = true;
896 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
897 this->set_width_stats(id, width, width_sd);
898 this->set_bearing_stats(id, bearing, bearing_sd);
899 this->set_advance_stats(id, advance, advance_sd);
900 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
901 this->set_other_case(id, (other_case < unicharset_size) ? other_case : id);
902 this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
903 this->set_normed(id, normed[0] != '\0' ? normed : unichar);
904 }
905 post_load_setup();
906 return true;
907 }
908
909 // Sets up internal data after loading the file, based on the char
910 // properties. Called from load_from_file, but also needs to be run
911 // during set_unicharset_properties.
post_load_setup()912 void UNICHARSET::post_load_setup() {
913 // Number of alpha chars with the case property minus those without,
914 // in order to determine that half the alpha chars have case.
915 int net_case_alphas = 0;
916 int x_height_alphas = 0;
917 int cap_height_alphas = 0;
918 top_bottom_set_ = false;
919 for (unsigned id = 0; id < unichars.size(); ++id) {
920 int min_bottom = 0;
921 int max_bottom = UINT8_MAX;
922 int min_top = 0;
923 int max_top = UINT8_MAX;
924 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
925 if (min_top > 0) {
926 top_bottom_set_ = true;
927 }
928 if (get_isalpha(id)) {
929 if (get_islower(id) || get_isupper(id)) {
930 ++net_case_alphas;
931 } else {
932 --net_case_alphas;
933 }
934 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) {
935 ++x_height_alphas;
936 } else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) {
937 ++cap_height_alphas;
938 }
939 }
940 set_normed_ids(id);
941 }
942
943 script_has_upper_lower_ = net_case_alphas > 0;
944 script_has_xheight_ =
945 script_has_upper_lower_ ||
946 (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
947 cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
948
949 null_sid_ = get_script_id_from_name(null_script);
950 ASSERT_HOST(null_sid_ == 0);
951 common_sid_ = get_script_id_from_name("Common");
952 latin_sid_ = get_script_id_from_name("Latin");
953 cyrillic_sid_ = get_script_id_from_name("Cyrillic");
954 greek_sid_ = get_script_id_from_name("Greek");
955 han_sid_ = get_script_id_from_name("Han");
956 hiragana_sid_ = get_script_id_from_name("Hiragana");
957 katakana_sid_ = get_script_id_from_name("Katakana");
958 thai_sid_ = get_script_id_from_name("Thai");
959 hangul_sid_ = get_script_id_from_name("Hangul");
960
961 // Compute default script. Use the highest-counting alpha script, that is
962 // not the common script, as that still contains some "alphas".
963 int *script_counts = new int[script_table_size_used];
964 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
965 for (unsigned id = 0; id < unichars.size(); ++id) {
966 if (get_isalpha(id)) {
967 ++script_counts[get_script(id)];
968 }
969 }
970 default_sid_ = 0;
971 for (int s = 1; s < script_table_size_used; ++s) {
972 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) {
973 default_sid_ = s;
974 }
975 }
976 delete[] script_counts;
977 }
978
979 // Returns true if right_to_left scripts are significant in the unicharset,
980 // but without being so sensitive that "universal" unicharsets containing
981 // characters from many scripts, like orientation and script detection,
982 // look like they are right_to_left.
major_right_to_left() const983 bool UNICHARSET::major_right_to_left() const {
984 int ltr_count = 0;
985 int rtl_count = 0;
986 for (unsigned id = 0; id < unichars.size(); ++id) {
987 int dir = get_direction(id);
988 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) {
989 ltr_count++;
990 }
991 if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
992 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
993 dir == UNICHARSET::U_ARABIC_NUMBER) {
994 rtl_count++;
995 }
996 }
997 return rtl_count > ltr_count;
998 }
999
1000 // Set a whitelist and/or blacklist of characters to recognize.
1001 // An empty or nullptr whitelist enables everything (minus any blacklist).
1002 // An empty or nullptr blacklist disables nothing.
1003 // An empty or nullptr blacklist has no effect.
set_black_and_whitelist(const char * blacklist,const char * whitelist,const char * unblacklist)1004 void UNICHARSET::set_black_and_whitelist(const char *blacklist,
1005 const char *whitelist,
1006 const char *unblacklist) {
1007 bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1008 // Set everything to default
1009 for (auto &uc : unichars) {
1010 uc.properties.enabled = def_enabled;
1011 }
1012 if (!def_enabled) {
1013 // Enable the whitelist.
1014 std::vector<UNICHAR_ID> encoding;
1015 encode_string(whitelist, false, &encoding, nullptr, nullptr);
1016 for (auto it : encoding) {
1017 if (it != INVALID_UNICHAR_ID) {
1018 unichars[it].properties.enabled = true;
1019 }
1020 }
1021 }
1022 if (blacklist != nullptr && blacklist[0] != '\0') {
1023 // Disable the blacklist.
1024 std::vector<UNICHAR_ID> encoding;
1025 encode_string(blacklist, false, &encoding, nullptr, nullptr);
1026 for (auto it : encoding) {
1027 if (it != INVALID_UNICHAR_ID) {
1028 unichars[it].properties.enabled = false;
1029 }
1030 }
1031 }
1032 if (unblacklist != nullptr && unblacklist[0] != '\0') {
1033 // Re-enable the unblacklist.
1034 std::vector<UNICHAR_ID> encoding;
1035 encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1036 for (auto it : encoding) {
1037 if (it != INVALID_UNICHAR_ID) {
1038 unichars[it].properties.enabled = true;
1039 }
1040 }
1041 }
1042 }
1043
1044 // Returns true if there are any repeated unicodes in the normalized
1045 // text of any unichar-id in the unicharset.
AnyRepeatedUnicodes() const1046 bool UNICHARSET::AnyRepeatedUnicodes() const {
1047 int start_id = 0;
1048 if (has_special_codes()) {
1049 start_id = SPECIAL_UNICHAR_CODES_COUNT;
1050 }
1051 for (unsigned id = start_id; id < unichars.size(); ++id) {
1052 // Convert to unicodes.
1053 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1054 for (size_t u = 1; u < unicodes.size(); ++u) {
1055 if (unicodes[u - 1] == unicodes[u]) {
1056 return true;
1057 }
1058 }
1059 }
1060 return false;
1061 }
1062
add_script(const char * script)1063 int UNICHARSET::add_script(const char *script) {
1064 for (int i = 0; i < script_table_size_used; ++i) {
1065 if (strcmp(script, script_table[i]) == 0) {
1066 return i;
1067 }
1068 }
1069 if (script_table_size_reserved == 0) {
1070 script_table_size_reserved = 8;
1071 script_table = new char *[script_table_size_reserved];
1072 } else if (script_table_size_used >= script_table_size_reserved) {
1073 assert(script_table_size_used == script_table_size_reserved);
1074 script_table_size_reserved += script_table_size_reserved;
1075 char **new_script_table = new char *[script_table_size_reserved];
1076 memcpy(new_script_table, script_table,
1077 script_table_size_used * sizeof(char *));
1078 delete[] script_table;
1079 script_table = new_script_table;
1080 }
1081 script_table[script_table_size_used] = new char[strlen(script) + 1];
1082 strcpy(script_table[script_table_size_used], script);
1083 return script_table_size_used++;
1084 }
1085
1086 // Returns the string that represents a fragment
1087 // with the given unichar, pos and total.
to_string(const char * unichar,int pos,int total,bool natural)1088 std::string CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1089 bool natural) {
1090 if (total == 1) {
1091 return std::string(unichar);
1092 }
1093 std::string result;
1094 result += kSeparator;
1095 result += unichar;
1096 char buffer[kMaxLen];
1097 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1098 natural ? kNaturalFlag : kSeparator, total);
1099 result += buffer;
1100 return result;
1101 }
1102
parse_from_string(const char * string)1103 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
1104 const char *ptr = string;
1105 int len = strlen(string);
1106 if (len < kMinLen || *ptr != kSeparator) {
1107 return nullptr; // this string can not represent a fragment
1108 }
1109 ptr++; // move to the next character
1110 int step = 0;
1111 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1112 step += UNICHAR::utf8_step(ptr + step);
1113 }
1114 if (step == 0 || step > UNICHAR_LEN) {
1115 return nullptr; // no character for unichar or the character is too long
1116 }
1117 char unichar[UNICHAR_LEN + 1];
1118 strncpy(unichar, ptr, step);
1119 unichar[step] = '\0'; // null terminate unichar
1120 ptr += step; // move to the next fragment separator
1121 int pos = 0;
1122 int total = 0;
1123 bool natural = false;
1124 char *end_ptr = nullptr;
1125 for (int i = 0; i < 2; i++) {
1126 if (ptr > string + len || *ptr != kSeparator) {
1127 if (i == 1 && *ptr == kNaturalFlag) {
1128 natural = true;
1129 } else {
1130 return nullptr; // Failed to parse fragment representation.
1131 }
1132 }
1133 ptr++; // move to the next character
1134 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1135 : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1136 ptr = end_ptr;
1137 }
1138 if (ptr != string + len) {
1139 return nullptr; // malformed fragment representation
1140 }
1141 auto *fragment = new CHAR_FRAGMENT();
1142 fragment->set_all(unichar, pos, total, natural);
1143 return fragment;
1144 }
1145
get_script_id_from_name(const char * script_name) const1146 int UNICHARSET::get_script_id_from_name(const char *script_name) const {
1147 for (int i = 0; i < script_table_size_used; ++i) {
1148 if (strcmp(script_name, script_table[i]) == 0) {
1149 return i;
1150 }
1151 }
1152 return 0; // 0 is always the null_script
1153 }
1154
1155 // Removes/replaces content that belongs in rendered text, but not in the
1156 // unicharset.
1157 /* static */
CleanupString(const char * utf8_str,size_t length)1158 std::string UNICHARSET::CleanupString(const char *utf8_str, size_t length) {
1159 std::string result;
1160 result.reserve(length);
1161 char ch;
1162 while ((ch = *utf8_str) != '\0' && length-- > 0) {
1163 int key_index = 0;
1164 const char *key;
1165 while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1166 int match = 0;
1167 while (key[match] != '\0' && key[match] == utf8_str[match]) {
1168 ++match;
1169 }
1170 if (key[match] == '\0') {
1171 utf8_str += match;
1172 break;
1173 }
1174 ++key_index;
1175 }
1176 if (key == nullptr) {
1177 result.push_back(ch);
1178 ++utf8_str;
1179 } else {
1180 result.append(kCleanupMaps[key_index][1]);
1181 }
1182 }
1183 return result;
1184 }
1185
1186 } // namespace tesseract
1187