1 ///////////////////////////////////////////////////////////////////////
2 // File:        unichar.h
3 // Description: Unicode character/ligature class.
4 // Author:      Ray Smith
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_CCUTIL_UNICHAR_H_
20 #define TESSERACT_CCUTIL_UNICHAR_H_
21 
22 #include "export.h"
23 
24 #include <memory.h>
25 #include <cstring>
26 #include <string>
27 #include <vector>
28 
29 namespace tesseract {
30 
31 // Maximum number of characters that can be stored in a UNICHAR. Must be
32 // at least 4. Must not exceed 31 without changing the coding of length.
33 #define UNICHAR_LEN 30
34 
35 // A UNICHAR_ID is the unique id of a unichar.
36 using UNICHAR_ID = int;
37 
38 // A variable to indicate an invalid or uninitialized unichar id.
39 static const int INVALID_UNICHAR_ID = -1;
40 // A special unichar that corresponds to INVALID_UNICHAR_ID.
41 static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
42 
43 enum StrongScriptDirection {
44   DIR_NEUTRAL = 0,       // Text contains only neutral characters.
45   DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
46   DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
47   DIR_MIX = 3,           // Text contains a mixture of left-to-right
48                          // and right-to-left characters.
49 };
50 
51 using char32 = signed int;
52 
53 // The UNICHAR class holds a single classification result. This may be
54 // a single Unicode character (stored as between 1 and 4 utf8 bytes) or
55 // multiple Unicode characters representing the NFKC expansion of a ligature
56 // such as fi, ffl etc. These are also stored as utf8.
57 class TESS_API UNICHAR {
58 public:
UNICHAR()59   UNICHAR() {
60     memset(chars, 0, UNICHAR_LEN);
61   }
62 
63   // Construct from a utf8 string. If len<0 then the string is null terminated.
64   // If the string is too long to fit in the UNICHAR then it takes only what
65   // will fit.
66   UNICHAR(const char *utf8_str, int len);
67 
68   // Construct from a single UCS4 character.
69   explicit UNICHAR(int unicode);
70 
71   // Default copy constructor and operator= are OK.
72 
73   // Get the first character as UCS-4.
74   int first_uni() const;
75 
76   // Get the length of the UTF8 string.
utf8_len()77   int utf8_len() const {
78     int len = chars[UNICHAR_LEN - 1];
79     return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
80   }
81 
82   // Get a UTF8 string, but NOT nullptr terminated.
utf8()83   const char *utf8() const {
84     return chars;
85   }
86 
87   // Get a terminated UTF8 string: Must delete[] it after use.
88   char *utf8_str() const;
89 
90   // Get the number of bytes in the first character of the given utf8 string.
91   static int utf8_step(const char *utf8_str);
92 
93   // A class to simplify iterating over and accessing elements of a UTF8
94   // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
95   // take ownership of the underlying byte array. It also does not permit
96   // modification of the array (as the name suggests).
97   //
98   // Example:
99   //   for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
100   //        it != UNICHAR::end(str, len);
101   //        ++it) {
102   //     tprintf("UCS-4 symbol code = %d\n", *it);
103   //     char buf[5];
104   //     int char_len = it.get_utf8(buf); buf[char_len] = '\0';
105   //     tprintf("Char = %s\n", buf);
106   //   }
107   class TESS_API const_iterator {
108     using CI = const_iterator;
109 
110   public:
111     // Step to the next UTF8 character.
112     // If the current position is at an illegal UTF8 character, then print an
113     // error message and step by one byte. If the current position is at a
114     // nullptr value, don't step past it.
115     const_iterator &operator++();
116 
117     // Return the UCS-4 value at the current position.
118     // If the current position is at an illegal UTF8 value, return a single
119     // space character.
120     int operator*() const;
121 
122     // Store the UTF-8 encoding of the current codepoint into buf, which must be
123     // at least 4 bytes long. Return the number of bytes written.
124     // If the current position is at an illegal UTF8 value, writes a single
125     // space character and returns 1.
126     // Note that this method does not null-terminate the buffer.
127     int get_utf8(char *buf) const;
128     // Returns the number of bytes of the current codepoint. Returns 1 if the
129     // current position is at an illegal UTF8 value.
130     int utf8_len() const;
131     // Returns true if the UTF-8 encoding at the current position is legal.
132     bool is_legal() const;
133 
134     // Return the pointer into the string at the current position.
utf8_data()135     const char *utf8_data() const {
136       return it_;
137     }
138 
139     // Iterator equality operators.
140     friend bool operator==(const CI &lhs, const CI &rhs) {
141       return lhs.it_ == rhs.it_;
142     }
143     friend bool operator!=(const CI &lhs, const CI &rhs) {
144       return !(lhs == rhs);
145     }
146 
147   private:
148     friend class UNICHAR;
const_iterator(const char * it)149     explicit const_iterator(const char *it) : it_(it) {}
150 
151     const char *it_; // Pointer into the string.
152   };
153 
154   // Create a start/end iterator pointing to a string. Note that these methods
155   // are static and do NOT create a copy or take ownership of the underlying
156   // array.
157   static const_iterator begin(const char *utf8_str, int byte_length);
158   static const_iterator end(const char *utf8_str, int byte_length);
159 
160   // Converts a utf-8 string to a vector of unicodes.
161   // Returns an empty vector if the input contains invalid UTF-8.
162   static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
163   // Converts a vector of unicodes to a utf8 string.
164   // Returns an empty string if the input contains an invalid unicode.
165   static std::string UTF32ToUTF8(const std::vector<char32> &str32);
166 
167 private:
168   // A UTF-8 representation of 1 or more Unicode characters.
169   // The last element (chars[UNICHAR_LEN - 1]) is a length if
170   // its value < UNICHAR_LEN, otherwise it is a genuine character.
171   char chars[UNICHAR_LEN]{};
172 };
173 
174 } // namespace tesseract
175 
176 #endif // TESSERACT_CCUTIL_UNICHAR_H_
177