1 /* Copyright (C) 2004 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 #ifndef _UTF8ITER_H_INCLUDED_
18 #define _UTF8ITER_H_INCLUDED_
19 
20 #ifdef UTF8ITER_CHECK
21 #include "assert.h"
22 #endif
23 #include <string>
24 #include <cstdint>
25 
26 /**
27  * A small helper class to iterate over utf8 strings. This is not an
28  * STL iterator and does not much error checking. It is designed purely
29  * for recoll usage, where the utf-8 string comes out of iconv in most cases
30  * and is assumed legal. We just try to catch cases where there would be
31  * a risk of crash.
32  */
33 class Utf8Iter {
34 public:
Utf8Iter(const std::string & in)35     Utf8Iter(const std::string &in)
36         : m_sp(&in) {
37         update_cl();
38     }
39 
buffer()40     const std::string& buffer() const {
41         return *m_sp;
42     }
43 
rewind()44     void rewind() {
45         m_cl = 0;
46         m_pos = 0;
47         m_charpos = 0;
48         update_cl();
49     }
50 
retryfurther()51     void retryfurther() {
52         if (eof())
53             return;
54         m_pos++;
55         if (eof()) {
56             return;
57         }
58         update_cl();
59     }
60 
61     /** "Direct" access. Awfully inefficient as we skip from start or current
62      * position at best. This can only be useful for a lookahead from the
63      * current position */
64     uint32_t operator[](std::string::size_type charpos) const {
65         std::string::size_type mypos = 0;
66         unsigned int mycp = 0;
67         if (charpos >= m_charpos) {
68             mypos = m_pos;
69             mycp = m_charpos;
70         }
71         int l;
72         while (mypos < m_sp->length() && mycp != charpos) {
73             l = get_cl(mypos);
74             if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
75                 return uint32_t(-1);
76             mypos += l;
77             ++mycp;
78         }
79         if (mypos < m_sp->length() && mycp == charpos) {
80             l = get_cl(mypos);
81             if (poslok(mypos, l) && checkvalidat(mypos, l))
82                 return getvalueat(mypos, l);
83         }
84         return uint32_t(-1);
85     }
86 
87     /** Increment current position to next utf-8 char */
88     std::string::size_type operator++(int) {
89         // Note: m_cl may be zero at eof if user's test not right
90         // this shouldn't crash the program until actual data access
91 #ifdef UTF8ITER_CHECK
92         assert(m_cl != 0);
93 #endif
94         if (m_cl == 0)
95             return std::string::npos;
96 
97         m_pos += m_cl;
98         m_charpos++;
99         update_cl();
100         return m_pos;
101     }
102 
103     /** operator* returns the ucs4 value as a machine integer*/
104     uint32_t operator*() {
105 #ifdef UTF8ITER_CHECK
106         assert(m_cl > 0);
107 #endif
108         return m_cl == 0 ? uint32_t(-1) : getvalueat(m_pos, m_cl);
109     }
110 
111     /** Append current utf-8 possibly multi-byte character to string param.
112         This needs to be fast. No error checking. */
appendchartostring(std::string & out)113     unsigned int appendchartostring(std::string &out) const {
114 #ifdef UTF8ITER_CHECK
115         assert(m_cl != 0);
116 #endif
117         out.append(&(*m_sp)[m_pos], m_cl);
118         return m_cl;
119     }
120 
121     /** Return current character as string */
string()122     operator std::string() {
123 #ifdef UTF8ITER_CHECK
124         assert(m_cl != 0);
125 #endif
126         return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
127     }
128 
eof()129     bool eof() const {
130         return m_pos == m_sp->length();
131     }
132 
error()133     bool error() const {
134         return m_cl == 0;
135     }
136 
137     /** Return current byte offset in input string */
getBpos()138     std::string::size_type getBpos() const {
139         return m_pos;
140     }
141 
142     /** Return current character length */
getBlen()143     std::string::size_type getBlen() const {
144         return m_cl;
145     }
146 
147     /** Return current unicode character offset in input string */
getCpos()148     std::string::size_type getCpos() const {
149         return m_charpos;
150     }
151 
152 private:
153     // String we're working with
154     const std::string*     m_sp;
155     // Character length at current position. A value of zero indicates
156     // an error.
157     unsigned int m_cl{0};
158     // Current byte offset in string.
159     std::string::size_type m_pos{0};
160     // Current character position
161     unsigned int      m_charpos{0};
162 
163     // Check position and cl against string length
poslok(std::string::size_type p,int l)164     bool poslok(std::string::size_type p, int l) const {
165         return p != std::string::npos && l > 0 && p + l <= m_sp->length();
166     }
167 
168     // Update current char length in object state, check
169     // for errors
update_cl()170     inline void update_cl() {
171         m_cl = 0;
172         if (m_pos >= m_sp->length())
173             return;
174         m_cl = get_cl(m_pos);
175         if (!poslok(m_pos, m_cl)) {
176             // Used to set eof here for safety, but this is bad because it
177             // basically prevents the caller to discriminate error and eof.
178             //        m_pos = m_sp->length();
179             m_cl = 0;
180             return;
181         }
182         if (!checkvalidat(m_pos, m_cl)) {
183             m_cl = 0;
184         }
185     }
186 
checkvalidat(std::string::size_type p,int l)187     inline bool checkvalidat(std::string::size_type p, int l) const {
188         switch (l) {
189         case 1:
190             return uint8_t((*m_sp)[p]) < 128;
191         case 2:
192             return uint8_t((*m_sp)[p] & 224) == 192
193                                                && uint8_t((*m_sp)[p+1] & 192) == 128;
194         case 3:
195             return uint8_t((*m_sp)[p] & 240) == 224
196                                                && uint8_t((*m_sp)[p+1] & 192) ==  128
197                                                && uint8_t((*m_sp)[p+2] & 192) ==  128
198                                                ;
199         case 4:
200             return uint8_t((*m_sp)[p] & 248) == 240
201                                                && uint8_t((*m_sp)[p+1] & 192) ==  128
202                                                && uint8_t((*m_sp)[p+2] & 192) ==  128
203                                                && uint8_t((*m_sp)[p+3] & 192) ==  128
204                                                ;
205         default:
206             return false;
207         }
208     }
209 
210     // Get character byte length at specified position. Returns 0 for error.
get_cl(std::string::size_type p)211     inline int get_cl(std::string::size_type p) const {
212         unsigned int z = uint8_t((*m_sp)[p]);
213         if (z <= 127) {
214             return 1;
215         } else if ((z & 224) == 192) {
216             return 2;
217         } else if ((z & 240) == 224) {
218             return 3;
219         } else if ((z & 248) == 240) {
220             return 4;
221         }
222 #ifdef UTF8ITER_CHECK
223         assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
224                (z & 248) == 240);
225 #endif
226         return 0;
227     }
228 
229     // Compute value at given position. No error checking.
getvalueat(std::string::size_type p,int l)230     inline unsigned int getvalueat(std::string::size_type p, int l) const {
231         switch (l) {
232         case 1:
233 #ifdef UTF8ITER_CHECK
234             assert((unsigned char)(*m_sp)[p] < 128);
235 #endif
236             return uint8_t((*m_sp)[p]);
237         case 2:
238 #ifdef UTF8ITER_CHECK
239             assert(
240                 uint8_t((*m_sp)[p] & 224) == 192
241                 && ((unsigned char)(*m_sp)[p+1] & 192) ==  128
242                 );
243 #endif
244             return uint8_t((*m_sp)[p] - 192) * 64 +
245                 uint8_t((*m_sp)[p+1] - 128);
246         case 3:
247 #ifdef UTF8ITER_CHECK
248             assert(
249                 (((unsigned char)(*m_sp)[p]) & 240) == 224
250                 && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
251                 && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
252                 );
253 #endif
254 
255             return uint8_t((*m_sp)[p] - 224) * 4096 +
256                 uint8_t((*m_sp)[p+1] - 128) * 64 +
257                 uint8_t((*m_sp)[p+2] - 128);
258         case 4:
259 #ifdef UTF8ITER_CHECK
260             assert(
261                 uint8_t((*m_sp)[p] & 248) == 240
262                 && uint8_t((*m_sp)[p+1] & 192) ==  128
263                 && uint8_t((*m_sp)[p+2] & 192) ==  128
264                 && uint8_t((*m_sp)[p+3] & 192) ==  128
265                 );
266 #endif
267 
268             return uint8_t((*m_sp)[p]-240)*262144 +
269                 uint8_t((*m_sp)[p+1]-128)*4096 +
270                 uint8_t((*m_sp)[p+2]-128)*64 +
271                 uint8_t((*m_sp)[p+3]-128);
272 
273         default:
274 #ifdef UTF8ITER_CHECK
275             assert(l <= 4);
276 #endif
277             return uint32_t(-1);
278         }
279     }
280 
281 };
282 
283 
284 enum Utf8TruncateFlag {UTF8T_NONE, UTF8T_ATWORD, UTF8T_ELLIPSIS};
285 
286 /** Truncate utf8 string, maintaining encoding integrity
287  * @param s input string to be modified in place
288  * @param maxlen maximum size after truncation in bytes
289  * @param flags Specify cutting at word position, adding an ellipsis
290  */
291 void utf8truncate(std::string& s, int maxlen, int flags = 0,
292                   const std::string& ellipsis = "...",
293                   const std::string& ws = " \t\n\r");
294 
295 /** Compute length in characters of utf-8 string */
296 size_t utf8len(const std::string& s);
297 
298 /** @brief Check and possibly fix string by replacing badly encoded
299  * characters with the standard question mark replacement character.
300  *
301  * @param in the string to check
302  * @param[out] if fixit is true, the fixed output string
303  * @param fixit if true, copy a fixed string to out
304  * @param maxrepl maximum replacements before we bail out
305  * @return -1 for failure (fixit false or maxrepl reached).
306  *   0 or positive: replacement count.
307  */
308 int utf8check(
309     const std::string& in, bool fixit=false, std::string* out = nullptr, int maxrepl=100);
310 
311 #endif /* _UTF8ITER_H_INCLUDED_ */
312