1 /***************************************************************************
2 * Copyright (C) 2010~2010 by CSSlayer *
3 * wengxt@gmail.com *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
19 ***************************************************************************/
20
21 #include <string.h>
22 #include "fcitx/fcitx.h"
23 #include "fcitx-utils/utf8.h"
24
25 #define CONT(i) ISUTF8_CB(in[i])
26 #define VAL(i, s) ((in[i]&0x3f) << s)
27
28 #define UTF8_LENGTH(Char) \
29 ((Char) < 0x80 ? 1 : \
30 ((Char) < 0x800 ? 2 : \
31 ((Char) < 0x10000 ? 3 : \
32 ((Char) < 0x200000 ? 4 : \
33 ((Char) < 0x4000000 ? 5 : 6)))))
34
35 #define UNICODE_VALID(Char) \
36 ((Char) < 0x110000 && \
37 (((Char) & 0xFFFFF800) != 0xD800) && \
38 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
39 ((Char) & 0xFFFE) != 0xFFFE)
40
41 FCITX_EXPORT_API
42 size_t
fcitx_utf8_strlen(const char * s)43 fcitx_utf8_strlen(const char *s)
44 {
45 unsigned int l = 0;
46
47 while (*s) {
48 uint32_t chr;
49
50 s = fcitx_utf8_get_char(s, &chr);
51 l++;
52 }
53
54 return l;
55 }
56
57 FCITX_EXPORT_API
fcitx_utf8_char_len(const char * in)58 int fcitx_utf8_char_len(const char *in)
59 {
60 if (!(in[0] & 0x80))
61 return 1;
62
63 /* 2-byte, 0x80-0x7ff */
64 if ((in[0] & 0xe0) == 0xc0 && CONT(1))
65 return 2;
66
67 /* 3-byte, 0x800-0xffff */
68 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2))
69 return 3;
70
71 /* 4-byte, 0x10000-0x1FFFFF */
72 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3))
73 return 4;
74
75 /* 5-byte, 0x200000-0x3FFFFFF */
76 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4))
77 return 5;
78
79 /* 6-byte, 0x400000-0x7FFFFFF */
80 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) && CONT(5))
81 return 6;
82
83 return 1;
84 }
85
86 FCITX_EXPORT_API
fcitx_ucs4_char_len(uint32_t c)87 int fcitx_ucs4_char_len(uint32_t c)
88 {
89 if (c < 0x00080) {
90 return 1;
91 } else if (c < 0x00800) {
92 return 2;
93 } else if (c < 0x10000) {
94 return 3;
95 } else if (c < 0x200000) {
96 return 4;
97 // below is not in UCS4 but in 32bit int.
98 } else if (c < 0x8000000) {
99 return 5;
100 } else {
101 return 6;
102 }
103 }
104
105 FCITX_EXPORT_API
fcitx_ucs4_to_utf8(uint32_t c,char * output)106 int fcitx_ucs4_to_utf8(uint32_t c, char* output)
107 {
108 if (c < 0x00080) {
109 output[0] = (char) (c & 0xFF);
110 output[1] = '\0';
111 return 1;
112 } else if (c < 0x00800) {
113 output[0] = (char) (0xC0 + ((c >> 6) & 0x1F));
114 output[1] = (char) (0x80 + (c & 0x3F));
115 output[2] = '\0';
116 return 2;
117 } else if (c < 0x10000) {
118 output[0] = (char) (0xE0 + ((c >> 12) & 0x0F));
119 output[1] = (char) (0x80 + ((c >> 6) & 0x3F));
120 output[2] = (char) (0x80 + (c & 0x3F));
121 output[3] = '\0';
122 return 3;
123 } else if (c < 0x200000) {
124 output[0] = (char) (0xF0 + ((c >> 18) & 0x07));
125 output[1] = (char) (0x80 + ((c >> 12) & 0x3F));
126 output[2] = (char) (0x80 + ((c >> 6) & 0x3F));
127 output[3] = (char) (0x80 + (c & 0x3F));
128 output[4] = '\0';
129 return 4;
130 // below is not in UCS4 but in 32bit int.
131 } else if (c < 0x8000000) {
132 output[0] = (char) (0xF8 + ((c >> 24) & 0x03));
133 output[1] = (char) (0x80 + ((c >> 18) & 0x3F));
134 output[2] = (char) (0x80 + ((c >> 12) & 0x3F));
135 output[3] = (char) (0x80 + ((c >> 6) & 0x3F));
136 output[4] = (char) (0x80 + (c & 0x3F));
137 output[5] = '\0';
138 return 5;
139 } else {
140 output[0] = (char) (0xFC + ((c >> 30) & 0x01));
141 output[1] = (char) (0x80 + ((c >> 24) & 0x3F));
142 output[2] = (char) (0x80 + ((c >> 18) & 0x3F));
143 output[3] = (char) (0x80 + ((c >> 12) & 0x3F));
144 output[4] = (char) (0x80 + ((c >> 6) & 0x3F));
145 output[5] = (char) (0x80 + (c & 0x3F));
146 output[6] = '\0';
147 return 6;
148 }
149 return 6;
150 }
151
152 FCITX_EXPORT_API
153 char *
fcitx_utf8_get_char(const char * i,uint32_t * chr)154 fcitx_utf8_get_char(const char *i, uint32_t *chr)
155 {
156 const unsigned char* in = (const unsigned char *)i;
157 if (!(in[0] & 0x80)) {
158 *(chr) = *(in);
159 return (char *)in + 1;
160 }
161
162 /* 2-byte, 0x80-0x7ff */
163 if ((in[0] & 0xe0) == 0xc0 && CONT(1)) {
164 *chr = ((in[0] & 0x1f) << 6) | VAL(1, 0);
165 return (char *)in + 2;
166 }
167
168 /* 3-byte, 0x800-0xffff */
169 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2)) {
170 *chr = ((in[0] & 0xf) << 12) | VAL(1, 6) | VAL(2, 0);
171 return (char *)in + 3;
172 }
173
174 /* 4-byte, 0x10000-0x1FFFFF */
175 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3)) {
176 *chr = ((in[0] & 0x7) << 18) | VAL(1, 12) | VAL(2, 6) | VAL(3, 0);
177 return (char *)in + 4;
178 }
179
180 /* 5-byte, 0x200000-0x3FFFFFF */
181 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4)) {
182 *chr = ((in[0] & 0x3) << 24) | VAL(1, 18) | VAL(2, 12) | VAL(3, 6) | VAL(4, 0);
183 return (char *)in + 5;
184 }
185
186 /* 6-byte, 0x400000-0x7FFFFFF */
187 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) && CONT(5)) {
188 *chr = ((in[0] & 0x1) << 30) | VAL(1, 24) | VAL(2, 18) | VAL(3, 12) | VAL(4, 6) | VAL(5, 0);
189 return (char *)in + 6;
190 }
191
192 *chr = *in;
193
194 return (char *)in + 1;
195 }
196
197 FCITX_EXPORT_API
fcitx_utf8_strncmp(const char * s1,const char * s2,int n)198 int fcitx_utf8_strncmp(const char *s1, const char *s2, int n)
199 {
200 // Seems to work.
201 uint32_t c1, c2;
202 int i;
203
204 for (i = 0; i < n; i++) {
205 if (!(*s1 & 0x80)) {
206 if (*s1 != *s2)
207 return 1;
208
209 if (*s1 == 0)
210 return 0;
211
212 s1 ++;
213
214 s2 ++;
215 } else {
216 s1 = fcitx_utf8_get_char(s1, &c1);
217 s2 = fcitx_utf8_get_char(s2, &c2);
218
219 if (c1 != c2)
220 return 1;
221 }
222 }
223
224 return 0;
225 }
226
227 FCITX_EXPORT_API
fcitx_utf8_get_nth_char(const char * s,uint32_t n)228 char* fcitx_utf8_get_nth_char(const char* s, uint32_t n)
229 {
230 unsigned int l = 0;
231
232 while (*s && l < n) {
233 uint32_t chr;
234
235 s = fcitx_utf8_get_char(s, &chr);
236 l++;
237 }
238
239 return (char*)s;
240 }
241
242 FCITX_EXPORT_API
243 int
fcitx_utf8_get_char_extended(const char * s,int max_len)244 fcitx_utf8_get_char_extended(const char *s,
245 int max_len)
246 {
247 const unsigned char*p = (const unsigned char*)s;
248 int i, len;
249 unsigned int wc = (unsigned char) * p;
250
251 if (wc < 0x80) {
252 return wc;
253 } else if (wc < 0xc0) {
254 return (unsigned int) - 1;
255 } else if (wc < 0xe0) {
256 len = 2;
257 wc &= 0x1f;
258 } else if (wc < 0xf0) {
259 len = 3;
260 wc &= 0x0f;
261 } else if (wc < 0xf8) {
262 len = 4;
263 wc &= 0x07;
264 } else if (wc < 0xfc) {
265 len = 5;
266 wc &= 0x03;
267 } else if (wc < 0xfe) {
268 len = 6;
269 wc &= 0x01;
270 } else {
271 return (unsigned int) - 1;
272 }
273
274 if (max_len >= 0 && len > max_len) {
275 for (i = 1; i < max_len; i++) {
276 if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
277 return (unsigned int) - 1;
278 }
279
280 return (unsigned int) - 2;
281 }
282
283 for (i = 1; i < len; ++i) {
284 unsigned int ch = ((unsigned char *)p)[i];
285
286 if ((ch & 0xc0) != 0x80) {
287 if (ch)
288 return (unsigned int) - 1;
289 else
290 return (unsigned int) - 2;
291 }
292
293 wc <<= 6;
294
295 wc |= (ch & 0x3f);
296 }
297
298 if (UTF8_LENGTH(wc) != len)
299 return (unsigned int) - 1;
300
301 return wc;
302 }
303
304 FCITX_EXPORT_API
fcitx_utf8_get_char_validated(const char * p,int max_len)305 int fcitx_utf8_get_char_validated(const char *p,
306 int max_len)
307 {
308 int result;
309
310 if (max_len == 0)
311 return -2;
312
313 result = fcitx_utf8_get_char_extended(p, max_len);
314
315 if (result & 0x80000000)
316 return result;
317 else if (!UNICODE_VALID(result))
318 return -1;
319 else
320 return result;
321 }
322
323 FCITX_EXPORT_API
fcitx_utf8_check_string(const char * s)324 int fcitx_utf8_check_string(const char *s)
325 {
326 while (*s) {
327 uint32_t chr;
328
329 if (fcitx_utf8_get_char_validated(s, 6) < 0)
330 return 0;
331
332 s = fcitx_utf8_get_char(s, &chr);
333 }
334
335 return 1;
336 }
337
338 FCITX_EXPORT_API
fcitx_utf8_strncpy(char * str,const char * s,size_t byte)339 void fcitx_utf8_strncpy(char* str, const char* s, size_t byte)
340 {
341 while (*s) {
342 uint32_t chr;
343
344 const char* next = fcitx_utf8_get_char(s, &chr);
345 size_t diff = next - s;
346 if (byte < diff)
347 break;
348
349 memcpy(str, s, diff);
350 str += diff;
351 byte -= diff;
352 s = next;
353 }
354
355 while(byte --) {
356 *str = '\0';
357 str++;
358 }
359 }
360
361 FCITX_EXPORT_API
fcitx_utf8_strnlen(const char * str,size_t byte)362 size_t fcitx_utf8_strnlen(const char* str, size_t byte)
363 {
364 size_t len = 0;
365 while (*str && byte > 0) {
366 uint32_t chr;
367
368 const char* next = fcitx_utf8_get_char(str, &chr);
369 size_t diff = next - str;
370 if (byte < diff)
371 break;
372
373 byte -= diff;
374 str = next;
375 len++;
376 }
377 return len;
378 }
379
380 FCITX_EXPORT_API
381 char*
fcitx_utils_get_ascii_partn(char * string,size_t len)382 fcitx_utils_get_ascii_partn(char *string, size_t len)
383 {
384 if (!string)
385 return NULL;
386
387 char *s = string + len;
388 while ((--s) >= string && !(*s & 0x80)) {
389 }
390 return s + 1;
391 }
392
393 FCITX_EXPORT_API
394 char*
fcitx_utils_get_ascii_part(char * string)395 fcitx_utils_get_ascii_part(char *string)
396 {
397 if (!string)
398 return NULL;
399 return fcitx_utils_get_ascii_partn(string, strlen(string));
400 }
401
402 static inline int
is_valid_ascii(char c)403 is_valid_ascii(char c)
404 {
405 return (!(c & 0x80)) && c;
406 }
407
408 FCITX_EXPORT_API
409 char*
fcitx_utils_get_ascii_endn(const char * string,size_t len)410 fcitx_utils_get_ascii_endn(const char *string, size_t len)
411 {
412 if (!string)
413 return NULL;
414 const char *end = string + len;
415 for (;string < end && is_valid_ascii(*string);string++) {
416 }
417 return (char*)string;
418 }
419
420 FCITX_EXPORT_API
421 char*
fcitx_utils_get_ascii_end(const char * string)422 fcitx_utils_get_ascii_end(const char *string)
423 {
424 if (!string)
425 return NULL;
426 for (;is_valid_ascii(*string);string++) {
427 }
428 return (char*)string;
429 }
430
431 // kate: indent-mode cstyle; space-indent on; indent-width 0;
432