1 /*
2 * gutf8.c: UTF-8 conversion
3 *
4 * Author:
5 * Atsushi Enomoto <atsushi@ximian.com>
6 *
7 * (C) 2006 Novell, Inc.
8 * Copyright 2012 Xamarin Inc
9 */
10
11 #include <stdio.h>
12 #include <glib.h>
13
14 /*
15 * Index into the table below with the first byte of a UTF-8 sequence to get
16 * the number of bytes that are supposed to follow it to complete the sequence.
17 *
18 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
19 * as-is for anyone who may want to do such conversion, which was allowed in
20 * earlier algorithms.
21 */
22 const guchar g_utf8_jump_table[256] = {
23 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
24 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
25 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
26 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
27 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
30 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
31 };
32
33 static gchar *
utf8_case_conv(const gchar * str,gssize len,gboolean upper)34 utf8_case_conv (const gchar *str, gssize len, gboolean upper)
35 {
36 gunichar *ustr;
37 glong i, ulen;
38 gchar *utf8;
39
40 ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
41 for (i = 0; i < ulen; i++)
42 ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
43 utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
44 g_free (ustr);
45
46 return utf8;
47 }
48
49 gchar *
g_utf8_strup(const gchar * str,gssize len)50 g_utf8_strup (const gchar *str, gssize len)
51 {
52 return utf8_case_conv (str, len, TRUE);
53 }
54
55 gchar *
g_utf8_strdown(const gchar * str,gssize len)56 g_utf8_strdown (const gchar *str, gssize len)
57 {
58 return utf8_case_conv (str, len, FALSE);
59 }
60
61 static gboolean
utf8_validate(const unsigned char * inptr,size_t len)62 utf8_validate (const unsigned char *inptr, size_t len)
63 {
64 const unsigned char *ptr = inptr + len;
65 unsigned char c;
66
67 /* Everything falls through when TRUE... */
68 switch (len) {
69 default:
70 return FALSE;
71 case 4:
72 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
73 return FALSE;
74
75 if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
76 if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
77 ptr[-2] == 0xAF || ptr[-2] == 0xBF)
78 return FALSE;
79 }
80 case 3:
81 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
82 return FALSE;
83 case 2:
84 if ((c = (*--ptr)) < 0x80 || c > 0xBF)
85 return FALSE;
86
87 /* no fall-through in this inner switch */
88 switch (*inptr) {
89 case 0xE0: if (c < 0xA0) return FALSE; break;
90 case 0xED: if (c > 0x9F) return FALSE; break;
91 case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
92 if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
93 break;
94 case 0xF0: if (c < 0x90) return FALSE; break;
95 case 0xF4: if (c > 0x8F) return FALSE; break;
96 default: if (c < 0x80) return FALSE; break;
97 }
98 case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
99 }
100
101 if (*inptr > 0xF4)
102 return FALSE;
103
104 return TRUE;
105 }
106
107 /**
108 * g_utf8_validate:
109 * @str: a utf-8 encoded string
110 * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
111 * @end: output parameter to mark the end of the valid input
112 *
113 * Checks @utf for being valid UTF-8. @str is assumed to be
114 * null-terminated. This function is not super-strict, as it will
115 * allow longer UTF-8 sequences than necessary. Note that Java is
116 * capable of producing these sequences if provoked. Also note, this
117 * routine checks for the 4-byte maximum size, but does not check for
118 * 0x10ffff maximum value.
119 *
120 * Return value: %TRUE if @str is valid or %FALSE otherwise.
121 **/
122 gboolean
g_utf8_validate(const gchar * str,gssize max_len,const gchar ** end)123 g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
124 {
125 guchar *inptr = (guchar *) str;
126 gboolean valid = TRUE;
127 guint length, min;
128 gssize n = 0;
129
130 if (max_len == 0)
131 return FALSE;
132
133 if (max_len < 0) {
134 while (*inptr != 0) {
135 length = g_utf8_jump_table[*inptr];
136 if (!utf8_validate (inptr, length)) {
137 valid = FALSE;
138 break;
139 }
140
141 inptr += length;
142 }
143 } else {
144 while (n < max_len) {
145 if (*inptr == 0) {
146 /* Note: return FALSE if we encounter nul-byte
147 * before max_len is reached. */
148 valid = FALSE;
149 break;
150 }
151
152 length = g_utf8_jump_table[*inptr];
153 min = MIN (length, max_len - n);
154
155 if (!utf8_validate (inptr, min)) {
156 valid = FALSE;
157 break;
158 }
159
160 if (min < length) {
161 valid = FALSE;
162 break;
163 }
164
165 inptr += length;
166 n += length;
167 }
168 }
169
170 if (end != NULL)
171 *end = (gchar *) inptr;
172
173 return valid;
174 }
175
176 gunichar
g_utf8_get_char_validated(const gchar * str,gssize max_len)177 g_utf8_get_char_validated (const gchar *str, gssize max_len)
178 {
179 unsigned char *inptr = (unsigned char *) str;
180 gunichar u = *inptr;
181 int n, i;
182
183 if (max_len == 0)
184 return -2;
185
186 if (u < 0x80) {
187 /* simple ascii case */
188 return u;
189 } else if (u < 0xc2) {
190 return -1;
191 } else if (u < 0xe0) {
192 u &= 0x1f;
193 n = 2;
194 } else if (u < 0xf0) {
195 u &= 0x0f;
196 n = 3;
197 } else if (u < 0xf8) {
198 u &= 0x07;
199 n = 4;
200 } else if (u < 0xfc) {
201 u &= 0x03;
202 n = 5;
203 } else if (u < 0xfe) {
204 u &= 0x01;
205 n = 6;
206 } else {
207 return -1;
208 }
209
210 if (max_len > 0) {
211 if (!utf8_validate (inptr, MIN (max_len, n)))
212 return -1;
213
214 if (max_len < n)
215 return -2;
216 } else {
217 if (!utf8_validate (inptr, n))
218 return -1;
219 }
220
221 for (i = 1; i < n; i++)
222 u = (u << 6) | (*++inptr ^ 0x80);
223
224 return u;
225 }
226
227 glong
g_utf8_strlen(const gchar * str,gssize max_len)228 g_utf8_strlen (const gchar *str, gssize max_len)
229 {
230 const guchar *inptr = (const guchar *) str;
231 glong clen = 0, len = 0, n;
232
233 if (max_len == 0)
234 return 0;
235
236 if (max_len < 0) {
237 while (*inptr) {
238 inptr += g_utf8_jump_table[*inptr];
239 len++;
240 }
241 } else {
242 while (len < max_len && *inptr) {
243 n = g_utf8_jump_table[*inptr];
244 if ((clen + n) > max_len)
245 break;
246
247 inptr += n;
248 clen += n;
249 len++;
250 }
251 }
252
253 return len;
254 }
255
256 gunichar
g_utf8_get_char(const gchar * src)257 g_utf8_get_char (const gchar *src)
258 {
259 unsigned char *inptr = (unsigned char *) src;
260 gunichar u = *inptr;
261 int n, i;
262
263 if (u < 0x80) {
264 /* simple ascii case */
265 return u;
266 } else if (u < 0xe0) {
267 u &= 0x1f;
268 n = 2;
269 } else if (u < 0xf0) {
270 u &= 0x0f;
271 n = 3;
272 } else if (u < 0xf8) {
273 u &= 0x07;
274 n = 4;
275 } else if (u < 0xfc) {
276 u &= 0x03;
277 n = 5;
278 } else {
279 u &= 0x01;
280 n = 6;
281 }
282
283 for (i = 1; i < n; i++)
284 u = (u << 6) | (*++inptr ^ 0x80);
285
286 return u;
287 }
288
289 gchar *
g_utf8_find_prev_char(const gchar * str,const gchar * p)290 g_utf8_find_prev_char (const gchar *str, const gchar *p)
291 {
292 while (p > str) {
293 p--;
294 if ((*p & 0xc0) != 0xb0)
295 return (gchar *)p;
296 }
297 return NULL;
298 }
299
300 gchar *
g_utf8_prev_char(const gchar * str)301 g_utf8_prev_char (const gchar *str)
302 {
303 const gchar *p = str;
304 do {
305 p--;
306 } while ((*p & 0xc0) == 0xb0);
307
308 return (gchar *)p;
309 }
310
311 gchar *
g_utf8_offset_to_pointer(const gchar * str,glong offset)312 g_utf8_offset_to_pointer (const gchar *str, glong offset)
313 {
314 const gchar *p = str;
315
316 if (offset > 0) {
317 do {
318 p = g_utf8_next_char (p);
319 offset --;
320 } while (offset > 0);
321 }
322 else if (offset < 0) {
323 const gchar *jump = str;
324 do {
325 // since the minimum size of a character is 1
326 // we know we can step back at least offset bytes
327 jump = jump + offset;
328
329 // if we land in the middle of a character
330 // walk to the beginning
331 while ((*jump & 0xc0) == 0xb0)
332 jump --;
333
334 // count how many characters we've actually walked
335 // by going forward
336 p = jump;
337 do {
338 p = g_utf8_next_char (p);
339 offset ++;
340 } while (p < jump);
341
342 } while (offset < 0);
343 }
344
345 return (gchar *)p;
346 }
347
348 glong
g_utf8_pointer_to_offset(const gchar * str,const gchar * pos)349 g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
350 {
351 const gchar *inptr, *inend;
352 glong offset = 0;
353 glong sign = 1;
354
355 if (pos == str)
356 return 0;
357
358 if (str < pos) {
359 inptr = str;
360 inend = pos;
361 } else {
362 inptr = pos;
363 inend = str;
364 sign = -1;
365 }
366
367 do {
368 inptr = g_utf8_next_char (inptr);
369 offset++;
370 } while (inptr < inend);
371
372 return offset * sign;
373 }
374