1 /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2 /* cairo - a vector graphics library with display and print output
3 *
4 * The code in this file is derived from GLib's gutf8.c and
5 * ultimately from libunicode. It is relicensed under the
6 * dual LGPL/MPL with permission of the original authors.
7 *
8 * Copyright © 1999 Tom Tromey
9 * Copyright © 2005 Red Hat, Inc
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it either under the terms of the GNU Lesser General Public
13 * License version 2.1 as published by the Free Software Foundation
14 * (the "LGPL") or, at your option, under the terms of the Mozilla
15 * Public License Version 1.1 (the "MPL"). If you do not alter this
16 * notice, a recipient may use your version of this file under either
17 * the MPL or the LGPL.
18 *
19 * You should have received a copy of the LGPL along with this library
20 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22 * You should have received a copy of the MPL along with this library
23 * in the file COPYING-MPL-1.1
24 *
25 * The contents of this file are subject to the Mozilla Public License
26 * Version 1.1 (the "License"); you may not use this file except in
27 * compliance with the License. You may obtain a copy of the License at
28 * http://www.mozilla.org/MPL/
29 *
30 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32 * the specific language governing rights and limitations.
33 *
34 * The Original Code is the cairo graphics library.
35 *
36 * The Initial Developer of the Original Code is Tom Tromey.
37 * and Red Hat, Inc.
38 *
39 * Contributor(s):
40 * Owen Taylor <otaylor@redhat.com>
41 */
42
43 #include "cairoint.h"
44 #include "cairo-error-private.h"
45
46 #define UTF8_COMPUTE(Char, Mask, Len) \
47 if (Char < 128) \
48 { \
49 Len = 1; \
50 Mask = 0x7f; \
51 } \
52 else if ((Char & 0xe0) == 0xc0) \
53 { \
54 Len = 2; \
55 Mask = 0x1f; \
56 } \
57 else if ((Char & 0xf0) == 0xe0) \
58 { \
59 Len = 3; \
60 Mask = 0x0f; \
61 } \
62 else if ((Char & 0xf8) == 0xf0) \
63 { \
64 Len = 4; \
65 Mask = 0x07; \
66 } \
67 else if ((Char & 0xfc) == 0xf8) \
68 { \
69 Len = 5; \
70 Mask = 0x03; \
71 } \
72 else if ((Char & 0xfe) == 0xfc) \
73 { \
74 Len = 6; \
75 Mask = 0x01; \
76 } \
77 else \
78 Len = -1;
79
80 #define UTF8_LENGTH(Char) \
81 ((Char) < 0x80 ? 1 : \
82 ((Char) < 0x800 ? 2 : \
83 ((Char) < 0x10000 ? 3 : \
84 ((Char) < 0x200000 ? 4 : \
85 ((Char) < 0x4000000 ? 5 : 6)))))
86
87 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
88 (Result) = (Chars)[0] & (Mask); \
89 for ((Count) = 1; (Count) < (Len); ++(Count)) \
90 { \
91 if (((Chars)[(Count)] & 0xc0) != 0x80) \
92 { \
93 (Result) = -1; \
94 break; \
95 } \
96 (Result) <<= 6; \
97 (Result) |= ((Chars)[(Count)] & 0x3f); \
98 }
99
100 #define UNICODE_VALID(Char) \
101 ((Char) < 0x110000 && \
102 (((Char) & 0xFFFFF800) != 0xD800) && \
103 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
104 ((Char) & 0xFFFE) != 0xFFFE)
105
106 static const char utf8_skip_data[256] = {
107 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
114 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
115 };
116
117 #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118
119 /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
120 * If @p does not point to a valid UTF-8 encoded character, results are
121 * undefined.
122 **/
123 static uint32_t
_utf8_get_char(const unsigned char * p)124 _utf8_get_char (const unsigned char *p)
125 {
126 int i, mask = 0, len;
127 uint32_t result;
128 unsigned char c = (unsigned char) *p;
129
130 UTF8_COMPUTE (c, mask, len);
131 if (len == -1)
132 return (uint32_t)-1;
133 UTF8_GET (result, p, i, mask, len);
134
135 return result;
136 }
137
138 /* Like _utf8_get_char, but take a maximum length
139 * and return (uint32_t)-2 on incomplete trailing character
140 */
141 static uint32_t
_utf8_get_char_extended(const unsigned char * p,long max_len)142 _utf8_get_char_extended (const unsigned char *p,
143 long max_len)
144 {
145 int i, len;
146 uint32_t wc = (unsigned char) *p;
147
148 if (wc < 0x80) {
149 return wc;
150 } else if (wc < 0xc0) {
151 return (uint32_t)-1;
152 } else if (wc < 0xe0) {
153 len = 2;
154 wc &= 0x1f;
155 } else if (wc < 0xf0) {
156 len = 3;
157 wc &= 0x0f;
158 } else if (wc < 0xf8) {
159 len = 4;
160 wc &= 0x07;
161 } else if (wc < 0xfc) {
162 len = 5;
163 wc &= 0x03;
164 } else if (wc < 0xfe) {
165 len = 6;
166 wc &= 0x01;
167 } else {
168 return (uint32_t)-1;
169 }
170
171 if (max_len >= 0 && len > max_len) {
172 for (i = 1; i < max_len; i++) {
173 if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
174 return (uint32_t)-1;
175 }
176 return (uint32_t)-2;
177 }
178
179 for (i = 1; i < len; ++i) {
180 uint32_t ch = ((unsigned char *)p)[i];
181
182 if ((ch & 0xc0) != 0x80) {
183 if (ch)
184 return (uint32_t)-1;
185 else
186 return (uint32_t)-2;
187 }
188
189 wc <<= 6;
190 wc |= (ch & 0x3f);
191 }
192
193 if (UTF8_LENGTH(wc) != len)
194 return (uint32_t)-1;
195
196 return wc;
197 }
198
199 /**
200 * _cairo_utf8_get_char_validated:
201 * @p: a UTF-8 string
202 * @unicode: location to store one Unicode character
203 *
204 * Decodes the first character of a valid UTF-8 string, and returns
205 * the number of bytes consumed.
206 *
207 * Note that the string should be valid. Do not use this without
208 * validating the string first.
209 *
210 * Returns: the number of bytes forming the character returned.
211 **/
212 int
_cairo_utf8_get_char_validated(const char * p,uint32_t * unicode)213 _cairo_utf8_get_char_validated (const char *p,
214 uint32_t *unicode)
215 {
216 int i, mask = 0, len;
217 uint32_t result;
218 unsigned char c = (unsigned char) *p;
219
220 UTF8_COMPUTE (c, mask, len);
221 if (len == -1) {
222 if (unicode)
223 *unicode = (uint32_t)-1;
224 return 1;
225 }
226 UTF8_GET (result, p, i, mask, len);
227
228 if (unicode)
229 *unicode = result;
230 return len;
231 }
232
233 /**
234 * _cairo_utf8_to_ucs4:
235 * @str: an UTF-8 string
236 * @len: length of @str in bytes, or -1 if it is nul-terminated.
237 * If @len is supplied and the string has an embedded nul
238 * byte, only the portion before the nul byte is converted.
239 * @result: location to store a pointer to a newly allocated UTF-32
240 * string (always native endian), or %NULL. Free with free(). A 0
241 * word will be written after the last character.
242 * @items_written: location to store number of 32-bit words
243 * written. (Not including the trailing 0)
244 *
245 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
246 * with 1 32-bit word per character. The string is validated to
247 * consist entirely of valid Unicode characters.
248 *
249 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
250 * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
251 * invalid sequence was found.
252 **/
253 cairo_status_t
_cairo_utf8_to_ucs4(const char * str,int len,uint32_t ** result,int * items_written)254 _cairo_utf8_to_ucs4 (const char *str,
255 int len,
256 uint32_t **result,
257 int *items_written)
258 {
259 uint32_t *str32 = NULL;
260 int n_chars, i;
261 const unsigned char *in;
262 const unsigned char * const ustr = (const unsigned char *) str;
263
264 in = ustr;
265 n_chars = 0;
266 while ((len < 0 || ustr + len - in > 0) && *in)
267 {
268 uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
269 if (wc & 0x80000000 || !UNICODE_VALID (wc))
270 return _cairo_error (CAIRO_STATUS_INVALID_STRING);
271
272 n_chars++;
273 if (n_chars == INT_MAX)
274 return _cairo_error (CAIRO_STATUS_INVALID_STRING);
275
276 in = UTF8_NEXT_CHAR (in);
277 }
278
279 if (result) {
280 str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
281 if (!str32)
282 return _cairo_error (CAIRO_STATUS_NO_MEMORY);
283
284 in = ustr;
285 for (i=0; i < n_chars; i++) {
286 str32[i] = _utf8_get_char (in);
287 in = UTF8_NEXT_CHAR (in);
288 }
289 str32[i] = 0;
290
291 *result = str32;
292 }
293
294 if (items_written)
295 *items_written = n_chars;
296
297 return CAIRO_STATUS_SUCCESS;
298 }
299
300 /**
301 * _cairo_ucs4_to_utf8:
302 * @unicode: a UCS-4 character
303 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
304 * space available. Or %NULL.
305 *
306 * This space left intentionally blank.
307 *
308 * Return value: Number of bytes in the utf8 string or 0 if an invalid
309 * unicode character
310 **/
311 int
_cairo_ucs4_to_utf8(uint32_t unicode,char * utf8)312 _cairo_ucs4_to_utf8 (uint32_t unicode,
313 char *utf8)
314 {
315 int bytes;
316 char *p;
317
318 if (unicode < 0x80) {
319 if (utf8)
320 *utf8 = unicode;
321 return 1;
322 } else if (unicode < 0x800) {
323 bytes = 2;
324 } else if (unicode < 0x10000) {
325 bytes = 3;
326 } else if (unicode < 0x200000) {
327 bytes = 4;
328 } else {
329 return 0;
330 }
331
332 if (!utf8)
333 return bytes;
334
335 p = utf8 + bytes;
336 while (p > utf8) {
337 *--p = 0x80 | (unicode & 0x3f);
338 unicode >>= 6;
339 }
340 *p |= 0xf0 << (4 - bytes);
341
342 return bytes;
343 }
344
345 #if CAIRO_HAS_UTF8_TO_UTF16
346 /**
347 * _cairo_utf8_to_utf16:
348 * @str: an UTF-8 string
349 * @len: length of @str in bytes, or -1 if it is nul-terminated.
350 * If @len is supplied and the string has an embedded nul
351 * byte, only the portion before the nul byte is converted.
352 * @result: location to store a pointer to a newly allocated UTF-16
353 * string (always native endian). Free with free(). A 0
354 * word will be written after the last character.
355 * @items_written: location to store number of 16-bit words
356 * written. (Not including the trailing 0)
357 *
358 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
359 * where characters are represented either as a single 16-bit word, or
360 * as a pair of 16-bit "surrogates". The string is validated to
361 * consist entirely of valid Unicode characters.
362 *
363 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
364 * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
365 * an invalid sequence was found.
366 **/
367 cairo_status_t
_cairo_utf8_to_utf16(const char * str,int len,uint16_t ** result,int * items_written)368 _cairo_utf8_to_utf16 (const char *str,
369 int len,
370 uint16_t **result,
371 int *items_written)
372 {
373 uint16_t *str16 = NULL;
374 int n16, i;
375 const unsigned char *in;
376 const unsigned char * const ustr = (const unsigned char *) str;
377
378 in = ustr;
379 n16 = 0;
380 while ((len < 0 || ustr + len - in > 0) && *in) {
381 uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
382 if (wc & 0x80000000 || !UNICODE_VALID (wc))
383 return _cairo_error (CAIRO_STATUS_INVALID_STRING);
384
385 if (wc < 0x10000)
386 n16 += 1;
387 else
388 n16 += 2;
389
390 if (n16 == INT_MAX - 1 || n16 == INT_MAX)
391 return _cairo_error (CAIRO_STATUS_INVALID_STRING);
392
393 in = UTF8_NEXT_CHAR (in);
394 }
395
396 str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
397 if (!str16)
398 return _cairo_error (CAIRO_STATUS_NO_MEMORY);
399
400 in = ustr;
401 for (i = 0; i < n16;) {
402 uint32_t wc = _utf8_get_char (in);
403
404 if (wc < 0x10000) {
405 str16[i++] = wc;
406 } else {
407 str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
408 str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
409 }
410
411 in = UTF8_NEXT_CHAR (in);
412 }
413
414 str16[i] = 0;
415
416 *result = str16;
417 if (items_written)
418 *items_written = n16;
419
420 return CAIRO_STATUS_SUCCESS;
421 }
422 #endif
423