1 /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2 /* cairo - a vector graphics library with display and print output
3  *
4  * The code in this file is derived from GLib's gutf8.c and
5  *   ultimately from libunicode. It is relicensed under the
6  *   dual LGPL/MPL with permission of the original authors.
7  *
8  * Copyright © 1999 Tom Tromey
9  * Copyright © 2005 Red Hat, Inc
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it either under the terms of the GNU Lesser General Public
13  * License version 2.1 as published by the Free Software Foundation
14  * (the "LGPL") or, at your option, under the terms of the Mozilla
15  * Public License Version 1.1 (the "MPL"). If you do not alter this
16  * notice, a recipient may use your version of this file under either
17  * the MPL or the LGPL.
18  *
19  * You should have received a copy of the LGPL along with this library
20  * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22  * You should have received a copy of the MPL along with this library
23  * in the file COPYING-MPL-1.1
24  *
25  * The contents of this file are subject to the Mozilla Public License
26  * Version 1.1 (the "License"); you may not use this file except in
27  * compliance with the License. You may obtain a copy of the License at
28  * http://www.mozilla.org/MPL/
29  *
30  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31  * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32  * the specific language governing rights and limitations.
33  *
34  * The Original Code is the cairo graphics library.
35  *
36  * The Initial Developer of the Original Code is Tom Tromey.
37  *  and Red Hat, Inc.
38  *
39  * Contributor(s):
40  *	Owen Taylor <otaylor@redhat.com>
41  */
42 
43 #include "cairoint.h"
44 #include "cairo-error-private.h"
45 
46 #define UTF8_COMPUTE(Char, Mask, Len)					      \
47   if (Char < 128)							      \
48     {									      \
49       Len = 1;								      \
50       Mask = 0x7f;							      \
51     }									      \
52   else if ((Char & 0xe0) == 0xc0)					      \
53     {									      \
54       Len = 2;								      \
55       Mask = 0x1f;							      \
56     }									      \
57   else if ((Char & 0xf0) == 0xe0)					      \
58     {									      \
59       Len = 3;								      \
60       Mask = 0x0f;							      \
61     }									      \
62   else if ((Char & 0xf8) == 0xf0)					      \
63     {									      \
64       Len = 4;								      \
65       Mask = 0x07;							      \
66     }									      \
67   else if ((Char & 0xfc) == 0xf8)					      \
68     {									      \
69       Len = 5;								      \
70       Mask = 0x03;							      \
71     }									      \
72   else if ((Char & 0xfe) == 0xfc)					      \
73     {									      \
74       Len = 6;								      \
75       Mask = 0x01;							      \
76     }									      \
77   else									      \
78     Len = -1;
79 
80 #define UTF8_LENGTH(Char)              \
81   ((Char) < 0x80 ? 1 :                 \
82    ((Char) < 0x800 ? 2 :               \
83     ((Char) < 0x10000 ? 3 :            \
84      ((Char) < 0x200000 ? 4 :          \
85       ((Char) < 0x4000000 ? 5 : 6)))))
86 
87 #define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
88   (Result) = (Chars)[0] & (Mask);					      \
89   for ((Count) = 1; (Count) < (Len); ++(Count))				      \
90     {									      \
91       if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
92 	{								      \
93 	  (Result) = -1;						      \
94 	  break;							      \
95 	}								      \
96       (Result) <<= 6;							      \
97       (Result) |= ((Chars)[(Count)] & 0x3f);				      \
98     }
99 
100 #define UNICODE_VALID(Char)                   \
101     ((Char) < 0x110000 &&                     \
102      (((Char) & 0xFFFFF800) != 0xD800) &&     \
103      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
104      ((Char) & 0xFFFE) != 0xFFFE)
105 
106 static const char utf8_skip_data[256] = {
107     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
114     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
115 };
116 
117 #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118 
119 /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
120  * If @p does not point to a valid UTF-8 encoded character, results are
121  * undefined.
122  **/
123 static uint32_t
_utf8_get_char(const unsigned char * p)124 _utf8_get_char (const unsigned char *p)
125 {
126     int i, mask = 0, len;
127     uint32_t result;
128     unsigned char c = (unsigned char) *p;
129 
130     UTF8_COMPUTE (c, mask, len);
131     if (len == -1)
132 	return (uint32_t)-1;
133     UTF8_GET (result, p, i, mask, len);
134 
135     return result;
136 }
137 
138 /* Like _utf8_get_char, but take a maximum length
139  * and return (uint32_t)-2 on incomplete trailing character
140  */
141 static uint32_t
_utf8_get_char_extended(const unsigned char * p,long max_len)142 _utf8_get_char_extended (const unsigned char *p,
143 			 long		      max_len)
144 {
145     int i, len;
146     uint32_t wc = (unsigned char) *p;
147 
148     if (wc < 0x80) {
149 	return wc;
150     } else if (wc < 0xc0) {
151 	return (uint32_t)-1;
152     } else if (wc < 0xe0) {
153 	len = 2;
154 	wc &= 0x1f;
155     } else if (wc < 0xf0) {
156 	len = 3;
157 	wc &= 0x0f;
158     } else if (wc < 0xf8) {
159 	len = 4;
160 	wc &= 0x07;
161     } else if (wc < 0xfc) {
162 	len = 5;
163 	wc &= 0x03;
164     } else if (wc < 0xfe) {
165 	len = 6;
166 	wc &= 0x01;
167     } else {
168 	return (uint32_t)-1;
169     }
170 
171     if (max_len >= 0 && len > max_len) {
172 	for (i = 1; i < max_len; i++) {
173 	    if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
174 		return (uint32_t)-1;
175 	}
176 	return (uint32_t)-2;
177     }
178 
179     for (i = 1; i < len; ++i) {
180 	uint32_t ch = ((unsigned char *)p)[i];
181 
182 	if ((ch & 0xc0) != 0x80) {
183 	    if (ch)
184 		return (uint32_t)-1;
185 	    else
186 		return (uint32_t)-2;
187 	}
188 
189 	wc <<= 6;
190 	wc |= (ch & 0x3f);
191     }
192 
193     if (UTF8_LENGTH(wc) != len)
194 	return (uint32_t)-1;
195 
196     return wc;
197 }
198 
199 /**
200  * _cairo_utf8_get_char_validated:
201  * @p: a UTF-8 string
202  * @unicode: location to store one Unicode character
203  *
204  * Decodes the first character of a valid UTF-8 string, and returns
205  * the number of bytes consumed.
206  *
207  * Note that the string should be valid.  Do not use this without
208  * validating the string first.
209  *
210  * Returns: the number of bytes forming the character returned.
211  **/
212 int
_cairo_utf8_get_char_validated(const char * p,uint32_t * unicode)213 _cairo_utf8_get_char_validated (const char *p,
214 				uint32_t   *unicode)
215 {
216     int i, mask = 0, len;
217     uint32_t result;
218     unsigned char c = (unsigned char) *p;
219 
220     UTF8_COMPUTE (c, mask, len);
221     if (len == -1) {
222 	if (unicode)
223 	    *unicode = (uint32_t)-1;
224 	return 1;
225     }
226     UTF8_GET (result, p, i, mask, len);
227 
228     if (unicode)
229 	*unicode = result;
230     return len;
231 }
232 
233 /**
234  * _cairo_utf8_to_ucs4:
235  * @str: an UTF-8 string
236  * @len: length of @str in bytes, or -1 if it is nul-terminated.
237  *   If @len is supplied and the string has an embedded nul
238  *   byte, only the portion before the nul byte is converted.
239  * @result: location to store a pointer to a newly allocated UTF-32
240  *   string (always native endian), or %NULL. Free with free(). A 0
241  *   word will be written after the last character.
242  * @items_written: location to store number of 32-bit words
243  *   written. (Not including the trailing 0)
244  *
245  * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
246  * with 1 32-bit word per character. The string is validated to
247  * consist entirely of valid Unicode characters.
248  *
249  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
250  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
251  *   invalid sequence was found.
252  **/
253 cairo_status_t
_cairo_utf8_to_ucs4(const char * str,int len,uint32_t ** result,int * items_written)254 _cairo_utf8_to_ucs4 (const char *str,
255 		     int	 len,
256 		     uint32_t  **result,
257 		     int	*items_written)
258 {
259     uint32_t *str32 = NULL;
260     int n_chars, i;
261     const unsigned char *in;
262     const unsigned char * const ustr = (const unsigned char *) str;
263 
264     in = ustr;
265     n_chars = 0;
266     while ((len < 0 || ustr + len - in > 0) && *in)
267     {
268 	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
269 	if (wc & 0x80000000 || !UNICODE_VALID (wc))
270 	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
271 
272 	n_chars++;
273 	if (n_chars == INT_MAX)
274 	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
275 
276 	in = UTF8_NEXT_CHAR (in);
277     }
278 
279     if (result) {
280 	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
281 	if (!str32)
282 	    return _cairo_error (CAIRO_STATUS_NO_MEMORY);
283 
284 	in = ustr;
285 	for (i=0; i < n_chars; i++) {
286 	    str32[i] = _utf8_get_char (in);
287 	    in = UTF8_NEXT_CHAR (in);
288 	}
289 	str32[i] = 0;
290 
291 	*result = str32;
292     }
293 
294     if (items_written)
295 	*items_written = n_chars;
296 
297     return CAIRO_STATUS_SUCCESS;
298 }
299 
300 /**
301  * _cairo_ucs4_to_utf8:
302  * @unicode: a UCS-4 character
303  * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
304  * space available. Or %NULL.
305  *
306  * This space left intentionally blank.
307  *
308  * Return value: Number of bytes in the utf8 string or 0 if an invalid
309  * unicode character
310  **/
311 int
_cairo_ucs4_to_utf8(uint32_t unicode,char * utf8)312 _cairo_ucs4_to_utf8 (uint32_t  unicode,
313 		     char     *utf8)
314 {
315     int bytes;
316     char *p;
317 
318     if (unicode < 0x80) {
319 	if (utf8)
320 	    *utf8 = unicode;
321 	return 1;
322     } else if (unicode < 0x800) {
323 	bytes = 2;
324     } else if (unicode < 0x10000) {
325 	bytes = 3;
326     } else if (unicode < 0x200000) {
327 	bytes = 4;
328     } else {
329 	return 0;
330     }
331 
332     if (!utf8)
333 	return bytes;
334 
335     p = utf8 + bytes;
336     while (p > utf8) {
337 	*--p = 0x80 | (unicode & 0x3f);
338 	unicode >>= 6;
339     }
340     *p |= 0xf0 << (4 - bytes);
341 
342     return bytes;
343 }
344 
345 #if CAIRO_HAS_UTF8_TO_UTF16
346 /**
347  * _cairo_utf8_to_utf16:
348  * @str: an UTF-8 string
349  * @len: length of @str in bytes, or -1 if it is nul-terminated.
350  *   If @len is supplied and the string has an embedded nul
351  *   byte, only the portion before the nul byte is converted.
352  * @result: location to store a pointer to a newly allocated UTF-16
353  *   string (always native endian). Free with free(). A 0
354  *   word will be written after the last character.
355  * @items_written: location to store number of 16-bit words
356  *   written. (Not including the trailing 0)
357  *
358  * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
359  * where characters are represented either as a single 16-bit word, or
360  * as a pair of 16-bit "surrogates". The string is validated to
361  * consist entirely of valid Unicode characters.
362  *
363  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
364  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
365  *   an invalid sequence was found.
366  **/
367 cairo_status_t
_cairo_utf8_to_utf16(const char * str,int len,uint16_t ** result,int * items_written)368 _cairo_utf8_to_utf16 (const char *str,
369 		      int	  len,
370 		      uint16_t **result,
371 		      int	*items_written)
372 {
373     uint16_t *str16 = NULL;
374     int n16, i;
375     const unsigned char *in;
376     const unsigned char * const ustr = (const unsigned char *) str;
377 
378     in = ustr;
379     n16 = 0;
380     while ((len < 0 || ustr + len - in > 0) && *in) {
381 	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
382 	if (wc & 0x80000000 || !UNICODE_VALID (wc))
383 	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
384 
385 	if (wc < 0x10000)
386 	    n16 += 1;
387 	else
388 	    n16 += 2;
389 
390 	if (n16 == INT_MAX - 1 || n16 == INT_MAX)
391 	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
392 
393 	in = UTF8_NEXT_CHAR (in);
394     }
395 
396     str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
397     if (!str16)
398 	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
399 
400     in = ustr;
401     for (i = 0; i < n16;) {
402 	uint32_t wc = _utf8_get_char (in);
403 
404 	if (wc < 0x10000) {
405 	    str16[i++] = wc;
406 	} else {
407 	    str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
408 	    str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
409 	}
410 
411 	in = UTF8_NEXT_CHAR (in);
412     }
413 
414     str16[i] = 0;
415 
416     *result = str16;
417     if (items_written)
418 	*items_written = n16;
419 
420     return CAIRO_STATUS_SUCCESS;
421 }
422 #endif
423