1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003, 2004, 2005 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
21
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "stringprep.h"
26
27 /* This file contains functions from GLIB, including gutf8.c and
28 * gunidecomp.c, all licensed under LGPL and copyright hold by:
29 *
30 * Copyright (C) 1999, 2000 Tom Tromey
31 * Copyright 2000 Red Hat, Inc.
32 */
33
34 /* Hacks to make syncing with GLIB code easier. */
35 #define gboolean int
36 #define gchar char
37 #define guchar unsigned char
38 #define glong long
39 #define gint int
40 #define guint unsigned int
41 #define gushort unsigned short
42 #define gint16 int16_t
43 #define guint16 uint16_t
44 #define gunichar uint32_t
45 #define gsize size_t
46 #define gssize ssize_t
47 #define g_malloc malloc
48 #define g_free free
49 #define GError void
50 #define g_set_error(a,b,c,d) ((void) 0)
51 #define g_new(struct_type, n_structs) \
52 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
53 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
54 # define G_STMT_START (void)(
55 # define G_STMT_END )
56 # else
57 # if (defined (sun) || defined (__sun__))
58 # define G_STMT_START if (1)
59 # define G_STMT_END else (void)0
60 # else
61 # define G_STMT_START do
62 # define G_STMT_END while (0)
63 # endif
64 # endif
65 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
66 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
67 #define TRUE 1
68 #define FALSE 0
69
70 /* Code from GLIB gunicode.h starts here. */
71
72 typedef enum
73 {
74 G_NORMALIZE_DEFAULT,
75 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
76 G_NORMALIZE_DEFAULT_COMPOSE,
77 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
78 G_NORMALIZE_ALL,
79 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
80 G_NORMALIZE_ALL_COMPOSE,
81 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
82 }
83 GNormalizeMode;
84
85 /* Code from GLIB gutf8.c starts here. */
86
87 #define UTF8_COMPUTE(Char, Mask, Len) \
88 if (Char < 128) \
89 { \
90 Len = 1; \
91 Mask = 0x7f; \
92 } \
93 else if ((Char & 0xe0) == 0xc0) \
94 { \
95 Len = 2; \
96 Mask = 0x1f; \
97 } \
98 else if ((Char & 0xf0) == 0xe0) \
99 { \
100 Len = 3; \
101 Mask = 0x0f; \
102 } \
103 else if ((Char & 0xf8) == 0xf0) \
104 { \
105 Len = 4; \
106 Mask = 0x07; \
107 } \
108 else if ((Char & 0xfc) == 0xf8) \
109 { \
110 Len = 5; \
111 Mask = 0x03; \
112 } \
113 else if ((Char & 0xfe) == 0xfc) \
114 { \
115 Len = 6; \
116 Mask = 0x01; \
117 } \
118 else \
119 Len = -1;
120
121 #define UTF8_LENGTH(Char) \
122 ((Char) < 0x80 ? 1 : \
123 ((Char) < 0x800 ? 2 : \
124 ((Char) < 0x10000 ? 3 : \
125 ((Char) < 0x200000 ? 4 : \
126 ((Char) < 0x4000000 ? 5 : 6)))))
127
128
129 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
130 (Result) = (Chars)[0] & (Mask); \
131 for ((Count) = 1; (Count) < (Len); ++(Count)) \
132 { \
133 if (((Chars)[(Count)] & 0xc0) != 0x80) \
134 { \
135 (Result) = -1; \
136 break; \
137 } \
138 (Result) <<= 6; \
139 (Result) |= ((Chars)[(Count)] & 0x3f); \
140 }
141
142 #define UNICODE_VALID(Char) \
143 ((Char) < 0x110000 && \
144 (((Char) & 0xFFFFF800) != 0xD800) && \
145 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
146 ((Char) & 0xFFFE) != 0xFFFE)
147
148
149 static const gchar utf8_skip_data[256] = {
150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
151 1, 1, 1, 1, 1, 1, 1,
152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161 1, 1, 1, 1, 1, 1, 1,
162 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
163 2, 2, 2, 2, 2, 2, 2,
164 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
165 5, 5, 5, 6, 6, 1, 1
166 };
167
168 static const gchar *const g_utf8_skip = utf8_skip_data;
169
170 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
171
172 /*
173 * g_utf8_strlen:
174 * @p: pointer to the start of a UTF-8 encoded string.
175 * @max: the maximum number of bytes to examine. If @max
176 * is less than 0, then the string is assumed to be
177 * nul-terminated. If @max is 0, @p will not be examined and
178 * may be %NULL.
179 *
180 * Returns the length of the string in characters.
181 *
182 * Return value: the length of the string in characters
183 **/
184 static glong
g_utf8_strlen(const gchar * p,gssize max)185 g_utf8_strlen (const gchar * p, gssize max)
186 {
187 glong len = 0;
188 const gchar *start = p;
189 g_return_val_if_fail (p != NULL || max == 0, 0);
190
191 if (max < 0)
192 {
193 while (*p)
194 {
195 p = g_utf8_next_char (p);
196 ++len;
197 }
198 }
199 else
200 {
201 if (max == 0 || !*p)
202 return 0;
203
204 p = g_utf8_next_char (p);
205
206 while (p - start < max && *p)
207 {
208 ++len;
209 p = g_utf8_next_char (p);
210 }
211
212 /* only do the last len increment if we got a complete
213 * char (don't count partial chars)
214 */
215 if (p - start == max)
216 ++len;
217 }
218
219 return len;
220 }
221
222 /*
223 * g_utf8_get_char:
224 * @p: a pointer to Unicode character encoded as UTF-8
225 *
226 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
227 * If @p does not point to a valid UTF-8 encoded character, results are
228 * undefined. If you are not sure that the bytes are complete
229 * valid Unicode characters, you should use g_utf8_get_char_validated()
230 * instead.
231 *
232 * Return value: the resulting character
233 **/
234 static gunichar
g_utf8_get_char(const gchar * p)235 g_utf8_get_char (const gchar * p)
236 {
237 int i, mask = 0, len;
238 gunichar result;
239 unsigned char c = (unsigned char) *p;
240
241 UTF8_COMPUTE (c, mask, len);
242 if (len == -1)
243 return (gunichar) - 1;
244 UTF8_GET (result, p, i, mask, len);
245
246 return result;
247 }
248
249 /*
250 * g_unichar_to_utf8:
251 * @c: a ISO10646 character code
252 * @outbuf: output buffer, must have at least 6 bytes of space.
253 * If %NULL, the length will be computed and returned
254 * and nothing will be written to @outbuf.
255 *
256 * Converts a single character to UTF-8.
257 *
258 * Return value: number of bytes written
259 **/
260 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)261 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
262 {
263 guint len = 0;
264 int first;
265 int i;
266
267 if (c < 0x80)
268 {
269 first = 0;
270 len = 1;
271 }
272 else if (c < 0x800)
273 {
274 first = 0xc0;
275 len = 2;
276 }
277 else if (c < 0x10000)
278 {
279 first = 0xe0;
280 len = 3;
281 }
282 else if (c < 0x200000)
283 {
284 first = 0xf0;
285 len = 4;
286 }
287 else if (c < 0x4000000)
288 {
289 first = 0xf8;
290 len = 5;
291 }
292 else
293 {
294 first = 0xfc;
295 len = 6;
296 }
297
298 if (outbuf)
299 {
300 for (i = len - 1; i > 0; --i)
301 {
302 outbuf[i] = (c & 0x3f) | 0x80;
303 c >>= 6;
304 }
305 outbuf[0] = c | first;
306 }
307
308 return len;
309 }
310
311 /*
312 * g_utf8_to_ucs4_fast:
313 * @str: a UTF-8 encoded string
314 * @len: the maximum length of @str to use. If @len < 0, then
315 * the string is nul-terminated.
316 * @items_written: location to store the number of characters in the
317 * result, or %NULL.
318 *
319 * Convert a string from UTF-8 to a 32-bit fixed width
320 * representation as UCS-4, assuming valid UTF-8 input.
321 * This function is roughly twice as fast as g_utf8_to_ucs4()
322 * but does no error checking on the input.
323 *
324 * Return value: a pointer to a newly allocated UCS-4 string.
325 * This value must be freed with g_free().
326 **/
327 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)328 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
329 {
330 gint j, charlen;
331 gunichar *result;
332 gint n_chars, i;
333 const gchar *p;
334
335 g_return_val_if_fail (str != NULL, NULL);
336
337 p = str;
338 n_chars = 0;
339 if (len < 0)
340 {
341 while (*p)
342 {
343 p = g_utf8_next_char (p);
344 ++n_chars;
345 }
346 }
347 else
348 {
349 while (p < str + len && *p)
350 {
351 p = g_utf8_next_char (p);
352 ++n_chars;
353 }
354 }
355
356 result = g_new (gunichar, n_chars + 1);
357 if (!result)
358 return NULL;
359
360 p = str;
361 for (i = 0; i < n_chars; i++)
362 {
363 gunichar wc = ((unsigned char *) p)[0];
364
365 if (wc < 0x80)
366 {
367 result[i] = wc;
368 p++;
369 }
370 else
371 {
372 if (wc < 0xe0)
373 {
374 charlen = 2;
375 wc &= 0x1f;
376 }
377 else if (wc < 0xf0)
378 {
379 charlen = 3;
380 wc &= 0x0f;
381 }
382 else if (wc < 0xf8)
383 {
384 charlen = 4;
385 wc &= 0x07;
386 }
387 else if (wc < 0xfc)
388 {
389 charlen = 5;
390 wc &= 0x03;
391 }
392 else
393 {
394 charlen = 6;
395 wc &= 0x01;
396 }
397
398 for (j = 1; j < charlen; j++)
399 {
400 wc <<= 6;
401 wc |= ((unsigned char *) p)[j] & 0x3f;
402 }
403
404 result[i] = wc;
405 p += charlen;
406 }
407 }
408 result[i] = 0;
409
410 if (items_written)
411 *items_written = i;
412
413 return result;
414 }
415
416 /*
417 * g_ucs4_to_utf8:
418 * @str: a UCS-4 encoded string
419 * @len: the maximum length of @str to use. If @len < 0, then
420 * the string is terminated with a 0 character.
421 * @items_read: location to store number of characters read read, or %NULL.
422 * @items_written: location to store number of bytes written or %NULL.
423 * The value here stored does not include the trailing 0
424 * byte.
425 * @error: location to store the error occuring, or %NULL to ignore
426 * errors. Any of the errors in #GConvertError other than
427 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
428 *
429 * Convert a string from a 32-bit fixed width representation as UCS-4.
430 * to UTF-8. The result will be terminated with a 0 byte.
431 *
432 * Return value: a pointer to a newly allocated UTF-8 string.
433 * This value must be freed with g_free(). If an
434 * error occurs, %NULL will be returned and
435 * @error set.
436 **/
437 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)438 g_ucs4_to_utf8 (const gunichar * str,
439 glong len,
440 glong * items_read, glong * items_written, GError ** error)
441 {
442 gint result_length;
443 gchar *result = NULL;
444 gchar *p;
445 gint i;
446
447 result_length = 0;
448 for (i = 0; len < 0 || i < len; i++)
449 {
450 if (!str[i])
451 break;
452
453 if (str[i] >= 0x80000000)
454 {
455 if (items_read)
456 *items_read = i;
457
458 g_set_error (error, G_CONVERT_ERROR,
459 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
460 _("Character out of range for UTF-8"));
461 goto err_out;
462 }
463
464 result_length += UTF8_LENGTH (str[i]);
465 }
466
467 result = g_malloc (result_length + 1);
468 if (!result)
469 return NULL;
470 p = result;
471
472 i = 0;
473 while (p < result + result_length)
474 p += g_unichar_to_utf8 (str[i++], p);
475
476 *p = '\0';
477
478 if (items_written)
479 *items_written = p - result;
480
481 err_out:
482 if (items_read)
483 *items_read = i;
484
485 return result;
486 }
487
488 /* Code from GLIB gunidecomp.c starts here. */
489
490 #include "gunidecomp.h"
491 #include "gunicomp.h"
492
493 #define CC_PART1(Page, Char) \
494 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
495 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
496 : (cclass_data[combining_class_table_part1[Page]][Char]))
497
498 #define CC_PART2(Page, Char) \
499 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
500 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
501 : (cclass_data[combining_class_table_part2[Page]][Char]))
502
503 #define COMBINING_CLASS(Char) \
504 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
505 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
506 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
507 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
508 : 0))
509
510 /* constants for hangul syllable [de]composition */
511 #define SBase 0xAC00
512 #define LBase 0x1100
513 #define VBase 0x1161
514 #define TBase 0x11A7
515 #define LCount 19
516 #define VCount 21
517 #define TCount 28
518 #define NCount (VCount * TCount)
519 #define SCount (LCount * NCount)
520
521 /*
522 * g_unicode_canonical_ordering:
523 * @string: a UCS-4 encoded string.
524 * @len: the maximum length of @string to use.
525 *
526 * Computes the canonical ordering of a string in-place.
527 * This rearranges decomposed characters in the string
528 * according to their combining classes. See the Unicode
529 * manual for more information.
530 **/
531 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)532 g_unicode_canonical_ordering (gunichar * string, gsize len)
533 {
534 gsize i;
535 int swap = 1;
536
537 while (swap)
538 {
539 int last;
540 swap = 0;
541 last = COMBINING_CLASS (string[0]);
542 for (i = 0; i < len - 1; ++i)
543 {
544 int next = COMBINING_CLASS (string[i + 1]);
545 if (next != 0 && last > next)
546 {
547 gsize j;
548 /* Percolate item leftward through string. */
549 for (j = i + 1; j > 0; --j)
550 {
551 gunichar t;
552 if (COMBINING_CLASS (string[j - 1]) <= next)
553 break;
554 t = string[j];
555 string[j] = string[j - 1];
556 string[j - 1] = t;
557 swap = 1;
558 }
559 /* We're re-entering the loop looking at the old
560 character again. */
561 next = last;
562 }
563 last = next;
564 }
565 }
566 }
567
568 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
569 * r should be null or have sufficient space. Calling with r == NULL will
570 * only calculate the result_len; however, a buffer with space for three
571 * characters will always be big enough. */
572 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)573 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
574 {
575 gint SIndex = s - SBase;
576
577 /* not a hangul syllable */
578 if (SIndex < 0 || SIndex >= SCount)
579 {
580 if (r)
581 r[0] = s;
582 *result_len = 1;
583 }
584 else
585 {
586 gunichar L = LBase + SIndex / NCount;
587 gunichar V = VBase + (SIndex % NCount) / TCount;
588 gunichar T = TBase + SIndex % TCount;
589
590 if (r)
591 {
592 r[0] = L;
593 r[1] = V;
594 }
595
596 if (T != TBase)
597 {
598 if (r)
599 r[2] = T;
600 *result_len = 3;
601 }
602 else
603 *result_len = 2;
604 }
605 }
606
607 /* returns a pointer to a null-terminated UTF-8 string */
608 static const gchar *
find_decomposition(gunichar ch,gboolean compat)609 find_decomposition (gunichar ch, gboolean compat)
610 {
611 int start = 0;
612 int end = G_N_ELEMENTS (decomp_table);
613
614 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
615 {
616 while (TRUE)
617 {
618 int half = (start + end) / 2;
619 if (ch == decomp_table[half].ch)
620 {
621 int offset;
622
623 if (compat)
624 {
625 offset = decomp_table[half].compat_offset;
626 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
627 offset = decomp_table[half].canon_offset;
628 }
629 else
630 {
631 offset = decomp_table[half].canon_offset;
632 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
633 return NULL;
634 }
635
636 return &(decomp_expansion_string[offset]);
637 }
638 else if (half == start)
639 break;
640 else if (ch > decomp_table[half].ch)
641 start = half;
642 else
643 end = half;
644 }
645 }
646
647 return NULL;
648 }
649
650 /* L,V => LV and LV,T => LVT */
651 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)652 combine_hangul (gunichar a, gunichar b, gunichar * result)
653 {
654 gint LIndex = a - LBase;
655 gint SIndex = a - SBase;
656
657 gint VIndex = b - VBase;
658 gint TIndex = b - TBase;
659
660 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
661 {
662 *result = SBase + (LIndex * VCount + VIndex) * TCount;
663 return TRUE;
664 }
665 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
666 && 0 <= TIndex && TIndex <= TCount)
667 {
668 *result = a + TIndex;
669 return TRUE;
670 }
671
672 return FALSE;
673 }
674
675 #define CI(Page, Char) \
676 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
677 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
678 : (compose_data[compose_table[Page]][Char]))
679
680 #define COMPOSE_INDEX(Char) \
681 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
682
683 static gboolean
combine(gunichar a,gunichar b,gunichar * result)684 combine (gunichar a, gunichar b, gunichar * result)
685 {
686 gushort index_a, index_b;
687
688 if (combine_hangul (a, b, result))
689 return TRUE;
690
691 index_a = COMPOSE_INDEX (a);
692
693 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
694 {
695 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
696 {
697 *result =
698 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
699 return TRUE;
700 }
701 else
702 return FALSE;
703 }
704
705 index_b = COMPOSE_INDEX (b);
706
707 if (index_b >= COMPOSE_SECOND_SINGLE_START)
708 {
709 if (a ==
710 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
711 {
712 *result =
713 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
714 return TRUE;
715 }
716 else
717 return FALSE;
718 }
719
720 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
721 && index_b >= COMPOSE_SECOND_START
722 && index_b < COMPOSE_SECOND_SINGLE_START)
723 {
724 gunichar res =
725 compose_array[index_a - COMPOSE_FIRST_START][index_b -
726 COMPOSE_SECOND_START];
727
728 if (res)
729 {
730 *result = res;
731 return TRUE;
732 }
733 }
734
735 return FALSE;
736 }
737
738 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)739 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
740 {
741 gsize n_wc;
742 gunichar *wc_buffer;
743 const char *p;
744 gsize last_start;
745 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
746 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
747
748 if (!str)
749 return NULL;
750
751 n_wc = 0;
752 p = str;
753 while ((max_len < 0 || p < str + max_len) && *p)
754 {
755 const gchar *decomp;
756 gunichar wc = g_utf8_get_char (p);
757
758 if (wc >= 0xac00 && wc <= 0xd7af)
759 {
760 gsize result_len;
761 decompose_hangul (wc, NULL, &result_len);
762 n_wc += result_len;
763 }
764 else
765 {
766 decomp = find_decomposition (wc, do_compat);
767
768 if (decomp)
769 n_wc += g_utf8_strlen (decomp, -1);
770 else
771 n_wc++;
772 }
773
774 p = g_utf8_next_char (p);
775 }
776
777 wc_buffer = g_new (gunichar, n_wc + 1);
778 if (!wc_buffer)
779 return NULL;
780
781 last_start = 0;
782 n_wc = 0;
783 p = str;
784 while ((max_len < 0 || p < str + max_len) && *p)
785 {
786 gunichar wc = g_utf8_get_char (p);
787 const gchar *decomp;
788 int cc;
789 gsize old_n_wc = n_wc;
790
791 if (wc >= 0xac00 && wc <= 0xd7af)
792 {
793 gsize result_len;
794 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
795 n_wc += result_len;
796 }
797 else
798 {
799 decomp = find_decomposition (wc, do_compat);
800
801 if (decomp)
802 {
803 const char *pd;
804 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
805 wc_buffer[n_wc++] = g_utf8_get_char (pd);
806 }
807 else
808 wc_buffer[n_wc++] = wc;
809 }
810
811 if (n_wc > 0)
812 {
813 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
814
815 if (cc == 0)
816 {
817 g_unicode_canonical_ordering (wc_buffer + last_start,
818 n_wc - last_start);
819 last_start = old_n_wc;
820 }
821 }
822
823 p = g_utf8_next_char (p);
824 }
825
826 if (n_wc > 0)
827 {
828 g_unicode_canonical_ordering (wc_buffer + last_start,
829 n_wc - last_start);
830 last_start = n_wc;
831 }
832
833 wc_buffer[n_wc] = 0;
834
835 /* All decomposed and reordered */
836
837 if (do_compose && n_wc > 0)
838 {
839 gsize i, j;
840 int last_cc = 0;
841 last_start = 0;
842
843 for (i = 0; i < n_wc; i++)
844 {
845 int cc = COMBINING_CLASS (wc_buffer[i]);
846
847 if (i > 0 &&
848 (last_cc == 0 || last_cc != cc) &&
849 combine (wc_buffer[last_start], wc_buffer[i],
850 &wc_buffer[last_start]))
851 {
852 for (j = i + 1; j < n_wc; j++)
853 wc_buffer[j - 1] = wc_buffer[j];
854 n_wc--;
855 i--;
856
857 if (i == last_start)
858 last_cc = 0;
859 else
860 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
861
862 continue;
863 }
864
865 if (cc == 0)
866 last_start = i;
867
868 last_cc = cc;
869 }
870 }
871
872 wc_buffer[n_wc] = 0;
873
874 return wc_buffer;
875 }
876
877 /*
878 * g_utf8_normalize:
879 * @str: a UTF-8 encoded string.
880 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881 * @mode: the type of normalization to perform.
882 *
883 * Converts a string into canonical form, standardizing
884 * such issues as whether a character with an accent
885 * is represented as a base character and combining
886 * accent or as a single precomposed character. You
887 * should generally call g_utf8_normalize() before
888 * comparing two Unicode strings.
889 *
890 * The normalization mode %G_NORMALIZE_DEFAULT only
891 * standardizes differences that do not affect the
892 * text content, such as the above-mentioned accent
893 * representation. %G_NORMALIZE_ALL also standardizes
894 * the "compatibility" characters in Unicode, such
895 * as SUPERSCRIPT THREE to the standard forms
896 * (in this case DIGIT THREE). Formatting information
897 * may be lost but for most text operations such
898 * characters should be considered the same.
899 * For example, g_utf8_collate() normalizes
900 * with %G_NORMALIZE_ALL as its first step.
901 *
902 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904 * but returned a result with composed forms rather
905 * than a maximally decomposed form. This is often
906 * useful if you intend to convert the string to
907 * a legacy encoding or pass it to a system with
908 * less capable Unicode handling.
909 *
910 * Return value: a newly allocated string, that is the
911 * normalized form of @str.
912 **/
913 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
915 {
916 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
917 gchar *result;
918
919 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
920 g_free (result_wc);
921
922 return result;
923 }
924
925 /* Public Libidn API starts here. */
926
927 /**
928 * stringprep_utf8_to_unichar:
929 * @p: a pointer to Unicode character encoded as UTF-8
930 *
931 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932 * If @p does not point to a valid UTF-8 encoded character, results are
933 * undefined.
934 *
935 * Return value: the resulting character.
936 **/
937 uint32_t
stringprep_utf8_to_unichar(const char * p)938 stringprep_utf8_to_unichar (const char *p)
939 {
940 return g_utf8_get_char (p);
941 }
942
943 /**
944 * stringprep_unichar_to_utf8:
945 * @c: a ISO10646 character code
946 * @outbuf: output buffer, must have at least 6 bytes of space.
947 * If %NULL, the length will be computed and returned
948 * and nothing will be written to @outbuf.
949 *
950 * Converts a single character to UTF-8.
951 *
952 * Return value: number of bytes written.
953 **/
954 int
stringprep_unichar_to_utf8(uint32_t c,char * outbuf)955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
956 {
957 return g_unichar_to_utf8 (c, outbuf);
958 }
959
960 /**
961 * stringprep_utf8_to_ucs4:
962 * @str: a UTF-8 encoded string
963 * @len: the maximum length of @str to use. If @len < 0, then
964 * the string is nul-terminated.
965 * @items_written: location to store the number of characters in the
966 * result, or %NULL.
967 *
968 * Convert a string from UTF-8 to a 32-bit fixed width
969 * representation as UCS-4, assuming valid UTF-8 input.
970 * This function does no error checking on the input.
971 *
972 * Return value: a pointer to a newly allocated UCS-4 string.
973 * This value must be freed with free().
974 **/
975 uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
977 {
978 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
979 }
980
981 /**
982 * stringprep_ucs4_to_utf8:
983 * @str: a UCS-4 encoded string
984 * @len: the maximum length of @str to use. If @len < 0, then
985 * the string is terminated with a 0 character.
986 * @items_read: location to store number of characters read read, or %NULL.
987 * @items_written: location to store number of bytes written or %NULL.
988 * The value here stored does not include the trailing 0
989 * byte.
990 *
991 * Convert a string from a 32-bit fixed width representation as UCS-4.
992 * to UTF-8. The result will be terminated with a 0 byte.
993 *
994 * Return value: a pointer to a newly allocated UTF-8 string.
995 * This value must be freed with free(). If an
996 * error occurs, %NULL will be returned and
997 * @error set.
998 **/
999 char *
stringprep_ucs4_to_utf8(const uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001 size_t * items_read, size_t * items_written)
1002 {
1003 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004 (glong *) items_written, NULL);
1005 }
1006
1007 /**
1008 * stringprep_utf8_nfkc_normalize:
1009 * @str: a UTF-8 encoded string.
1010 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011 *
1012 * Converts a string into canonical form, standardizing
1013 * such issues as whether a character with an accent
1014 * is represented as a base character and combining
1015 * accent or as a single precomposed character.
1016 *
1017 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018 * differences that do not affect the text content, such as the
1019 * above-mentioned accent representation. It standardizes the
1020 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021 * the standard forms (in this case DIGIT THREE). Formatting
1022 * information may be lost but for most text operations such
1023 * characters should be considered the same. It returns a result with
1024 * composed forms rather than a maximally decomposed form.
1025 *
1026 * Return value: a newly allocated string, that is the
1027 * NFKC normalized form of @str.
1028 **/
1029 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031 {
1032 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033 }
1034
1035 /**
1036 * stringprep_ucs4_nfkc_normalize:
1037 * @str: a Unicode string.
1038 * @len: length of @str array, or -1 if @str is nul-terminated.
1039 *
1040 * Converts UCS4 string into UTF-8 and runs
1041 * stringprep_utf8_nfkc_normalize().
1042 *
1043 * Return value: a newly allocated Unicode string, that is the NFKC
1044 * normalized form of @str.
1045 **/
1046 uint32_t *
stringprep_ucs4_nfkc_normalize(uint32_t * str,ssize_t len)1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048 {
1049 char *p;
1050 uint32_t *result_wc;
1051
1052 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054 free (p);
1055
1056 return result_wc;
1057 }
1058