1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003 Simon Josefsson
3 *
4 * This file is part of GNU Libidn.
5 *
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 */
21
22 #include "internal.h"
23
24 /* This file contains functions from GLIB, including gutf8.c and
25 * gunidecomp.c, all licensed under LGPL and copyright hold by:
26 *
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
29 */
30
31 /* Hacks to make syncing with GLIB code easier. */
32 #define gboolean int
33 #define gchar char
34 #define guchar unsigned char
35 #define glong long
36 #define gint int
37 #define guint unsigned int
38 #define gushort unsigned short
39 #define gint16 my_int16_t
40 #define guint16 my_uint16_t
41 #define gunichar my_uint32_t
42 #define gsize size_t
43 #define gssize ssize_t
44 #define g_malloc malloc
45 #define g_free free
46 #define GError void
47 #define g_set_error(a,b,c,d) 0
48 #define g_new(struct_type, n_structs) \
49 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
50 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
51 # define G_STMT_START (void)(
52 # define G_STMT_END )
53 # else
54 # if (defined (sun) || defined (__sun__))
55 # define G_STMT_START if (1)
56 # define G_STMT_END else (void)0
57 # else
58 # define G_STMT_START do
59 # define G_STMT_END while (0)
60 # endif
61 # endif
62 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
63 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
64 #define TRUE 1
65 #define FALSE 0
66
67 /* Code from GLIB gunicode.h starts here. */
68
69 typedef enum
70 {
71 G_NORMALIZE_DEFAULT,
72 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
73 G_NORMALIZE_DEFAULT_COMPOSE,
74 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
75 G_NORMALIZE_ALL,
76 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
77 G_NORMALIZE_ALL_COMPOSE,
78 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
79 }
80 GNormalizeMode;
81
82 /* Code from GLIB gutf8.c starts here. */
83
84 #define UTF8_COMPUTE(Char, Mask, Len) \
85 if (Char < 128) \
86 { \
87 Len = 1; \
88 Mask = 0x7f; \
89 } \
90 else if ((Char & 0xe0) == 0xc0) \
91 { \
92 Len = 2; \
93 Mask = 0x1f; \
94 } \
95 else if ((Char & 0xf0) == 0xe0) \
96 { \
97 Len = 3; \
98 Mask = 0x0f; \
99 } \
100 else if ((Char & 0xf8) == 0xf0) \
101 { \
102 Len = 4; \
103 Mask = 0x07; \
104 } \
105 else if ((Char & 0xfc) == 0xf8) \
106 { \
107 Len = 5; \
108 Mask = 0x03; \
109 } \
110 else if ((Char & 0xfe) == 0xfc) \
111 { \
112 Len = 6; \
113 Mask = 0x01; \
114 } \
115 else \
116 Len = -1;
117
118 #define UTF8_LENGTH(Char) \
119 ((Char) < 0x80 ? 1 : \
120 ((Char) < 0x800 ? 2 : \
121 ((Char) < 0x10000 ? 3 : \
122 ((Char) < 0x200000 ? 4 : \
123 ((Char) < 0x4000000 ? 5 : 6)))))
124
125
126 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
127 (Result) = (Chars)[0] & (Mask); \
128 for ((Count) = 1; (Count) < (Len); ++(Count)) \
129 { \
130 if (((Chars)[(Count)] & 0xc0) != 0x80) \
131 { \
132 (Result) = -1; \
133 break; \
134 } \
135 (Result) <<= 6; \
136 (Result) |= ((Chars)[(Count)] & 0x3f); \
137 }
138
139 #define UNICODE_VALID(Char) \
140 ((Char) < 0x110000 && \
141 (((Char) & 0xFFFFF800) != 0xD800) && \
142 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
143 ((Char) & 0xFFFE) != 0xFFFE)
144
145
146 static const gchar utf8_skip_data[256] = {
147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148 1, 1, 1, 1, 1, 1, 1,
149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
150 1, 1, 1, 1, 1, 1, 1,
151 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152 1, 1, 1, 1, 1, 1, 1,
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1,
155 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1,
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1,
159 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160 2, 2, 2, 2, 2, 2, 2,
161 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
162 5, 5, 5, 6, 6, 1, 1
163 };
164
165 const gchar *const g_utf8_skip = utf8_skip_data;
166
167 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
168
169 /**
170 * g_utf8_strlen:
171 * @p: pointer to the start of a UTF-8 encoded string.
172 * @max: the maximum number of bytes to examine. If @max
173 * is less than 0, then the string is assumed to be
174 * nul-terminated. If @max is 0, @p will not be examined and
175 * may be %NULL.
176 *
177 * Returns the length of the string in characters.
178 *
179 * Return value: the length of the string in characters
180 **/
181 static glong
g_utf8_strlen(const gchar * p,gssize max)182 g_utf8_strlen (const gchar * p, gssize max)
183 {
184 glong len = 0;
185 const gchar *start = p;
186 g_return_val_if_fail (p != NULL || max == 0, 0);
187
188 if (max < 0)
189 {
190 while (*p)
191 {
192 p = g_utf8_next_char (p);
193 ++len;
194 }
195 }
196 else
197 {
198 if (max == 0 || !*p)
199 return 0;
200
201 p = g_utf8_next_char (p);
202
203 while (p - start < max && *p)
204 {
205 ++len;
206 p = g_utf8_next_char (p);
207 }
208
209 /* only do the last len increment if we got a complete
210 * char (don't count partial chars)
211 */
212 if (p - start == max)
213 ++len;
214 }
215
216 return len;
217 }
218
219 /**
220 * g_utf8_get_char:
221 * @p: a pointer to Unicode character encoded as UTF-8
222 *
223 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
224 * If @p does not point to a valid UTF-8 encoded character, results are
225 * undefined. If you are not sure that the bytes are complete
226 * valid Unicode characters, you should use g_utf8_get_char_validated()
227 * instead.
228 *
229 * Return value: the resulting character
230 **/
231 static gunichar
g_utf8_get_char(const gchar * p)232 g_utf8_get_char (const gchar * p)
233 {
234 int i, mask = 0, len;
235 gunichar result;
236 unsigned char c = (unsigned char) *p;
237
238 UTF8_COMPUTE (c, mask, len);
239 if (len == -1)
240 return (gunichar) - 1;
241 UTF8_GET (result, p, i, mask, len);
242
243 return result;
244 }
245
246 /**
247 * g_unichar_to_utf8:
248 * @c: a ISO10646 character code
249 * @outbuf: output buffer, must have at least 6 bytes of space.
250 * If %NULL, the length will be computed and returned
251 * and nothing will be written to @outbuf.
252 *
253 * Converts a single character to UTF-8.
254 *
255 * Return value: number of bytes written
256 **/
257 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)258 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
259 {
260 guint len = 0;
261 int first;
262 int i;
263
264 if (c < 0x80)
265 {
266 first = 0;
267 len = 1;
268 }
269 else if (c < 0x800)
270 {
271 first = 0xc0;
272 len = 2;
273 }
274 else if (c < 0x10000)
275 {
276 first = 0xe0;
277 len = 3;
278 }
279 else if (c < 0x200000)
280 {
281 first = 0xf0;
282 len = 4;
283 }
284 else if (c < 0x4000000)
285 {
286 first = 0xf8;
287 len = 5;
288 }
289 else
290 {
291 first = 0xfc;
292 len = 6;
293 }
294
295 if (outbuf)
296 {
297 for (i = len - 1; i > 0; --i)
298 {
299 outbuf[i] = (c & 0x3f) | 0x80;
300 c >>= 6;
301 }
302 outbuf[0] = c | first;
303 }
304
305 return len;
306 }
307
308 /**
309 * g_utf8_to_ucs4_fast:
310 * @str: a UTF-8 encoded string
311 * @len: the maximum length of @str to use. If @len < 0, then
312 * the string is nul-terminated.
313 * @items_written: location to store the number of characters in the
314 * result, or %NULL.
315 *
316 * Convert a string from UTF-8 to a 32-bit fixed width
317 * representation as UCS-4, assuming valid UTF-8 input.
318 * This function is roughly twice as fast as g_utf8_to_ucs4()
319 * but does no error checking on the input.
320 *
321 * Return value: a pointer to a newly allocated UCS-4 string.
322 * This value must be freed with g_free().
323 **/
324 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)325 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
326 {
327 gint j, charlen;
328 gunichar *result;
329 gint n_chars, i;
330 const gchar *p;
331
332 g_return_val_if_fail (str != NULL, NULL);
333
334 p = str;
335 n_chars = 0;
336 if (len < 0)
337 {
338 while (*p)
339 {
340 p = g_utf8_next_char (p);
341 ++n_chars;
342 }
343 }
344 else
345 {
346 while (p < str + len && *p)
347 {
348 p = g_utf8_next_char (p);
349 ++n_chars;
350 }
351 }
352
353 result = g_new (gunichar, n_chars + 1);
354
355 p = str;
356 for (i = 0; i < n_chars; i++)
357 {
358 gunichar wc = ((unsigned char *) p)[0];
359
360 if (wc < 0x80)
361 {
362 result[i] = wc;
363 p++;
364 }
365 else
366 {
367 if (wc < 0xe0)
368 {
369 charlen = 2;
370 wc &= 0x1f;
371 }
372 else if (wc < 0xf0)
373 {
374 charlen = 3;
375 wc &= 0x0f;
376 }
377 else if (wc < 0xf8)
378 {
379 charlen = 4;
380 wc &= 0x07;
381 }
382 else if (wc < 0xfc)
383 {
384 charlen = 5;
385 wc &= 0x03;
386 }
387 else
388 {
389 charlen = 6;
390 wc &= 0x01;
391 }
392
393 for (j = 1; j < charlen; j++)
394 {
395 wc <<= 6;
396 wc |= ((unsigned char *) p)[j] & 0x3f;
397 }
398
399 result[i] = wc;
400 p += charlen;
401 }
402 }
403 result[i] = 0;
404
405 if (items_written)
406 *items_written = i;
407
408 return result;
409 }
410
411 /**
412 * g_ucs4_to_utf8:
413 * @str: a UCS-4 encoded string
414 * @len: the maximum length of @str to use. If @len < 0, then
415 * the string is terminated with a 0 character.
416 * @items_read: location to store number of characters read read, or %NULL.
417 * @items_written: location to store number of bytes written or %NULL.
418 * The value here stored does not include the trailing 0
419 * byte.
420 * @error: location to store the error occuring, or %NULL to ignore
421 * errors. Any of the errors in #GConvertError other than
422 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
423 *
424 * Convert a string from a 32-bit fixed width representation as UCS-4.
425 * to UTF-8. The result will be terminated with a 0 byte.
426 *
427 * Return value: a pointer to a newly allocated UTF-8 string.
428 * This value must be freed with g_free(). If an
429 * error occurs, %NULL will be returned and
430 * @error set.
431 **/
432 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)433 g_ucs4_to_utf8 (const gunichar * str,
434 glong len,
435 glong * items_read, glong * items_written, GError ** error)
436 {
437 gint result_length;
438 gchar *result = NULL;
439 gchar *p;
440 gint i;
441
442 result_length = 0;
443 for (i = 0; len < 0 || i < len; i++)
444 {
445 if (!str[i])
446 break;
447
448 if (str[i] >= 0x80000000)
449 {
450 if (items_read)
451 *items_read = i;
452
453 /*g_set_error (error, G_CONVERT_ERROR,
454 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
455 _("Character out of range for UTF-8"));*/
456 goto err_out;
457 }
458
459 result_length += UTF8_LENGTH (str[i]);
460 }
461
462 result = g_malloc (result_length + 1);
463 p = result;
464
465 i = 0;
466 while (p < result + result_length)
467 p += g_unichar_to_utf8 (str[i++], p);
468
469 *p = '\0';
470
471 if (items_written)
472 *items_written = p - result;
473
474 err_out:
475 if (items_read)
476 *items_read = i;
477
478 return result;
479 }
480
481 /* Code from GLIB gunidecomp.c starts here. */
482
483 #include "gunidecomp.h"
484 #include "gunicomp.h"
485
486 #define CC_PART1(Page, Char) \
487 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
488 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
489 : (cclass_data[combining_class_table_part1[Page]][Char]))
490
491 #define CC_PART2(Page, Char) \
492 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
493 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
494 : (cclass_data[combining_class_table_part2[Page]][Char]))
495
496 #define COMBINING_CLASS(Char) \
497 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
498 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
499 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
500 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
501 : 0))
502
503 /* constants for hangul syllable [de]composition */
504 #define SBase 0xAC00
505 #define LBase 0x1100
506 #define VBase 0x1161
507 #define TBase 0x11A7
508 #define LCount 19
509 #define VCount 21
510 #define TCount 28
511 #define NCount (VCount * TCount)
512 #define SCount (LCount * NCount)
513
514 /**
515 * g_unicode_canonical_ordering:
516 * @string: a UCS-4 encoded string.
517 * @len: the maximum length of @string to use.
518 *
519 * Computes the canonical ordering of a string in-place.
520 * This rearranges decomposed characters in the string
521 * according to their combining classes. See the Unicode
522 * manual for more information.
523 **/
524 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)525 g_unicode_canonical_ordering (gunichar * string, gsize len)
526 {
527 gsize i;
528 int swap = 1;
529
530 while (swap)
531 {
532 int last;
533 swap = 0;
534 last = COMBINING_CLASS (string[0]);
535 for (i = 0; i < len - 1; ++i)
536 {
537 int next = COMBINING_CLASS (string[i + 1]);
538 if (next != 0 && last > next)
539 {
540 gsize j;
541 /* Percolate item leftward through string. */
542 for (j = i + 1; j > 0; --j)
543 {
544 gunichar t;
545 if (COMBINING_CLASS (string[j - 1]) <= next)
546 break;
547 t = string[j];
548 string[j] = string[j - 1];
549 string[j - 1] = t;
550 swap = 1;
551 }
552 /* We're re-entering the loop looking at the old
553 character again. */
554 next = last;
555 }
556 last = next;
557 }
558 }
559 }
560
561 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
562 * r should be null or have sufficient space. Calling with r == NULL will
563 * only calculate the result_len; however, a buffer with space for three
564 * characters will always be big enough. */
565 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)566 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
567 {
568 gint SIndex = s - SBase;
569
570 /* not a hangul syllable */
571 if (SIndex < 0 || SIndex >= SCount)
572 {
573 if (r)
574 r[0] = s;
575 *result_len = 1;
576 }
577 else
578 {
579 gunichar L = LBase + SIndex / NCount;
580 gunichar V = VBase + (SIndex % NCount) / TCount;
581 gunichar T = TBase + SIndex % TCount;
582
583 if (r)
584 {
585 r[0] = L;
586 r[1] = V;
587 }
588
589 if (T != TBase)
590 {
591 if (r)
592 r[2] = T;
593 *result_len = 3;
594 }
595 else
596 *result_len = 2;
597 }
598 }
599
600 /* returns a pointer to a null-terminated UTF-8 string */
601 static const gchar *
find_decomposition(gunichar ch,gboolean compat)602 find_decomposition (gunichar ch, gboolean compat)
603 {
604 int start = 0;
605 int end = G_N_ELEMENTS (decomp_table);
606
607 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
608 {
609 while (TRUE)
610 {
611 int half = (start + end) / 2;
612 if (ch == decomp_table[half].ch)
613 {
614 int offset;
615
616 if (compat)
617 {
618 offset = decomp_table[half].compat_offset;
619 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
620 offset = decomp_table[half].canon_offset;
621 }
622 else
623 {
624 offset = decomp_table[half].canon_offset;
625 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
626 return NULL;
627 }
628
629 return &(decomp_expansion_string[offset]);
630 }
631 else if (half == start)
632 break;
633 else if (ch > decomp_table[half].ch)
634 start = half;
635 else
636 end = half;
637 }
638 }
639
640 return NULL;
641 }
642
643 /* L,V => LV and LV,T => LVT */
644 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)645 combine_hangul (gunichar a, gunichar b, gunichar * result)
646 {
647 gint LIndex = a - LBase;
648 gint SIndex = a - SBase;
649
650 gint VIndex = b - VBase;
651 gint TIndex = b - TBase;
652
653 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
654 {
655 *result = SBase + (LIndex * VCount + VIndex) * TCount;
656 return TRUE;
657 }
658 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
659 && 0 <= TIndex && TIndex <= TCount)
660 {
661 *result = a + TIndex;
662 return TRUE;
663 }
664
665 return FALSE;
666 }
667
668 #define CI(Page, Char) \
669 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
670 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
671 : (compose_data[compose_table[Page]][Char]))
672
673 #define COMPOSE_INDEX(Char) \
674 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
675
676 static gboolean
combine(gunichar a,gunichar b,gunichar * result)677 combine (gunichar a, gunichar b, gunichar * result)
678 {
679 gushort index_a, index_b;
680
681 if (combine_hangul (a, b, result))
682 return TRUE;
683
684 index_a = COMPOSE_INDEX (a);
685
686 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
687 {
688 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
689 {
690 *result =
691 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
692 return TRUE;
693 }
694 else
695 return FALSE;
696 }
697
698 index_b = COMPOSE_INDEX (b);
699
700 if (index_b >= COMPOSE_SECOND_SINGLE_START)
701 {
702 if (a ==
703 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
704 {
705 *result =
706 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
707 return TRUE;
708 }
709 else
710 return FALSE;
711 }
712
713 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
714 && index_b >= COMPOSE_SECOND_START
715 && index_b < COMPOSE_SECOND_SINGLE_START)
716 {
717 gunichar res =
718 compose_array[index_a - COMPOSE_FIRST_START][index_b -
719 COMPOSE_SECOND_START];
720
721 if (res)
722 {
723 *result = res;
724 return TRUE;
725 }
726 }
727
728 return FALSE;
729 }
730
731 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)732 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
733 {
734 gsize n_wc;
735 gunichar *wc_buffer;
736 const char *p;
737 gsize last_start;
738 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
739 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
740
741 n_wc = 0;
742 p = str;
743 while ((max_len < 0 || p < str + max_len) && *p)
744 {
745 const gchar *decomp;
746 gunichar wc = g_utf8_get_char (p);
747
748 if (wc >= 0xac00 && wc <= 0xd7af)
749 {
750 gsize result_len;
751 decompose_hangul (wc, NULL, &result_len);
752 n_wc += result_len;
753 }
754 else
755 {
756 decomp = find_decomposition (wc, do_compat);
757
758 if (decomp)
759 n_wc += g_utf8_strlen (decomp, -1);
760 else
761 n_wc++;
762 }
763
764 p = g_utf8_next_char (p);
765 }
766
767 wc_buffer = g_new (gunichar, n_wc + 1);
768
769 last_start = 0;
770 n_wc = 0;
771 p = str;
772 while ((max_len < 0 || p < str + max_len) && *p)
773 {
774 gunichar wc = g_utf8_get_char (p);
775 const gchar *decomp;
776 int cc;
777 gsize old_n_wc = n_wc;
778
779 if (wc >= 0xac00 && wc <= 0xd7af)
780 {
781 gsize result_len;
782 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
783 n_wc += result_len;
784 }
785 else
786 {
787 decomp = find_decomposition (wc, do_compat);
788
789 if (decomp)
790 {
791 const char *pd;
792 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
793 wc_buffer[n_wc++] = g_utf8_get_char (pd);
794 }
795 else
796 wc_buffer[n_wc++] = wc;
797 }
798
799 if (n_wc > 0)
800 {
801 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
802
803 if (cc == 0)
804 {
805 g_unicode_canonical_ordering (wc_buffer + last_start,
806 n_wc - last_start);
807 last_start = old_n_wc;
808 }
809 }
810
811 p = g_utf8_next_char (p);
812 }
813
814 if (n_wc > 0)
815 {
816 g_unicode_canonical_ordering (wc_buffer + last_start,
817 n_wc - last_start);
818 last_start = n_wc;
819 }
820
821 wc_buffer[n_wc] = 0;
822
823 /* All decomposed and reordered */
824
825 if (do_compose && n_wc > 0)
826 {
827 gsize i, j;
828 int last_cc = 0;
829 last_start = 0;
830
831 for (i = 0; i < n_wc; i++)
832 {
833 int cc = COMBINING_CLASS (wc_buffer[i]);
834
835 if (i > 0 &&
836 (last_cc == 0 || last_cc != cc) &&
837 combine (wc_buffer[last_start], wc_buffer[i],
838 &wc_buffer[last_start]))
839 {
840 for (j = i + 1; j < n_wc; j++)
841 wc_buffer[j - 1] = wc_buffer[j];
842 n_wc--;
843 i--;
844
845 if (i == last_start)
846 last_cc = 0;
847 else
848 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
849
850 continue;
851 }
852
853 if (cc == 0)
854 last_start = i;
855
856 last_cc = cc;
857 }
858 }
859
860 wc_buffer[n_wc] = 0;
861
862 return wc_buffer;
863 }
864
865 /**
866 * g_utf8_normalize:
867 * @str: a UTF-8 encoded string.
868 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
869 * @mode: the type of normalization to perform.
870 *
871 * Converts a string into canonical form, standardizing
872 * such issues as whether a character with an accent
873 * is represented as a base character and combining
874 * accent or as a single precomposed character. You
875 * should generally call g_utf8_normalize() before
876 * comparing two Unicode strings.
877 *
878 * The normalization mode %G_NORMALIZE_DEFAULT only
879 * standardizes differences that do not affect the
880 * text content, such as the above-mentioned accent
881 * representation. %G_NORMALIZE_ALL also standardizes
882 * the "compatibility" characters in Unicode, such
883 * as SUPERSCRIPT THREE to the standard forms
884 * (in this case DIGIT THREE). Formatting information
885 * may be lost but for most text operations such
886 * characters should be considered the same.
887 * For example, g_utf8_collate() normalizes
888 * with %G_NORMALIZE_ALL as its first step.
889 *
890 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
891 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
892 * but returned a result with composed forms rather
893 * than a maximally decomposed form. This is often
894 * useful if you intend to convert the string to
895 * a legacy encoding or pass it to a system with
896 * less capable Unicode handling.
897 *
898 * Return value: a newly allocated string, that is the
899 * normalized form of @str.
900 **/
901 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)902 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
903 {
904 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
905 gchar *result;
906
907 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
908 g_free (result_wc);
909
910 return result;
911 }
912
913 /* Public Libidn API starts here. */
914
915 /**
916 * stringprep_utf8_to_unichar:
917 * @p: a pointer to Unicode character encoded as UTF-8
918 *
919 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
920 * If @p does not point to a valid UTF-8 encoded character, results are
921 * undefined. If you are not sure that the bytes are complete
922 * valid Unicode characters, you should use g_utf8_get_char_validated()
923 * instead.
924 *
925 * Return value: the resulting character
926 **/
927 my_uint32_t
stringprep_utf8_to_unichar(const char * p)928 stringprep_utf8_to_unichar (const char *p)
929 {
930 return g_utf8_get_char (p);
931 }
932
933 /**
934 * stringprep_unichar_to_utf8:
935 * @c: a ISO10646 character code
936 * @outbuf: output buffer, must have at least 6 bytes of space.
937 * If %NULL, the length will be computed and returned
938 * and nothing will be written to @outbuf.
939 *
940 * Converts a single character to UTF-8.
941 *
942 * Return value: number of bytes written
943 **/
944 int
stringprep_unichar_to_utf8(my_uint32_t c,char * outbuf)945 stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf)
946 {
947 return g_unichar_to_utf8 (c, outbuf);
948 }
949
950 /**
951 * stringprep_utf8_to_ucs4:
952 * @str: a UTF-8 encoded string
953 * @len: the maximum length of @str to use. If @len < 0, then
954 * the string is nul-terminated.
955 * @items_written: location to store the number of characters in the
956 * result, or %NULL.
957 *
958 * Convert a string from UTF-8 to a 32-bit fixed width
959 * representation as UCS-4, assuming valid UTF-8 input.
960 * This function does no error checking on the input.
961 *
962 * Return value: a pointer to a newly allocated UCS-4 string.
963 * This value must be freed with free().
964 **/
965 my_uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)966 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
967 {
968 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
969 }
970
971 /**
972 * stringprep_ucs4_to_utf8:
973 * @str: a UCS-4 encoded string
974 * @len: the maximum length of @str to use. If @len < 0, then
975 * the string is terminated with a 0 character.
976 * @items_read: location to store number of characters read read, or %NULL.
977 * @items_written: location to store number of bytes written or %NULL.
978 * The value here stored does not include the trailing 0
979 * byte.
980 *
981 * Convert a string from a 32-bit fixed width representation as UCS-4.
982 * to UTF-8. The result will be terminated with a 0 byte.
983 *
984 * Return value: a pointer to a newly allocated UTF-8 string.
985 * This value must be freed with free(). If an
986 * error occurs, %NULL will be returned and
987 * @error set.
988 **/
989 char *
stringprep_ucs4_to_utf8(const my_uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)990 stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len,
991 size_t * items_read, size_t * items_written)
992 {
993 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
994 (glong *) items_written, NULL);
995 }
996
997 /**
998 * stringprep_utf8_nfkc_normalize:
999 * @str: a UTF-8 encoded string.
1000 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1001 *
1002 * Converts a string into canonical form, standardizing
1003 * such issues as whether a character with an accent
1004 * is represented as a base character and combining
1005 * accent or as a single precomposed character.
1006 *
1007 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1008 * differences that do not affect the text content, such as the
1009 * above-mentioned accent representation. It standardizes the
1010 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1011 * the standard forms (in this case DIGIT THREE). Formatting
1012 * information may be lost but for most text operations such
1013 * characters should be considered the same. It returns a result with
1014 * composed forms rather than a maximally decomposed form.
1015 *
1016 * Return value: a newly allocated string, that is the
1017 * NFKC normalized form of @str.
1018 **/
1019 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1020 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1021 {
1022 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1023 }
1024
1025 /**
1026 * stringprep_ucs4_nfkc_normalize:
1027 * @str: a Unicode string.
1028 * @len: length of @str array, or -1 if @str is nul-terminated.
1029 *
1030 * Converts UCS4 string into UTF-8 and runs
1031 * stringprep_utf8_nfkc_normalize().
1032 *
1033 * Return value: a newly allocated Unicode string, that is the NFKC
1034 * normalized form of @str.
1035 **/
1036 my_uint32_t *
stringprep_ucs4_nfkc_normalize(my_uint32_t * str,ssize_t len)1037 stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len)
1038 {
1039 char *p;
1040 my_uint32_t *result_wc;
1041
1042 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1043 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1044 free (p);
1045
1046 return result_wc;
1047 }
1048