1 /* nfkc.c --- Unicode normalization utilities.
2 Copyright (C) 2002-2016 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <http://www.gnu.org/licenses/>. */
29
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33
34 #include <stdlib.h>
35 #include <string.h>
36
37 #include "stringprep.h"
38
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define glong long
44 #define gint int
45 #define guint unsigned int
46 #define gushort unsigned short
47 #define gint16 int16_t
48 #define guint16 uint16_t
49 #define gunichar uint32_t
50 #define gsize size_t
51 #define gssize ssize_t
52 #define g_malloc malloc
53 #define g_free free
54 #define g_return_val_if_fail(expr,val) { \
55 if (!(expr)) \
56 return (val); \
57 }
58
59 /* Code from GLIB gmacros.h starts here. */
60
61 /* GLIB - Library of useful routines for C programming
62 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
63 *
64 * This library is free software; you can redistribute it and/or
65 * modify it under the terms of the GNU Lesser General Public
66 * License as published by the Free Software Foundation; either
67 * version 2 of the License, or (at your option) any later version.
68 *
69 * This library is distributed in the hope that it will be useful,
70 * but WITHOUT ANY WARRANTY; without even the implied warranty of
71 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72 * Lesser General Public License for more details.
73 *
74 * You should have received a copy of the GNU Lesser General Public
75 * License along with this library; if not, write to the
76 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77 * Boston, MA 02111-1307, USA.
78 */
79
80 #ifndef FALSE
81 #define FALSE (0)
82 #endif
83
84 #ifndef TRUE
85 #define TRUE (!FALSE)
86 #endif
87
88 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
89
90 #define G_UNLIKELY(expr) (expr)
91
92 /* Code from GLIB gunicode.h starts here. */
93
94 /* gunicode.h - Unicode manipulation functions
95 *
96 * Copyright (C) 1999, 2000 Tom Tromey
97 * Copyright 2000, 2005 Red Hat, Inc.
98 *
99 * The Gnome Library is free software; you can redistribute it and/or
100 * modify it under the terms of the GNU Lesser General Public License as
101 * published by the Free Software Foundation; either version 2 of the
102 * License, or (at your option) any later version.
103 *
104 * The Gnome Library is distributed in the hope that it will be useful,
105 * but WITHOUT ANY WARRANTY; without even the implied warranty of
106 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107 * Lesser General Public License for more details.
108 *
109 * You should have received a copy of the GNU Lesser General Public
110 * License along with the Gnome Library; see the file COPYING.LIB. If not,
111 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112 * Boston, MA 02111-1307, USA.
113 */
114
115 typedef enum
116 {
117 G_NORMALIZE_DEFAULT,
118 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
119 G_NORMALIZE_DEFAULT_COMPOSE,
120 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
121 G_NORMALIZE_ALL,
122 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
123 G_NORMALIZE_ALL_COMPOSE,
124 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
125 }
126 GNormalizeMode;
127
128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129
130 /* Code from GLIB gutf8.c starts here. */
131
132 /* gutf8.c - Operations on UTF-8 strings.
133 *
134 * Copyright (C) 1999 Tom Tromey
135 * Copyright (C) 2000 Red Hat, Inc.
136 *
137 * This library is free software; you can redistribute it and/or
138 * modify it under the terms of the GNU Lesser General Public
139 * License as published by the Free Software Foundation; either
140 * version 2 of the License, or (at your option) any later version.
141 *
142 * This library is distributed in the hope that it will be useful,
143 * but WITHOUT ANY WARRANTY; without even the implied warranty of
144 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145 * Lesser General Public License for more details.
146 *
147 * You should have received a copy of the GNU Lesser General Public
148 * License along with this library; if not, write to the
149 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150 * Boston, MA 02111-1307, USA.
151 */
152
153 #define UTF8_COMPUTE(Char, Mask, Len) \
154 if (Char < 128) \
155 { \
156 Len = 1; \
157 Mask = 0x7f; \
158 } \
159 else if ((Char & 0xe0) == 0xc0) \
160 { \
161 Len = 2; \
162 Mask = 0x1f; \
163 } \
164 else if ((Char & 0xf0) == 0xe0) \
165 { \
166 Len = 3; \
167 Mask = 0x0f; \
168 } \
169 else if ((Char & 0xf8) == 0xf0) \
170 { \
171 Len = 4; \
172 Mask = 0x07; \
173 } \
174 else if ((Char & 0xfc) == 0xf8) \
175 { \
176 Len = 5; \
177 Mask = 0x03; \
178 } \
179 else if ((Char & 0xfe) == 0xfc) \
180 { \
181 Len = 6; \
182 Mask = 0x01; \
183 } \
184 else \
185 Len = -1;
186
187 #define UTF8_LENGTH(Char) \
188 ((Char) < 0x80 ? 1 : \
189 ((Char) < 0x800 ? 2 : \
190 ((Char) < 0x10000 ? 3 : \
191 ((Char) < 0x200000 ? 4 : \
192 ((Char) < 0x4000000 ? 5 : 6)))))
193
194 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
195 (Result) = (Chars)[0] & (Mask); \
196 for ((Count) = 1; (Count) < (Len); ++(Count)) \
197 { \
198 if (((Chars)[(Count)] & 0xc0) != 0x80) \
199 { \
200 (Result) = -1; \
201 break; \
202 } \
203 (Result) <<= 6; \
204 (Result) |= ((Chars)[(Count)] & 0x3f); \
205 }
206
207 static const gchar utf8_skip_data[256] = {
208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209 1, 1, 1, 1, 1, 1, 1,
210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211 1, 1, 1, 1, 1, 1, 1,
212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 1, 1, 1, 1, 1, 1, 1,
214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 1, 1, 1, 1, 1, 1, 1,
216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 1, 1, 1, 1, 1, 1, 1,
218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219 1, 1, 1, 1, 1, 1, 1,
220 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 2, 2, 2, 2, 2, 2, 2,
222 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223 5, 5, 5, 6, 6, 1, 1
224 };
225
226 static const gchar *const g_utf8_skip = utf8_skip_data;
227
228 /*
229 * g_utf8_strlen:
230 * @p: pointer to the start of a UTF-8 encoded string
231 * @max: the maximum number of bytes to examine. If @max
232 * is less than 0, then the string is assumed to be
233 * nul-terminated. If @max is 0, @p will not be examined and
234 * may be %NULL.
235 *
236 * Computes the length of the string in characters, not including
237 * the terminating nul character.
238 *
239 * Return value: the length of the string in characters
240 **/
241 static glong
g_utf8_strlen(const gchar * p)242 g_utf8_strlen (const gchar * p)
243 {
244 glong len = 0;
245
246 g_return_val_if_fail (p != NULL, 0);
247
248 while (*p)
249 {
250 p = g_utf8_next_char (p);
251 ++len;
252 }
253
254 return len;
255 }
256
257 /*
258 * g_utf8_get_char:
259 * @p: a pointer to Unicode character encoded as UTF-8
260 *
261 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262 * If @p does not point to a valid UTF-8 encoded character, results are
263 * undefined. If you are not sure that the bytes are complete
264 * valid Unicode characters, you should use g_utf8_get_char_validated()
265 * instead.
266 *
267 * Return value: the resulting character
268 **/
269 static gunichar
g_utf8_get_char(const gchar * p)270 g_utf8_get_char (const gchar * p)
271 {
272 int i, mask = 0, len;
273 gunichar result;
274 unsigned char c = (unsigned char) *p;
275
276 UTF8_COMPUTE (c, mask, len);
277 if (len == -1)
278 return (gunichar) - 1;
279 UTF8_GET (result, p, i, mask, len);
280
281 return result;
282 }
283
284 /*
285 * g_unichar_to_utf8:
286 * @c: a Unicode character code
287 * @outbuf: output buffer, must have at least 6 bytes of space.
288 * If %NULL, the length will be computed and returned
289 * and nothing will be written to @outbuf.
290 *
291 * Converts a single character to UTF-8.
292 *
293 * Return value: number of bytes written
294 **/
295 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)296 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297 {
298 /* If this gets modified, also update the copy in g_string_insert_unichar() */
299 guint len = 0;
300 int first;
301 int i;
302
303 if (c < 0x80)
304 {
305 first = 0;
306 len = 1;
307 }
308 else if (c < 0x800)
309 {
310 first = 0xc0;
311 len = 2;
312 }
313 else if (c < 0x10000)
314 {
315 first = 0xe0;
316 len = 3;
317 }
318 else if (c < 0x200000)
319 {
320 first = 0xf0;
321 len = 4;
322 }
323 else if (c < 0x4000000)
324 {
325 first = 0xf8;
326 len = 5;
327 }
328 else
329 {
330 first = 0xfc;
331 len = 6;
332 }
333
334 if (outbuf)
335 {
336 for (i = len - 1; i > 0; --i)
337 {
338 outbuf[i] = (c & 0x3f) | 0x80;
339 c >>= 6;
340 }
341 outbuf[0] = c | first;
342 }
343
344 return len;
345 }
346
347 /*
348 * g_utf8_to_ucs4_fast:
349 * @str: a UTF-8 encoded string
350 * @len: the maximum length of @str to use, in bytes. If @len < 0,
351 * then the string is nul-terminated.
352 * @items_written: location to store the number of characters in the
353 * result, or %NULL.
354 *
355 * Convert a string from UTF-8 to a 32-bit fixed width
356 * representation as UCS-4, assuming valid UTF-8 input.
357 * This function is roughly twice as fast as g_utf8_to_ucs4()
358 * but does no error checking on the input. A trailing 0 character
359 * will be added to the string after the converted text.
360 *
361 * Return value: a pointer to a newly allocated UCS-4 string.
362 * This value must be freed with g_free().
363 **/
364 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)365 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366 {
367 gunichar *result;
368 gsize n_chars, i;
369 const gchar *p;
370
371 g_return_val_if_fail (str != NULL, NULL);
372
373 p = str;
374 n_chars = 0;
375 if (len < 0)
376 {
377 while (*p)
378 {
379 p = g_utf8_next_char (p);
380 ++n_chars;
381 }
382 }
383 else
384 {
385 while (p < str + len && *p)
386 {
387 p = g_utf8_next_char (p);
388 ++n_chars;
389 }
390 }
391
392 result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393 if (!result)
394 return NULL;
395
396 p = str;
397 for (i = 0; i < n_chars; i++)
398 {
399 gunichar wc = (guchar) * p++;
400
401 if (wc < 0x80)
402 {
403 result[i] = wc;
404 }
405 else
406 {
407 gunichar mask = 0x40;
408
409 if (G_UNLIKELY ((wc & mask) == 0))
410 {
411 /* It's an out-of-sequence 10xxxxxxx byte.
412 * Rather than making an ugly hash of this and the next byte
413 * and overrunning the buffer, it's more useful to treat it
414 * with a replacement character */
415 result[i] = 0xfffd;
416 continue;
417 }
418
419 do
420 {
421 wc <<= 6;
422 wc |= (guchar) (*p++) & 0x3f;
423 mask <<= 5;
424 }
425 while ((wc & mask) != 0);
426
427 wc &= mask - 1;
428
429 result[i] = wc;
430 }
431 }
432 result[i] = 0;
433
434 if (items_written)
435 *items_written = i;
436
437 return result;
438 }
439
440 /*
441 * g_ucs4_to_utf8:
442 * @str: a UCS-4 encoded string
443 * @len: the maximum length (number of characters) of @str to use.
444 * If @len < 0, then the string is nul-terminated.
445 * @items_read: location to store number of characters read, or %NULL.
446 * @items_written: location to store number of bytes written or %NULL.
447 * The value here stored does not include the trailing 0
448 * byte.
449 * @error: location to store the error occurring, or %NULL to ignore
450 * errors. Any of the errors in #GConvertError other than
451 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
452 *
453 * Convert a string from a 32-bit fixed width representation as UCS-4.
454 * to UTF-8. The result will be terminated with a 0 byte.
455 *
456 * Return value: a pointer to a newly allocated UTF-8 string.
457 * This value must be freed with g_free(). If an
458 * error occurs, %NULL will be returned and
459 * @error set. In that case, @items_read will be
460 * set to the position of the first invalid input
461 * character.
462 **/
463 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written)464 g_ucs4_to_utf8 (const gunichar * str,
465 glong len,
466 glong * items_read, glong * items_written)
467 {
468 gint result_length;
469 gchar *result = NULL;
470 gchar *p;
471 gint i;
472
473 result_length = 0;
474 for (i = 0; len < 0 || i < len; i++)
475 {
476 if (!str[i])
477 break;
478
479 if (str[i] >= 0x80000000)
480 goto err_out;
481
482 result_length += UTF8_LENGTH (str[i]);
483 }
484
485 result = g_malloc (result_length + 1);
486 if (!result)
487 return NULL;
488 p = result;
489
490 i = 0;
491 while (p < result + result_length)
492 p += g_unichar_to_utf8 (str[i++], p);
493
494 *p = '\0';
495
496 if (items_written)
497 *items_written = p - result;
498
499 err_out:
500 if (items_read)
501 *items_read = i;
502
503 return result;
504 }
505
506 /* Code from GLIB gunidecomp.c starts here. */
507
508 /* decomp.c - Character decomposition.
509 *
510 * Copyright (C) 1999, 2000 Tom Tromey
511 * Copyright 2000 Red Hat, Inc.
512 *
513 * The Gnome Library is free software; you can redistribute it and/or
514 * modify it under the terms of the GNU Lesser General Public License as
515 * published by the Free Software Foundation; either version 2 of the
516 * License, or (at your option) any later version.
517 *
518 * The Gnome Library is distributed in the hope that it will be useful,
519 * but WITHOUT ANY WARRANTY; without even the implied warranty of
520 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
521 * Lesser General Public License for more details.
522 *
523 * You should have received a copy of the GNU Lesser General Public
524 * License along with the Gnome Library; see the file COPYING.LIB. If not,
525 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
526 * Boston, MA 02111-1307, USA.
527 */
528
529 #include "gunidecomp.h"
530 #include "gunicomp.h"
531
532 #define CC_PART1(Page, Char) \
533 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
534 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
535 : (cclass_data[combining_class_table_part1[Page]][Char]))
536
537 #define CC_PART2(Page, Char) \
538 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
539 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
540 : (cclass_data[combining_class_table_part2[Page]][Char]))
541
542 #define COMBINING_CLASS(Char) \
543 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
544 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
545 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
546 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
547 : 0))
548
549 /* constants for hangul syllable [de]composition */
550 #define SBase 0xAC00
551 #define LBase 0x1100
552 #define VBase 0x1161
553 #define TBase 0x11A7
554 #define LCount 19
555 #define VCount 21
556 #define TCount 28
557 #define NCount (VCount * TCount)
558 #define SCount (LCount * NCount)
559
560 /*
561 * g_unicode_canonical_ordering:
562 * @string: a UCS-4 encoded string.
563 * @len: the maximum length of @string to use.
564 *
565 * Computes the canonical ordering of a string in-place.
566 * This rearranges decomposed characters in the string
567 * according to their combining classes. See the Unicode
568 * manual for more information.
569 **/
570 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)571 g_unicode_canonical_ordering (gunichar * string, gsize len)
572 {
573 gsize i;
574 int swap = 1;
575
576 while (swap)
577 {
578 int last;
579 swap = 0;
580 last = COMBINING_CLASS (string[0]);
581 for (i = 0; i < len - 1; ++i)
582 {
583 int next = COMBINING_CLASS (string[i + 1]);
584 if (next != 0 && last > next)
585 {
586 gsize j;
587 /* Percolate item leftward through string. */
588 for (j = i + 1; j > 0; --j)
589 {
590 gunichar t;
591 if (COMBINING_CLASS (string[j - 1]) <= next)
592 break;
593 t = string[j];
594 string[j] = string[j - 1];
595 string[j - 1] = t;
596 swap = 1;
597 }
598 /* We're re-entering the loop looking at the old
599 character again. */
600 next = last;
601 }
602 last = next;
603 }
604 }
605 }
606
607 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
608 * r should be null or have sufficient space. Calling with r == NULL will
609 * only calculate the result_len; however, a buffer with space for three
610 * characters will always be big enough. */
611 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)612 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
613 {
614 gint SIndex = s - SBase;
615 gint TIndex = SIndex % TCount;
616
617 if (r)
618 {
619 r[0] = LBase + SIndex / NCount;
620 r[1] = VBase + (SIndex % NCount) / TCount;
621 }
622
623 if (TIndex)
624 {
625 if (r)
626 r[2] = TBase + TIndex;
627 *result_len = 3;
628 }
629 else
630 *result_len = 2;
631 }
632
633 /* returns a pointer to a null-terminated UTF-8 string */
634 static const gchar *
find_decomposition(gunichar ch,gboolean compat)635 find_decomposition (gunichar ch, gboolean compat)
636 {
637 int start = 0;
638 int end = G_N_ELEMENTS (decomp_table);
639
640 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
641 {
642 while (TRUE)
643 {
644 int half = (start + end) / 2;
645 if (ch == decomp_table[half].ch)
646 {
647 int offset;
648
649 if (compat)
650 {
651 offset = decomp_table[half].compat_offset;
652 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
653 offset = decomp_table[half].canon_offset;
654 }
655 else
656 {
657 offset = decomp_table[half].canon_offset;
658 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
659 return NULL;
660 }
661
662 return &(decomp_expansion_string[offset]);
663 }
664 else if (half == start)
665 break;
666 else if (ch > decomp_table[half].ch)
667 start = half;
668 else
669 end = half;
670 }
671 }
672
673 return NULL;
674 }
675
676 /* L,V => LV and LV,T => LVT */
677 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)678 combine_hangul (gunichar a, gunichar b, gunichar * result)
679 {
680 if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
681 {
682 gint LIndex = a - LBase;
683 gint VIndex = b - VBase;
684
685 *result = SBase + (LIndex * VCount + VIndex) * TCount;
686 return TRUE;
687 }
688
689 if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
690 {
691 gint SIndex = a - SBase;
692
693 if ((SIndex % TCount) == 0)
694 {
695 gint TIndex = b - TBase;
696
697 *result = a + TIndex;
698 return TRUE;
699 }
700 }
701
702 return FALSE;
703 }
704
705 #define CI(Page, Char) \
706 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
707 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
708 : (compose_data[compose_table[Page]][Char]))
709
710 #define COMPOSE_INDEX(Char) \
711 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
712
713 static gboolean
combine(gunichar a,gunichar b,gunichar * result)714 combine (gunichar a, gunichar b, gunichar * result)
715 {
716 gushort index_a, index_b;
717
718 if (combine_hangul (a, b, result))
719 return TRUE;
720
721 index_a = COMPOSE_INDEX (a);
722
723 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
724 {
725 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
726 {
727 *result =
728 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
729 return TRUE;
730 }
731 else
732 return FALSE;
733 }
734
735 index_b = COMPOSE_INDEX (b);
736
737 if (index_b >= COMPOSE_SECOND_SINGLE_START)
738 {
739 if (a ==
740 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
741 {
742 *result =
743 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
744 return TRUE;
745 }
746 else
747 return FALSE;
748 }
749
750 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
751 && index_b >= COMPOSE_SECOND_START
752 && index_b < COMPOSE_SECOND_SINGLE_START)
753 {
754 gunichar res =
755 compose_array[index_a - COMPOSE_FIRST_START][index_b -
756 COMPOSE_SECOND_START];
757
758 if (res)
759 {
760 *result = res;
761 return TRUE;
762 }
763 }
764
765 return FALSE;
766 }
767
768 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)769 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
770 {
771 gsize n_wc;
772 gunichar *wc_buffer;
773 const char *p;
774 gsize last_start;
775 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
776 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
777
778 n_wc = 0;
779 p = str;
780 while ((max_len < 0 || p < str + max_len) && *p)
781 {
782 const gchar *decomp;
783 gunichar wc = g_utf8_get_char (p);
784
785 if (wc >= SBase && wc < SBase + SCount)
786 {
787 gsize result_len;
788 decompose_hangul (wc, NULL, &result_len);
789 n_wc += result_len;
790 }
791 else
792 {
793 decomp = find_decomposition (wc, do_compat);
794
795 if (decomp)
796 n_wc += g_utf8_strlen (decomp);
797 else
798 n_wc++;
799 }
800
801 p = g_utf8_next_char (p);
802 }
803
804 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
805 if (!wc_buffer)
806 return NULL;
807
808 last_start = 0;
809 n_wc = 0;
810 p = str;
811 while ((max_len < 0 || p < str + max_len) && *p)
812 {
813 gunichar wc = g_utf8_get_char (p);
814 const gchar *decomp;
815 int cc;
816 gsize old_n_wc = n_wc;
817
818 if (wc >= SBase && wc < SBase + SCount)
819 {
820 gsize result_len;
821 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
822 n_wc += result_len;
823 }
824 else
825 {
826 decomp = find_decomposition (wc, do_compat);
827
828 if (decomp)
829 {
830 const char *pd;
831 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
832 wc_buffer[n_wc++] = g_utf8_get_char (pd);
833 }
834 else
835 wc_buffer[n_wc++] = wc;
836 }
837
838 if (n_wc > 0)
839 {
840 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
841
842 if (cc == 0)
843 {
844 g_unicode_canonical_ordering (wc_buffer + last_start,
845 n_wc - last_start);
846 last_start = old_n_wc;
847 }
848 }
849
850 p = g_utf8_next_char (p);
851 }
852
853 if (n_wc > 0)
854 {
855 g_unicode_canonical_ordering (wc_buffer + last_start,
856 n_wc - last_start);
857 /* dead assignment: last_start = n_wc; */
858 }
859
860 wc_buffer[n_wc] = 0;
861
862 /* All decomposed and reordered */
863
864 if (do_compose && n_wc > 0)
865 {
866 gsize i, j;
867 int last_cc = 0;
868 last_start = 0;
869
870 for (i = 0; i < n_wc; i++)
871 {
872 int cc = COMBINING_CLASS (wc_buffer[i]);
873
874 if (i > 0 &&
875 (last_cc == 0 || last_cc != cc) &&
876 combine (wc_buffer[last_start], wc_buffer[i],
877 &wc_buffer[last_start]))
878 {
879 for (j = i + 1; j < n_wc; j++)
880 wc_buffer[j - 1] = wc_buffer[j];
881 n_wc--;
882 i--;
883
884 if (i == last_start)
885 last_cc = 0;
886 else
887 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
888
889 continue;
890 }
891
892 if (cc == 0)
893 last_start = i;
894
895 last_cc = cc;
896 }
897 }
898
899 wc_buffer[n_wc] = 0;
900
901 return wc_buffer;
902 }
903
904 /*
905 * g_utf8_normalize:
906 * @str: a UTF-8 encoded string.
907 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
908 * @mode: the type of normalization to perform.
909 *
910 * Converts a string into canonical form, standardizing
911 * such issues as whether a character with an accent
912 * is represented as a base character and combining
913 * accent or as a single precomposed character. The
914 * string has to be valid UTF-8, otherwise %NULL is
915 * returned. You should generally call g_utf8_normalize()
916 * before comparing two Unicode strings.
917 *
918 * The normalization mode %G_NORMALIZE_DEFAULT only
919 * standardizes differences that do not affect the
920 * text content, such as the above-mentioned accent
921 * representation. %G_NORMALIZE_ALL also standardizes
922 * the "compatibility" characters in Unicode, such
923 * as SUPERSCRIPT THREE to the standard forms
924 * (in this case DIGIT THREE). Formatting information
925 * may be lost but for most text operations such
926 * characters should be considered the same.
927 *
928 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
929 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
930 * but returned a result with composed forms rather
931 * than a maximally decomposed form. This is often
932 * useful if you intend to convert the string to
933 * a legacy encoding or pass it to a system with
934 * less capable Unicode handling.
935 *
936 * Return value: a newly allocated string, that is the
937 * normalized form of @str, or %NULL if @str is not
938 * valid UTF-8.
939 **/
940 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)941 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
942 {
943 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
944 gchar *result = NULL;
945
946 if (result_wc)
947 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
948
949 g_free (result_wc);
950
951 return result;
952 }
953
954 /* Public Libidn API starts here. */
955
956 /**
957 * stringprep_utf8_to_unichar:
958 * @p: a pointer to Unicode character encoded as UTF-8
959 *
960 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
961 * If @p does not point to a valid UTF-8 encoded character, results are
962 * undefined.
963 *
964 * Return value: the resulting character.
965 **/
966 uint32_t
stringprep_utf8_to_unichar(const char * p)967 stringprep_utf8_to_unichar (const char *p)
968 {
969 return g_utf8_get_char (p);
970 }
971
972 /**
973 * stringprep_unichar_to_utf8:
974 * @c: a ISO10646 character code
975 * @outbuf: output buffer, must have at least 6 bytes of space.
976 * If %NULL, the length will be computed and returned
977 * and nothing will be written to @outbuf.
978 *
979 * Converts a single character to UTF-8.
980 *
981 * Return value: number of bytes written.
982 **/
983 int
stringprep_unichar_to_utf8(uint32_t c,char * outbuf)984 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
985 {
986 return g_unichar_to_utf8 (c, outbuf);
987 }
988
989 #include <unistr.h>
990
991 /**
992 * stringprep_utf8_to_ucs4:
993 * @str: a UTF-8 encoded string
994 * @len: the maximum length of @str to use. If @len < 0, then
995 * the string is nul-terminated.
996 * @items_written: location to store the number of characters in the
997 * result, or %NULL.
998 *
999 * Convert a string from UTF-8 to a 32-bit fixed width representation
1000 * as UCS-4. The function now performs error checking to verify that
1001 * the input is valid UTF-8 (before it was documented to not do error
1002 * checking).
1003 *
1004 * Return value: a pointer to a newly allocated UCS-4 string.
1005 * This value must be deallocated by the caller.
1006 **/
1007 uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)1008 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
1009 {
1010 size_t n;
1011
1012 if (len < 0)
1013 n = strlen (str);
1014 else
1015 n = len;
1016
1017 if (u8_check ((const uint8_t *) str, n))
1018 return NULL;
1019
1020 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1021 }
1022
1023 /**
1024 * stringprep_ucs4_to_utf8:
1025 * @str: a UCS-4 encoded string
1026 * @len: the maximum length of @str to use. If @len < 0, then
1027 * the string is terminated with a 0 character.
1028 * @items_read: location to store number of characters read read, or %NULL.
1029 * @items_written: location to store number of bytes written or %NULL.
1030 * The value here stored does not include the trailing 0
1031 * byte.
1032 *
1033 * Convert a string from a 32-bit fixed width representation as UCS-4.
1034 * to UTF-8. The result will be terminated with a 0 byte.
1035 *
1036 * Return value: a pointer to a newly allocated UTF-8 string.
1037 * This value must be deallocated by the caller.
1038 * If an error occurs, %NULL will be returned.
1039 **/
1040 char *
stringprep_ucs4_to_utf8(const uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)1041 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1042 size_t * items_read, size_t * items_written)
1043 {
1044 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1045 (glong *) items_written);
1046 }
1047
1048 /**
1049 * stringprep_utf8_nfkc_normalize:
1050 * @str: a UTF-8 encoded string.
1051 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1052 *
1053 * Converts a string into canonical form, standardizing
1054 * such issues as whether a character with an accent
1055 * is represented as a base character and combining
1056 * accent or as a single precomposed character.
1057 *
1058 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1059 * differences that do not affect the text content, such as the
1060 * above-mentioned accent representation. It standardizes the
1061 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1062 * the standard forms (in this case DIGIT THREE). Formatting
1063 * information may be lost but for most text operations such
1064 * characters should be considered the same. It returns a result with
1065 * composed forms rather than a maximally decomposed form.
1066 *
1067 * Return value: a newly allocated string, that is the
1068 * NFKC normalized form of @str.
1069 **/
1070 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1071 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1072 {
1073 size_t n;
1074
1075 if (len < 0)
1076 n = strlen (str);
1077 else
1078 n = len;
1079
1080 if (u8_check ((const uint8_t *) str, n))
1081 return NULL;
1082
1083 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1084 }
1085
1086 #include <stdio.h>
1087 /**
1088 * stringprep_ucs4_nfkc_normalize:
1089 * @str: a Unicode string.
1090 * @len: length of @str array, or -1 if @str is nul-terminated.
1091 *
1092 * Converts a UCS4 string into canonical form, see
1093 * stringprep_utf8_nfkc_normalize() for more information.
1094 *
1095 * Return value: a newly allocated Unicode string, that is the NFKC
1096 * normalized form of @str.
1097 **/
1098 uint32_t *
stringprep_ucs4_nfkc_normalize(const uint32_t * str,ssize_t len)1099 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1100 {
1101 char *p;
1102 uint32_t *result_wc;
1103
1104 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1105 if (!p)
1106 return NULL;
1107
1108 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1109 free (p);
1110
1111 return result_wc;
1112 }
1113