1 /* nfkc.c	Unicode normalization utilities.
2  * Copyright (C) 2002, 2003, 2004, 2005  Simon Josefsson
3  *
4  * This file is part of GNU Libidn.
5  *
6  * GNU Libidn is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * GNU Libidn is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with GNU Libidn; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 
22 #include <stdlib.h>
23 #include <string.h>
24 
25 #include "stringprep.h"
26 
27 /* This file contains functions from GLIB, including gutf8.c and
28  * gunidecomp.c, all licensed under LGPL and copyright hold by:
29  *
30  *  Copyright (C) 1999, 2000 Tom Tromey
31  *  Copyright 2000 Red Hat, Inc.
32  */
33 
34 /* Hacks to make syncing with GLIB code easier. */
35 #define gboolean int
36 #define gchar char
37 #define guchar unsigned char
38 #define glong long
39 #define gint int
40 #define guint unsigned int
41 #define gushort unsigned short
42 #define gint16 int16_t
43 #define guint16 uint16_t
44 #define gunichar uint32_t
45 #define gsize size_t
46 #define gssize ssize_t
47 #define g_malloc malloc
48 #define g_free free
49 #define GError void
50 #define g_set_error(a,b,c,d) ((void) 0)
51 #define g_new(struct_type, n_structs)					\
52   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
53 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
54 #    define G_STMT_START	(void)(
55 #    define G_STMT_END		)
56 #  else
57 #    if (defined (sun) || defined (__sun__))
58 #      define G_STMT_START	if (1)
59 #      define G_STMT_END	else (void)0
60 #    else
61 #      define G_STMT_START	do
62 #      define G_STMT_END	while (0)
63 #    endif
64 #  endif
65 #define g_return_val_if_fail(expr,val)		G_STMT_START{ (void)0; }G_STMT_END
66 #define G_N_ELEMENTS(arr)		(sizeof (arr) / sizeof ((arr)[0]))
67 #define TRUE 1
68 #define FALSE 0
69 
70 /* Code from GLIB gunicode.h starts here. */
71 
72 typedef enum
73 {
74   G_NORMALIZE_DEFAULT,
75   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
76   G_NORMALIZE_DEFAULT_COMPOSE,
77   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
78   G_NORMALIZE_ALL,
79   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
80   G_NORMALIZE_ALL_COMPOSE,
81   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
82 }
83 GNormalizeMode;
84 
85 /* Code from GLIB gutf8.c starts here. */
86 
87 #define UTF8_COMPUTE(Char, Mask, Len)		\
88   if (Char < 128)				\
89     {						\
90       Len = 1;					\
91       Mask = 0x7f;				\
92     }						\
93   else if ((Char & 0xe0) == 0xc0)		\
94     {						\
95       Len = 2;					\
96       Mask = 0x1f;				\
97     }						\
98   else if ((Char & 0xf0) == 0xe0)		\
99     {						\
100       Len = 3;					\
101       Mask = 0x0f;				\
102     }						\
103   else if ((Char & 0xf8) == 0xf0)		\
104     {						\
105       Len = 4;					\
106       Mask = 0x07;				\
107     }						\
108   else if ((Char & 0xfc) == 0xf8)		\
109     {						\
110       Len = 5;					\
111       Mask = 0x03;				\
112     }						\
113   else if ((Char & 0xfe) == 0xfc)		\
114     {						\
115       Len = 6;					\
116       Mask = 0x01;				\
117     }						\
118   else						\
119     Len = -1;
120 
121 #define UTF8_LENGTH(Char)			\
122   ((Char) < 0x80 ? 1 :				\
123    ((Char) < 0x800 ? 2 :			\
124     ((Char) < 0x10000 ? 3 :			\
125      ((Char) < 0x200000 ? 4 :			\
126       ((Char) < 0x4000000 ? 5 : 6)))))
127 
128 
129 #define UTF8_GET(Result, Chars, Count, Mask, Len)	\
130   (Result) = (Chars)[0] & (Mask);			\
131   for ((Count) = 1; (Count) < (Len); ++(Count))		\
132     {							\
133       if (((Chars)[(Count)] & 0xc0) != 0x80)		\
134 	{						\
135 	  (Result) = -1;				\
136 	  break;					\
137 	}						\
138       (Result) <<= 6;					\
139       (Result) |= ((Chars)[(Count)] & 0x3f);		\
140     }
141 
142 #define UNICODE_VALID(Char)			\
143   ((Char) < 0x110000 &&				\
144    (((Char) & 0xFFFFF800) != 0xD800) &&		\
145    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&	\
146    ((Char) & 0xFFFE) != 0xFFFE)
147 
148 
149 static const gchar utf8_skip_data[256] = {
150   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
151   1, 1, 1, 1, 1, 1, 1,
152   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153   1, 1, 1, 1, 1, 1, 1,
154   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155   1, 1, 1, 1, 1, 1, 1,
156   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157   1, 1, 1, 1, 1, 1, 1,
158   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159   1, 1, 1, 1, 1, 1, 1,
160   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
161   1, 1, 1, 1, 1, 1, 1,
162   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
163   2, 2, 2, 2, 2, 2, 2,
164   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
165   5, 5, 5, 6, 6, 1, 1
166 };
167 
168 static const gchar *const g_utf8_skip = utf8_skip_data;
169 
170 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
171 
172 /*
173  * g_utf8_strlen:
174  * @p: pointer to the start of a UTF-8 encoded string.
175  * @max: the maximum number of bytes to examine. If @max
176  *       is less than 0, then the string is assumed to be
177  *       nul-terminated. If @max is 0, @p will not be examined and
178  *       may be %NULL.
179  *
180  * Returns the length of the string in characters.
181  *
182  * Return value: the length of the string in characters
183  **/
184 static glong
g_utf8_strlen(const gchar * p,gssize max)185 g_utf8_strlen (const gchar * p, gssize max)
186 {
187   glong len = 0;
188   const gchar *start = p;
189   g_return_val_if_fail (p != NULL || max == 0, 0);
190 
191   if (max < 0)
192     {
193       while (*p)
194 	{
195 	  p = g_utf8_next_char (p);
196 	  ++len;
197 	}
198     }
199   else
200     {
201       if (max == 0 || !*p)
202 	return 0;
203 
204       p = g_utf8_next_char (p);
205 
206       while (p - start < max && *p)
207 	{
208 	  ++len;
209 	  p = g_utf8_next_char (p);
210 	}
211 
212       /* only do the last len increment if we got a complete
213        * char (don't count partial chars)
214        */
215       if (p - start == max)
216 	++len;
217     }
218 
219   return len;
220 }
221 
222 /*
223  * g_utf8_get_char:
224  * @p: a pointer to Unicode character encoded as UTF-8
225  *
226  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
227  * If @p does not point to a valid UTF-8 encoded character, results are
228  * undefined. If you are not sure that the bytes are complete
229  * valid Unicode characters, you should use g_utf8_get_char_validated()
230  * instead.
231  *
232  * Return value: the resulting character
233  **/
234 static gunichar
g_utf8_get_char(const gchar * p)235 g_utf8_get_char (const gchar * p)
236 {
237   int i, mask = 0, len;
238   gunichar result;
239   unsigned char c = (unsigned char) *p;
240 
241   UTF8_COMPUTE (c, mask, len);
242   if (len == -1)
243     return (gunichar) - 1;
244   UTF8_GET (result, p, i, mask, len);
245 
246   return result;
247 }
248 
249 /*
250  * g_unichar_to_utf8:
251  * @c: a ISO10646 character code
252  * @outbuf: output buffer, must have at least 6 bytes of space.
253  *       If %NULL, the length will be computed and returned
254  *       and nothing will be written to @outbuf.
255  *
256  * Converts a single character to UTF-8.
257  *
258  * Return value: number of bytes written
259  **/
260 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)261 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
262 {
263   guint len = 0;
264   int first;
265   int i;
266 
267   if (c < 0x80)
268     {
269       first = 0;
270       len = 1;
271     }
272   else if (c < 0x800)
273     {
274       first = 0xc0;
275       len = 2;
276     }
277   else if (c < 0x10000)
278     {
279       first = 0xe0;
280       len = 3;
281     }
282   else if (c < 0x200000)
283     {
284       first = 0xf0;
285       len = 4;
286     }
287   else if (c < 0x4000000)
288     {
289       first = 0xf8;
290       len = 5;
291     }
292   else
293     {
294       first = 0xfc;
295       len = 6;
296     }
297 
298   if (outbuf)
299     {
300       for (i = len - 1; i > 0; --i)
301 	{
302 	  outbuf[i] = (c & 0x3f) | 0x80;
303 	  c >>= 6;
304 	}
305       outbuf[0] = c | first;
306     }
307 
308   return len;
309 }
310 
311 /*
312  * g_utf8_to_ucs4_fast:
313  * @str: a UTF-8 encoded string
314  * @len: the maximum length of @str to use. If @len < 0, then
315  *       the string is nul-terminated.
316  * @items_written: location to store the number of characters in the
317  *                 result, or %NULL.
318  *
319  * Convert a string from UTF-8 to a 32-bit fixed width
320  * representation as UCS-4, assuming valid UTF-8 input.
321  * This function is roughly twice as fast as g_utf8_to_ucs4()
322  * but does no error checking on the input.
323  *
324  * Return value: a pointer to a newly allocated UCS-4 string.
325  *               This value must be freed with g_free().
326  **/
327 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)328 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
329 {
330   gint j, charlen;
331   gunichar *result;
332   gint n_chars, i;
333   const gchar *p;
334 
335   g_return_val_if_fail (str != NULL, NULL);
336 
337   p = str;
338   n_chars = 0;
339   if (len < 0)
340     {
341       while (*p)
342 	{
343 	  p = g_utf8_next_char (p);
344 	  ++n_chars;
345 	}
346     }
347   else
348     {
349       while (p < str + len && *p)
350 	{
351 	  p = g_utf8_next_char (p);
352 	  ++n_chars;
353 	}
354     }
355 
356   result = g_new (gunichar, n_chars + 1);
357   if (!result)
358     return NULL;
359 
360   p = str;
361   for (i = 0; i < n_chars; i++)
362     {
363       gunichar wc = ((unsigned char *) p)[0];
364 
365       if (wc < 0x80)
366 	{
367 	  result[i] = wc;
368 	  p++;
369 	}
370       else
371 	{
372 	  if (wc < 0xe0)
373 	    {
374 	      charlen = 2;
375 	      wc &= 0x1f;
376 	    }
377 	  else if (wc < 0xf0)
378 	    {
379 	      charlen = 3;
380 	      wc &= 0x0f;
381 	    }
382 	  else if (wc < 0xf8)
383 	    {
384 	      charlen = 4;
385 	      wc &= 0x07;
386 	    }
387 	  else if (wc < 0xfc)
388 	    {
389 	      charlen = 5;
390 	      wc &= 0x03;
391 	    }
392 	  else
393 	    {
394 	      charlen = 6;
395 	      wc &= 0x01;
396 	    }
397 
398 	  for (j = 1; j < charlen; j++)
399 	    {
400 	      wc <<= 6;
401 	      wc |= ((unsigned char *) p)[j] & 0x3f;
402 	    }
403 
404 	  result[i] = wc;
405 	  p += charlen;
406 	}
407     }
408   result[i] = 0;
409 
410   if (items_written)
411     *items_written = i;
412 
413   return result;
414 }
415 
416 /*
417  * g_ucs4_to_utf8:
418  * @str: a UCS-4 encoded string
419  * @len: the maximum length of @str to use. If @len < 0, then
420  *       the string is terminated with a 0 character.
421  * @items_read: location to store number of characters read read, or %NULL.
422  * @items_written: location to store number of bytes written or %NULL.
423  *                 The value here stored does not include the trailing 0
424  *                 byte.
425  * @error: location to store the error occuring, or %NULL to ignore
426  *         errors. Any of the errors in #GConvertError other than
427  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
428  *
429  * Convert a string from a 32-bit fixed width representation as UCS-4.
430  * to UTF-8. The result will be terminated with a 0 byte.
431  *
432  * Return value: a pointer to a newly allocated UTF-8 string.
433  *               This value must be freed with g_free(). If an
434  *               error occurs, %NULL will be returned and
435  *               @error set.
436  **/
437 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)438 g_ucs4_to_utf8 (const gunichar * str,
439 		glong len,
440 		glong * items_read, glong * items_written, GError ** error)
441 {
442   gint result_length;
443   gchar *result = NULL;
444   gchar *p;
445   gint i;
446 
447   result_length = 0;
448   for (i = 0; len < 0 || i < len; i++)
449     {
450       if (!str[i])
451 	break;
452 
453       if (str[i] >= 0x80000000)
454 	{
455 	  if (items_read)
456 	    *items_read = i;
457 
458 	  g_set_error (error, G_CONVERT_ERROR,
459 		       G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
460 		       _("Character out of range for UTF-8"));
461 	  goto err_out;
462 	}
463 
464       result_length += UTF8_LENGTH (str[i]);
465     }
466 
467   result = g_malloc (result_length + 1);
468   if (!result)
469     return NULL;
470   p = result;
471 
472   i = 0;
473   while (p < result + result_length)
474     p += g_unichar_to_utf8 (str[i++], p);
475 
476   *p = '\0';
477 
478   if (items_written)
479     *items_written = p - result;
480 
481 err_out:
482   if (items_read)
483     *items_read = i;
484 
485   return result;
486 }
487 
488 /* Code from GLIB gunidecomp.c starts here. */
489 
490 #include "gunidecomp.h"
491 #include "gunicomp.h"
492 
493 #define CC_PART1(Page, Char) \
494   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
495    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
496    : (cclass_data[combining_class_table_part1[Page]][Char]))
497 
498 #define CC_PART2(Page, Char) \
499   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
500    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
501    : (cclass_data[combining_class_table_part2[Page]][Char]))
502 
503 #define COMBINING_CLASS(Char) \
504   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
505    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
506    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
507       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
508       : 0))
509 
510 /* constants for hangul syllable [de]composition */
511 #define SBase 0xAC00
512 #define LBase 0x1100
513 #define VBase 0x1161
514 #define TBase 0x11A7
515 #define LCount 19
516 #define VCount 21
517 #define TCount 28
518 #define NCount (VCount * TCount)
519 #define SCount (LCount * NCount)
520 
521 /*
522  * g_unicode_canonical_ordering:
523  * @string: a UCS-4 encoded string.
524  * @len: the maximum length of @string to use.
525  *
526  * Computes the canonical ordering of a string in-place.
527  * This rearranges decomposed characters in the string
528  * according to their combining classes.  See the Unicode
529  * manual for more information.
530  **/
531 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)532 g_unicode_canonical_ordering (gunichar * string, gsize len)
533 {
534   gsize i;
535   int swap = 1;
536 
537   while (swap)
538     {
539       int last;
540       swap = 0;
541       last = COMBINING_CLASS (string[0]);
542       for (i = 0; i < len - 1; ++i)
543 	{
544 	  int next = COMBINING_CLASS (string[i + 1]);
545 	  if (next != 0 && last > next)
546 	    {
547 	      gsize j;
548 	      /* Percolate item leftward through string.  */
549 	      for (j = i + 1; j > 0; --j)
550 		{
551 		  gunichar t;
552 		  if (COMBINING_CLASS (string[j - 1]) <= next)
553 		    break;
554 		  t = string[j];
555 		  string[j] = string[j - 1];
556 		  string[j - 1] = t;
557 		  swap = 1;
558 		}
559 	      /* We're re-entering the loop looking at the old
560 	         character again.  */
561 	      next = last;
562 	    }
563 	  last = next;
564 	}
565     }
566 }
567 
568 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
569  * r should be null or have sufficient space. Calling with r == NULL will
570  * only calculate the result_len; however, a buffer with space for three
571  * characters will always be big enough. */
572 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)573 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
574 {
575   gint SIndex = s - SBase;
576 
577   /* not a hangul syllable */
578   if (SIndex < 0 || SIndex >= SCount)
579     {
580       if (r)
581 	r[0] = s;
582       *result_len = 1;
583     }
584   else
585     {
586       gunichar L = LBase + SIndex / NCount;
587       gunichar V = VBase + (SIndex % NCount) / TCount;
588       gunichar T = TBase + SIndex % TCount;
589 
590       if (r)
591 	{
592 	  r[0] = L;
593 	  r[1] = V;
594 	}
595 
596       if (T != TBase)
597 	{
598 	  if (r)
599 	    r[2] = T;
600 	  *result_len = 3;
601 	}
602       else
603 	*result_len = 2;
604     }
605 }
606 
607 /* returns a pointer to a null-terminated UTF-8 string */
608 static const gchar *
find_decomposition(gunichar ch,gboolean compat)609 find_decomposition (gunichar ch, gboolean compat)
610 {
611   int start = 0;
612   int end = G_N_ELEMENTS (decomp_table);
613 
614   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
615     {
616       while (TRUE)
617 	{
618 	  int half = (start + end) / 2;
619 	  if (ch == decomp_table[half].ch)
620 	    {
621 	      int offset;
622 
623 	      if (compat)
624 		{
625 		  offset = decomp_table[half].compat_offset;
626 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
627 		    offset = decomp_table[half].canon_offset;
628 		}
629 	      else
630 		{
631 		  offset = decomp_table[half].canon_offset;
632 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
633 		    return NULL;
634 		}
635 
636 	      return &(decomp_expansion_string[offset]);
637 	    }
638 	  else if (half == start)
639 	    break;
640 	  else if (ch > decomp_table[half].ch)
641 	    start = half;
642 	  else
643 	    end = half;
644 	}
645     }
646 
647   return NULL;
648 }
649 
650 /* L,V => LV and LV,T => LVT  */
651 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)652 combine_hangul (gunichar a, gunichar b, gunichar * result)
653 {
654   gint LIndex = a - LBase;
655   gint SIndex = a - SBase;
656 
657   gint VIndex = b - VBase;
658   gint TIndex = b - TBase;
659 
660   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
661     {
662       *result = SBase + (LIndex * VCount + VIndex) * TCount;
663       return TRUE;
664     }
665   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
666 	   && 0 <= TIndex && TIndex <= TCount)
667     {
668       *result = a + TIndex;
669       return TRUE;
670     }
671 
672   return FALSE;
673 }
674 
675 #define CI(Page, Char) \
676   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
677    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
678    : (compose_data[compose_table[Page]][Char]))
679 
680 #define COMPOSE_INDEX(Char) \
681      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
682 
683 static gboolean
combine(gunichar a,gunichar b,gunichar * result)684 combine (gunichar a, gunichar b, gunichar * result)
685 {
686   gushort index_a, index_b;
687 
688   if (combine_hangul (a, b, result))
689     return TRUE;
690 
691   index_a = COMPOSE_INDEX (a);
692 
693   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
694     {
695       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
696 	{
697 	  *result =
698 	    compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
699 	  return TRUE;
700 	}
701       else
702 	return FALSE;
703     }
704 
705   index_b = COMPOSE_INDEX (b);
706 
707   if (index_b >= COMPOSE_SECOND_SINGLE_START)
708     {
709       if (a ==
710 	  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
711 	{
712 	  *result =
713 	    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
714 	  return TRUE;
715 	}
716       else
717 	return FALSE;
718     }
719 
720   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
721       && index_b >= COMPOSE_SECOND_START
722       && index_b < COMPOSE_SECOND_SINGLE_START)
723     {
724       gunichar res =
725 	compose_array[index_a - COMPOSE_FIRST_START][index_b -
726 						     COMPOSE_SECOND_START];
727 
728       if (res)
729 	{
730 	  *result = res;
731 	  return TRUE;
732 	}
733     }
734 
735   return FALSE;
736 }
737 
738 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)739 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
740 {
741   gsize n_wc;
742   gunichar *wc_buffer;
743   const char *p;
744   gsize last_start;
745   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
746   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
747 
748   if (!str)
749     return NULL;
750 
751   n_wc = 0;
752   p = str;
753   while ((max_len < 0 || p < str + max_len) && *p)
754     {
755       const gchar *decomp;
756       gunichar wc = g_utf8_get_char (p);
757 
758       if (wc >= 0xac00 && wc <= 0xd7af)
759 	{
760 	  gsize result_len;
761 	  decompose_hangul (wc, NULL, &result_len);
762 	  n_wc += result_len;
763 	}
764       else
765 	{
766 	  decomp = find_decomposition (wc, do_compat);
767 
768 	  if (decomp)
769 	    n_wc += g_utf8_strlen (decomp, -1);
770 	  else
771 	    n_wc++;
772 	}
773 
774       p = g_utf8_next_char (p);
775     }
776 
777   wc_buffer = g_new (gunichar, n_wc + 1);
778   if (!wc_buffer)
779     return NULL;
780 
781   last_start = 0;
782   n_wc = 0;
783   p = str;
784   while ((max_len < 0 || p < str + max_len) && *p)
785     {
786       gunichar wc = g_utf8_get_char (p);
787       const gchar *decomp;
788       int cc;
789       gsize old_n_wc = n_wc;
790 
791       if (wc >= 0xac00 && wc <= 0xd7af)
792 	{
793 	  gsize result_len;
794 	  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
795 	  n_wc += result_len;
796 	}
797       else
798 	{
799 	  decomp = find_decomposition (wc, do_compat);
800 
801 	  if (decomp)
802 	    {
803 	      const char *pd;
804 	      for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
805 		wc_buffer[n_wc++] = g_utf8_get_char (pd);
806 	    }
807 	  else
808 	    wc_buffer[n_wc++] = wc;
809 	}
810 
811       if (n_wc > 0)
812 	{
813 	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
814 
815 	  if (cc == 0)
816 	    {
817 	      g_unicode_canonical_ordering (wc_buffer + last_start,
818 					    n_wc - last_start);
819 	      last_start = old_n_wc;
820 	    }
821 	}
822 
823       p = g_utf8_next_char (p);
824     }
825 
826   if (n_wc > 0)
827     {
828       g_unicode_canonical_ordering (wc_buffer + last_start,
829 				    n_wc - last_start);
830       last_start = n_wc;
831     }
832 
833   wc_buffer[n_wc] = 0;
834 
835   /* All decomposed and reordered */
836 
837   if (do_compose && n_wc > 0)
838     {
839       gsize i, j;
840       int last_cc = 0;
841       last_start = 0;
842 
843       for (i = 0; i < n_wc; i++)
844 	{
845 	  int cc = COMBINING_CLASS (wc_buffer[i]);
846 
847 	  if (i > 0 &&
848 	      (last_cc == 0 || last_cc != cc) &&
849 	      combine (wc_buffer[last_start], wc_buffer[i],
850 		       &wc_buffer[last_start]))
851 	    {
852 	      for (j = i + 1; j < n_wc; j++)
853 		wc_buffer[j - 1] = wc_buffer[j];
854 	      n_wc--;
855 	      i--;
856 
857 	      if (i == last_start)
858 		last_cc = 0;
859 	      else
860 		last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
861 
862 	      continue;
863 	    }
864 
865 	  if (cc == 0)
866 	    last_start = i;
867 
868 	  last_cc = cc;
869 	}
870     }
871 
872   wc_buffer[n_wc] = 0;
873 
874   return wc_buffer;
875 }
876 
877 /*
878  * g_utf8_normalize:
879  * @str: a UTF-8 encoded string.
880  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881  * @mode: the type of normalization to perform.
882  *
883  * Converts a string into canonical form, standardizing
884  * such issues as whether a character with an accent
885  * is represented as a base character and combining
886  * accent or as a single precomposed character. You
887  * should generally call g_utf8_normalize() before
888  * comparing two Unicode strings.
889  *
890  * The normalization mode %G_NORMALIZE_DEFAULT only
891  * standardizes differences that do not affect the
892  * text content, such as the above-mentioned accent
893  * representation. %G_NORMALIZE_ALL also standardizes
894  * the "compatibility" characters in Unicode, such
895  * as SUPERSCRIPT THREE to the standard forms
896  * (in this case DIGIT THREE). Formatting information
897  * may be lost but for most text operations such
898  * characters should be considered the same.
899  * For example, g_utf8_collate() normalizes
900  * with %G_NORMALIZE_ALL as its first step.
901  *
902  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904  * but returned a result with composed forms rather
905  * than a maximally decomposed form. This is often
906  * useful if you intend to convert the string to
907  * a legacy encoding or pass it to a system with
908  * less capable Unicode handling.
909  *
910  * Return value: a newly allocated string, that is the
911  *   normalized form of @str.
912  **/
913 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
915 {
916   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
917   gchar *result;
918 
919   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
920   g_free (result_wc);
921 
922   return result;
923 }
924 
925 /* Public Libidn API starts here. */
926 
927 /**
928  * stringprep_utf8_to_unichar:
929  * @p: a pointer to Unicode character encoded as UTF-8
930  *
931  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932  * If @p does not point to a valid UTF-8 encoded character, results are
933  * undefined.
934  *
935  * Return value: the resulting character.
936  **/
937 uint32_t
stringprep_utf8_to_unichar(const char * p)938 stringprep_utf8_to_unichar (const char *p)
939 {
940   return g_utf8_get_char (p);
941 }
942 
943 /**
944  * stringprep_unichar_to_utf8:
945  * @c: a ISO10646 character code
946  * @outbuf: output buffer, must have at least 6 bytes of space.
947  *       If %NULL, the length will be computed and returned
948  *       and nothing will be written to @outbuf.
949  *
950  * Converts a single character to UTF-8.
951  *
952  * Return value: number of bytes written.
953  **/
954 int
stringprep_unichar_to_utf8(uint32_t c,char * outbuf)955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
956 {
957   return g_unichar_to_utf8 (c, outbuf);
958 }
959 
960 /**
961  * stringprep_utf8_to_ucs4:
962  * @str: a UTF-8 encoded string
963  * @len: the maximum length of @str to use. If @len < 0, then
964  *       the string is nul-terminated.
965  * @items_written: location to store the number of characters in the
966  *                 result, or %NULL.
967  *
968  * Convert a string from UTF-8 to a 32-bit fixed width
969  * representation as UCS-4, assuming valid UTF-8 input.
970  * This function does no error checking on the input.
971  *
972  * Return value: a pointer to a newly allocated UCS-4 string.
973  *               This value must be freed with free().
974  **/
975 uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
977 {
978   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
979 }
980 
981 /**
982  * stringprep_ucs4_to_utf8:
983  * @str: a UCS-4 encoded string
984  * @len: the maximum length of @str to use. If @len < 0, then
985  *       the string is terminated with a 0 character.
986  * @items_read: location to store number of characters read read, or %NULL.
987  * @items_written: location to store number of bytes written or %NULL.
988  *                 The value here stored does not include the trailing 0
989  *                 byte.
990  *
991  * Convert a string from a 32-bit fixed width representation as UCS-4.
992  * to UTF-8. The result will be terminated with a 0 byte.
993  *
994  * Return value: a pointer to a newly allocated UTF-8 string.
995  *               This value must be freed with free(). If an
996  *               error occurs, %NULL will be returned and
997  *               @error set.
998  **/
999 char *
stringprep_ucs4_to_utf8(const uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001 			 size_t * items_read, size_t * items_written)
1002 {
1003   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004 			 (glong *) items_written, NULL);
1005 }
1006 
1007 /**
1008  * stringprep_utf8_nfkc_normalize:
1009  * @str: a UTF-8 encoded string.
1010  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011  *
1012  * Converts a string into canonical form, standardizing
1013  * such issues as whether a character with an accent
1014  * is represented as a base character and combining
1015  * accent or as a single precomposed character.
1016  *
1017  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1018  * differences that do not affect the text content, such as the
1019  * above-mentioned accent representation. It standardizes the
1020  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021  * the standard forms (in this case DIGIT THREE). Formatting
1022  * information may be lost but for most text operations such
1023  * characters should be considered the same. It returns a result with
1024  * composed forms rather than a maximally decomposed form.
1025  *
1026  * Return value: a newly allocated string, that is the
1027  *   NFKC normalized form of @str.
1028  **/
1029 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031 {
1032   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033 }
1034 
1035 /**
1036  * stringprep_ucs4_nfkc_normalize:
1037  * @str: a Unicode string.
1038  * @len: length of @str array, or -1 if @str is nul-terminated.
1039  *
1040  * Converts UCS4 string into UTF-8 and runs
1041  * stringprep_utf8_nfkc_normalize().
1042  *
1043  * Return value: a newly allocated Unicode string, that is the NFKC
1044  *   normalized form of @str.
1045  **/
1046 uint32_t *
stringprep_ucs4_nfkc_normalize(uint32_t * str,ssize_t len)1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048 {
1049   char *p;
1050   uint32_t *result_wc;
1051 
1052   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054   free (p);
1055 
1056   return result_wc;
1057 }
1058