1 /* nfkc.c	Unicode normalization utilities.
2  * Copyright (C) 2002, 2003  Simon Josefsson
3  *
4  * This file is part of GNU Libidn.
5  *
6  * GNU Libidn is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * GNU Libidn is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with GNU Libidn; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  */
21 
22 #include "internal.h"
23 
24 /* This file contains functions from GLIB, including gutf8.c and
25  * gunidecomp.c, all licensed under LGPL and copyright hold by:
26  *
27  *  Copyright (C) 1999, 2000 Tom Tromey
28  *  Copyright 2000 Red Hat, Inc.
29  */
30 
31 /* Hacks to make syncing with GLIB code easier. */
32 #define gboolean int
33 #define gchar char
34 #define guchar unsigned char
35 #define glong long
36 #define gint int
37 #define guint unsigned int
38 #define gushort unsigned short
39 #define gint16 my_int16_t
40 #define guint16 my_uint16_t
41 #define gunichar my_uint32_t
42 #define gsize size_t
43 #define gssize ssize_t
44 #define g_malloc malloc
45 #define g_free free
46 #define GError void
47 #define g_set_error(a,b,c,d) 0
48 #define g_new(struct_type, n_structs)					\
49   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
50 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
51 #    define G_STMT_START	(void)(
52 #    define G_STMT_END		)
53 #  else
54 #    if (defined (sun) || defined (__sun__))
55 #      define G_STMT_START	if (1)
56 #      define G_STMT_END	else (void)0
57 #    else
58 #      define G_STMT_START	do
59 #      define G_STMT_END	while (0)
60 #    endif
61 #  endif
62 #define g_return_val_if_fail(expr,val)		G_STMT_START{ (void)0; }G_STMT_END
63 #define G_N_ELEMENTS(arr)		(sizeof (arr) / sizeof ((arr)[0]))
64 #define TRUE 1
65 #define FALSE 0
66 
67 /* Code from GLIB gunicode.h starts here. */
68 
69 typedef enum
70 {
71   G_NORMALIZE_DEFAULT,
72   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
73   G_NORMALIZE_DEFAULT_COMPOSE,
74   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
75   G_NORMALIZE_ALL,
76   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
77   G_NORMALIZE_ALL_COMPOSE,
78   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
79 }
80 GNormalizeMode;
81 
82 /* Code from GLIB gutf8.c starts here. */
83 
84 #define UTF8_COMPUTE(Char, Mask, Len)		\
85   if (Char < 128)				\
86     {						\
87       Len = 1;					\
88       Mask = 0x7f;				\
89     }						\
90   else if ((Char & 0xe0) == 0xc0)		\
91     {						\
92       Len = 2;					\
93       Mask = 0x1f;				\
94     }						\
95   else if ((Char & 0xf0) == 0xe0)		\
96     {						\
97       Len = 3;					\
98       Mask = 0x0f;				\
99     }						\
100   else if ((Char & 0xf8) == 0xf0)		\
101     {						\
102       Len = 4;					\
103       Mask = 0x07;				\
104     }						\
105   else if ((Char & 0xfc) == 0xf8)		\
106     {						\
107       Len = 5;					\
108       Mask = 0x03;				\
109     }						\
110   else if ((Char & 0xfe) == 0xfc)		\
111     {						\
112       Len = 6;					\
113       Mask = 0x01;				\
114     }						\
115   else						\
116     Len = -1;
117 
118 #define UTF8_LENGTH(Char)			\
119   ((Char) < 0x80 ? 1 :				\
120    ((Char) < 0x800 ? 2 :			\
121     ((Char) < 0x10000 ? 3 :			\
122      ((Char) < 0x200000 ? 4 :			\
123       ((Char) < 0x4000000 ? 5 : 6)))))
124 
125 
126 #define UTF8_GET(Result, Chars, Count, Mask, Len)	\
127   (Result) = (Chars)[0] & (Mask);			\
128   for ((Count) = 1; (Count) < (Len); ++(Count))		\
129     {							\
130       if (((Chars)[(Count)] & 0xc0) != 0x80)		\
131 	{						\
132 	  (Result) = -1;				\
133 	  break;					\
134 	}						\
135       (Result) <<= 6;					\
136       (Result) |= ((Chars)[(Count)] & 0x3f);		\
137     }
138 
139 #define UNICODE_VALID(Char)			\
140   ((Char) < 0x110000 &&				\
141    (((Char) & 0xFFFFF800) != 0xD800) &&		\
142    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&	\
143    ((Char) & 0xFFFE) != 0xFFFE)
144 
145 
146 static const gchar utf8_skip_data[256] = {
147   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148   1, 1, 1, 1, 1, 1, 1,
149   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
150   1, 1, 1, 1, 1, 1, 1,
151   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152   1, 1, 1, 1, 1, 1, 1,
153   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154   1, 1, 1, 1, 1, 1, 1,
155   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156   1, 1, 1, 1, 1, 1, 1,
157   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158   1, 1, 1, 1, 1, 1, 1,
159   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
160   2, 2, 2, 2, 2, 2, 2,
161   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
162   5, 5, 5, 6, 6, 1, 1
163 };
164 
165 const gchar *const g_utf8_skip = utf8_skip_data;
166 
167 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
168 
169 /**
170  * g_utf8_strlen:
171  * @p: pointer to the start of a UTF-8 encoded string.
172  * @max: the maximum number of bytes to examine. If @max
173  *       is less than 0, then the string is assumed to be
174  *       nul-terminated. If @max is 0, @p will not be examined and
175  *       may be %NULL.
176  *
177  * Returns the length of the string in characters.
178  *
179  * Return value: the length of the string in characters
180  **/
181 static glong
g_utf8_strlen(const gchar * p,gssize max)182 g_utf8_strlen (const gchar * p, gssize max)
183 {
184   glong len = 0;
185   const gchar *start = p;
186   g_return_val_if_fail (p != NULL || max == 0, 0);
187 
188   if (max < 0)
189     {
190       while (*p)
191 	{
192 	  p = g_utf8_next_char (p);
193 	  ++len;
194 	}
195     }
196   else
197     {
198       if (max == 0 || !*p)
199 	return 0;
200 
201       p = g_utf8_next_char (p);
202 
203       while (p - start < max && *p)
204 	{
205 	  ++len;
206 	  p = g_utf8_next_char (p);
207 	}
208 
209       /* only do the last len increment if we got a complete
210        * char (don't count partial chars)
211        */
212       if (p - start == max)
213 	++len;
214     }
215 
216   return len;
217 }
218 
219 /**
220  * g_utf8_get_char:
221  * @p: a pointer to Unicode character encoded as UTF-8
222  *
223  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
224  * If @p does not point to a valid UTF-8 encoded character, results are
225  * undefined. If you are not sure that the bytes are complete
226  * valid Unicode characters, you should use g_utf8_get_char_validated()
227  * instead.
228  *
229  * Return value: the resulting character
230  **/
231 static gunichar
g_utf8_get_char(const gchar * p)232 g_utf8_get_char (const gchar * p)
233 {
234   int i, mask = 0, len;
235   gunichar result;
236   unsigned char c = (unsigned char) *p;
237 
238   UTF8_COMPUTE (c, mask, len);
239   if (len == -1)
240     return (gunichar) - 1;
241   UTF8_GET (result, p, i, mask, len);
242 
243   return result;
244 }
245 
246 /**
247  * g_unichar_to_utf8:
248  * @c: a ISO10646 character code
249  * @outbuf: output buffer, must have at least 6 bytes of space.
250  *       If %NULL, the length will be computed and returned
251  *       and nothing will be written to @outbuf.
252  *
253  * Converts a single character to UTF-8.
254  *
255  * Return value: number of bytes written
256  **/
257 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)258 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
259 {
260   guint len = 0;
261   int first;
262   int i;
263 
264   if (c < 0x80)
265     {
266       first = 0;
267       len = 1;
268     }
269   else if (c < 0x800)
270     {
271       first = 0xc0;
272       len = 2;
273     }
274   else if (c < 0x10000)
275     {
276       first = 0xe0;
277       len = 3;
278     }
279   else if (c < 0x200000)
280     {
281       first = 0xf0;
282       len = 4;
283     }
284   else if (c < 0x4000000)
285     {
286       first = 0xf8;
287       len = 5;
288     }
289   else
290     {
291       first = 0xfc;
292       len = 6;
293     }
294 
295   if (outbuf)
296     {
297       for (i = len - 1; i > 0; --i)
298 	{
299 	  outbuf[i] = (c & 0x3f) | 0x80;
300 	  c >>= 6;
301 	}
302       outbuf[0] = c | first;
303     }
304 
305   return len;
306 }
307 
308 /**
309  * g_utf8_to_ucs4_fast:
310  * @str: a UTF-8 encoded string
311  * @len: the maximum length of @str to use. If @len < 0, then
312  *       the string is nul-terminated.
313  * @items_written: location to store the number of characters in the
314  *                 result, or %NULL.
315  *
316  * Convert a string from UTF-8 to a 32-bit fixed width
317  * representation as UCS-4, assuming valid UTF-8 input.
318  * This function is roughly twice as fast as g_utf8_to_ucs4()
319  * but does no error checking on the input.
320  *
321  * Return value: a pointer to a newly allocated UCS-4 string.
322  *               This value must be freed with g_free().
323  **/
324 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)325 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
326 {
327   gint j, charlen;
328   gunichar *result;
329   gint n_chars, i;
330   const gchar *p;
331 
332   g_return_val_if_fail (str != NULL, NULL);
333 
334   p = str;
335   n_chars = 0;
336   if (len < 0)
337     {
338       while (*p)
339 	{
340 	  p = g_utf8_next_char (p);
341 	  ++n_chars;
342 	}
343     }
344   else
345     {
346       while (p < str + len && *p)
347 	{
348 	  p = g_utf8_next_char (p);
349 	  ++n_chars;
350 	}
351     }
352 
353   result = g_new (gunichar, n_chars + 1);
354 
355   p = str;
356   for (i = 0; i < n_chars; i++)
357     {
358       gunichar wc = ((unsigned char *) p)[0];
359 
360       if (wc < 0x80)
361 	{
362 	  result[i] = wc;
363 	  p++;
364 	}
365       else
366 	{
367 	  if (wc < 0xe0)
368 	    {
369 	      charlen = 2;
370 	      wc &= 0x1f;
371 	    }
372 	  else if (wc < 0xf0)
373 	    {
374 	      charlen = 3;
375 	      wc &= 0x0f;
376 	    }
377 	  else if (wc < 0xf8)
378 	    {
379 	      charlen = 4;
380 	      wc &= 0x07;
381 	    }
382 	  else if (wc < 0xfc)
383 	    {
384 	      charlen = 5;
385 	      wc &= 0x03;
386 	    }
387 	  else
388 	    {
389 	      charlen = 6;
390 	      wc &= 0x01;
391 	    }
392 
393 	  for (j = 1; j < charlen; j++)
394 	    {
395 	      wc <<= 6;
396 	      wc |= ((unsigned char *) p)[j] & 0x3f;
397 	    }
398 
399 	  result[i] = wc;
400 	  p += charlen;
401 	}
402     }
403   result[i] = 0;
404 
405   if (items_written)
406     *items_written = i;
407 
408   return result;
409 }
410 
411 /**
412  * g_ucs4_to_utf8:
413  * @str: a UCS-4 encoded string
414  * @len: the maximum length of @str to use. If @len < 0, then
415  *       the string is terminated with a 0 character.
416  * @items_read: location to store number of characters read read, or %NULL.
417  * @items_written: location to store number of bytes written or %NULL.
418  *                 The value here stored does not include the trailing 0
419  *                 byte.
420  * @error: location to store the error occuring, or %NULL to ignore
421  *         errors. Any of the errors in #GConvertError other than
422  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
423  *
424  * Convert a string from a 32-bit fixed width representation as UCS-4.
425  * to UTF-8. The result will be terminated with a 0 byte.
426  *
427  * Return value: a pointer to a newly allocated UTF-8 string.
428  *               This value must be freed with g_free(). If an
429  *               error occurs, %NULL will be returned and
430  *               @error set.
431  **/
432 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)433 g_ucs4_to_utf8 (const gunichar * str,
434 		glong len,
435 		glong * items_read, glong * items_written, GError ** error)
436 {
437   gint result_length;
438   gchar *result = NULL;
439   gchar *p;
440   gint i;
441 
442   result_length = 0;
443   for (i = 0; len < 0 || i < len; i++)
444     {
445       if (!str[i])
446 	break;
447 
448       if (str[i] >= 0x80000000)
449 	{
450 	  if (items_read)
451 	    *items_read = i;
452 
453 	  /*g_set_error (error, G_CONVERT_ERROR,
454 		       G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
455 		       _("Character out of range for UTF-8"));*/
456 	  goto err_out;
457 	}
458 
459       result_length += UTF8_LENGTH (str[i]);
460     }
461 
462   result = g_malloc (result_length + 1);
463   p = result;
464 
465   i = 0;
466   while (p < result + result_length)
467     p += g_unichar_to_utf8 (str[i++], p);
468 
469   *p = '\0';
470 
471   if (items_written)
472     *items_written = p - result;
473 
474 err_out:
475   if (items_read)
476     *items_read = i;
477 
478   return result;
479 }
480 
481 /* Code from GLIB gunidecomp.c starts here. */
482 
483 #include "gunidecomp.h"
484 #include "gunicomp.h"
485 
486 #define CC_PART1(Page, Char) \
487   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
488    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
489    : (cclass_data[combining_class_table_part1[Page]][Char]))
490 
491 #define CC_PART2(Page, Char) \
492   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
493    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
494    : (cclass_data[combining_class_table_part2[Page]][Char]))
495 
496 #define COMBINING_CLASS(Char) \
497   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
498    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
499    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
500       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
501       : 0))
502 
503 /* constants for hangul syllable [de]composition */
504 #define SBase 0xAC00
505 #define LBase 0x1100
506 #define VBase 0x1161
507 #define TBase 0x11A7
508 #define LCount 19
509 #define VCount 21
510 #define TCount 28
511 #define NCount (VCount * TCount)
512 #define SCount (LCount * NCount)
513 
514 /**
515  * g_unicode_canonical_ordering:
516  * @string: a UCS-4 encoded string.
517  * @len: the maximum length of @string to use.
518  *
519  * Computes the canonical ordering of a string in-place.
520  * This rearranges decomposed characters in the string
521  * according to their combining classes.  See the Unicode
522  * manual for more information.
523  **/
524 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)525 g_unicode_canonical_ordering (gunichar * string, gsize len)
526 {
527   gsize i;
528   int swap = 1;
529 
530   while (swap)
531     {
532       int last;
533       swap = 0;
534       last = COMBINING_CLASS (string[0]);
535       for (i = 0; i < len - 1; ++i)
536 	{
537 	  int next = COMBINING_CLASS (string[i + 1]);
538 	  if (next != 0 && last > next)
539 	    {
540 	      gsize j;
541 	      /* Percolate item leftward through string.  */
542 	      for (j = i + 1; j > 0; --j)
543 		{
544 		  gunichar t;
545 		  if (COMBINING_CLASS (string[j - 1]) <= next)
546 		    break;
547 		  t = string[j];
548 		  string[j] = string[j - 1];
549 		  string[j - 1] = t;
550 		  swap = 1;
551 		}
552 	      /* We're re-entering the loop looking at the old
553 	         character again.  */
554 	      next = last;
555 	    }
556 	  last = next;
557 	}
558     }
559 }
560 
561 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
562  * r should be null or have sufficient space. Calling with r == NULL will
563  * only calculate the result_len; however, a buffer with space for three
564  * characters will always be big enough. */
565 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)566 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
567 {
568   gint SIndex = s - SBase;
569 
570   /* not a hangul syllable */
571   if (SIndex < 0 || SIndex >= SCount)
572     {
573       if (r)
574 	r[0] = s;
575       *result_len = 1;
576     }
577   else
578     {
579       gunichar L = LBase + SIndex / NCount;
580       gunichar V = VBase + (SIndex % NCount) / TCount;
581       gunichar T = TBase + SIndex % TCount;
582 
583       if (r)
584 	{
585 	  r[0] = L;
586 	  r[1] = V;
587 	}
588 
589       if (T != TBase)
590 	{
591 	  if (r)
592 	    r[2] = T;
593 	  *result_len = 3;
594 	}
595       else
596 	*result_len = 2;
597     }
598 }
599 
600 /* returns a pointer to a null-terminated UTF-8 string */
601 static const gchar *
find_decomposition(gunichar ch,gboolean compat)602 find_decomposition (gunichar ch, gboolean compat)
603 {
604   int start = 0;
605   int end = G_N_ELEMENTS (decomp_table);
606 
607   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
608     {
609       while (TRUE)
610 	{
611 	  int half = (start + end) / 2;
612 	  if (ch == decomp_table[half].ch)
613 	    {
614 	      int offset;
615 
616 	      if (compat)
617 		{
618 		  offset = decomp_table[half].compat_offset;
619 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
620 		    offset = decomp_table[half].canon_offset;
621 		}
622 	      else
623 		{
624 		  offset = decomp_table[half].canon_offset;
625 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
626 		    return NULL;
627 		}
628 
629 	      return &(decomp_expansion_string[offset]);
630 	    }
631 	  else if (half == start)
632 	    break;
633 	  else if (ch > decomp_table[half].ch)
634 	    start = half;
635 	  else
636 	    end = half;
637 	}
638     }
639 
640   return NULL;
641 }
642 
643 /* L,V => LV and LV,T => LVT  */
644 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)645 combine_hangul (gunichar a, gunichar b, gunichar * result)
646 {
647   gint LIndex = a - LBase;
648   gint SIndex = a - SBase;
649 
650   gint VIndex = b - VBase;
651   gint TIndex = b - TBase;
652 
653   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
654     {
655       *result = SBase + (LIndex * VCount + VIndex) * TCount;
656       return TRUE;
657     }
658   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
659 	   && 0 <= TIndex && TIndex <= TCount)
660     {
661       *result = a + TIndex;
662       return TRUE;
663     }
664 
665   return FALSE;
666 }
667 
668 #define CI(Page, Char) \
669   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
670    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
671    : (compose_data[compose_table[Page]][Char]))
672 
673 #define COMPOSE_INDEX(Char) \
674      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
675 
676 static gboolean
combine(gunichar a,gunichar b,gunichar * result)677 combine (gunichar a, gunichar b, gunichar * result)
678 {
679   gushort index_a, index_b;
680 
681   if (combine_hangul (a, b, result))
682     return TRUE;
683 
684   index_a = COMPOSE_INDEX (a);
685 
686   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
687     {
688       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
689 	{
690 	  *result =
691 	    compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
692 	  return TRUE;
693 	}
694       else
695 	return FALSE;
696     }
697 
698   index_b = COMPOSE_INDEX (b);
699 
700   if (index_b >= COMPOSE_SECOND_SINGLE_START)
701     {
702       if (a ==
703 	  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
704 	{
705 	  *result =
706 	    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
707 	  return TRUE;
708 	}
709       else
710 	return FALSE;
711     }
712 
713   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
714       && index_b >= COMPOSE_SECOND_START
715       && index_b < COMPOSE_SECOND_SINGLE_START)
716     {
717       gunichar res =
718 	compose_array[index_a - COMPOSE_FIRST_START][index_b -
719 						     COMPOSE_SECOND_START];
720 
721       if (res)
722 	{
723 	  *result = res;
724 	  return TRUE;
725 	}
726     }
727 
728   return FALSE;
729 }
730 
731 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)732 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
733 {
734   gsize n_wc;
735   gunichar *wc_buffer;
736   const char *p;
737   gsize last_start;
738   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
739   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
740 
741   n_wc = 0;
742   p = str;
743   while ((max_len < 0 || p < str + max_len) && *p)
744     {
745       const gchar *decomp;
746       gunichar wc = g_utf8_get_char (p);
747 
748       if (wc >= 0xac00 && wc <= 0xd7af)
749 	{
750 	  gsize result_len;
751 	  decompose_hangul (wc, NULL, &result_len);
752 	  n_wc += result_len;
753 	}
754       else
755 	{
756 	  decomp = find_decomposition (wc, do_compat);
757 
758 	  if (decomp)
759 	    n_wc += g_utf8_strlen (decomp, -1);
760 	  else
761 	    n_wc++;
762 	}
763 
764       p = g_utf8_next_char (p);
765     }
766 
767   wc_buffer = g_new (gunichar, n_wc + 1);
768 
769   last_start = 0;
770   n_wc = 0;
771   p = str;
772   while ((max_len < 0 || p < str + max_len) && *p)
773     {
774       gunichar wc = g_utf8_get_char (p);
775       const gchar *decomp;
776       int cc;
777       gsize old_n_wc = n_wc;
778 
779       if (wc >= 0xac00 && wc <= 0xd7af)
780 	{
781 	  gsize result_len;
782 	  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
783 	  n_wc += result_len;
784 	}
785       else
786 	{
787 	  decomp = find_decomposition (wc, do_compat);
788 
789 	  if (decomp)
790 	    {
791 	      const char *pd;
792 	      for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
793 		wc_buffer[n_wc++] = g_utf8_get_char (pd);
794 	    }
795 	  else
796 	    wc_buffer[n_wc++] = wc;
797 	}
798 
799       if (n_wc > 0)
800 	{
801 	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
802 
803 	  if (cc == 0)
804 	    {
805 	      g_unicode_canonical_ordering (wc_buffer + last_start,
806 					    n_wc - last_start);
807 	      last_start = old_n_wc;
808 	    }
809 	}
810 
811       p = g_utf8_next_char (p);
812     }
813 
814   if (n_wc > 0)
815     {
816       g_unicode_canonical_ordering (wc_buffer + last_start,
817 				    n_wc - last_start);
818       last_start = n_wc;
819     }
820 
821   wc_buffer[n_wc] = 0;
822 
823   /* All decomposed and reordered */
824 
825   if (do_compose && n_wc > 0)
826     {
827       gsize i, j;
828       int last_cc = 0;
829       last_start = 0;
830 
831       for (i = 0; i < n_wc; i++)
832 	{
833 	  int cc = COMBINING_CLASS (wc_buffer[i]);
834 
835 	  if (i > 0 &&
836 	      (last_cc == 0 || last_cc != cc) &&
837 	      combine (wc_buffer[last_start], wc_buffer[i],
838 		       &wc_buffer[last_start]))
839 	    {
840 	      for (j = i + 1; j < n_wc; j++)
841 		wc_buffer[j - 1] = wc_buffer[j];
842 	      n_wc--;
843 	      i--;
844 
845 	      if (i == last_start)
846 		last_cc = 0;
847 	      else
848 		last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
849 
850 	      continue;
851 	    }
852 
853 	  if (cc == 0)
854 	    last_start = i;
855 
856 	  last_cc = cc;
857 	}
858     }
859 
860   wc_buffer[n_wc] = 0;
861 
862   return wc_buffer;
863 }
864 
865 /**
866  * g_utf8_normalize:
867  * @str: a UTF-8 encoded string.
868  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
869  * @mode: the type of normalization to perform.
870  *
871  * Converts a string into canonical form, standardizing
872  * such issues as whether a character with an accent
873  * is represented as a base character and combining
874  * accent or as a single precomposed character. You
875  * should generally call g_utf8_normalize() before
876  * comparing two Unicode strings.
877  *
878  * The normalization mode %G_NORMALIZE_DEFAULT only
879  * standardizes differences that do not affect the
880  * text content, such as the above-mentioned accent
881  * representation. %G_NORMALIZE_ALL also standardizes
882  * the "compatibility" characters in Unicode, such
883  * as SUPERSCRIPT THREE to the standard forms
884  * (in this case DIGIT THREE). Formatting information
885  * may be lost but for most text operations such
886  * characters should be considered the same.
887  * For example, g_utf8_collate() normalizes
888  * with %G_NORMALIZE_ALL as its first step.
889  *
890  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
891  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
892  * but returned a result with composed forms rather
893  * than a maximally decomposed form. This is often
894  * useful if you intend to convert the string to
895  * a legacy encoding or pass it to a system with
896  * less capable Unicode handling.
897  *
898  * Return value: a newly allocated string, that is the
899  *   normalized form of @str.
900  **/
901 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)902 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
903 {
904   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
905   gchar *result;
906 
907   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
908   g_free (result_wc);
909 
910   return result;
911 }
912 
913 /* Public Libidn API starts here. */
914 
915 /**
916  * stringprep_utf8_to_unichar:
917  * @p: a pointer to Unicode character encoded as UTF-8
918  *
919  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
920  * If @p does not point to a valid UTF-8 encoded character, results are
921  * undefined. If you are not sure that the bytes are complete
922  * valid Unicode characters, you should use g_utf8_get_char_validated()
923  * instead.
924  *
925  * Return value: the resulting character
926  **/
927 my_uint32_t
stringprep_utf8_to_unichar(const char * p)928 stringprep_utf8_to_unichar (const char *p)
929 {
930   return g_utf8_get_char (p);
931 }
932 
933 /**
934  * stringprep_unichar_to_utf8:
935  * @c: a ISO10646 character code
936  * @outbuf: output buffer, must have at least 6 bytes of space.
937  *       If %NULL, the length will be computed and returned
938  *       and nothing will be written to @outbuf.
939  *
940  * Converts a single character to UTF-8.
941  *
942  * Return value: number of bytes written
943  **/
944 int
stringprep_unichar_to_utf8(my_uint32_t c,char * outbuf)945 stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf)
946 {
947   return g_unichar_to_utf8 (c, outbuf);
948 }
949 
950 /**
951  * stringprep_utf8_to_ucs4:
952  * @str: a UTF-8 encoded string
953  * @len: the maximum length of @str to use. If @len < 0, then
954  *       the string is nul-terminated.
955  * @items_written: location to store the number of characters in the
956  *                 result, or %NULL.
957  *
958  * Convert a string from UTF-8 to a 32-bit fixed width
959  * representation as UCS-4, assuming valid UTF-8 input.
960  * This function does no error checking on the input.
961  *
962  * Return value: a pointer to a newly allocated UCS-4 string.
963  *               This value must be freed with free().
964  **/
965 my_uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)966 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
967 {
968   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
969 }
970 
971 /**
972  * stringprep_ucs4_to_utf8:
973  * @str: a UCS-4 encoded string
974  * @len: the maximum length of @str to use. If @len < 0, then
975  *       the string is terminated with a 0 character.
976  * @items_read: location to store number of characters read read, or %NULL.
977  * @items_written: location to store number of bytes written or %NULL.
978  *                 The value here stored does not include the trailing 0
979  *                 byte.
980  *
981  * Convert a string from a 32-bit fixed width representation as UCS-4.
982  * to UTF-8. The result will be terminated with a 0 byte.
983  *
984  * Return value: a pointer to a newly allocated UTF-8 string.
985  *               This value must be freed with free(). If an
986  *               error occurs, %NULL will be returned and
987  *               @error set.
988  **/
989 char *
stringprep_ucs4_to_utf8(const my_uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)990 stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len,
991 			 size_t * items_read, size_t * items_written)
992 {
993   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
994 			 (glong *) items_written, NULL);
995 }
996 
997 /**
998  * stringprep_utf8_nfkc_normalize:
999  * @str: a UTF-8 encoded string.
1000  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1001  *
1002  * Converts a string into canonical form, standardizing
1003  * such issues as whether a character with an accent
1004  * is represented as a base character and combining
1005  * accent or as a single precomposed character.
1006  *
1007  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1008  * differences that do not affect the text content, such as the
1009  * above-mentioned accent representation. It standardizes the
1010  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1011  * the standard forms (in this case DIGIT THREE). Formatting
1012  * information may be lost but for most text operations such
1013  * characters should be considered the same. It returns a result with
1014  * composed forms rather than a maximally decomposed form.
1015  *
1016  * Return value: a newly allocated string, that is the
1017  *   NFKC normalized form of @str.
1018  **/
1019 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1020 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1021 {
1022   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1023 }
1024 
1025 /**
1026  * stringprep_ucs4_nfkc_normalize:
1027  * @str: a Unicode string.
1028  * @len: length of @str array, or -1 if @str is nul-terminated.
1029  *
1030  * Converts UCS4 string into UTF-8 and runs
1031  * stringprep_utf8_nfkc_normalize().
1032  *
1033  * Return value: a newly allocated Unicode string, that is the NFKC
1034  *   normalized form of @str.
1035  **/
1036 my_uint32_t *
stringprep_ucs4_nfkc_normalize(my_uint32_t * str,ssize_t len)1037 stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len)
1038 {
1039   char *p;
1040   my_uint32_t *result_wc;
1041 
1042   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1043   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1044   free (p);
1045 
1046   return result_wc;
1047 }
1048