1 /* nfkc.c --- Unicode normalization utilities.
2    Copyright (C) 2002-2016 Simon Josefsson
3 
4    This file is part of GNU Libidn.
5 
6    GNU Libidn is free software: you can redistribute it and/or
7    modify it under the terms of either:
8 
9      * the GNU Lesser General Public License as published by the Free
10        Software Foundation; either version 3 of the License, or (at
11        your option) any later version.
12 
13    or
14 
15      * the GNU General Public License as published by the Free
16        Software Foundation; either version 2 of the License, or (at
17        your option) any later version.
18 
19    or both in parallel, as here.
20 
21    GNU Libidn is distributed in the hope that it will be useful,
22    but WITHOUT ANY WARRANTY; without even the implied warranty of
23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24    General Public License for more details.
25 
26    You should have received copies of the GNU General Public License and
27    the GNU Lesser General Public License along with this program.  If
28    not, see <http://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "stringprep.h"
38 
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define glong long
44 #define gint int
45 #define guint unsigned int
46 #define gushort unsigned short
47 #define gint16 int16_t
48 #define guint16 uint16_t
49 #define gunichar uint32_t
50 #define gsize size_t
51 #define gssize ssize_t
52 #define g_malloc malloc
53 #define g_free free
54 #define g_return_val_if_fail(expr,val)	{		\
55     if (!(expr))					\
56       return (val);					\
57   }
58 
59 /* Code from GLIB gmacros.h starts here. */
60 
61 /* GLIB - Library of useful routines for C programming
62  * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
63  *
64  * This library is free software; you can redistribute it and/or
65  * modify it under the terms of the GNU Lesser General Public
66  * License as published by the Free Software Foundation; either
67  * version 2 of the License, or (at your option) any later version.
68  *
69  * This library is distributed in the hope that it will be useful,
70  * but WITHOUT ANY WARRANTY; without even the implied warranty of
71  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
72  * Lesser General Public License for more details.
73  *
74  * You should have received a copy of the GNU Lesser General Public
75  * License along with this library; if not, write to the
76  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77  * Boston, MA 02111-1307, USA.
78  */
79 
80 #ifndef	FALSE
81 #define	FALSE	(0)
82 #endif
83 
84 #ifndef	TRUE
85 #define	TRUE	(!FALSE)
86 #endif
87 
88 #define G_N_ELEMENTS(arr)		(sizeof (arr) / sizeof ((arr)[0]))
89 
90 #define G_UNLIKELY(expr) (expr)
91 
92 /* Code from GLIB gunicode.h starts here. */
93 
94 /* gunicode.h - Unicode manipulation functions
95  *
96  *  Copyright (C) 1999, 2000 Tom Tromey
97  *  Copyright 2000, 2005 Red Hat, Inc.
98  *
99  * The Gnome Library is free software; you can redistribute it and/or
100  * modify it under the terms of the GNU Lesser General Public License as
101  * published by the Free Software Foundation; either version 2 of the
102  * License, or (at your option) any later version.
103  *
104  * The Gnome Library is distributed in the hope that it will be useful,
105  * but WITHOUT ANY WARRANTY; without even the implied warranty of
106  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
107  * Lesser General Public License for more details.
108  *
109  * You should have received a copy of the GNU Lesser General Public
110  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
111  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112  *   Boston, MA 02111-1307, USA.
113  */
114 
115 typedef enum
116 {
117   G_NORMALIZE_DEFAULT,
118   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
119   G_NORMALIZE_DEFAULT_COMPOSE,
120   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
121   G_NORMALIZE_ALL,
122   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
123   G_NORMALIZE_ALL_COMPOSE,
124   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
125 }
126 GNormalizeMode;
127 
128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129 
130 /* Code from GLIB gutf8.c starts here. */
131 
132 /* gutf8.c - Operations on UTF-8 strings.
133  *
134  * Copyright (C) 1999 Tom Tromey
135  * Copyright (C) 2000 Red Hat, Inc.
136  *
137  * This library is free software; you can redistribute it and/or
138  * modify it under the terms of the GNU Lesser General Public
139  * License as published by the Free Software Foundation; either
140  * version 2 of the License, or (at your option) any later version.
141  *
142  * This library is distributed in the hope that it will be useful,
143  * but WITHOUT ANY WARRANTY; without even the implied warranty of
144  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
145  * Lesser General Public License for more details.
146  *
147  * You should have received a copy of the GNU Lesser General Public
148  * License along with this library; if not, write to the
149  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150  * Boston, MA 02111-1307, USA.
151  */
152 
153 #define UTF8_COMPUTE(Char, Mask, Len)		\
154   if (Char < 128)				\
155     {						\
156       Len = 1;					\
157       Mask = 0x7f;				\
158     }						\
159   else if ((Char & 0xe0) == 0xc0)		\
160     {						\
161       Len = 2;					\
162       Mask = 0x1f;				\
163     }						\
164   else if ((Char & 0xf0) == 0xe0)		\
165     {						\
166       Len = 3;					\
167       Mask = 0x0f;				\
168     }						\
169   else if ((Char & 0xf8) == 0xf0)		\
170     {						\
171       Len = 4;					\
172       Mask = 0x07;				\
173     }						\
174   else if ((Char & 0xfc) == 0xf8)		\
175     {						\
176       Len = 5;					\
177       Mask = 0x03;				\
178     }						\
179   else if ((Char & 0xfe) == 0xfc)		\
180     {						\
181       Len = 6;					\
182       Mask = 0x01;				\
183     }						\
184   else						\
185     Len = -1;
186 
187 #define UTF8_LENGTH(Char)			\
188   ((Char) < 0x80 ? 1 :				\
189    ((Char) < 0x800 ? 2 :			\
190     ((Char) < 0x10000 ? 3 :			\
191      ((Char) < 0x200000 ? 4 :			\
192       ((Char) < 0x4000000 ? 5 : 6)))))
193 
194 #define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
195   (Result) = (Chars)[0] & (Mask);					      \
196   for ((Count) = 1; (Count) < (Len); ++(Count))				      \
197     {									      \
198       if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
199 	{								      \
200 	  (Result) = -1;						      \
201 	  break;							      \
202 	}								      \
203       (Result) <<= 6;							      \
204       (Result) |= ((Chars)[(Count)] & 0x3f);				      \
205     }
206 
207 static const gchar utf8_skip_data[256] = {
208   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209   1, 1, 1, 1, 1, 1, 1,
210   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211   1, 1, 1, 1, 1, 1, 1,
212   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213   1, 1, 1, 1, 1, 1, 1,
214   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215   1, 1, 1, 1, 1, 1, 1,
216   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217   1, 1, 1, 1, 1, 1, 1,
218   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219   1, 1, 1, 1, 1, 1, 1,
220   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221   2, 2, 2, 2, 2, 2, 2,
222   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223   5, 5, 5, 6, 6, 1, 1
224 };
225 
226 static const gchar *const g_utf8_skip = utf8_skip_data;
227 
228 /*
229  * g_utf8_strlen:
230  * @p: pointer to the start of a UTF-8 encoded string
231  * @max: the maximum number of bytes to examine. If @max
232  *       is less than 0, then the string is assumed to be
233  *       nul-terminated. If @max is 0, @p will not be examined and
234  *       may be %NULL.
235  *
236  * Computes the length of the string in characters, not including
237  * the terminating nul character.
238  *
239  * Return value: the length of the string in characters
240  **/
241 static glong
g_utf8_strlen(const gchar * p)242 g_utf8_strlen (const gchar * p)
243 {
244   glong len = 0;
245 
246   g_return_val_if_fail (p != NULL, 0);
247 
248   while (*p)
249     {
250       p = g_utf8_next_char (p);
251       ++len;
252     }
253 
254   return len;
255 }
256 
257 /*
258  * g_utf8_get_char:
259  * @p: a pointer to Unicode character encoded as UTF-8
260  *
261  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262  * If @p does not point to a valid UTF-8 encoded character, results are
263  * undefined. If you are not sure that the bytes are complete
264  * valid Unicode characters, you should use g_utf8_get_char_validated()
265  * instead.
266  *
267  * Return value: the resulting character
268  **/
269 static gunichar
g_utf8_get_char(const gchar * p)270 g_utf8_get_char (const gchar * p)
271 {
272   int i, mask = 0, len;
273   gunichar result;
274   unsigned char c = (unsigned char) *p;
275 
276   UTF8_COMPUTE (c, mask, len);
277   if (len == -1)
278     return (gunichar) - 1;
279   UTF8_GET (result, p, i, mask, len);
280 
281   return result;
282 }
283 
284 /*
285  * g_unichar_to_utf8:
286  * @c: a Unicode character code
287  * @outbuf: output buffer, must have at least 6 bytes of space.
288  *       If %NULL, the length will be computed and returned
289  *       and nothing will be written to @outbuf.
290  *
291  * Converts a single character to UTF-8.
292  *
293  * Return value: number of bytes written
294  **/
295 static int
g_unichar_to_utf8(gunichar c,gchar * outbuf)296 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297 {
298   /* If this gets modified, also update the copy in g_string_insert_unichar() */
299   guint len = 0;
300   int first;
301   int i;
302 
303   if (c < 0x80)
304     {
305       first = 0;
306       len = 1;
307     }
308   else if (c < 0x800)
309     {
310       first = 0xc0;
311       len = 2;
312     }
313   else if (c < 0x10000)
314     {
315       first = 0xe0;
316       len = 3;
317     }
318   else if (c < 0x200000)
319     {
320       first = 0xf0;
321       len = 4;
322     }
323   else if (c < 0x4000000)
324     {
325       first = 0xf8;
326       len = 5;
327     }
328   else
329     {
330       first = 0xfc;
331       len = 6;
332     }
333 
334   if (outbuf)
335     {
336       for (i = len - 1; i > 0; --i)
337 	{
338 	  outbuf[i] = (c & 0x3f) | 0x80;
339 	  c >>= 6;
340 	}
341       outbuf[0] = c | first;
342     }
343 
344   return len;
345 }
346 
347 /*
348  * g_utf8_to_ucs4_fast:
349  * @str: a UTF-8 encoded string
350  * @len: the maximum length of @str to use, in bytes. If @len < 0,
351  *       then the string is nul-terminated.
352  * @items_written: location to store the number of characters in the
353  *                 result, or %NULL.
354  *
355  * Convert a string from UTF-8 to a 32-bit fixed width
356  * representation as UCS-4, assuming valid UTF-8 input.
357  * This function is roughly twice as fast as g_utf8_to_ucs4()
358  * but does no error checking on the input. A trailing 0 character
359  * will be added to the string after the converted text.
360  *
361  * Return value: a pointer to a newly allocated UCS-4 string.
362  *               This value must be freed with g_free().
363  **/
364 static gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)365 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366 {
367   gunichar *result;
368   gsize n_chars, i;
369   const gchar *p;
370 
371   g_return_val_if_fail (str != NULL, NULL);
372 
373   p = str;
374   n_chars = 0;
375   if (len < 0)
376     {
377       while (*p)
378 	{
379 	  p = g_utf8_next_char (p);
380 	  ++n_chars;
381 	}
382     }
383   else
384     {
385       while (p < str + len && *p)
386 	{
387 	  p = g_utf8_next_char (p);
388 	  ++n_chars;
389 	}
390     }
391 
392   result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393   if (!result)
394     return NULL;
395 
396   p = str;
397   for (i = 0; i < n_chars; i++)
398     {
399       gunichar wc = (guchar) * p++;
400 
401       if (wc < 0x80)
402 	{
403 	  result[i] = wc;
404 	}
405       else
406 	{
407 	  gunichar mask = 0x40;
408 
409 	  if (G_UNLIKELY ((wc & mask) == 0))
410 	    {
411 	      /* It's an out-of-sequence 10xxxxxxx byte.
412 	       * Rather than making an ugly hash of this and the next byte
413 	       * and overrunning the buffer, it's more useful to treat it
414 	       * with a replacement character */
415 	      result[i] = 0xfffd;
416 	      continue;
417 	    }
418 
419 	  do
420 	    {
421 	      wc <<= 6;
422 	      wc |= (guchar) (*p++) & 0x3f;
423 	      mask <<= 5;
424 	    }
425 	  while ((wc & mask) != 0);
426 
427 	  wc &= mask - 1;
428 
429 	  result[i] = wc;
430 	}
431     }
432   result[i] = 0;
433 
434   if (items_written)
435     *items_written = i;
436 
437   return result;
438 }
439 
440 /*
441  * g_ucs4_to_utf8:
442  * @str: a UCS-4 encoded string
443  * @len: the maximum length (number of characters) of @str to use.
444  *       If @len < 0, then the string is nul-terminated.
445  * @items_read: location to store number of characters read, or %NULL.
446  * @items_written: location to store number of bytes written or %NULL.
447  *                 The value here stored does not include the trailing 0
448  *                 byte.
449  * @error: location to store the error occurring, or %NULL to ignore
450  *         errors. Any of the errors in #GConvertError other than
451  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
452  *
453  * Convert a string from a 32-bit fixed width representation as UCS-4.
454  * to UTF-8. The result will be terminated with a 0 byte.
455  *
456  * Return value: a pointer to a newly allocated UTF-8 string.
457  *               This value must be freed with g_free(). If an
458  *               error occurs, %NULL will be returned and
459  *               @error set. In that case, @items_read will be
460  *               set to the position of the first invalid input
461  *               character.
462  **/
463 static gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written)464 g_ucs4_to_utf8 (const gunichar * str,
465 		glong len,
466 		glong * items_read, glong * items_written)
467 {
468   gint result_length;
469   gchar *result = NULL;
470   gchar *p;
471   gint i;
472 
473   result_length = 0;
474   for (i = 0; len < 0 || i < len; i++)
475     {
476       if (!str[i])
477 	break;
478 
479       if (str[i] >= 0x80000000)
480 	goto err_out;
481 
482       result_length += UTF8_LENGTH (str[i]);
483     }
484 
485   result = g_malloc (result_length + 1);
486   if (!result)
487     return NULL;
488   p = result;
489 
490   i = 0;
491   while (p < result + result_length)
492     p += g_unichar_to_utf8 (str[i++], p);
493 
494   *p = '\0';
495 
496   if (items_written)
497     *items_written = p - result;
498 
499 err_out:
500   if (items_read)
501     *items_read = i;
502 
503   return result;
504 }
505 
506 /* Code from GLIB gunidecomp.c starts here. */
507 
508 /* decomp.c - Character decomposition.
509  *
510  *  Copyright (C) 1999, 2000 Tom Tromey
511  *  Copyright 2000 Red Hat, Inc.
512  *
513  * The Gnome Library is free software; you can redistribute it and/or
514  * modify it under the terms of the GNU Lesser General Public License as
515  * published by the Free Software Foundation; either version 2 of the
516  * License, or (at your option) any later version.
517  *
518  * The Gnome Library is distributed in the hope that it will be useful,
519  * but WITHOUT ANY WARRANTY; without even the implied warranty of
520  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
521  * Lesser General Public License for more details.
522  *
523  * You should have received a copy of the GNU Lesser General Public
524  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
525  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
526  *   Boston, MA 02111-1307, USA.
527  */
528 
529 #include "gunidecomp.h"
530 #include "gunicomp.h"
531 
532 #define CC_PART1(Page, Char)						\
533   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX)	\
534    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)	\
535    : (cclass_data[combining_class_table_part1[Page]][Char]))
536 
537 #define CC_PART2(Page, Char)						\
538   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX)	\
539    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX)	\
540    : (cclass_data[combining_class_table_part2[Page]][Char]))
541 
542 #define COMBINING_CLASS(Char)					\
543   (((Char) <= G_UNICODE_LAST_CHAR_PART1)			\
544    ? CC_PART1 ((Char) >> 8, (Char) & 0xff)			\
545    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR)	\
546       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff)	\
547       : 0))
548 
549 /* constants for hangul syllable [de]composition */
550 #define SBase 0xAC00
551 #define LBase 0x1100
552 #define VBase 0x1161
553 #define TBase 0x11A7
554 #define LCount 19
555 #define VCount 21
556 #define TCount 28
557 #define NCount (VCount * TCount)
558 #define SCount (LCount * NCount)
559 
560 /*
561  * g_unicode_canonical_ordering:
562  * @string: a UCS-4 encoded string.
563  * @len: the maximum length of @string to use.
564  *
565  * Computes the canonical ordering of a string in-place.
566  * This rearranges decomposed characters in the string
567  * according to their combining classes.  See the Unicode
568  * manual for more information.
569  **/
570 static void
g_unicode_canonical_ordering(gunichar * string,gsize len)571 g_unicode_canonical_ordering (gunichar * string, gsize len)
572 {
573   gsize i;
574   int swap = 1;
575 
576   while (swap)
577     {
578       int last;
579       swap = 0;
580       last = COMBINING_CLASS (string[0]);
581       for (i = 0; i < len - 1; ++i)
582 	{
583 	  int next = COMBINING_CLASS (string[i + 1]);
584 	  if (next != 0 && last > next)
585 	    {
586 	      gsize j;
587 	      /* Percolate item leftward through string.  */
588 	      for (j = i + 1; j > 0; --j)
589 		{
590 		  gunichar t;
591 		  if (COMBINING_CLASS (string[j - 1]) <= next)
592 		    break;
593 		  t = string[j];
594 		  string[j] = string[j - 1];
595 		  string[j - 1] = t;
596 		  swap = 1;
597 		}
598 	      /* We're re-entering the loop looking at the old
599 	         character again.  */
600 	      next = last;
601 	    }
602 	  last = next;
603 	}
604     }
605 }
606 
607 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
608  * r should be null or have sufficient space. Calling with r == NULL will
609  * only calculate the result_len; however, a buffer with space for three
610  * characters will always be big enough. */
611 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)612 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
613 {
614   gint SIndex = s - SBase;
615   gint TIndex = SIndex % TCount;
616 
617   if (r)
618     {
619       r[0] = LBase + SIndex / NCount;
620       r[1] = VBase + (SIndex % NCount) / TCount;
621     }
622 
623   if (TIndex)
624     {
625       if (r)
626 	r[2] = TBase + TIndex;
627       *result_len = 3;
628     }
629   else
630     *result_len = 2;
631 }
632 
633 /* returns a pointer to a null-terminated UTF-8 string */
634 static const gchar *
find_decomposition(gunichar ch,gboolean compat)635 find_decomposition (gunichar ch, gboolean compat)
636 {
637   int start = 0;
638   int end = G_N_ELEMENTS (decomp_table);
639 
640   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
641     {
642       while (TRUE)
643 	{
644 	  int half = (start + end) / 2;
645 	  if (ch == decomp_table[half].ch)
646 	    {
647 	      int offset;
648 
649 	      if (compat)
650 		{
651 		  offset = decomp_table[half].compat_offset;
652 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
653 		    offset = decomp_table[half].canon_offset;
654 		}
655 	      else
656 		{
657 		  offset = decomp_table[half].canon_offset;
658 		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
659 		    return NULL;
660 		}
661 
662 	      return &(decomp_expansion_string[offset]);
663 	    }
664 	  else if (half == start)
665 	    break;
666 	  else if (ch > decomp_table[half].ch)
667 	    start = half;
668 	  else
669 	    end = half;
670 	}
671     }
672 
673   return NULL;
674 }
675 
676 /* L,V => LV and LV,T => LVT  */
677 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)678 combine_hangul (gunichar a, gunichar b, gunichar * result)
679 {
680   if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
681     {
682       gint LIndex = a - LBase;
683       gint VIndex = b - VBase;
684 
685       *result = SBase + (LIndex * VCount + VIndex) * TCount;
686       return TRUE;
687     }
688 
689   if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
690     {
691       gint SIndex = a - SBase;
692 
693 		if ((SIndex % TCount) == 0)
694         {
695           gint TIndex = b - TBase;
696 
697           *result = a + TIndex;
698           return TRUE;
699         }
700     }
701 
702   return FALSE;
703 }
704 
705 #define CI(Page, Char)					\
706   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX)	\
707    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)	\
708    : (compose_data[compose_table[Page]][Char]))
709 
710 #define COMPOSE_INDEX(Char)						\
711   (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
712 
713 static gboolean
combine(gunichar a,gunichar b,gunichar * result)714 combine (gunichar a, gunichar b, gunichar * result)
715 {
716   gushort index_a, index_b;
717 
718   if (combine_hangul (a, b, result))
719     return TRUE;
720 
721   index_a = COMPOSE_INDEX (a);
722 
723   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
724     {
725       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
726 	{
727 	  *result =
728 	    compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
729 	  return TRUE;
730 	}
731       else
732 	return FALSE;
733     }
734 
735   index_b = COMPOSE_INDEX (b);
736 
737   if (index_b >= COMPOSE_SECOND_SINGLE_START)
738     {
739       if (a ==
740 	  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
741 	{
742 	  *result =
743 	    compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
744 	  return TRUE;
745 	}
746       else
747 	return FALSE;
748     }
749 
750   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
751       && index_b >= COMPOSE_SECOND_START
752       && index_b < COMPOSE_SECOND_SINGLE_START)
753     {
754       gunichar res =
755 	compose_array[index_a - COMPOSE_FIRST_START][index_b -
756 						     COMPOSE_SECOND_START];
757 
758       if (res)
759 	{
760 	  *result = res;
761 	  return TRUE;
762 	}
763     }
764 
765   return FALSE;
766 }
767 
768 static gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)769 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
770 {
771   gsize n_wc;
772   gunichar *wc_buffer;
773   const char *p;
774   gsize last_start;
775   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
776   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
777 
778   n_wc = 0;
779   p = str;
780   while ((max_len < 0 || p < str + max_len) && *p)
781     {
782       const gchar *decomp;
783       gunichar wc = g_utf8_get_char (p);
784 
785       if (wc >= SBase && wc < SBase + SCount)
786 	{
787 	  gsize result_len;
788 	  decompose_hangul (wc, NULL, &result_len);
789 	  n_wc += result_len;
790 	}
791       else
792 	{
793 	  decomp = find_decomposition (wc, do_compat);
794 
795 	  if (decomp)
796 	    n_wc += g_utf8_strlen (decomp);
797 	  else
798 	    n_wc++;
799 	}
800 
801       p = g_utf8_next_char (p);
802     }
803 
804   wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
805   if (!wc_buffer)
806     return NULL;
807 
808   last_start = 0;
809   n_wc = 0;
810   p = str;
811   while ((max_len < 0 || p < str + max_len) && *p)
812     {
813       gunichar wc = g_utf8_get_char (p);
814       const gchar *decomp;
815       int cc;
816       gsize old_n_wc = n_wc;
817 
818       if (wc >= SBase && wc < SBase + SCount)
819 	{
820 	  gsize result_len;
821 	  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
822 	  n_wc += result_len;
823 	}
824       else
825 	{
826 	  decomp = find_decomposition (wc, do_compat);
827 
828 	  if (decomp)
829 	    {
830 	      const char *pd;
831 	      for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
832 		wc_buffer[n_wc++] = g_utf8_get_char (pd);
833 	    }
834 	  else
835 	    wc_buffer[n_wc++] = wc;
836 	}
837 
838       if (n_wc > 0)
839 	{
840 	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
841 
842 	  if (cc == 0)
843 	    {
844 	      g_unicode_canonical_ordering (wc_buffer + last_start,
845 					    n_wc - last_start);
846 	      last_start = old_n_wc;
847 	    }
848 	}
849 
850       p = g_utf8_next_char (p);
851     }
852 
853   if (n_wc > 0)
854     {
855       g_unicode_canonical_ordering (wc_buffer + last_start,
856 				    n_wc - last_start);
857       /* dead assignment: last_start = n_wc; */
858     }
859 
860   wc_buffer[n_wc] = 0;
861 
862   /* All decomposed and reordered */
863 
864   if (do_compose && n_wc > 0)
865     {
866       gsize i, j;
867       int last_cc = 0;
868       last_start = 0;
869 
870       for (i = 0; i < n_wc; i++)
871 	{
872 	  int cc = COMBINING_CLASS (wc_buffer[i]);
873 
874 	  if (i > 0 &&
875 	      (last_cc == 0 || last_cc != cc) &&
876 	      combine (wc_buffer[last_start], wc_buffer[i],
877 		       &wc_buffer[last_start]))
878 	    {
879 	      for (j = i + 1; j < n_wc; j++)
880 		wc_buffer[j - 1] = wc_buffer[j];
881 	      n_wc--;
882 	      i--;
883 
884 	      if (i == last_start)
885 		last_cc = 0;
886 	      else
887 		last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
888 
889 	      continue;
890 	    }
891 
892 	  if (cc == 0)
893 	    last_start = i;
894 
895 	  last_cc = cc;
896 	}
897     }
898 
899   wc_buffer[n_wc] = 0;
900 
901   return wc_buffer;
902 }
903 
904 /*
905  * g_utf8_normalize:
906  * @str: a UTF-8 encoded string.
907  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
908  * @mode: the type of normalization to perform.
909  *
910  * Converts a string into canonical form, standardizing
911  * such issues as whether a character with an accent
912  * is represented as a base character and combining
913  * accent or as a single precomposed character. The
914  * string has to be valid UTF-8, otherwise %NULL is
915  * returned. You should generally call g_utf8_normalize()
916  * before comparing two Unicode strings.
917  *
918  * The normalization mode %G_NORMALIZE_DEFAULT only
919  * standardizes differences that do not affect the
920  * text content, such as the above-mentioned accent
921  * representation. %G_NORMALIZE_ALL also standardizes
922  * the "compatibility" characters in Unicode, such
923  * as SUPERSCRIPT THREE to the standard forms
924  * (in this case DIGIT THREE). Formatting information
925  * may be lost but for most text operations such
926  * characters should be considered the same.
927  *
928  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
929  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
930  * but returned a result with composed forms rather
931  * than a maximally decomposed form. This is often
932  * useful if you intend to convert the string to
933  * a legacy encoding or pass it to a system with
934  * less capable Unicode handling.
935  *
936  * Return value: a newly allocated string, that is the
937  *   normalized form of @str, or %NULL if @str is not
938  *   valid UTF-8.
939  **/
940 static gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)941 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
942 {
943   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
944   gchar *result = NULL;
945 
946   if (result_wc)
947     result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
948 
949   g_free (result_wc);
950 
951   return result;
952 }
953 
954 /* Public Libidn API starts here. */
955 
956 /**
957  * stringprep_utf8_to_unichar:
958  * @p: a pointer to Unicode character encoded as UTF-8
959  *
960  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
961  * If @p does not point to a valid UTF-8 encoded character, results are
962  * undefined.
963  *
964  * Return value: the resulting character.
965  **/
966 uint32_t
stringprep_utf8_to_unichar(const char * p)967 stringprep_utf8_to_unichar (const char *p)
968 {
969   return g_utf8_get_char (p);
970 }
971 
972 /**
973  * stringprep_unichar_to_utf8:
974  * @c: a ISO10646 character code
975  * @outbuf: output buffer, must have at least 6 bytes of space.
976  *       If %NULL, the length will be computed and returned
977  *       and nothing will be written to @outbuf.
978  *
979  * Converts a single character to UTF-8.
980  *
981  * Return value: number of bytes written.
982  **/
983 int
stringprep_unichar_to_utf8(uint32_t c,char * outbuf)984 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
985 {
986   return g_unichar_to_utf8 (c, outbuf);
987 }
988 
989 #include <unistr.h>
990 
991 /**
992  * stringprep_utf8_to_ucs4:
993  * @str: a UTF-8 encoded string
994  * @len: the maximum length of @str to use. If @len < 0, then
995  *       the string is nul-terminated.
996  * @items_written: location to store the number of characters in the
997  *                 result, or %NULL.
998  *
999  * Convert a string from UTF-8 to a 32-bit fixed width representation
1000  * as UCS-4.  The function now performs error checking to verify that
1001  * the input is valid UTF-8 (before it was documented to not do error
1002  * checking).
1003  *
1004  * Return value: a pointer to a newly allocated UCS-4 string.
1005  *               This value must be deallocated by the caller.
1006  **/
1007 uint32_t *
stringprep_utf8_to_ucs4(const char * str,ssize_t len,size_t * items_written)1008 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
1009 {
1010   size_t n;
1011 
1012   if (len < 0)
1013     n = strlen (str);
1014   else
1015     n = len;
1016 
1017   if (u8_check ((const uint8_t *) str, n))
1018     return NULL;
1019 
1020   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1021 }
1022 
1023 /**
1024  * stringprep_ucs4_to_utf8:
1025  * @str: a UCS-4 encoded string
1026  * @len: the maximum length of @str to use. If @len < 0, then
1027  *       the string is terminated with a 0 character.
1028  * @items_read: location to store number of characters read read, or %NULL.
1029  * @items_written: location to store number of bytes written or %NULL.
1030  *                 The value here stored does not include the trailing 0
1031  *                 byte.
1032  *
1033  * Convert a string from a 32-bit fixed width representation as UCS-4.
1034  * to UTF-8. The result will be terminated with a 0 byte.
1035  *
1036  * Return value: a pointer to a newly allocated UTF-8 string.
1037  *               This value must be deallocated by the caller.
1038  *               If an error occurs, %NULL will be returned.
1039  **/
1040 char *
stringprep_ucs4_to_utf8(const uint32_t * str,ssize_t len,size_t * items_read,size_t * items_written)1041 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1042 			 size_t * items_read, size_t * items_written)
1043 {
1044   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1045 			 (glong *) items_written);
1046 }
1047 
1048 /**
1049  * stringprep_utf8_nfkc_normalize:
1050  * @str: a UTF-8 encoded string.
1051  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1052  *
1053  * Converts a string into canonical form, standardizing
1054  * such issues as whether a character with an accent
1055  * is represented as a base character and combining
1056  * accent or as a single precomposed character.
1057  *
1058  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1059  * differences that do not affect the text content, such as the
1060  * above-mentioned accent representation. It standardizes the
1061  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1062  * the standard forms (in this case DIGIT THREE). Formatting
1063  * information may be lost but for most text operations such
1064  * characters should be considered the same. It returns a result with
1065  * composed forms rather than a maximally decomposed form.
1066  *
1067  * Return value: a newly allocated string, that is the
1068  *   NFKC normalized form of @str.
1069  **/
1070 char *
stringprep_utf8_nfkc_normalize(const char * str,ssize_t len)1071 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1072 {
1073   size_t n;
1074 
1075   if (len < 0)
1076     n = strlen (str);
1077   else
1078     n = len;
1079 
1080   if (u8_check ((const uint8_t *) str, n))
1081     return NULL;
1082 
1083   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1084 }
1085 
1086 #include <stdio.h>
1087 /**
1088  * stringprep_ucs4_nfkc_normalize:
1089  * @str: a Unicode string.
1090  * @len: length of @str array, or -1 if @str is nul-terminated.
1091  *
1092  * Converts a UCS4 string into canonical form, see
1093  * stringprep_utf8_nfkc_normalize() for more information.
1094  *
1095  * Return value: a newly allocated Unicode string, that is the NFKC
1096  *   normalized form of @str.
1097  **/
1098 uint32_t *
stringprep_ucs4_nfkc_normalize(const uint32_t * str,ssize_t len)1099 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1100 {
1101   char *p;
1102   uint32_t *result_wc;
1103 
1104   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1105   if (!p)
1106     return NULL;
1107 
1108   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1109   free (p);
1110 
1111   return result_wc;
1112 }
1113