1 /*  an - Anagram generator
2     Copyright (C) 2012  Paul Martin <pm@debian.org>
3 
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License along
15     with this program; if not, write to the Free Software Foundation, Inc.,
16     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <unicode/uchar.h>
22 #include <unicode/ucnv.h>
23 #include <unicode/unorm2.h>
24 
25 #include "unicode.h"
26 #include "malloc.h"
27 
28 const UNormalizer2 *norm_decompose = NULL;
29 UConverter *conv_from_utf8 = NULL;
30 
31 static void
initializeunicode(void)32 initializeunicode(void)
33 {
34     UErrorCode uerr = U_ZERO_ERROR;
35 
36     /* We're going to be doing a lot of conversions.
37        Make sure we're only initializing these once. */
38     if (conv_from_utf8 == NULL)
39         conv_from_utf8 = ucnv_open("utf8", &uerr);
40     if (norm_decompose == NULL)
41         /* change UNORM2_DECOMPOSE to UNORM2_COMPOSE if you want accents
42            to be significant */
43         norm_decompose =
44             unorm2_getInstance(NULL, "nfkc_cf", UNORM2_DECOMPOSE, &uerr);
45 }
46 
47 UChar *
utf8tointernal(const char * utf8word)48 utf8tointernal(const char *utf8word)
49 {
50     UChar *intermediate=NULL, *decomposed=NULL;
51     int len, destlen;
52     UErrorCode uerr = U_ZERO_ERROR;
53     UChar *p, *q;
54 
55     initializeunicode();
56 
57     /* Convert from UTF-8 to internal representation */
58     destlen = 0;
59     do {
60         uerr = U_ZERO_ERROR;
61         len = ucnv_toUChars(conv_from_utf8,
62                             intermediate, destlen,
63                             utf8word, -1, &uerr);
64         if (uerr == U_BUFFER_OVERFLOW_ERROR) {
65             destlen = len + 1;
66             intermediate = safe_calloc(destlen,sizeof(UChar));
67         }
68     } while (uerr == U_BUFFER_OVERFLOW_ERROR);
69 
70     if (U_FAILURE(uerr)) {
71         fprintf(stderr, "toUChars: %s\n", u_errorName(uerr));
72         exit(99);
73     }
74 
75     /* decompose and case fold */
76     destlen = 0;
77     do {
78         uerr = U_ZERO_ERROR;
79         len = unorm2_normalize(norm_decompose,
80                                intermediate, -1,
81                                decomposed, destlen,
82                                &uerr);
83         if (uerr == U_BUFFER_OVERFLOW_ERROR) {
84             destlen = len + 1;
85             decomposed = safe_calloc(destlen,sizeof(UChar));
86         }
87     } while (uerr == U_BUFFER_OVERFLOW_ERROR);
88 
89     if (U_FAILURE(uerr)) {
90         fprintf(stderr, "normalize: %s\n", u_errorName(uerr));
91         exit(99);
92     }
93 
94     free(intermediate);
95 
96     /* Copy down only alphabetic characters */
97     p = q = decomposed;
98     while (*p != 0) {
99         if (u_isalpha(*p))
100             *q++ = *p;
101         p++;
102     }
103     *q = *p;
104     return decomposed;
105 }
106