1 /* an - Anagram generator
2 Copyright (C) 2012 Paul Martin <pm@debian.org>
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License along
15 with this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17 */
18
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <unicode/uchar.h>
22 #include <unicode/ucnv.h>
23 #include <unicode/unorm2.h>
24
25 #include "unicode.h"
26 #include "malloc.h"
27
28 const UNormalizer2 *norm_decompose = NULL;
29 UConverter *conv_from_utf8 = NULL;
30
31 static void
initializeunicode(void)32 initializeunicode(void)
33 {
34 UErrorCode uerr = U_ZERO_ERROR;
35
36 /* We're going to be doing a lot of conversions.
37 Make sure we're only initializing these once. */
38 if (conv_from_utf8 == NULL)
39 conv_from_utf8 = ucnv_open("utf8", &uerr);
40 if (norm_decompose == NULL)
41 /* change UNORM2_DECOMPOSE to UNORM2_COMPOSE if you want accents
42 to be significant */
43 norm_decompose =
44 unorm2_getInstance(NULL, "nfkc_cf", UNORM2_DECOMPOSE, &uerr);
45 }
46
47 UChar *
utf8tointernal(const char * utf8word)48 utf8tointernal(const char *utf8word)
49 {
50 UChar *intermediate=NULL, *decomposed=NULL;
51 int len, destlen;
52 UErrorCode uerr = U_ZERO_ERROR;
53 UChar *p, *q;
54
55 initializeunicode();
56
57 /* Convert from UTF-8 to internal representation */
58 destlen = 0;
59 do {
60 uerr = U_ZERO_ERROR;
61 len = ucnv_toUChars(conv_from_utf8,
62 intermediate, destlen,
63 utf8word, -1, &uerr);
64 if (uerr == U_BUFFER_OVERFLOW_ERROR) {
65 destlen = len + 1;
66 intermediate = safe_calloc(destlen,sizeof(UChar));
67 }
68 } while (uerr == U_BUFFER_OVERFLOW_ERROR);
69
70 if (U_FAILURE(uerr)) {
71 fprintf(stderr, "toUChars: %s\n", u_errorName(uerr));
72 exit(99);
73 }
74
75 /* decompose and case fold */
76 destlen = 0;
77 do {
78 uerr = U_ZERO_ERROR;
79 len = unorm2_normalize(norm_decompose,
80 intermediate, -1,
81 decomposed, destlen,
82 &uerr);
83 if (uerr == U_BUFFER_OVERFLOW_ERROR) {
84 destlen = len + 1;
85 decomposed = safe_calloc(destlen,sizeof(UChar));
86 }
87 } while (uerr == U_BUFFER_OVERFLOW_ERROR);
88
89 if (U_FAILURE(uerr)) {
90 fprintf(stderr, "normalize: %s\n", u_errorName(uerr));
91 exit(99);
92 }
93
94 free(intermediate);
95
96 /* Copy down only alphabetic characters */
97 p = q = decomposed;
98 while (*p != 0) {
99 if (u_isalpha(*p))
100 *q++ = *p;
101 p++;
102 }
103 *q = *p;
104 return decomposed;
105 }
106