1 /* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
2 /*======================================================================
3 Copyright (C) 2004,2005,2009 Walter Doekes <walter+tthsum@wjd.nu>
4 This file is part of tthsum.
5
6 tthsum is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 tthsum is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with tthsum. If not, see <http://www.gnu.org/licenses/>.
18 ======================================================================*/
19 #include "utf8.h"
20
21 #include "test.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25
26
symcmpbin(const char * name,const char * utf8,const wchar_t * unicode)27 static int symcmpbin(const char* name, const char* utf8,
28 const wchar_t* unicode) {
29 char utf8buf[2048];
30 char* utf8alloc;
31 wchar_t unicodebuf[2048];
32 wchar_t* unicodealloc;
33 unsigned utf8len = strlen(utf8);
34 unsigned unicodelen = wcslen(unicode);
35 unsigned ret;
36 unsigned allocsuccess;
37 memset(utf8buf, 85, 2048);
38 memset(unicodebuf, 85, 2048);
39
40 if (utf8len >= 2048 || unicodelen >= 512)
41 FAIL3("Test values too large: \"%s\" is %u and %u bytes", name,
42 utf8len, unicodelen);
43 if ((ret = (unsigned)utf8towcs(unicodebuf, utf8, 2048)) != unicodelen)
44 FAIL3("utf8towcs returned wrong length for \"%s\": %u instead of %u",
45 name, ret, unicodelen);
46 if ((ret = (unsigned)wcstoutf8(utf8buf, unicode, 2048)) != utf8len)
47 FAIL3("wcstoutf8 returned wrong length for \"%s\": %u instead of %u",
48 name, ret, utf8len);
49 if ((ret = (unsigned)utf8toawcs(&unicodealloc, utf8)) != unicodelen)
50 FAIL3("utf8toawcs returned wrong length for \"%s\": %u instead of %u",
51 name, ret, unicodelen);
52 if ((ret = (unsigned)wcstoautf8(&utf8alloc, unicode)) != utf8len) {
53 free(unicodealloc);
54 FAIL3("wcstoautf8 returned wrong length for \"%s\": %u instead of %u",
55 name, ret, utf8len);
56 }
57
58 allocsuccess = memcmp(utf8, utf8alloc, utf8len + 1) == 0 && memcmp(unicode,
59 unicodealloc, (unicodelen + 1) * sizeof(wchar_t)) == 0;
60 free(utf8alloc);
61 free(unicodealloc);
62
63 TEST_PASS3(memcmp(utf8, utf8buf, utf8len + 1) == 0,
64 "utf8 strings mismatch on \"%s\", first chars 0x%02hx 0x%02hx",
65 name, (short)utf8buf[0], (short)utf8buf[1]); /* %hhx is not C90 */
66 TEST_PASS2(memcmp(unicode, unicodebuf, (unicodelen + 1) * sizeof(wchar_t))
67 == 0, "unicode strings mismatch on \"%s\", first char 0x%x",
68 name, (unsigned)unicodebuf[0]);
69 TEST_PASS(allocsuccess, "utf8 and/or unicode strings allocated by the "
70 "*toa* variants failed");
71 return 0;
72 }
73
symcmp(const char * utf8,const wchar_t * unicode)74 static int symcmp(const char* utf8, const wchar_t* unicode) {
75 return symcmpbin(utf8, utf8, unicode);
76 }
77
shortutf8towcs(const char * utf8)78 static int shortutf8towcs(const char* utf8) {
79 size_t ret;
80 wchar_t buf[5];
81 wchar_t* abuf;
82 ret = utf8towcs(buf, utf8, 5);
83 if (ret == (size_t)-1)
84 FAIL1("utf8towcs returned -1 on \"%s\" with a 6 char buffer", utf8);
85 if (ret != 5)
86 FAIL2("utf8towcs returned %u on \"%s\" with a 6 char buffer",
87 (unsigned)ret, utf8);
88 if (utf8toawcs(&abuf, utf8) == (size_t)-1)
89 FAIL1("failed to get an auto-alloc'd comparison sample for \"%s\"",
90 utf8);
91 if (memcmp(abuf, buf, 5) != 0) {
92 free(abuf);
93 FAIL1("short utf8towcs didn't compare with full sample for \"%s\"",
94 utf8);
95 }
96 free(abuf);
97 return 0;
98 }
99
test_wchar_size()100 static int test_wchar_size() {
101 #ifdef _WIN32
102 TEST_PASS1(sizeof(wchar_t) == 2,
103 "expected wchar_t to be 2 bytes on windows, got %u instead",
104 (unsigned)sizeof(wchar_t));
105 #else /* !_WIN32 */
106 TEST_PASS1(sizeof(wchar_t) == 4,
107 "expected wchar_t to be 4 bytes on this OS, got %u instead",
108 (unsigned)sizeof(wchar_t));
109 #endif /* !_WIN32 */
110 return 0;
111 }
112
test_bidirectional_ascii()113 static int test_bidirectional_ascii() {
114 return symcmp("", L"")
115 + symcmp("0123ABCDabcd", L"0123ABCDabcd")
116 + symcmpbin("(low ascii)", "\x01\x02\x03\x7d\x7e\x7f",
117 L"\x01\x02\x03\x7d\x7e\x7f");
118 }
119
test_bidirectional_asian()120 static int test_bidirectional_asian() {
121 wchar_t hiragana[] = {0x306a,0x308b,0x3068,0x0};
122 return symcmpbin("Hiragana na-ru-to",
123 "\xe3\x81\xaa\xe3\x82\x8b\xe3\x81\xa8", hiragana);
124 }
125
test_bidirectional_european()126 static int test_bidirectional_european() {
127 wchar_t dutch[] = {0xe4,0xeb,0xef,0xf6,0xfc,0xff,0xc4,0xcb,0xcf,0xd6,0xdc,
128 0x178,0x0};
129 wchar_t french[] = {0xe1,0xe2,0xe0,0xc1,0xc2,0xc0,0x0};
130 wchar_t swedish[] = {0xe5,0xe4,0xf6,0xc5,0xc4,0xd6,0x0};
131 return symcmpbin("Dutch aeiouy with trema",
132 "\xc3\xa4\xc3\xab\xc3\xaf\xc3\xb6\xc3\xbc\xc3\xbf"
133 "\xc3\x84\xc3\x8b\xc3\x8f\xc3\x96\xc3\x9c\xc5\xb8", dutch)
134 + symcmpbin("French several a's",
135 "\xc3\xa1\xc3\xa2\xc3\xa0\xc3\x81\xc3\x82\xc3\x80", french)
136 + symcmpbin("Swedish ao/ae/oe",
137 "\xc3\xa5\xc3\xa4\xc3\xb6\xc3\x85\xc3\x84\xc3\x96", swedish);
138 }
139
test_invalid_character_handling()140 static int test_invalid_character_handling() {
141 wchar_t unicode[2048];
142 char utf8[2048];
143 wchar_t illegal_surrogates1[] = {0x1,0xdc00,0xd800,0x1,0x0};
144 wchar_t illegal_surrogates2[] = {0x1,0xd800,0xd800,0x1,0x0};
145 wchar_t illegal_surrogates3[] = {0x1,0xd800,0x1,0xdc00,0x0};
146 #ifndef _WIN32
147 wchar_t high_unicode[] = {0x1,0xffff,0x200000,0xffff,0x1,0x0};
148 #endif /* _WIN32 */
149
150 TEST_PASS(utf8towcs(unicode, "abc" "\xe4" "def", 2048) == (size_t)-1,
151 "utf8towcs should fail on latin1 swedish ae (0xE4)");
152 TEST_PASS(utf8towcs(unicode, "\xc1\xff", 2048) == (size_t)-1,
153 "utf8towcs should fail on \"\\xc1\\xff\"");
154 TEST_PASS(utf8towcs(unicode, "\xc2" "abc", 2048) == (size_t)-1,
155 "utf8towcs should fail on \"\\xc2abc\"");
156 TEST_PASS(utf8towcs(unicode, "abc" "\x80" "def", 2048) == (size_t)-1,
157 "utf8towcs should fail on bare \\x80");
158 TEST_PASS(utf8towcs(unicode, "abc" "\xbf" "def", 2048) == (size_t)-1,
159 "utf8towcs should fail on bare \\x80");
160 TEST_PASS(wcstoutf8(utf8, illegal_surrogates1, 2048) == (size_t)-1,
161 "wcstoutf8 should fail on surrogate characters");
162 TEST_PASS(wcstoutf8(utf8, illegal_surrogates2, 2048) == (size_t)-1,
163 "wcstoutf8 should fail on surrogate characters");
164 TEST_PASS(wcstoutf8(utf8, illegal_surrogates3, 2048) == (size_t)-1,
165 "wcstoutf8 should fail on surrogate characters");
166 #ifndef _WIN32
167 /* Windows has 16bits wchars, so nothing above 0xffff can be found */
168 TEST_PASS(wcstoutf8(utf8, high_unicode, 2048) == (size_t)-1,
169 "wcstoutf8 should fail on characters above 0x200000"); /*5+ bytes*/
170 #endif /* _WIN32 */
171 return 0;
172 }
173
test_overlong_encoding()174 static int test_overlong_encoding() {
175 wchar_t buf[2];
176 return utf8towcs(buf, "\xc0\x80", 2) != (size_t)-1 /* 0x0 */
177 || utf8towcs(buf, "\xc0\xaf", 2) != (size_t)-1 /* 0x2f */
178 || utf8towcs(buf, "\xc1\xbf", 2) != (size_t)-1 /* 0x7f */
179 || utf8towcs(buf, "\xe0\x80\x80", 2) != (size_t)-1 /* 0x0 */
180 || utf8towcs(buf, "\xe0\x80\xaf", 2) != (size_t)-1 /* 0x2f */
181 || utf8towcs(buf, "\xe0\x81\xbf", 2) != (size_t)-1 /* 0x7f */
182 || utf8towcs(buf, "\xf0\x80\x80\x80", 2) != (size_t)-1 /* 0x0 */
183 || utf8towcs(buf, "\xf0\x80\x80\xaf", 2) != (size_t)-1 /* 0x2f */
184 || utf8towcs(buf, "\xf0\x80\x81\xbf", 2) != (size_t)-1; /* 0x7f */
185 }
186
test_short_destination()187 static int test_short_destination() {
188 return shortutf8towcs("abcdef")
189 + shortutf8towcs("\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6")
190 + shortutf8towcs("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
191 }
192
test_wide_wchars()193 static int test_wide_wchars() { /* larger than 16 bits */
194 #ifdef _WIN32
195 wchar_t osmanya[] = {0xd801,0xdc80,0xd801,0xdc81,0xd801,0xdc82,0xd801,
196 0xdc83,0x0};
197 #else /* !_WIN32 */
198 wchar_t osmanya[] = {0x10480,0x10481,0x10482,0x10483,0x0};
199 #endif /* !_WIN32 */
200 return symcmpbin("Osmanya alef ba ta ja",
201 "\xf0\x90\x92\x80\xf0\x90\x92\x81\xf0\x90\x92\x82\xf0\x90\x92\x83",
202 osmanya);
203 }
204
205
206 TESTS(utf8_test)
207 TEST(test_wchar_size);
208 TEST(test_bidirectional_ascii);
209 TEST(test_bidirectional_asian);
210 TEST(test_bidirectional_european);
211 TEST(test_invalid_character_handling);
212 TEST(test_overlong_encoding);
213 TEST(test_short_destination);
214 TEST(test_wide_wchars);
215 ENDTESTS
216