1 /* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
2 /*======================================================================
3 Copyright (C) 2004,2005,2009 Walter Doekes <walter+tthsum@wjd.nu>
4 This file is part of tthsum.
5 
6 tthsum is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 tthsum is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with tthsum.  If not, see <http://www.gnu.org/licenses/>.
18 ======================================================================*/
19 #include "utf8.h"
20 
21 #include "test.h"
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25 
26 
symcmpbin(const char * name,const char * utf8,const wchar_t * unicode)27 static int symcmpbin(const char* name, const char* utf8,
28 	const wchar_t* unicode) {
29     char utf8buf[2048];
30     char* utf8alloc;
31     wchar_t unicodebuf[2048];
32     wchar_t* unicodealloc;
33     unsigned utf8len = strlen(utf8);
34     unsigned unicodelen = wcslen(unicode);
35     unsigned ret;
36     unsigned allocsuccess;
37     memset(utf8buf, 85, 2048);
38     memset(unicodebuf, 85, 2048);
39 
40     if (utf8len >= 2048 || unicodelen >= 512)
41 	FAIL3("Test values too large: \"%s\" is %u and %u bytes", name,
42 		utf8len, unicodelen);
43     if ((ret = (unsigned)utf8towcs(unicodebuf, utf8, 2048)) != unicodelen)
44 	FAIL3("utf8towcs returned wrong length for \"%s\": %u instead of %u",
45 		name, ret, unicodelen);
46     if ((ret = (unsigned)wcstoutf8(utf8buf, unicode, 2048)) != utf8len)
47 	FAIL3("wcstoutf8 returned wrong length for \"%s\": %u instead of %u",
48 		name, ret, utf8len);
49     if ((ret = (unsigned)utf8toawcs(&unicodealloc, utf8)) != unicodelen)
50 	FAIL3("utf8toawcs returned wrong length for \"%s\": %u instead of %u",
51 		name, ret, unicodelen);
52     if ((ret = (unsigned)wcstoautf8(&utf8alloc, unicode)) != utf8len) {
53 	free(unicodealloc);
54 	FAIL3("wcstoautf8 returned wrong length for \"%s\": %u instead of %u",
55 		name, ret, utf8len);
56     }
57 
58     allocsuccess = memcmp(utf8, utf8alloc, utf8len + 1) == 0 && memcmp(unicode,
59 	    unicodealloc, (unicodelen + 1) * sizeof(wchar_t)) == 0;
60     free(utf8alloc);
61     free(unicodealloc);
62 
63     TEST_PASS3(memcmp(utf8, utf8buf, utf8len + 1) == 0,
64 	    "utf8 strings mismatch on \"%s\", first chars 0x%02hx 0x%02hx",
65 	    name, (short)utf8buf[0], (short)utf8buf[1]); /* %hhx is not C90 */
66     TEST_PASS2(memcmp(unicode, unicodebuf, (unicodelen + 1) * sizeof(wchar_t))
67 	    == 0, "unicode strings mismatch on \"%s\", first char 0x%x",
68 	    name, (unsigned)unicodebuf[0]);
69     TEST_PASS(allocsuccess, "utf8 and/or unicode strings allocated by the "
70 	    "*toa* variants failed");
71     return 0;
72 }
73 
symcmp(const char * utf8,const wchar_t * unicode)74 static int symcmp(const char* utf8, const wchar_t* unicode) {
75     return symcmpbin(utf8, utf8, unicode);
76 }
77 
shortutf8towcs(const char * utf8)78 static int shortutf8towcs(const char* utf8) {
79     size_t ret;
80     wchar_t buf[5];
81     wchar_t* abuf;
82     ret = utf8towcs(buf, utf8, 5);
83     if (ret == (size_t)-1)
84 	FAIL1("utf8towcs returned -1 on \"%s\" with a 6 char buffer", utf8);
85     if (ret != 5)
86 	FAIL2("utf8towcs returned %u on \"%s\" with a 6 char buffer",
87 		(unsigned)ret, utf8);
88     if (utf8toawcs(&abuf, utf8) == (size_t)-1)
89 	FAIL1("failed to get an auto-alloc'd comparison sample for \"%s\"",
90 		utf8);
91     if (memcmp(abuf, buf, 5) != 0) {
92 	free(abuf);
93 	FAIL1("short utf8towcs didn't compare with full sample for \"%s\"",
94 		utf8);
95     }
96     free(abuf);
97     return 0;
98 }
99 
test_wchar_size()100 static int test_wchar_size() {
101 #ifdef _WIN32
102     TEST_PASS1(sizeof(wchar_t) == 2,
103 	    "expected wchar_t to be 2 bytes on windows, got %u instead",
104 	    (unsigned)sizeof(wchar_t));
105 #else /* !_WIN32 */
106     TEST_PASS1(sizeof(wchar_t) == 4,
107 	    "expected wchar_t to be 4 bytes on this OS, got %u instead",
108 	    (unsigned)sizeof(wchar_t));
109 #endif /* !_WIN32 */
110     return 0;
111 }
112 
test_bidirectional_ascii()113 static int test_bidirectional_ascii() {
114     return symcmp("", L"")
115 	 + symcmp("0123ABCDabcd", L"0123ABCDabcd")
116 	 + symcmpbin("(low ascii)", "\x01\x02\x03\x7d\x7e\x7f",
117 	    L"\x01\x02\x03\x7d\x7e\x7f");
118 }
119 
test_bidirectional_asian()120 static int test_bidirectional_asian() {
121     wchar_t hiragana[] = {0x306a,0x308b,0x3068,0x0};
122     return symcmpbin("Hiragana na-ru-to",
123 	    "\xe3\x81\xaa\xe3\x82\x8b\xe3\x81\xa8", hiragana);
124 }
125 
test_bidirectional_european()126 static int test_bidirectional_european() {
127     wchar_t dutch[] = {0xe4,0xeb,0xef,0xf6,0xfc,0xff,0xc4,0xcb,0xcf,0xd6,0xdc,
128 		       0x178,0x0};
129     wchar_t french[] = {0xe1,0xe2,0xe0,0xc1,0xc2,0xc0,0x0};
130     wchar_t swedish[] = {0xe5,0xe4,0xf6,0xc5,0xc4,0xd6,0x0};
131     return symcmpbin("Dutch aeiouy with trema",
132 	    "\xc3\xa4\xc3\xab\xc3\xaf\xc3\xb6\xc3\xbc\xc3\xbf"
133 	    "\xc3\x84\xc3\x8b\xc3\x8f\xc3\x96\xc3\x9c\xc5\xb8", dutch)
134 	 + symcmpbin("French several a's",
135 	    "\xc3\xa1\xc3\xa2\xc3\xa0\xc3\x81\xc3\x82\xc3\x80", french)
136 	 + symcmpbin("Swedish ao/ae/oe",
137 	    "\xc3\xa5\xc3\xa4\xc3\xb6\xc3\x85\xc3\x84\xc3\x96", swedish);
138 }
139 
test_invalid_character_handling()140 static int test_invalid_character_handling() {
141     wchar_t unicode[2048];
142     char utf8[2048];
143     wchar_t illegal_surrogates1[] = {0x1,0xdc00,0xd800,0x1,0x0};
144     wchar_t illegal_surrogates2[] = {0x1,0xd800,0xd800,0x1,0x0};
145     wchar_t illegal_surrogates3[] = {0x1,0xd800,0x1,0xdc00,0x0};
146 #ifndef _WIN32
147     wchar_t high_unicode[] = {0x1,0xffff,0x200000,0xffff,0x1,0x0};
148 #endif /* _WIN32 */
149 
150     TEST_PASS(utf8towcs(unicode, "abc" "\xe4" "def", 2048) == (size_t)-1,
151 	    "utf8towcs should fail on latin1 swedish ae (0xE4)");
152     TEST_PASS(utf8towcs(unicode, "\xc1\xff", 2048) == (size_t)-1,
153 	    "utf8towcs should fail on \"\\xc1\\xff\"");
154     TEST_PASS(utf8towcs(unicode, "\xc2" "abc", 2048) == (size_t)-1,
155 	    "utf8towcs should fail on \"\\xc2abc\"");
156     TEST_PASS(utf8towcs(unicode, "abc" "\x80" "def", 2048) == (size_t)-1,
157 	    "utf8towcs should fail on bare \\x80");
158     TEST_PASS(utf8towcs(unicode, "abc" "\xbf" "def", 2048) == (size_t)-1,
159 	    "utf8towcs should fail on bare \\x80");
160     TEST_PASS(wcstoutf8(utf8, illegal_surrogates1, 2048) == (size_t)-1,
161 	    "wcstoutf8 should fail on surrogate characters");
162     TEST_PASS(wcstoutf8(utf8, illegal_surrogates2, 2048) == (size_t)-1,
163 	    "wcstoutf8 should fail on surrogate characters");
164     TEST_PASS(wcstoutf8(utf8, illegal_surrogates3, 2048) == (size_t)-1,
165 	    "wcstoutf8 should fail on surrogate characters");
166 #ifndef _WIN32
167     /* Windows has 16bits wchars, so nothing above 0xffff can be found */
168     TEST_PASS(wcstoutf8(utf8, high_unicode, 2048) == (size_t)-1,
169 	    "wcstoutf8 should fail on characters above 0x200000"); /*5+ bytes*/
170 #endif /* _WIN32 */
171     return 0;
172 }
173 
test_overlong_encoding()174 static int test_overlong_encoding() {
175     wchar_t buf[2];
176     return utf8towcs(buf, "\xc0\x80", 2) != (size_t)-1 /* 0x0 */
177 	|| utf8towcs(buf, "\xc0\xaf", 2) != (size_t)-1 /* 0x2f */
178 	|| utf8towcs(buf, "\xc1\xbf", 2) != (size_t)-1 /* 0x7f */
179 	|| utf8towcs(buf, "\xe0\x80\x80", 2) != (size_t)-1 /* 0x0 */
180 	|| utf8towcs(buf, "\xe0\x80\xaf", 2) != (size_t)-1 /* 0x2f */
181 	|| utf8towcs(buf, "\xe0\x81\xbf", 2) != (size_t)-1 /* 0x7f */
182 	|| utf8towcs(buf, "\xf0\x80\x80\x80", 2) != (size_t)-1 /* 0x0 */
183 	|| utf8towcs(buf, "\xf0\x80\x80\xaf", 2) != (size_t)-1 /* 0x2f */
184 	|| utf8towcs(buf, "\xf0\x80\x81\xbf", 2) != (size_t)-1; /* 0x7f */
185 }
186 
test_short_destination()187 static int test_short_destination() {
188     return shortutf8towcs("abcdef")
189 	 + shortutf8towcs("\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6")
190 	 + shortutf8towcs("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
191 }
192 
test_wide_wchars()193 static int test_wide_wchars() { /* larger than 16 bits */
194 #ifdef _WIN32
195     wchar_t osmanya[] = {0xd801,0xdc80,0xd801,0xdc81,0xd801,0xdc82,0xd801,
196 			 0xdc83,0x0};
197 #else /* !_WIN32 */
198     wchar_t osmanya[] = {0x10480,0x10481,0x10482,0x10483,0x0};
199 #endif /* !_WIN32 */
200     return symcmpbin("Osmanya alef ba ta ja",
201 	    "\xf0\x90\x92\x80\xf0\x90\x92\x81\xf0\x90\x92\x82\xf0\x90\x92\x83",
202 	    osmanya);
203 }
204 
205 
206 TESTS(utf8_test)
207     TEST(test_wchar_size);
208     TEST(test_bidirectional_ascii);
209     TEST(test_bidirectional_asian);
210     TEST(test_bidirectional_european);
211     TEST(test_invalid_character_handling);
212     TEST(test_overlong_encoding);
213     TEST(test_short_destination);
214     TEST(test_wide_wchars);
215 ENDTESTS
216