1 /* Copyright (c) 2007-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "test-lib.h"
4 #include "str.h"
5 #include "buffer.h"
6 #include "unichar.h"
7 
test_unichar_uni_utf8_strlen(void)8 static void test_unichar_uni_utf8_strlen(void)
9 {
10 	static const char input[] = "\xC3\xA4\xC3\xA4\0a";
11 
12 	test_begin("uni_utf8_strlen()");
13 	test_assert(uni_utf8_strlen(input) == 2);
14 	test_end();
15 
16 	test_begin("uni_utf8_strlen_n()");
17 	test_assert(uni_utf8_strlen_n(input, 1) == 0);
18 	test_assert(uni_utf8_strlen_n(input, 2) == 1);
19 	test_assert(uni_utf8_strlen_n(input, 3) == 1);
20 	test_assert(uni_utf8_strlen_n(input, 4) == 2);
21 	test_end();
22 }
23 
test_unichar_uni_utf8_partial_strlen_n(void)24 static void test_unichar_uni_utf8_partial_strlen_n(void)
25 {
26 	static const char input[] = "\xC3\xA4\xC3\xA4\0a";
27 	size_t pos;
28 
29 	test_begin("uni_utf8_partial_strlen_n()");
30 	test_assert(uni_utf8_partial_strlen_n(input, 1, &pos) == 0 && pos == 0);
31 	test_assert(uni_utf8_partial_strlen_n(input, 2, &pos) == 1 && pos == 2);
32 	test_assert(uni_utf8_partial_strlen_n(input, 3, &pos) == 1 && pos == 2);
33 	test_assert(uni_utf8_partial_strlen_n(input, 4, &pos) == 2 && pos == 4);
34 	test_assert(uni_utf8_partial_strlen_n(input, 5, &pos) == 3 && pos == 5);
35 	test_assert(uni_utf8_partial_strlen_n(input, 6, &pos) == 4 && pos == 6);
36 	test_end();
37 }
38 
test_unichar_valid_unicode(void)39 static void test_unichar_valid_unicode(void)
40 {
41 	struct {
42 		const char *input;
43 		bool valid;
44 		unichar_t expected;
45 	} test_cases[] = {
46 		{ "a", TRUE, 'a' },
47 		{ "\xc3\xb1", TRUE, 0x00F1 }, /* U+00F1 */
48 		{ "\xc3\x28", FALSE, 0x0 }, /* has invalid 2nd octet */
49 		{ "\xa0\xa1", FALSE, 0x0 }, /* invalid sequence identifier */
50 		{ "\xed\xb2\x80", FALSE, 0x0 }, /* UTF-8B */
51 		{ "\xed\xa0\x80", FALSE, 0x0 }, /* surrogate halves, U+D800 .. */
52 		{ "\xed\xa0\x80", FALSE, 0x0 },
53 		{ "\xed\xa1\x80", FALSE, 0x0 },
54 		{ "\xed\xa2\x80", FALSE, 0x0 },
55 		{ "\xed\xa3\x80", FALSE, 0x0 },
56 		{ "\xed\xa4\x80", FALSE, 0x0 },
57 		{ "\xed\xa5\x80", FALSE, 0x0 },
58 		{ "\xed\xa6\x80", FALSE, 0x0 },
59 		{ "\xed\xa7\x80", FALSE, 0x0 },
60 		{ "\xed\xa8\x80", FALSE, 0x0 },
61 		{ "\xed\xa9\x80", FALSE, 0x0 },
62 		{ "\xed\xaa\x80", FALSE, 0x0 },
63 		{ "\xed\xab\x80", FALSE, 0x0 },
64 		{ "\xed\xac\x80", FALSE, 0x0 },
65 		{ "\xed\xad\x80", FALSE, 0x0 },
66 		{ "\xed\xaf\x80", FALSE, 0x0 },
67 		{ "\xed\xb0\x80", FALSE, 0x0 },
68 		{ "\xed\xb1\x80", FALSE, 0x0 },
69 		{ "\xed\xb2\x80", FALSE, 0x0 },
70 		{ "\xed\xb3\x80", FALSE, 0x0 },
71 		{ "\xed\xb4\x80", FALSE, 0x0 },
72 		{ "\xed\xb5\x80", FALSE, 0x0 },
73 		{ "\xed\xb6\x80", FALSE, 0x0 },
74 		{ "\xed\xb7\x80", FALSE, 0x0 },
75 		{ "\xed\xb8\x80", FALSE, 0x0 },
76 		{ "\xed\xb9\x80", FALSE, 0x0 },
77 		{ "\xed\xba\x80", FALSE, 0x0 },
78 		{ "\xed\xbb\x80", FALSE, 0x0 },
79 		{ "\xed\xbc\x80", FALSE, 0x0 },
80 		{ "\xed\xbd\x80", FALSE, 0x0 },
81 		{ "\xed\xbf\x80", FALSE, 0x0 }, /* .. U+DFFF */
82 		{ "\xe2\x82\xa1", TRUE, 0x20A1 },  /* U+20A1 */
83 		{ "\xe2\x28\xa1", FALSE, 0x0 }, /* invalid 2nd octet */
84 		{ "\xe2\x82\x28", FALSE, 0x0 }, /* invalid 3rd octet */
85 		{ "\xf0\x90\x8c\xbc", TRUE, 0x1033C },  /* U+1033C */
86 		{ "\xf0\x28\x8c\xbc", FALSE, 0x0 }, /*invalid 2nd octet*/
87 		{ "\xf0\x90\x28\xbc", FALSE, 0x0 }, /* invalid 3rd octet */
88 		{ "\xf0\x28\x8c\x28", FALSE, 0x0 }, /* invalid 4th octet */
89 		{ "\xf4\x80\x80\x80", TRUE, 0x100000 }, /* U+100000, supplementary plane start */
90 		{ "\xf4\x8f\xbf\xbf", TRUE, 0x10FFFF }, /* U+10FFFF, maximum value */
91 		{ "\xf8\xa1\xa1\xa1\xa1", FALSE, 0x0 }, /* invalid unicode */
92 		{ "\xfc\xa1\xa1\xa1\xa1\xa1", FALSE, 0x0 }, /* invalid unicode */
93 	};
94 
95 	test_begin("unichar valid unicode");
96 
97 	for(size_t i = 0; i < N_ELEMENTS(test_cases); i++) {
98 		unichar_t chr;
99 		if (test_cases[i].valid) {
100 			test_assert_idx(uni_utf8_get_char(test_cases[i].input, &chr) > 0, i);
101 			test_assert_idx(test_cases[i].expected == chr, i);
102 		} else {
103 			test_assert_idx(uni_utf8_get_char(test_cases[i].input, &chr) < 1, i);
104 		}
105 	}
106 
107 	test_end();
108 }
109 
test_unichar_surrogates(void)110 static void test_unichar_surrogates(void)
111 {
112 	unichar_t orig, high, low;
113 	test_begin("unichar surrogates");
114 
115 	orig = 0x10437;
116 	uni_split_surrogate(orig, &high, &low);
117 	test_assert(high == 0xD801);
118 	test_assert(low == 0xDC37);
119 	test_assert(uni_join_surrogate(high, low) == orig);
120 
121 	test_end();
122 }
123 
test_unichar(void)124 void test_unichar(void)
125 {
126 	static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1";
127 	static const char collate_in[] = "\xc3\xbc \xc2\xb3";
128 	static const char collate_exp[] = "U\xcc\x88 3";
129 	buffer_t *collate_out;
130 	unichar_t chr, chr2;
131 	string_t *str = t_str_new(16);
132 
133 	test_begin("unichars encode/decode");
134 	for (chr = 0; chr <= 0x10ffff; chr++) {
135 		/* skip surrogates */
136 		if ((chr & 0xfff800) == 0xd800)
137 			continue;
138 		/* The bottom 6 bits should be irrelevant to code coverage,
139 		   only test 000000, 111111, and something in between. */
140 		if ((chr & 63) == 1)
141 			chr += i_rand_limit(62); /* After 0, somewhere between 1 and 62 */
142 		else if ((chr & 63) > 0 && (chr & 63) < 63)
143 			chr |= 63; /* After random, straight to 63 */
144 
145 		str_truncate(str, 0);
146 		uni_ucs4_to_utf8_c(chr, str);
147 		test_assert(uni_utf8_str_is_valid(str_c(str)));
148 		test_assert(uni_utf8_get_char(str_c(str), &chr2) == (int)uni_utf8_char_bytes(*str_data(str)));
149 		test_assert(chr2 == chr);
150 
151 		if ((chr & 0x63) == 0) {
152 			unsigned int utf8len = uni_utf8_char_bytes((unsigned char)*str_c(str));
153 
154 			/* virtually truncate the byte string */
155 			while (--utf8len > 0)
156 				test_assert(uni_utf8_get_char_n(str_c(str), utf8len, &chr2) == 0);
157 
158 			utf8len = uni_utf8_char_bytes((unsigned char)*str_c(str));
159 
160 			/* actually truncate the byte stream */
161 			while (--utf8len > 0) {
162 				str_truncate(str, utf8len);
163 				test_assert(!uni_utf8_str_is_valid(str_c(str)));
164 				test_assert(uni_utf8_get_char(str_c(str), &chr2) == 0);
165 			}
166 		}
167 	}
168 	test_end();
169 
170 	test_begin("unichar collation");
171 	collate_out = buffer_create_dynamic(default_pool, 32);
172 	uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in),
173 					 collate_out);
174 	test_assert(strcmp(collate_out->data, collate_exp) == 0);
175 	buffer_free(&collate_out);
176 
177 	test_assert(!uni_utf8_str_is_valid(overlong_utf8));
178 	test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0);
179 	test_end();
180 
181 	test_unichar_uni_utf8_strlen();
182 	test_unichar_uni_utf8_partial_strlen_n();
183 	test_unichar_valid_unicode();
184 	test_unichar_surrogates();
185 }
186