1 /* Test of compatibility decomposition of UTF-8 strings.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
18
19 #include <config.h>
20
21 #if GNULIB_TEST_UNINORM_U8_NORMALIZE
22
23 #include "uninorm.h"
24
25 #include <signal.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28
29 #include "unistr.h"
30 #include "macros.h"
31
32 static int
check(const uint8_t * input,size_t input_length,const uint8_t * expected,size_t expected_length)33 check (const uint8_t *input, size_t input_length,
34 const uint8_t *expected, size_t expected_length)
35 {
36 size_t length;
37 uint8_t *result;
38
39 /* Test return conventions with resultbuf == NULL. */
40 result = u8_normalize (UNINORM_NFKD, input, input_length, NULL, &length);
41 if (!(result != NULL))
42 return 1;
43 if (!(length == expected_length))
44 return 2;
45 if (!(u8_cmp (result, expected, expected_length) == 0))
46 return 3;
47 free (result);
48
49 /* Test return conventions with resultbuf too small. */
50 if (expected_length > 0)
51 {
52 uint8_t *preallocated;
53
54 length = expected_length - 1;
55 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
56 result = u8_normalize (UNINORM_NFKD, input, input_length, preallocated, &length);
57 if (!(result != NULL))
58 return 4;
59 if (!(result != preallocated))
60 return 5;
61 if (!(length == expected_length))
62 return 6;
63 if (!(u8_cmp (result, expected, expected_length) == 0))
64 return 7;
65 free (result);
66 free (preallocated);
67 }
68
69 /* Test return conventions with resultbuf large enough. */
70 {
71 uint8_t *preallocated;
72
73 length = expected_length;
74 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
75 result = u8_normalize (UNINORM_NFKD, input, input_length, preallocated, &length);
76 if (!(result != NULL))
77 return 8;
78 if (!(preallocated == NULL || result == preallocated))
79 return 9;
80 if (!(length == expected_length))
81 return 10;
82 if (!(u8_cmp (result, expected, expected_length) == 0))
83 return 11;
84 free (preallocated);
85 }
86
87 return 0;
88 }
89
90 void
test_u8_nfkd(void)91 test_u8_nfkd (void)
92 {
93 { /* Empty string. */
94 ASSERT (check (NULL, 0, NULL, 0) == 0);
95 }
96 { /* SPACE */
97 static const uint8_t input[] = { 0x20 };
98 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
99 }
100
101 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
102 static const uint8_t input[] = { 0xC3, 0x84 };
103 static const uint8_t expected[] = { 0x41, 0xCC, 0x88 };
104 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
105 }
106
107 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
108 static const uint8_t input[] = { 0xC7, 0x9E };
109 static const uint8_t expected[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
110 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
111 }
112
113 { /* GREEK DIALYTIKA AND PERISPOMENI */
114 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
115 static const uint8_t expected[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 };
116 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
117 }
118
119 { /* SCRIPT SMALL L */
120 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
121 static const uint8_t expected[] = { 0x6C };
122 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
123 }
124
125 { /* NO-BREAK SPACE */
126 static const uint8_t input[] = { 0xC2, 0xA0 };
127 static const uint8_t expected[] = { 0x20 };
128 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
129 }
130
131 { /* ARABIC LETTER VEH INITIAL FORM */
132 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
133 static const uint8_t expected[] = { 0xDA, 0xA4 };
134 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
135 }
136
137 { /* ARABIC LETTER VEH MEDIAL FORM */
138 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
139 static const uint8_t expected[] = { 0xDA, 0xA4 };
140 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
141 }
142
143 { /* ARABIC LETTER VEH FINAL FORM */
144 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
145 static const uint8_t expected[] = { 0xDA, 0xA4 };
146 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
147 }
148
149 { /* ARABIC LETTER VEH ISOLATED FORM */
150 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
151 static const uint8_t expected[] = { 0xDA, 0xA4 };
152 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
153 }
154
155 { /* CIRCLED NUMBER FIFTEEN */
156 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
157 static const uint8_t expected[] = { 0x31, 0x35 };
158 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
159 }
160
161 { /* TRADE MARK SIGN */
162 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
163 static const uint8_t expected[] = { 0x54, 0x4D };
164 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
165 }
166
167 { /* LATIN SUBSCRIPT SMALL LETTER I */
168 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
169 static const uint8_t expected[] = { 0x69 };
170 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
171 }
172
173 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
174 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
175 static const uint8_t expected[] = { 0x28 };
176 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
177 }
178
179 { /* FULLWIDTH LATIN CAPITAL LETTER A */
180 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
181 static const uint8_t expected[] = { 0x41 };
182 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
183 }
184
185 { /* HALFWIDTH IDEOGRAPHIC COMMA */
186 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
187 static const uint8_t expected[] = { 0xE3, 0x80, 0x81 };
188 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
189 }
190
191 { /* SMALL IDEOGRAPHIC COMMA */
192 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
193 static const uint8_t expected[] = { 0xE3, 0x80, 0x81 };
194 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
195 }
196
197 { /* SQUARE MHZ */
198 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
199 static const uint8_t expected[] = { 0x4D, 0x48, 0x7A };
200 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
201 }
202
203 { /* VULGAR FRACTION THREE EIGHTHS */
204 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
205 static const uint8_t expected[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 };
206 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
207 }
208
209 { /* MICRO SIGN */
210 static const uint8_t input[] = { 0xC2, 0xB5 };
211 static const uint8_t expected[] = { 0xCE, 0xBC };
212 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
213 }
214
215 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
216 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
217 static const uint8_t expected[] =
218 { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9,
219 0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87,
220 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85
221 };
222 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
223 }
224
225 { /* HANGUL SYLLABLE GEUL */
226 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
227 static const uint8_t expected[] =
228 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
229 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
230 }
231
232 { /* HANGUL SYLLABLE GEU */
233 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
234 static const uint8_t expected[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
235 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
236 }
237
238 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
239 static const uint8_t input[] =
240 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
241 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
242 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
243 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
244 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
245 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
246 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
247 0xED, 0x95, 0x9C,
248 0xEA, 0xB8, 0x80, '\n'
249 };
250 static const uint8_t expected[] =
251 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
252 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
253 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
254 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
255 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
256 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
257 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
258 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
259 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
260 };
261 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
262 }
263
264 #if HAVE_DECL_ALARM
265 /* Declare failure if test takes too long, by using default abort
266 caused by SIGALRM. */
267 signal (SIGALRM, SIG_DFL);
268 alarm (50);
269 #endif
270
271 /* Check that the sorting is not O(n²) but O(n log n). */
272 {
273 int pass;
274 for (pass = 0; pass < 3; pass++)
275 {
276 size_t repeat = 1;
277 size_t m = 100000;
278 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
279 if (input != NULL)
280 {
281 uint8_t *expected = input + (2 * m - 1);
282 size_t m1 = m / 2;
283 size_t m2 = (m - 1) / 2;
284 /* NB: m1 + m2 == m - 1. */
285 uint8_t *p;
286 size_t i;
287
288 input[0] = 0x41;
289 p = input + 1;
290 switch (pass)
291 {
292 case 0:
293 for (i = 0; i < m1; i++)
294 {
295 *p++ = 0xCC;
296 *p++ = 0x99;
297 }
298 for (i = 0; i < m2; i++)
299 {
300 *p++ = 0xCC;
301 *p++ = 0x80;
302 }
303 break;
304
305 case 1:
306 for (i = 0; i < m2; i++)
307 {
308 *p++ = 0xCC;
309 *p++ = 0x80;
310 }
311 for (i = 0; i < m1; i++)
312 {
313 *p++ = 0xCC;
314 *p++ = 0x99;
315 }
316 break;
317
318 case 2:
319 for (i = 0; i < m2; i++)
320 {
321 *p++ = 0xCC;
322 *p++ = 0x99;
323 *p++ = 0xCC;
324 *p++ = 0x80;
325 }
326 for (; i < m1; i++)
327 {
328 *p++ = 0xCC;
329 *p++ = 0x99;
330 }
331 break;
332
333 default:
334 abort ();
335 }
336
337 expected[0] = 0x41;
338 p = expected + 1;
339 for (i = 0; i < m1; i++)
340 {
341 *p++ = 0xCC;
342 *p++ = 0x99;
343 }
344 for (i = 0; i < m2; i++)
345 {
346 *p++ = 0xCC;
347 *p++ = 0x80;
348 }
349
350 for (; repeat > 0; repeat--)
351 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 1) == 0);
352
353 free (input);
354 }
355 }
356 }
357 }
358
359 #else
360
361 void
test_u8_nfkd(void)362 test_u8_nfkd (void)
363 {
364 }
365
366 #endif
367