1 /* Test of compatibility normalization of UTF-8 strings.
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
16
17 /* Written by Bruno Haible <bruno@clisp.org>, 2009. */
18
19 #include <config.h>
20
21 #if GNULIB_TEST_UNINORM_U8_NORMALIZE
22
23 #include "uninorm.h"
24
25 #include <signal.h>
26 #include <stdlib.h>
27 #include <unistd.h>
28
29 #include "unistr.h"
30 #include "macros.h"
31
32 static int
check(const uint8_t * input,size_t input_length,const uint8_t * expected,size_t expected_length)33 check (const uint8_t *input, size_t input_length,
34 const uint8_t *expected, size_t expected_length)
35 {
36 size_t length;
37 uint8_t *result;
38
39 /* Test return conventions with resultbuf == NULL. */
40 result = u8_normalize (UNINORM_NFKC, input, input_length, NULL, &length);
41 if (!(result != NULL))
42 return 1;
43 if (!(length == expected_length))
44 return 2;
45 if (!(u8_cmp (result, expected, expected_length) == 0))
46 return 3;
47 free (result);
48
49 /* Test return conventions with resultbuf too small. */
50 if (expected_length > 0)
51 {
52 uint8_t *preallocated;
53
54 length = expected_length - 1;
55 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
56 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
57 if (!(result != NULL))
58 return 4;
59 if (!(result != preallocated))
60 return 5;
61 if (!(length == expected_length))
62 return 6;
63 if (!(u8_cmp (result, expected, expected_length) == 0))
64 return 7;
65 free (result);
66 free (preallocated);
67 }
68
69 /* Test return conventions with resultbuf large enough. */
70 {
71 uint8_t *preallocated;
72
73 length = expected_length;
74 preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
75 result = u8_normalize (UNINORM_NFKC, input, input_length, preallocated, &length);
76 if (!(result != NULL))
77 return 8;
78 if (!(preallocated == NULL || result == preallocated))
79 return 9;
80 if (!(length == expected_length))
81 return 10;
82 if (!(u8_cmp (result, expected, expected_length) == 0))
83 return 11;
84 free (preallocated);
85 }
86
87 return 0;
88 }
89
90 void
test_u8_nfkc(void)91 test_u8_nfkc (void)
92 {
93 { /* Empty string. */
94 ASSERT (check (NULL, 0, NULL, 0) == 0);
95 }
96 { /* SPACE */
97 static const uint8_t input[] = { 0x20 };
98 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
99 }
100
101 { /* LATIN CAPITAL LETTER A WITH DIAERESIS */
102 static const uint8_t input[] = { 0xC3, 0x84 };
103 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88 };
104 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
105 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
106 }
107
108 { /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */
109 static const uint8_t input[] = { 0xC7, 0x9E };
110 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x88, 0xCC, 0x84 };
111 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
112 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
113 }
114
115 { /* ANGSTROM SIGN */
116 static const uint8_t input[] = { 0xE2, 0x84, 0xAB };
117 static const uint8_t decomposed[] = { 0x41, 0xCC, 0x8A };
118 static const uint8_t expected[] = { 0xC3, 0x85 };
119 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
120 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
121 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
122 }
123
124 { /* GREEK DIALYTIKA AND PERISPOMENI */
125 static const uint8_t input[] = { 0xE1, 0xBF, 0x81 };
126 static const uint8_t decomposed[] = { 0x20, 0xCC, 0x88, 0xCD, 0x82 };
127 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
128 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
129 }
130
131 { /* SCRIPT SMALL L */
132 static const uint8_t input[] = { 0xE2, 0x84, 0x93 };
133 static const uint8_t decomposed[] = { 0x6C };
134 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
135 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
136 }
137
138 { /* NO-BREAK SPACE */
139 static const uint8_t input[] = { 0xC2, 0xA0 };
140 static const uint8_t decomposed[] = { 0x20 };
141 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
142 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
143 }
144
145 { /* ARABIC LETTER VEH INITIAL FORM */
146 static const uint8_t input[] = { 0xEF, 0xAD, 0xAC };
147 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
148 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
149 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
150 }
151
152 { /* ARABIC LETTER VEH MEDIAL FORM */
153 static const uint8_t input[] = { 0xEF, 0xAD, 0xAD };
154 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
155 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
156 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
157 }
158
159 { /* ARABIC LETTER VEH FINAL FORM */
160 static const uint8_t input[] = { 0xEF, 0xAD, 0xAB };
161 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
162 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
163 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
164 }
165
166 { /* ARABIC LETTER VEH ISOLATED FORM */
167 static const uint8_t input[] = { 0xEF, 0xAD, 0xAA };
168 static const uint8_t decomposed[] = { 0xDA, 0xA4 };
169 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
170 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
171 }
172
173 { /* CIRCLED NUMBER FIFTEEN */
174 static const uint8_t input[] = { 0xE2, 0x91, 0xAE };
175 static const uint8_t decomposed[] = { 0x31, 0x35 };
176 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
177 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
178 }
179
180 { /* TRADE MARK SIGN */
181 static const uint8_t input[] = { 0xE2, 0x84, 0xA2 };
182 static const uint8_t decomposed[] = { 0x54, 0x4D };
183 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
184 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
185 }
186
187 { /* LATIN SUBSCRIPT SMALL LETTER I */
188 static const uint8_t input[] = { 0xE1, 0xB5, 0xA2 };
189 static const uint8_t decomposed[] = { 0x69 };
190 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
191 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
192 }
193
194 { /* PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS */
195 static const uint8_t input[] = { 0xEF, 0xB8, 0xB5 };
196 static const uint8_t decomposed[] = { 0x28 };
197 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
198 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
199 }
200
201 { /* FULLWIDTH LATIN CAPITAL LETTER A */
202 static const uint8_t input[] = { 0xEF, 0xBC, 0xA1 };
203 static const uint8_t decomposed[] = { 0x41 };
204 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
205 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
206 }
207
208 { /* HALFWIDTH IDEOGRAPHIC COMMA */
209 static const uint8_t input[] = { 0xEF, 0xBD, 0xA4 };
210 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
211 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
212 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
213 }
214
215 { /* SMALL IDEOGRAPHIC COMMA */
216 static const uint8_t input[] = { 0xEF, 0xB9, 0x91 };
217 static const uint8_t decomposed[] = { 0xE3, 0x80, 0x81 };
218 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
219 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
220 }
221
222 { /* SQUARE MHZ */
223 static const uint8_t input[] = { 0xE3, 0x8E, 0x92 };
224 static const uint8_t decomposed[] = { 0x4D, 0x48, 0x7A };
225 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
226 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
227 }
228
229 { /* VULGAR FRACTION THREE EIGHTHS */
230 static const uint8_t input[] = { 0xE2, 0x85, 0x9C };
231 static const uint8_t decomposed[] = { 0x33, 0xE2, 0x81, 0x84, 0x38 };
232 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
233 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
234 }
235
236 { /* MICRO SIGN */
237 static const uint8_t input[] = { 0xC2, 0xB5 };
238 static const uint8_t decomposed[] = { 0xCE, 0xBC };
239 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
240 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
241 }
242
243 { /* ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM */
244 static const uint8_t input[] = { 0xEF, 0xB7, 0xBA };
245 static const uint8_t decomposed[] =
246 { 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9,
247 0x84, 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, 0xD9, 0x87,
248 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, 0x84, 0xD9, 0x85
249 };
250 ASSERT (check (input, SIZEOF (input), decomposed, SIZEOF (decomposed)) == 0);
251 ASSERT (check (decomposed, SIZEOF (decomposed), decomposed, SIZEOF (decomposed)) == 0);
252 }
253
254 { /* HANGUL SYLLABLE GEUL */
255 static const uint8_t input[] = { 0xEA, 0xB8, 0x80 };
256 static const uint8_t decomposed[] =
257 { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF };
258 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
259 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
260 }
261
262 { /* HANGUL SYLLABLE GEU */
263 static const uint8_t input[] = { 0xEA, 0xB7, 0xB8 };
264 static const uint8_t decomposed[] = { 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3 };
265 ASSERT (check (input, SIZEOF (input), input, SIZEOF (input)) == 0);
266 ASSERT (check (decomposed, SIZEOF (decomposed), input, SIZEOF (input)) == 0);
267 }
268
269 { /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */
270 static const uint8_t input[] =
271 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
272 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
273 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
274 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
275 's', 'q', 'r', 't', '(', 'b', 0xC2, 0xB2, '-', '4', 'a', 'c', ')', ')',
276 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
277 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
278 0xED, 0x95, 0x9C,
279 0xEA, 0xB8, 0x80, '\n'
280 };
281 static const uint8_t decomposed[] =
282 { 'G', 'r', 0x75, 0xCC, 0x88, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
283 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
284 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB8, 0xCC, 0x86,
285 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
286 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
287 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
288 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
289 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xAB,
290 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xB3, 0xE1, 0x86, 0xAF, '\n'
291 };
292 static const uint8_t expected[] =
293 { 'G', 'r', 0xC3, 0xBC, 0xC3, 0x9F, ' ', 'G', 'o', 't', 't', '.',
294 ' ', 0xD0, 0x97, 0xD0, 0xB4, 0xD1, 0x80, 0xD0, 0xB0, 0xD0, 0xB2, 0xD1,
295 0x81, 0xD1, 0x82, 0xD0, 0xB2, 0xD1, 0x83, 0xD0, 0xB9,
296 0xD1, 0x82, 0xD0, 0xB5, '!', ' ', 'x', '=', '(', '-', 'b', 0xC2, 0xB1,
297 's', 'q', 'r', 't', '(', 'b', 0x32, '-', '4', 'a', 'c', ')', ')',
298 '/', '(', '2', 'a', ')', ' ', ' ', 0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC,
299 0xE8, 0xAA, 0x9E, ',', 0xE4, 0xB8, 0xAD, 0xE6, 0x96, 0x87, ',',
300 0xED, 0x95, 0x9C,
301 0xEA, 0xB8, 0x80, '\n'
302 };
303 ASSERT (check (input, SIZEOF (input), expected, SIZEOF (expected)) == 0);
304 ASSERT (check (decomposed, SIZEOF (decomposed), expected, SIZEOF (expected)) == 0);
305 ASSERT (check (expected, SIZEOF (expected), expected, SIZEOF (expected)) == 0);
306 }
307
308 #if HAVE_DECL_ALARM
309 /* Declare failure if test takes too long, by using default abort
310 caused by SIGALRM. */
311 signal (SIGALRM, SIG_DFL);
312 alarm (50);
313 #endif
314
315 /* Check that the sorting is not O(n²) but O(n log n). */
316 {
317 int pass;
318 for (pass = 0; pass < 3; pass++)
319 {
320 size_t repeat = 1;
321 size_t m = 100000;
322 uint8_t *input = (uint8_t *) malloc (2 * (2 * m - 1) * sizeof (uint8_t));
323 if (input != NULL)
324 {
325 uint8_t *expected = input + (2 * m - 1);
326 size_t m1 = m / 2;
327 size_t m2 = (m - 1) / 2;
328 /* NB: m1 + m2 == m - 1. */
329 uint8_t *p;
330 size_t i;
331
332 input[0] = 0x41;
333 p = input + 1;
334 switch (pass)
335 {
336 case 0:
337 for (i = 0; i < m1; i++)
338 {
339 *p++ = 0xCC;
340 *p++ = 0x99;
341 }
342 for (i = 0; i < m2; i++)
343 {
344 *p++ = 0xCC;
345 *p++ = 0x80;
346 }
347 break;
348
349 case 1:
350 for (i = 0; i < m2; i++)
351 {
352 *p++ = 0xCC;
353 *p++ = 0x80;
354 }
355 for (i = 0; i < m1; i++)
356 {
357 *p++ = 0xCC;
358 *p++ = 0x99;
359 }
360 break;
361
362 case 2:
363 for (i = 0; i < m2; i++)
364 {
365 *p++ = 0xCC;
366 *p++ = 0x99;
367 *p++ = 0xCC;
368 *p++ = 0x80;
369 }
370 for (; i < m1; i++)
371 {
372 *p++ = 0xCC;
373 *p++ = 0x99;
374 }
375 break;
376
377 default:
378 abort ();
379 }
380
381 expected[0] = 0xC3;
382 expected[1] = 0x80;
383 p = expected + 2;
384 for (i = 0; i < m1; i++)
385 {
386 *p++ = 0xCC;
387 *p++ = 0x99;
388 }
389 for (i = 0; i < m2 - 1; i++)
390 {
391 *p++ = 0xCC;
392 *p++ = 0x80;
393 }
394
395 for (; repeat > 0; repeat--)
396 {
397 ASSERT (check (input, 2 * m - 1, expected, 2 * m - 2) == 0);
398 ASSERT (check (expected, 2 * m - 2, expected, 2 * m - 2) == 0);
399 }
400
401 free (input);
402 }
403 }
404 }
405 }
406
407 #else
408
409 void
test_u8_nfkc(void)410 test_u8_nfkc (void)
411 {
412 }
413
414 #endif
415