1 /*-------------------------------------------------------------------------
2  * unicode_norm.c
3  *		Normalize a Unicode string
4  *
5  * This implements Unicode normalization, per the documentation at
6  * https://www.unicode.org/reports/tr15/.
7  *
8  * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group
9  *
10  * IDENTIFICATION
11  *	  src/common/unicode_norm.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef FRONTEND
16 #include "postgres.h"
17 #else
18 #include "postgres_fe.h"
19 #endif
20 
21 #include "common/unicode_norm.h"
22 #include "common/unicode_norm_table.h"
23 #ifndef FRONTEND
24 #include "common/unicode_normprops_table.h"
25 #endif
26 
27 #ifndef FRONTEND
28 #define ALLOC(size) palloc(size)
29 #define FREE(size) pfree(size)
30 #else
31 #define ALLOC(size) malloc(size)
32 #define FREE(size) free(size)
33 #endif
34 
35 /* Constants for calculations with Hangul characters */
36 #define SBASE		0xAC00		/* U+AC00 */
37 #define LBASE		0x1100		/* U+1100 */
38 #define VBASE		0x1161		/* U+1161 */
39 #define TBASE		0x11A7		/* U+11A7 */
40 #define LCOUNT		19
41 #define VCOUNT		21
42 #define TCOUNT		28
43 #define NCOUNT		VCOUNT * TCOUNT
44 #define SCOUNT		LCOUNT * NCOUNT
45 
46 /* comparison routine for bsearch() of decomposition lookup table. */
47 static int
conv_compare(const void * p1,const void * p2)48 conv_compare(const void *p1, const void *p2)
49 {
50 	uint32		v1,
51 				v2;
52 
53 	v1 = *(const uint32 *) p1;
54 	v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
55 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
56 }
57 
58 /*
59  * Get the entry corresponding to code in the decomposition lookup table.
60  */
61 static pg_unicode_decomposition *
get_code_entry(pg_wchar code)62 get_code_entry(pg_wchar code)
63 {
64 	return bsearch(&(code),
65 				   UnicodeDecompMain,
66 				   lengthof(UnicodeDecompMain),
67 				   sizeof(pg_unicode_decomposition),
68 				   conv_compare);
69 }
70 
71 /*
72  * Given a decomposition entry looked up earlier, get the decomposed
73  * characters.
74  *
75  * Note: the returned pointer can point to statically allocated buffer, and
76  * is only valid until next call to this function!
77  */
78 static const pg_wchar *
get_code_decomposition(pg_unicode_decomposition * entry,int * dec_size)79 get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
80 {
81 	static pg_wchar x;
82 
83 	if (DECOMPOSITION_IS_INLINE(entry))
84 	{
85 		Assert(DECOMPOSITION_SIZE(entry) == 1);
86 		x = (pg_wchar) entry->dec_index;
87 		*dec_size = 1;
88 		return &x;
89 	}
90 	else
91 	{
92 		*dec_size = DECOMPOSITION_SIZE(entry);
93 		return &UnicodeDecomp_codepoints[entry->dec_index];
94 	}
95 }
96 
97 /*
98  * Calculate how many characters a given character will decompose to.
99  *
100  * This needs to recurse, if the character decomposes into characters that
101  * are, in turn, decomposable.
102  */
103 static int
get_decomposed_size(pg_wchar code,bool compat)104 get_decomposed_size(pg_wchar code, bool compat)
105 {
106 	pg_unicode_decomposition *entry;
107 	int			size = 0;
108 	int			i;
109 	const uint32 *decomp;
110 	int			dec_size;
111 
112 	/*
113 	 * Fast path for Hangul characters not stored in tables to save memory as
114 	 * decomposition is algorithmic. See
115 	 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
116 	 * on the matter.
117 	 */
118 	if (code >= SBASE && code < SBASE + SCOUNT)
119 	{
120 		uint32		tindex,
121 					sindex;
122 
123 		sindex = code - SBASE;
124 		tindex = sindex % TCOUNT;
125 
126 		if (tindex != 0)
127 			return 3;
128 		return 2;
129 	}
130 
131 	entry = get_code_entry(code);
132 
133 	/*
134 	 * Just count current code if no other decompositions.  A NULL entry is
135 	 * equivalent to a character with class 0 and no decompositions.
136 	 */
137 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
138 		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
139 		return 1;
140 
141 	/*
142 	 * If this entry has other decomposition codes look at them as well. First
143 	 * get its decomposition in the list of tables available.
144 	 */
145 	decomp = get_code_decomposition(entry, &dec_size);
146 	for (i = 0; i < dec_size; i++)
147 	{
148 		uint32		lcode = decomp[i];
149 
150 		size += get_decomposed_size(lcode, compat);
151 	}
152 
153 	return size;
154 }
155 
156 /*
157  * Recompose a set of characters. For hangul characters, the calculation
158  * is algorithmic. For others, an inverse lookup at the decomposition
159  * table is necessary. Returns true if a recomposition can be done, and
160  * false otherwise.
161  */
162 static bool
recompose_code(uint32 start,uint32 code,uint32 * result)163 recompose_code(uint32 start, uint32 code, uint32 *result)
164 {
165 	/*
166 	 * Handle Hangul characters algorithmically, per the Unicode spec.
167 	 *
168 	 * Check if two current characters are L and V.
169 	 */
170 	if (start >= LBASE && start < LBASE + LCOUNT &&
171 		code >= VBASE && code < VBASE + VCOUNT)
172 	{
173 		/* make syllable of form LV */
174 		uint32		lindex = start - LBASE;
175 		uint32		vindex = code - VBASE;
176 
177 		*result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
178 		return true;
179 	}
180 	/* Check if two current characters are LV and T */
181 	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
182 			 ((start - SBASE) % TCOUNT) == 0 &&
183 			 code >= TBASE && code < (TBASE + TCOUNT))
184 	{
185 		/* make syllable of form LVT */
186 		uint32		tindex = code - TBASE;
187 
188 		*result = start + tindex;
189 		return true;
190 	}
191 	else
192 	{
193 		int			i;
194 
195 		/*
196 		 * Do an inverse lookup of the decomposition tables to see if anything
197 		 * matches. The comparison just needs to be a perfect match on the
198 		 * sub-table of size two, because the start character has already been
199 		 * recomposed partially.
200 		 */
201 		for (i = 0; i < lengthof(UnicodeDecompMain); i++)
202 		{
203 			const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
204 
205 			if (DECOMPOSITION_SIZE(entry) != 2)
206 				continue;
207 
208 			if (DECOMPOSITION_NO_COMPOSE(entry))
209 				continue;
210 
211 			if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
212 				code == UnicodeDecomp_codepoints[entry->dec_index + 1])
213 			{
214 				*result = entry->codepoint;
215 				return true;
216 			}
217 		}
218 	}
219 
220 	return false;
221 }
222 
223 /*
224  * Decompose the given code into the array given by caller. The
225  * decomposition begins at the position given by caller, saving one
226  * lookup on the decomposition table. The current position needs to be
227  * updated here to let the caller know from where to continue filling
228  * in the array result.
229  */
230 static void
decompose_code(pg_wchar code,bool compat,pg_wchar ** result,int * current)231 decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
232 {
233 	pg_unicode_decomposition *entry;
234 	int			i;
235 	const uint32 *decomp;
236 	int			dec_size;
237 
238 	/*
239 	 * Fast path for Hangul characters not stored in tables to save memory as
240 	 * decomposition is algorithmic. See
241 	 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
242 	 * on the matter.
243 	 */
244 	if (code >= SBASE && code < SBASE + SCOUNT)
245 	{
246 		uint32		l,
247 					v,
248 					tindex,
249 					sindex;
250 		pg_wchar   *res = *result;
251 
252 		sindex = code - SBASE;
253 		l = LBASE + sindex / (VCOUNT * TCOUNT);
254 		v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
255 		tindex = sindex % TCOUNT;
256 
257 		res[*current] = l;
258 		(*current)++;
259 		res[*current] = v;
260 		(*current)++;
261 
262 		if (tindex != 0)
263 		{
264 			res[*current] = TBASE + tindex;
265 			(*current)++;
266 		}
267 
268 		return;
269 	}
270 
271 	entry = get_code_entry(code);
272 
273 	/*
274 	 * Just fill in with the current decomposition if there are no
275 	 * decomposition codes to recurse to.  A NULL entry is equivalent to a
276 	 * character with class 0 and no decompositions, so just leave also in
277 	 * this case.
278 	 */
279 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
280 		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
281 	{
282 		pg_wchar   *res = *result;
283 
284 		res[*current] = code;
285 		(*current)++;
286 		return;
287 	}
288 
289 	/*
290 	 * If this entry has other decomposition codes look at them as well.
291 	 */
292 	decomp = get_code_decomposition(entry, &dec_size);
293 	for (i = 0; i < dec_size; i++)
294 	{
295 		pg_wchar	lcode = (pg_wchar) decomp[i];
296 
297 		/* Leave if no more decompositions */
298 		decompose_code(lcode, compat, result, current);
299 	}
300 }
301 
302 /*
303  * unicode_normalize - Normalize a Unicode string to the specified form.
304  *
305  * The input is a 0-terminated array of codepoints.
306  *
307  * In frontend, returns a 0-terminated array of codepoints, allocated with
308  * malloc. Or NULL if we run out of memory. In backend, the returned
309  * string is palloc'd instead, and OOM is reported with ereport().
310  */
311 pg_wchar *
unicode_normalize(UnicodeNormalizationForm form,const pg_wchar * input)312 unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
313 {
314 	bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
315 	bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
316 	pg_wchar   *decomp_chars;
317 	pg_wchar   *recomp_chars;
318 	int			decomp_size,
319 				current_size;
320 	int			count;
321 	const pg_wchar *p;
322 
323 	/* variables for recomposition */
324 	int			last_class;
325 	int			starter_pos;
326 	int			target_pos;
327 	uint32		starter_ch;
328 
329 	/* First, do character decomposition */
330 
331 	/*
332 	 * Calculate how many characters long the decomposed version will be.
333 	 */
334 	decomp_size = 0;
335 	for (p = input; *p; p++)
336 		decomp_size += get_decomposed_size(*p, compat);
337 
338 	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
339 	if (decomp_chars == NULL)
340 		return NULL;
341 
342 	/*
343 	 * Now fill in each entry recursively. This needs a second pass on the
344 	 * decomposition table.
345 	 */
346 	current_size = 0;
347 	for (p = input; *p; p++)
348 		decompose_code(*p, compat, &decomp_chars, &current_size);
349 	decomp_chars[decomp_size] = '\0';
350 	Assert(decomp_size == current_size);
351 
352 	/*
353 	 * Now apply canonical ordering.
354 	 */
355 	for (count = 1; count < decomp_size; count++)
356 	{
357 		pg_wchar	prev = decomp_chars[count - 1];
358 		pg_wchar	next = decomp_chars[count];
359 		pg_wchar	tmp;
360 		pg_unicode_decomposition *prevEntry = get_code_entry(prev);
361 		pg_unicode_decomposition *nextEntry = get_code_entry(next);
362 
363 		/*
364 		 * If no entries are found, the character used is either an Hangul
365 		 * character or a character with a class of 0 and no decompositions,
366 		 * so move to next result.
367 		 */
368 		if (prevEntry == NULL || nextEntry == NULL)
369 			continue;
370 
371 		/*
372 		 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
373 		 * annex 4, a sequence of two adjacent characters in a string is an
374 		 * exchangeable pair if the combining class (from the Unicode
375 		 * Character Database) for the first character is greater than the
376 		 * combining class for the second, and the second is not a starter.  A
377 		 * character is a starter if its combining class is 0.
378 		 */
379 		if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
380 			continue;
381 
382 		if (prevEntry->comb_class <= nextEntry->comb_class)
383 			continue;
384 
385 		/* exchange can happen */
386 		tmp = decomp_chars[count - 1];
387 		decomp_chars[count - 1] = decomp_chars[count];
388 		decomp_chars[count] = tmp;
389 
390 		/* backtrack to check again */
391 		if (count > 1)
392 			count -= 2;
393 	}
394 
395 	if (!recompose)
396 		return decomp_chars;
397 
398 	/*
399 	 * The last phase of NFC and NFKC is the recomposition of the reordered
400 	 * Unicode string using combining classes. The recomposed string cannot be
401 	 * longer than the decomposed one, so make the allocation of the output
402 	 * string based on that assumption.
403 	 */
404 	recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
405 	if (!recomp_chars)
406 	{
407 		FREE(decomp_chars);
408 		return NULL;
409 	}
410 
411 	last_class = -1;			/* this eliminates a special check */
412 	starter_pos = 0;
413 	target_pos = 1;
414 	starter_ch = recomp_chars[0] = decomp_chars[0];
415 
416 	for (count = 1; count < decomp_size; count++)
417 	{
418 		pg_wchar	ch = decomp_chars[count];
419 		pg_unicode_decomposition *ch_entry = get_code_entry(ch);
420 		int			ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
421 		pg_wchar	composite;
422 
423 		if (last_class < ch_class &&
424 			recompose_code(starter_ch, ch, &composite))
425 		{
426 			recomp_chars[starter_pos] = composite;
427 			starter_ch = composite;
428 		}
429 		else if (ch_class == 0)
430 		{
431 			starter_pos = target_pos;
432 			starter_ch = ch;
433 			last_class = -1;
434 			recomp_chars[target_pos++] = ch;
435 		}
436 		else
437 		{
438 			last_class = ch_class;
439 			recomp_chars[target_pos++] = ch;
440 		}
441 	}
442 	recomp_chars[target_pos] = (pg_wchar) '\0';
443 
444 	FREE(decomp_chars);
445 
446 	return recomp_chars;
447 }
448 
449 /*
450  * Normalization "quick check" algorithm; see
451  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
452  */
453 
454 /* We only need this in the backend. */
455 #ifndef FRONTEND
456 
457 static uint8
get_canonical_class(pg_wchar ch)458 get_canonical_class(pg_wchar ch)
459 {
460 	pg_unicode_decomposition *entry = get_code_entry(ch);
461 
462 	if (!entry)
463 		return 0;
464 	else
465 		return entry->comb_class;
466 }
467 
468 static int
qc_compare(const void * p1,const void * p2)469 qc_compare(const void *p1, const void *p2)
470 {
471 	uint32		v1,
472 				v2;
473 
474 	v1 = ((const pg_unicode_normprops *) p1)->codepoint;
475 	v2 = ((const pg_unicode_normprops *) p2)->codepoint;
476 	return (v1 - v2);
477 }
478 
479 /*
480  * Look up the normalization quick check character property
481  */
482 static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form,pg_wchar ch)483 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
484 {
485 	pg_unicode_normprops key;
486 	pg_unicode_normprops *found = NULL;
487 
488 	key.codepoint = ch;
489 
490 	switch (form)
491 	{
492 		case UNICODE_NFC:
493 			found = bsearch(&key,
494 							UnicodeNormProps_NFC_QC,
495 							lengthof(UnicodeNormProps_NFC_QC),
496 							sizeof(pg_unicode_normprops),
497 							qc_compare);
498 			break;
499 		case UNICODE_NFKC:
500 			found = bsearch(&key,
501 							UnicodeNormProps_NFKC_QC,
502 							lengthof(UnicodeNormProps_NFKC_QC),
503 							sizeof(pg_unicode_normprops),
504 							qc_compare);
505 			break;
506 		default:
507 			Assert(false);
508 			break;
509 	}
510 
511 	if (found)
512 		return found->quickcheck;
513 	else
514 		return UNICODE_NORM_QC_YES;
515 }
516 
517 UnicodeNormalizationQC
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form,const pg_wchar * input)518 unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
519 {
520 	uint8		lastCanonicalClass = 0;
521 	UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
522 
523 	/*
524 	 * For the "D" forms, we don't run the quickcheck.  We don't include the
525 	 * lookup tables for those because they are huge, checking for these
526 	 * particular forms is less common, and running the slow path is faster
527 	 * for the "D" forms than the "C" forms because you don't need to
528 	 * recompose, which is slow.
529 	 */
530 	if (form == UNICODE_NFD || form == UNICODE_NFKD)
531 		return UNICODE_NORM_QC_MAYBE;
532 
533 	for (const pg_wchar *p = input; *p; p++)
534 	{
535 		pg_wchar	ch = *p;
536 		uint8		canonicalClass;
537 		UnicodeNormalizationQC check;
538 
539 		canonicalClass = get_canonical_class(ch);
540 		if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
541 			return UNICODE_NORM_QC_NO;
542 
543 		check = qc_is_allowed(form, ch);
544 		if (check == UNICODE_NORM_QC_NO)
545 			return UNICODE_NORM_QC_NO;
546 		else if (check == UNICODE_NORM_QC_MAYBE)
547 			result = UNICODE_NORM_QC_MAYBE;
548 
549 		lastCanonicalClass = canonicalClass;
550 	}
551 	return result;
552 }
553 
554 #endif							/* !FRONTEND */
555