1 /*-------------------------------------------------------------------------
2 * unicode_norm.c
3 * Normalize a Unicode string
4 *
5 * This implements Unicode normalization, per the documentation at
6 * https://www.unicode.org/reports/tr15/.
7 *
8 * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group
9 *
10 * IDENTIFICATION
11 * src/common/unicode_norm.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #ifndef FRONTEND
16 #include "postgres.h"
17 #else
18 #include "postgres_fe.h"
19 #endif
20
21 #include "common/unicode_norm.h"
22 #include "common/unicode_norm_table.h"
23 #ifndef FRONTEND
24 #include "common/unicode_normprops_table.h"
25 #endif
26
27 #ifndef FRONTEND
28 #define ALLOC(size) palloc(size)
29 #define FREE(size) pfree(size)
30 #else
31 #define ALLOC(size) malloc(size)
32 #define FREE(size) free(size)
33 #endif
34
35 /* Constants for calculations with Hangul characters */
36 #define SBASE 0xAC00 /* U+AC00 */
37 #define LBASE 0x1100 /* U+1100 */
38 #define VBASE 0x1161 /* U+1161 */
39 #define TBASE 0x11A7 /* U+11A7 */
40 #define LCOUNT 19
41 #define VCOUNT 21
42 #define TCOUNT 28
43 #define NCOUNT VCOUNT * TCOUNT
44 #define SCOUNT LCOUNT * NCOUNT
45
46 /* comparison routine for bsearch() of decomposition lookup table. */
47 static int
conv_compare(const void * p1,const void * p2)48 conv_compare(const void *p1, const void *p2)
49 {
50 uint32 v1,
51 v2;
52
53 v1 = *(const uint32 *) p1;
54 v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
55 return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
56 }
57
58 /*
59 * Get the entry corresponding to code in the decomposition lookup table.
60 */
61 static pg_unicode_decomposition *
get_code_entry(pg_wchar code)62 get_code_entry(pg_wchar code)
63 {
64 return bsearch(&(code),
65 UnicodeDecompMain,
66 lengthof(UnicodeDecompMain),
67 sizeof(pg_unicode_decomposition),
68 conv_compare);
69 }
70
71 /*
72 * Given a decomposition entry looked up earlier, get the decomposed
73 * characters.
74 *
75 * Note: the returned pointer can point to statically allocated buffer, and
76 * is only valid until next call to this function!
77 */
78 static const pg_wchar *
get_code_decomposition(pg_unicode_decomposition * entry,int * dec_size)79 get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
80 {
81 static pg_wchar x;
82
83 if (DECOMPOSITION_IS_INLINE(entry))
84 {
85 Assert(DECOMPOSITION_SIZE(entry) == 1);
86 x = (pg_wchar) entry->dec_index;
87 *dec_size = 1;
88 return &x;
89 }
90 else
91 {
92 *dec_size = DECOMPOSITION_SIZE(entry);
93 return &UnicodeDecomp_codepoints[entry->dec_index];
94 }
95 }
96
97 /*
98 * Calculate how many characters a given character will decompose to.
99 *
100 * This needs to recurse, if the character decomposes into characters that
101 * are, in turn, decomposable.
102 */
103 static int
get_decomposed_size(pg_wchar code,bool compat)104 get_decomposed_size(pg_wchar code, bool compat)
105 {
106 pg_unicode_decomposition *entry;
107 int size = 0;
108 int i;
109 const uint32 *decomp;
110 int dec_size;
111
112 /*
113 * Fast path for Hangul characters not stored in tables to save memory as
114 * decomposition is algorithmic. See
115 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
116 * on the matter.
117 */
118 if (code >= SBASE && code < SBASE + SCOUNT)
119 {
120 uint32 tindex,
121 sindex;
122
123 sindex = code - SBASE;
124 tindex = sindex % TCOUNT;
125
126 if (tindex != 0)
127 return 3;
128 return 2;
129 }
130
131 entry = get_code_entry(code);
132
133 /*
134 * Just count current code if no other decompositions. A NULL entry is
135 * equivalent to a character with class 0 and no decompositions.
136 */
137 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
138 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
139 return 1;
140
141 /*
142 * If this entry has other decomposition codes look at them as well. First
143 * get its decomposition in the list of tables available.
144 */
145 decomp = get_code_decomposition(entry, &dec_size);
146 for (i = 0; i < dec_size; i++)
147 {
148 uint32 lcode = decomp[i];
149
150 size += get_decomposed_size(lcode, compat);
151 }
152
153 return size;
154 }
155
156 /*
157 * Recompose a set of characters. For hangul characters, the calculation
158 * is algorithmic. For others, an inverse lookup at the decomposition
159 * table is necessary. Returns true if a recomposition can be done, and
160 * false otherwise.
161 */
162 static bool
recompose_code(uint32 start,uint32 code,uint32 * result)163 recompose_code(uint32 start, uint32 code, uint32 *result)
164 {
165 /*
166 * Handle Hangul characters algorithmically, per the Unicode spec.
167 *
168 * Check if two current characters are L and V.
169 */
170 if (start >= LBASE && start < LBASE + LCOUNT &&
171 code >= VBASE && code < VBASE + VCOUNT)
172 {
173 /* make syllable of form LV */
174 uint32 lindex = start - LBASE;
175 uint32 vindex = code - VBASE;
176
177 *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
178 return true;
179 }
180 /* Check if two current characters are LV and T */
181 else if (start >= SBASE && start < (SBASE + SCOUNT) &&
182 ((start - SBASE) % TCOUNT) == 0 &&
183 code >= TBASE && code < (TBASE + TCOUNT))
184 {
185 /* make syllable of form LVT */
186 uint32 tindex = code - TBASE;
187
188 *result = start + tindex;
189 return true;
190 }
191 else
192 {
193 int i;
194
195 /*
196 * Do an inverse lookup of the decomposition tables to see if anything
197 * matches. The comparison just needs to be a perfect match on the
198 * sub-table of size two, because the start character has already been
199 * recomposed partially.
200 */
201 for (i = 0; i < lengthof(UnicodeDecompMain); i++)
202 {
203 const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
204
205 if (DECOMPOSITION_SIZE(entry) != 2)
206 continue;
207
208 if (DECOMPOSITION_NO_COMPOSE(entry))
209 continue;
210
211 if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
212 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
213 {
214 *result = entry->codepoint;
215 return true;
216 }
217 }
218 }
219
220 return false;
221 }
222
223 /*
224 * Decompose the given code into the array given by caller. The
225 * decomposition begins at the position given by caller, saving one
226 * lookup on the decomposition table. The current position needs to be
227 * updated here to let the caller know from where to continue filling
228 * in the array result.
229 */
230 static void
decompose_code(pg_wchar code,bool compat,pg_wchar ** result,int * current)231 decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
232 {
233 pg_unicode_decomposition *entry;
234 int i;
235 const uint32 *decomp;
236 int dec_size;
237
238 /*
239 * Fast path for Hangul characters not stored in tables to save memory as
240 * decomposition is algorithmic. See
241 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
242 * on the matter.
243 */
244 if (code >= SBASE && code < SBASE + SCOUNT)
245 {
246 uint32 l,
247 v,
248 tindex,
249 sindex;
250 pg_wchar *res = *result;
251
252 sindex = code - SBASE;
253 l = LBASE + sindex / (VCOUNT * TCOUNT);
254 v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
255 tindex = sindex % TCOUNT;
256
257 res[*current] = l;
258 (*current)++;
259 res[*current] = v;
260 (*current)++;
261
262 if (tindex != 0)
263 {
264 res[*current] = TBASE + tindex;
265 (*current)++;
266 }
267
268 return;
269 }
270
271 entry = get_code_entry(code);
272
273 /*
274 * Just fill in with the current decomposition if there are no
275 * decomposition codes to recurse to. A NULL entry is equivalent to a
276 * character with class 0 and no decompositions, so just leave also in
277 * this case.
278 */
279 if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
280 (!compat && DECOMPOSITION_IS_COMPAT(entry)))
281 {
282 pg_wchar *res = *result;
283
284 res[*current] = code;
285 (*current)++;
286 return;
287 }
288
289 /*
290 * If this entry has other decomposition codes look at them as well.
291 */
292 decomp = get_code_decomposition(entry, &dec_size);
293 for (i = 0; i < dec_size; i++)
294 {
295 pg_wchar lcode = (pg_wchar) decomp[i];
296
297 /* Leave if no more decompositions */
298 decompose_code(lcode, compat, result, current);
299 }
300 }
301
302 /*
303 * unicode_normalize - Normalize a Unicode string to the specified form.
304 *
305 * The input is a 0-terminated array of codepoints.
306 *
307 * In frontend, returns a 0-terminated array of codepoints, allocated with
308 * malloc. Or NULL if we run out of memory. In backend, the returned
309 * string is palloc'd instead, and OOM is reported with ereport().
310 */
311 pg_wchar *
unicode_normalize(UnicodeNormalizationForm form,const pg_wchar * input)312 unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
313 {
314 bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
315 bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
316 pg_wchar *decomp_chars;
317 pg_wchar *recomp_chars;
318 int decomp_size,
319 current_size;
320 int count;
321 const pg_wchar *p;
322
323 /* variables for recomposition */
324 int last_class;
325 int starter_pos;
326 int target_pos;
327 uint32 starter_ch;
328
329 /* First, do character decomposition */
330
331 /*
332 * Calculate how many characters long the decomposed version will be.
333 */
334 decomp_size = 0;
335 for (p = input; *p; p++)
336 decomp_size += get_decomposed_size(*p, compat);
337
338 decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
339 if (decomp_chars == NULL)
340 return NULL;
341
342 /*
343 * Now fill in each entry recursively. This needs a second pass on the
344 * decomposition table.
345 */
346 current_size = 0;
347 for (p = input; *p; p++)
348 decompose_code(*p, compat, &decomp_chars, ¤t_size);
349 decomp_chars[decomp_size] = '\0';
350 Assert(decomp_size == current_size);
351
352 /*
353 * Now apply canonical ordering.
354 */
355 for (count = 1; count < decomp_size; count++)
356 {
357 pg_wchar prev = decomp_chars[count - 1];
358 pg_wchar next = decomp_chars[count];
359 pg_wchar tmp;
360 pg_unicode_decomposition *prevEntry = get_code_entry(prev);
361 pg_unicode_decomposition *nextEntry = get_code_entry(next);
362
363 /*
364 * If no entries are found, the character used is either an Hangul
365 * character or a character with a class of 0 and no decompositions,
366 * so move to next result.
367 */
368 if (prevEntry == NULL || nextEntry == NULL)
369 continue;
370
371 /*
372 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
373 * annex 4, a sequence of two adjacent characters in a string is an
374 * exchangeable pair if the combining class (from the Unicode
375 * Character Database) for the first character is greater than the
376 * combining class for the second, and the second is not a starter. A
377 * character is a starter if its combining class is 0.
378 */
379 if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
380 continue;
381
382 if (prevEntry->comb_class <= nextEntry->comb_class)
383 continue;
384
385 /* exchange can happen */
386 tmp = decomp_chars[count - 1];
387 decomp_chars[count - 1] = decomp_chars[count];
388 decomp_chars[count] = tmp;
389
390 /* backtrack to check again */
391 if (count > 1)
392 count -= 2;
393 }
394
395 if (!recompose)
396 return decomp_chars;
397
398 /*
399 * The last phase of NFC and NFKC is the recomposition of the reordered
400 * Unicode string using combining classes. The recomposed string cannot be
401 * longer than the decomposed one, so make the allocation of the output
402 * string based on that assumption.
403 */
404 recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
405 if (!recomp_chars)
406 {
407 FREE(decomp_chars);
408 return NULL;
409 }
410
411 last_class = -1; /* this eliminates a special check */
412 starter_pos = 0;
413 target_pos = 1;
414 starter_ch = recomp_chars[0] = decomp_chars[0];
415
416 for (count = 1; count < decomp_size; count++)
417 {
418 pg_wchar ch = decomp_chars[count];
419 pg_unicode_decomposition *ch_entry = get_code_entry(ch);
420 int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
421 pg_wchar composite;
422
423 if (last_class < ch_class &&
424 recompose_code(starter_ch, ch, &composite))
425 {
426 recomp_chars[starter_pos] = composite;
427 starter_ch = composite;
428 }
429 else if (ch_class == 0)
430 {
431 starter_pos = target_pos;
432 starter_ch = ch;
433 last_class = -1;
434 recomp_chars[target_pos++] = ch;
435 }
436 else
437 {
438 last_class = ch_class;
439 recomp_chars[target_pos++] = ch;
440 }
441 }
442 recomp_chars[target_pos] = (pg_wchar) '\0';
443
444 FREE(decomp_chars);
445
446 return recomp_chars;
447 }
448
449 /*
450 * Normalization "quick check" algorithm; see
451 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
452 */
453
454 /* We only need this in the backend. */
455 #ifndef FRONTEND
456
457 static uint8
get_canonical_class(pg_wchar ch)458 get_canonical_class(pg_wchar ch)
459 {
460 pg_unicode_decomposition *entry = get_code_entry(ch);
461
462 if (!entry)
463 return 0;
464 else
465 return entry->comb_class;
466 }
467
468 static int
qc_compare(const void * p1,const void * p2)469 qc_compare(const void *p1, const void *p2)
470 {
471 uint32 v1,
472 v2;
473
474 v1 = ((const pg_unicode_normprops *) p1)->codepoint;
475 v2 = ((const pg_unicode_normprops *) p2)->codepoint;
476 return (v1 - v2);
477 }
478
479 /*
480 * Look up the normalization quick check character property
481 */
482 static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form,pg_wchar ch)483 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
484 {
485 pg_unicode_normprops key;
486 pg_unicode_normprops *found = NULL;
487
488 key.codepoint = ch;
489
490 switch (form)
491 {
492 case UNICODE_NFC:
493 found = bsearch(&key,
494 UnicodeNormProps_NFC_QC,
495 lengthof(UnicodeNormProps_NFC_QC),
496 sizeof(pg_unicode_normprops),
497 qc_compare);
498 break;
499 case UNICODE_NFKC:
500 found = bsearch(&key,
501 UnicodeNormProps_NFKC_QC,
502 lengthof(UnicodeNormProps_NFKC_QC),
503 sizeof(pg_unicode_normprops),
504 qc_compare);
505 break;
506 default:
507 Assert(false);
508 break;
509 }
510
511 if (found)
512 return found->quickcheck;
513 else
514 return UNICODE_NORM_QC_YES;
515 }
516
517 UnicodeNormalizationQC
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form,const pg_wchar * input)518 unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
519 {
520 uint8 lastCanonicalClass = 0;
521 UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
522
523 /*
524 * For the "D" forms, we don't run the quickcheck. We don't include the
525 * lookup tables for those because they are huge, checking for these
526 * particular forms is less common, and running the slow path is faster
527 * for the "D" forms than the "C" forms because you don't need to
528 * recompose, which is slow.
529 */
530 if (form == UNICODE_NFD || form == UNICODE_NFKD)
531 return UNICODE_NORM_QC_MAYBE;
532
533 for (const pg_wchar *p = input; *p; p++)
534 {
535 pg_wchar ch = *p;
536 uint8 canonicalClass;
537 UnicodeNormalizationQC check;
538
539 canonicalClass = get_canonical_class(ch);
540 if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
541 return UNICODE_NORM_QC_NO;
542
543 check = qc_is_allowed(form, ch);
544 if (check == UNICODE_NORM_QC_NO)
545 return UNICODE_NORM_QC_NO;
546 else if (check == UNICODE_NORM_QC_MAYBE)
547 result = UNICODE_NORM_QC_MAYBE;
548
549 lastCanonicalClass = canonicalClass;
550 }
551 return result;
552 }
553
554 #endif /* !FRONTEND */
555