1 /*-------------------------------------------------------------------------
2  * unicode_norm.c
3  *		Normalize a Unicode string to NFKC form
4  *
5  * This implements Unicode normalization, per the documentation at
6  * http://www.unicode.org/reports/tr15/.
7  *
8  * Portions Copyright (c) 2017, PostgreSQL Global Development Group
9  *
10  * IDENTIFICATION
11  *	  src/common/unicode_norm.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef FRONTEND
16 #include "postgres.h"
17 #else
18 #include "postgres_fe.h"
19 #endif
20 
21 #include "common/unicode_norm.h"
22 #include "common/unicode_norm_table.h"
23 
24 #ifndef FRONTEND
25 #define ALLOC(size) palloc(size)
26 #define FREE(size) pfree(size)
27 #else
28 #define ALLOC(size) malloc(size)
29 #define FREE(size) free(size)
30 #endif
31 
32 /* Constants for calculations with Hangul characters */
33 #define SBASE		0xAC00		/* U+AC00 */
34 #define LBASE		0x1100		/* U+1100 */
35 #define VBASE		0x1161		/* U+1161 */
36 #define TBASE		0x11A7		/* U+11A7 */
37 #define LCOUNT		19
38 #define VCOUNT		21
39 #define TCOUNT		28
40 #define NCOUNT		VCOUNT * TCOUNT
41 #define SCOUNT		LCOUNT * NCOUNT
42 
43 /* comparison routine for bsearch() of decomposition lookup table. */
44 static int
conv_compare(const void * p1,const void * p2)45 conv_compare(const void *p1, const void *p2)
46 {
47 	uint32		v1,
48 				v2;
49 
50 	v1 = *(const uint32 *) p1;
51 	v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52 	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
53 }
54 
55 /*
56  * Get the entry corresponding to code in the decomposition lookup table.
57  */
58 static pg_unicode_decomposition *
get_code_entry(pg_wchar code)59 get_code_entry(pg_wchar code)
60 {
61 	return bsearch(&(code),
62 				   (void *) UnicodeDecompMain,
63 				   lengthof(UnicodeDecompMain),
64 				   sizeof(pg_unicode_decomposition),
65 				   conv_compare);
66 }
67 
68 /*
69  * Given a decomposition entry looked up earlier, get the decomposed
70  * characters.
71  *
72  * Note: the returned pointer can point to statically allocated buffer, and
73  * is only valid until next call to this function!
74  */
75 static const pg_wchar *
get_code_decomposition(pg_unicode_decomposition * entry,int * dec_size)76 get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
77 {
78 	static pg_wchar x;
79 
80 	if (DECOMPOSITION_IS_INLINE(entry))
81 	{
82 		Assert(DECOMPOSITION_SIZE(entry) == 1);
83 		x = (pg_wchar) entry->dec_index;
84 		*dec_size = 1;
85 		return &x;
86 	}
87 	else
88 	{
89 		*dec_size = DECOMPOSITION_SIZE(entry);
90 		return &UnicodeDecomp_codepoints[entry->dec_index];
91 	}
92 }
93 
94 /*
95  * Calculate how many characters a given character will decompose to.
96  *
97  * This needs to recurse, if the character decomposes into characters that
98  * are, in turn, decomposable.
99  */
100 static int
get_decomposed_size(pg_wchar code)101 get_decomposed_size(pg_wchar code)
102 {
103 	pg_unicode_decomposition *entry;
104 	int			size = 0;
105 	int			i;
106 	const uint32 *decomp;
107 	int			dec_size;
108 
109 	/*
110 	 * Fast path for Hangul characters not stored in tables to save memory as
111 	 * decomposition is algorithmic. See
112 	 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113 	 * the matter.
114 	 */
115 	if (code >= SBASE && code < SBASE + SCOUNT)
116 	{
117 		uint32		tindex,
118 					sindex;
119 
120 		sindex = code - SBASE;
121 		tindex = sindex % TCOUNT;
122 
123 		if (tindex != 0)
124 			return 3;
125 		return 2;
126 	}
127 
128 	entry = get_code_entry(code);
129 
130 	/*
131 	 * Just count current code if no other decompositions.  A NULL entry is
132 	 * equivalent to a character with class 0 and no decompositions.
133 	 */
134 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
135 		return 1;
136 
137 	/*
138 	 * If this entry has other decomposition codes look at them as well. First
139 	 * get its decomposition in the list of tables available.
140 	 */
141 	decomp = get_code_decomposition(entry, &dec_size);
142 	for (i = 0; i < dec_size; i++)
143 	{
144 		uint32		lcode = decomp[i];
145 
146 		size += get_decomposed_size(lcode);
147 	}
148 
149 	return size;
150 }
151 
152 /*
153  * Recompose a set of characters. For hangul characters, the calculation
154  * is algorithmic. For others, an inverse lookup at the decomposition
155  * table is necessary. Returns true if a recomposition can be done, and
156  * false otherwise.
157  */
158 static bool
recompose_code(uint32 start,uint32 code,uint32 * result)159 recompose_code(uint32 start, uint32 code, uint32 *result)
160 {
161 	/*
162 	 * Handle Hangul characters algorithmically, per the Unicode spec.
163 	 *
164 	 * Check if two current characters are L and V.
165 	 */
166 	if (start >= LBASE && start < LBASE + LCOUNT &&
167 		code >= VBASE && code < VBASE + VCOUNT)
168 	{
169 		/* make syllable of form LV */
170 		uint32		lindex = start - LBASE;
171 		uint32		vindex = code - VBASE;
172 
173 		*result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
174 		return true;
175 	}
176 	/* Check if two current characters are LV and T */
177 	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178 			 ((start - SBASE) % TCOUNT) == 0 &&
179 			 code >= TBASE && code < (TBASE + TCOUNT))
180 	{
181 		/* make syllable of from LVT */
182 		uint32		tindex = code - TBASE;
183 
184 		*result = start + tindex;
185 		return true;
186 	}
187 	else
188 	{
189 		int			i;
190 
191 		/*
192 		 * Do an inverse lookup of the decomposition tables to see if anything
193 		 * matches. The comparison just needs to be a perfect match on the
194 		 * sub-table of size two, because the start character has already been
195 		 * recomposed partially.
196 		 */
197 		for (i = 0; i < lengthof(UnicodeDecompMain); i++)
198 		{
199 			const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
200 
201 			if (DECOMPOSITION_SIZE(entry) != 2)
202 				continue;
203 
204 			if (DECOMPOSITION_NO_COMPOSE(entry))
205 				continue;
206 
207 			if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208 				code == UnicodeDecomp_codepoints[entry->dec_index + 1])
209 			{
210 				*result = entry->codepoint;
211 				return true;
212 			}
213 		}
214 	}
215 
216 	return false;
217 }
218 
219 /*
220  * Decompose the given code into the array given by caller. The
221  * decomposition begins at the position given by caller, saving one
222  * lookup on the decomposition table. The current position needs to be
223  * updated here to let the caller know from where to continue filling
224  * in the array result.
225  */
226 static void
decompose_code(pg_wchar code,pg_wchar ** result,int * current)227 decompose_code(pg_wchar code, pg_wchar **result, int *current)
228 {
229 	pg_unicode_decomposition *entry;
230 	int			i;
231 	const uint32 *decomp;
232 	int			dec_size;
233 
234 	/*
235 	 * Fast path for Hangul characters not stored in tables to save memory as
236 	 * decomposition is algorithmic. See
237 	 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238 	 * the matter.
239 	 */
240 	if (code >= SBASE && code < SBASE + SCOUNT)
241 	{
242 		uint32		l,
243 					v,
244 					tindex,
245 					sindex;
246 		pg_wchar   *res = *result;
247 
248 		sindex = code - SBASE;
249 		l = LBASE + sindex / (VCOUNT * TCOUNT);
250 		v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251 		tindex = sindex % TCOUNT;
252 
253 		res[*current] = l;
254 		(*current)++;
255 		res[*current] = v;
256 		(*current)++;
257 
258 		if (tindex != 0)
259 		{
260 			res[*current] = TBASE + tindex;
261 			(*current)++;
262 		}
263 
264 		return;
265 	}
266 
267 	entry = get_code_entry(code);
268 
269 	/*
270 	 * Just fill in with the current decomposition if there are no
271 	 * decomposition codes to recurse to.  A NULL entry is equivalent to a
272 	 * character with class 0 and no decompositions, so just leave also in
273 	 * this case.
274 	 */
275 	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
276 	{
277 		pg_wchar   *res = *result;
278 
279 		res[*current] = code;
280 		(*current)++;
281 		return;
282 	}
283 
284 	/*
285 	 * If this entry has other decomposition codes look at them as well.
286 	 */
287 	decomp = get_code_decomposition(entry, &dec_size);
288 	for (i = 0; i < dec_size; i++)
289 	{
290 		pg_wchar	lcode = (pg_wchar) decomp[i];
291 
292 		/* Leave if no more decompositions */
293 		decompose_code(lcode, result, current);
294 	}
295 }
296 
297 /*
298  * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
299  *
300  * The input is a 0-terminated array of codepoints.
301  *
302  * In frontend, returns a 0-terminated array of codepoints, allocated with
303  * malloc. Or NULL if we run out of memory. In backend, the returned
304  * string is palloc'd instead, and OOM is reported with ereport().
305  */
306 pg_wchar *
unicode_normalize_kc(const pg_wchar * input)307 unicode_normalize_kc(const pg_wchar *input)
308 {
309 	pg_wchar   *decomp_chars;
310 	pg_wchar   *recomp_chars;
311 	int			decomp_size,
312 				current_size;
313 	int			count;
314 	const pg_wchar *p;
315 
316 	/* variables for recomposition */
317 	int			last_class;
318 	int			starter_pos;
319 	int			target_pos;
320 	uint32		starter_ch;
321 
322 	/* First, do character decomposition */
323 
324 	/*
325 	 * Calculate how many characters long the decomposed version will be.
326 	 */
327 	decomp_size = 0;
328 	for (p = input; *p; p++)
329 		decomp_size += get_decomposed_size(*p);
330 
331 	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332 	if (decomp_chars == NULL)
333 		return NULL;
334 
335 	/*
336 	 * Now fill in each entry recursively. This needs a second pass on the
337 	 * decomposition table.
338 	 */
339 	current_size = 0;
340 	for (p = input; *p; p++)
341 		decompose_code(*p, &decomp_chars, &current_size);
342 	decomp_chars[decomp_size] = '\0';
343 	Assert(decomp_size == current_size);
344 
345 	/*
346 	 * Now apply canonical ordering.
347 	 */
348 	for (count = 1; count < decomp_size; count++)
349 	{
350 		pg_wchar	prev = decomp_chars[count - 1];
351 		pg_wchar	next = decomp_chars[count];
352 		pg_wchar	tmp;
353 		pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354 		pg_unicode_decomposition *nextEntry = get_code_entry(next);
355 
356 		/*
357 		 * If no entries are found, the character used is either an Hangul
358 		 * character or a character with a class of 0 and no decompositions,
359 		 * so move to next result.
360 		 */
361 		if (prevEntry == NULL || nextEntry == NULL)
362 			continue;
363 
364 		/*
365 		 * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
366 		 * a sequence of two adjacent characters in a string is an
367 		 * exchangeable pair if the combining class (from the Unicode
368 		 * Character Database) for the first character is greater than the
369 		 * combining class for the second, and the second is not a starter.  A
370 		 * character is a starter if its combining class is 0.
371 		 */
372 		if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373 			continue;
374 
375 		if (prevEntry->comb_class <= nextEntry->comb_class)
376 			continue;
377 
378 		/* exchange can happen */
379 		tmp = decomp_chars[count - 1];
380 		decomp_chars[count - 1] = decomp_chars[count];
381 		decomp_chars[count] = tmp;
382 
383 		/* backtrack to check again */
384 		if (count > 1)
385 			count -= 2;
386 	}
387 
388 	/*
389 	 * The last phase of NFKC is the recomposition of the reordered Unicode
390 	 * string using combining classes. The recomposed string cannot be longer
391 	 * than the decomposed one, so make the allocation of the output string
392 	 * based on that assumption.
393 	 */
394 	recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395 	if (!recomp_chars)
396 	{
397 		FREE(decomp_chars);
398 		return NULL;
399 	}
400 
401 	last_class = -1;			/* this eliminates a special check */
402 	starter_pos = 0;
403 	target_pos = 1;
404 	starter_ch = recomp_chars[0] = decomp_chars[0];
405 
406 	for (count = 1; count < decomp_size; count++)
407 	{
408 		pg_wchar	ch = decomp_chars[count];
409 		pg_unicode_decomposition *ch_entry = get_code_entry(ch);
410 		int			ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411 		pg_wchar	composite;
412 
413 		if (last_class < ch_class &&
414 			recompose_code(starter_ch, ch, &composite))
415 		{
416 			recomp_chars[starter_pos] = composite;
417 			starter_ch = composite;
418 		}
419 		else if (ch_class == 0)
420 		{
421 			starter_pos = target_pos;
422 			starter_ch = ch;
423 			last_class = -1;
424 			recomp_chars[target_pos++] = ch;
425 		}
426 		else
427 		{
428 			last_class = ch_class;
429 			recomp_chars[target_pos++] = ch;
430 		}
431 	}
432 	recomp_chars[target_pos] = (pg_wchar) '\0';
433 
434 	FREE(decomp_chars);
435 
436 	return recomp_chars;
437 }
438