1 /*
2  * Copyright (c) 2014 SGI.
3  * All rights reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 
19 /* Generator for a compact trie for unicode normalization */
20 
21 #include <sys/types.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <errno.h>
29 
30 /* Default names of the in- and output files. */
31 
32 #define AGE_NAME	"DerivedAge.txt"
33 #define CCC_NAME	"DerivedCombiningClass.txt"
34 #define PROP_NAME	"DerivedCoreProperties.txt"
35 #define DATA_NAME	"UnicodeData.txt"
36 #define FOLD_NAME	"CaseFolding.txt"
37 #define NORM_NAME	"NormalizationCorrections.txt"
38 #define TEST_NAME	"NormalizationTest.txt"
39 #define UTF8_NAME	"utf8data.h"
40 
41 const char	*age_name  = AGE_NAME;
42 const char	*ccc_name  = CCC_NAME;
43 const char	*prop_name = PROP_NAME;
44 const char	*data_name = DATA_NAME;
45 const char	*fold_name = FOLD_NAME;
46 const char	*norm_name = NORM_NAME;
47 const char	*test_name = TEST_NAME;
48 const char	*utf8_name = UTF8_NAME;
49 
50 int verbose = 0;
51 
52 /* An arbitrary line size limit on input lines. */
53 
54 #define LINESIZE	1024
55 char line[LINESIZE];
56 char buf0[LINESIZE];
57 char buf1[LINESIZE];
58 char buf2[LINESIZE];
59 char buf3[LINESIZE];
60 
61 const char *argv0;
62 
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64 
65 /* ------------------------------------------------------------------ */
66 
67 /*
68  * Unicode version numbers consist of three parts: major, minor, and a
69  * revision.  These numbers are packed into an unsigned int to obtain
70  * a single version number.
71  *
72  * To save space in the generated trie, the unicode version is not
73  * stored directly, instead we calculate a generation number from the
74  * unicode versions seen in the DerivedAge file, and use that as an
75  * index into a table of unicode versions.
76  */
77 #define UNICODE_MAJ_SHIFT		(16)
78 #define UNICODE_MIN_SHIFT		(8)
79 
80 #define UNICODE_MAJ_MAX			((unsigned short)-1)
81 #define UNICODE_MIN_MAX			((unsigned char)-1)
82 #define UNICODE_REV_MAX			((unsigned char)-1)
83 
84 #define UNICODE_AGE(MAJ,MIN,REV)			\
85 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
86 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
87 	 ((unsigned int)(REV)))
88 
89 unsigned int *ages;
90 int ages_count;
91 
92 unsigned int unicode_maxage;
93 
age_valid(unsigned int major,unsigned int minor,unsigned int revision)94 static int age_valid(unsigned int major, unsigned int minor,
95 		     unsigned int revision)
96 {
97 	if (major > UNICODE_MAJ_MAX)
98 		return 0;
99 	if (minor > UNICODE_MIN_MAX)
100 		return 0;
101 	if (revision > UNICODE_REV_MAX)
102 		return 0;
103 	return 1;
104 }
105 
106 /* ------------------------------------------------------------------ */
107 
108 /*
109  * utf8trie_t
110  *
111  * A compact binary tree, used to decode UTF-8 characters.
112  *
113  * Internal nodes are one byte for the node itself, and up to three
114  * bytes for an offset into the tree.  The first byte contains the
115  * following information:
116  *  NEXTBYTE  - flag        - advance to next byte if set
117  *  BITNUM    - 3 bit field - the bit number to tested
118  *  OFFLEN    - 2 bit field - number of bytes in the offset
119  * if offlen == 0 (non-branching node)
120  *  RIGHTPATH - 1 bit field - set if the following node is for the
121  *                            right-hand path (tested bit is set)
122  *  TRIENODE  - 1 bit field - set if the following node is an internal
123  *                            node, otherwise it is a leaf node
124  * if offlen != 0 (branching node)
125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
127  *
128  * Due to the way utf8 works, there cannot be branching nodes with
129  * NEXTBYTE set, and moreover those nodes always have a righthand
130  * descendant.
131  */
132 typedef unsigned char utf8trie_t;
133 #define BITNUM		0x07
134 #define NEXTBYTE	0x08
135 #define OFFLEN		0x30
136 #define OFFLEN_SHIFT	4
137 #define RIGHTPATH	0x40
138 #define TRIENODE	0x80
139 #define RIGHTNODE	0x40
140 #define LEFTNODE	0x80
141 
142 /*
143  * utf8leaf_t
144  *
145  * The leaves of the trie are embedded in the trie, and so the same
146  * underlying datatype, unsigned char.
147  *
148  * leaf[0]: The unicode version, stored as a generation number that is
149  *          an index into utf8agetab[].  With this we can filter code
150  *          points based on the unicode version in which they were
151  *          defined.  The CCC of a non-defined code point is 0.
152  * leaf[1]: Canonical Combining Class. During normalization, we need
153  *          to do a stable sort into ascending order of all characters
154  *          with a non-zero CCC that occur between two characters with
155  *          a CCC of 0, or at the begin or end of a string.
156  *          The unicode standard guarantees that all CCC values are
157  *          between 0 and 254 inclusive, which leaves 255 available as
158  *          a special value.
159  *          Code points with CCC 0 are known as stoppers.
160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161  *          start of a NUL-terminated string that is the decomposition
162  *          of the character.
163  *          The CCC of a decomposable character is the same as the CCC
164  *          of the first character of its decomposition.
165  *          Some characters decompose as the empty string: these are
166  *          characters with the Default_Ignorable_Code_Point property.
167  *          These do affect normalization, as they all have CCC 0.
168  *
169  * The decompositions in the trie have been fully expanded.
170  *
171  * Casefolding, if applicable, is also done using decompositions.
172  */
173 typedef unsigned char utf8leaf_t;
174 
175 #define LEAF_GEN(LEAF)	((LEAF)[0])
176 #define LEAF_CCC(LEAF)	((LEAF)[1])
177 #define LEAF_STR(LEAF)	((const char*)((LEAF) + 2))
178 
179 #define MAXGEN		(255)
180 
181 #define MINCCC		(0)
182 #define MAXCCC		(254)
183 #define STOPPER		(0)
184 #define DECOMPOSE	(255)
185 #define HANGUL		((char)(255))
186 
187 #define UTF8HANGULLEAF	(12)
188 
189 struct tree;
190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
191 			       const char *, size_t);
192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
193 
194 unsigned char *utf8data;
195 size_t utf8data_size;
196 
197 utf8trie_t *nfdi;
198 utf8trie_t *nfdicf;
199 
200 /* ------------------------------------------------------------------ */
201 
202 /*
203  * UTF8 valid ranges.
204  *
205  * The UTF-8 encoding spreads the bits of a 32bit word over several
206  * bytes. This table gives the ranges that can be held and how they'd
207  * be represented.
208  *
209  * 0x00000000 0x0000007F: 0xxxxxxx
210  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
215  *
216  * There is an additional requirement on UTF-8, in that only the
217  * shortest representation of a 32bit value is to be used.  A decoder
218  * must not decode sequences that do not satisfy this requirement.
219  * Thus the allowed ranges have a lower bound.
220  *
221  * 0x00000000 0x0000007F: 0xxxxxxx
222  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
227  *
228  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229  * 17 planes of 65536 values.  This limits the sequences actually seen
230  * even more, to just the following.
231  *
232  *          0 -     0x7f: 0                     0x7f
233  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
234  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
235  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
236  *
237  * Even within those ranges not all values are allowed: the surrogates
238  * 0xd800 - 0xdfff should never be seen.
239  *
240  * Note that the longest sequence seen with valid usage is 4 bytes,
241  * the same a single UTF-32 character.  This makes the UTF-8
242  * representation of Unicode strictly smaller than UTF-32.
243  *
244  * The shortest sequence requirement was introduced by:
245  *    Corrigendum #1: UTF-8 Shortest Form
246  * It can be found here:
247  *    http://www.unicode.org/versions/corrigendum1.html
248  *
249  */
250 
251 #define UTF8_2_BITS     0xC0
252 #define UTF8_3_BITS     0xE0
253 #define UTF8_4_BITS     0xF0
254 #define UTF8_N_BITS     0x80
255 #define UTF8_2_MASK     0xE0
256 #define UTF8_3_MASK     0xF0
257 #define UTF8_4_MASK     0xF8
258 #define UTF8_N_MASK     0xC0
259 #define UTF8_V_MASK     0x3F
260 #define UTF8_V_SHIFT    6
261 
utf8encode(char * str,unsigned int val)262 static int utf8encode(char *str, unsigned int val)
263 {
264 	int len;
265 
266 	if (val < 0x80) {
267 		str[0] = val;
268 		len = 1;
269 	} else if (val < 0x800) {
270 		str[1] = val & UTF8_V_MASK;
271 		str[1] |= UTF8_N_BITS;
272 		val >>= UTF8_V_SHIFT;
273 		str[0] = val;
274 		str[0] |= UTF8_2_BITS;
275 		len = 2;
276 	} else if (val < 0x10000) {
277 		str[2] = val & UTF8_V_MASK;
278 		str[2] |= UTF8_N_BITS;
279 		val >>= UTF8_V_SHIFT;
280 		str[1] = val & UTF8_V_MASK;
281 		str[1] |= UTF8_N_BITS;
282 		val >>= UTF8_V_SHIFT;
283 		str[0] = val;
284 		str[0] |= UTF8_3_BITS;
285 		len = 3;
286 	} else if (val < 0x110000) {
287 		str[3] = val & UTF8_V_MASK;
288 		str[3] |= UTF8_N_BITS;
289 		val >>= UTF8_V_SHIFT;
290 		str[2] = val & UTF8_V_MASK;
291 		str[2] |= UTF8_N_BITS;
292 		val >>= UTF8_V_SHIFT;
293 		str[1] = val & UTF8_V_MASK;
294 		str[1] |= UTF8_N_BITS;
295 		val >>= UTF8_V_SHIFT;
296 		str[0] = val;
297 		str[0] |= UTF8_4_BITS;
298 		len = 4;
299 	} else {
300 		printf("%#x: illegal val\n", val);
301 		len = 0;
302 	}
303 	return len;
304 }
305 
utf8decode(const char * str)306 static unsigned int utf8decode(const char *str)
307 {
308 	const unsigned char *s = (const unsigned char*)str;
309 	unsigned int unichar = 0;
310 
311 	if (*s < 0x80) {
312 		unichar = *s;
313 	} else if (*s < UTF8_3_BITS) {
314 		unichar = *s++ & 0x1F;
315 		unichar <<= UTF8_V_SHIFT;
316 		unichar |= *s & 0x3F;
317 	} else if (*s < UTF8_4_BITS) {
318 		unichar = *s++ & 0x0F;
319 		unichar <<= UTF8_V_SHIFT;
320 		unichar |= *s++ & 0x3F;
321 		unichar <<= UTF8_V_SHIFT;
322 		unichar |= *s & 0x3F;
323 	} else {
324 		unichar = *s++ & 0x0F;
325 		unichar <<= UTF8_V_SHIFT;
326 		unichar |= *s++ & 0x3F;
327 		unichar <<= UTF8_V_SHIFT;
328 		unichar |= *s++ & 0x3F;
329 		unichar <<= UTF8_V_SHIFT;
330 		unichar |= *s & 0x3F;
331 	}
332 	return unichar;
333 }
334 
utf32valid(unsigned int unichar)335 static int utf32valid(unsigned int unichar)
336 {
337 	return unichar < 0x110000;
338 }
339 
340 #define HANGUL_SYLLABLE(U)	((U) >= 0xAC00 && (U) <= 0xD7A3)
341 
342 #define NODE 1
343 #define LEAF 0
344 
345 struct tree {
346 	void *root;
347 	int childnode;
348 	const char *type;
349 	unsigned int maxage;
350 	struct tree *next;
351 	int (*leaf_equal)(void *, void *);
352 	void (*leaf_print)(void *, int);
353 	int (*leaf_mark)(void *);
354 	int (*leaf_size)(void *);
355 	int *(*leaf_index)(struct tree *, void *);
356 	unsigned char *(*leaf_emit)(void *, unsigned char *);
357 	int leafindex[0x110000];
358 	int index;
359 };
360 
361 struct node {
362 	int index;
363 	int offset;
364 	int mark;
365 	int size;
366 	struct node *parent;
367 	void *left;
368 	void *right;
369 	unsigned char bitnum;
370 	unsigned char nextbyte;
371 	unsigned char leftnode;
372 	unsigned char rightnode;
373 	unsigned int keybits;
374 	unsigned int keymask;
375 };
376 
377 /*
378  * Example lookup function for a tree.
379  */
lookup(struct tree * tree,const char * key)380 static void *lookup(struct tree *tree, const char *key)
381 {
382 	struct node *node;
383 	void *leaf = NULL;
384 
385 	node = tree->root;
386 	while (!leaf && node) {
387 		if (node->nextbyte)
388 			key++;
389 		if (*key & (1 << (node->bitnum & 7))) {
390 			/* Right leg */
391 			if (node->rightnode == NODE) {
392 				node = node->right;
393 			} else if (node->rightnode == LEAF) {
394 				leaf = node->right;
395 			} else {
396 				node = NULL;
397 			}
398 		} else {
399 			/* Left leg */
400 			if (node->leftnode == NODE) {
401 				node = node->left;
402 			} else if (node->leftnode == LEAF) {
403 				leaf = node->left;
404 			} else {
405 				node = NULL;
406 			}
407 		}
408 	}
409 
410 	return leaf;
411 }
412 
413 /*
414  * A simple non-recursive tree walker: keep track of visits to the
415  * left and right branches in the leftmask and rightmask.
416  */
tree_walk(struct tree * tree)417 static void tree_walk(struct tree *tree)
418 {
419 	struct node *node;
420 	unsigned int leftmask;
421 	unsigned int rightmask;
422 	unsigned int bitmask;
423 	int indent = 1;
424 	int nodes, singletons, leaves;
425 
426 	nodes = singletons = leaves = 0;
427 
428 	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429 	if (tree->childnode == LEAF) {
430 		assert(tree->root);
431 		tree->leaf_print(tree->root, indent);
432 		leaves = 1;
433 	} else {
434 		assert(tree->childnode == NODE);
435 		node = tree->root;
436 		leftmask = rightmask = 0;
437 		while (node) {
438 			printf("%*snode @ %p bitnum %d nextbyte %d"
439 			       " left %p right %p mask %x bits %x\n",
440 				indent, "", node,
441 				node->bitnum, node->nextbyte,
442 				node->left, node->right,
443 				node->keymask, node->keybits);
444 			nodes += 1;
445 			if (!(node->left && node->right))
446 				singletons += 1;
447 
448 			while (node) {
449 				bitmask = 1 << node->bitnum;
450 				if ((leftmask & bitmask) == 0) {
451 					leftmask |= bitmask;
452 					if (node->leftnode == LEAF) {
453 						assert(node->left);
454 						tree->leaf_print(node->left,
455 								 indent+1);
456 						leaves += 1;
457 					} else if (node->left) {
458 						assert(node->leftnode == NODE);
459 						indent += 1;
460 						node = node->left;
461 						break;
462 					}
463 				}
464 				if ((rightmask & bitmask) == 0) {
465 					rightmask |= bitmask;
466 					if (node->rightnode == LEAF) {
467 						assert(node->right);
468 						tree->leaf_print(node->right,
469 								 indent+1);
470 						leaves += 1;
471 					} else if (node->right) {
472 						assert(node->rightnode == NODE);
473 						indent += 1;
474 						node = node->right;
475 						break;
476 					}
477 				}
478 				leftmask &= ~bitmask;
479 				rightmask &= ~bitmask;
480 				node = node->parent;
481 				indent -= 1;
482 			}
483 		}
484 	}
485 	printf("nodes %d leaves %d singletons %d\n",
486 	       nodes, leaves, singletons);
487 }
488 
489 /*
490  * Allocate an initialize a new internal node.
491  */
alloc_node(struct node * parent)492 static struct node *alloc_node(struct node *parent)
493 {
494 	struct node *node;
495 	int bitnum;
496 
497 	node = malloc(sizeof(*node));
498 	node->left = node->right = NULL;
499 	node->parent = parent;
500 	node->leftnode = NODE;
501 	node->rightnode = NODE;
502 	node->keybits = 0;
503 	node->keymask = 0;
504 	node->mark = 0;
505 	node->index = 0;
506 	node->offset = -1;
507 	node->size = 4;
508 
509 	if (node->parent) {
510 		bitnum = parent->bitnum;
511 		if ((bitnum & 7) == 0) {
512 			node->bitnum = bitnum + 7 + 8;
513 			node->nextbyte = 1;
514 		} else {
515 			node->bitnum = bitnum - 1;
516 			node->nextbyte = 0;
517 		}
518 	} else {
519 		node->bitnum = 7;
520 		node->nextbyte = 0;
521 	}
522 
523 	return node;
524 }
525 
526 /*
527  * Insert a new leaf into the tree, and collapse any subtrees that are
528  * fully populated and end in identical leaves. A nextbyte tagged
529  * internal node will not be removed to preserve the tree's integrity.
530  * Note that due to the structure of utf8, no nextbyte tagged node
531  * will be a candidate for removal.
532  */
insert(struct tree * tree,char * key,int keylen,void * leaf)533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534 {
535 	struct node *node;
536 	struct node *parent;
537 	void **cursor;
538 	int keybits;
539 
540 	assert(keylen >= 1 && keylen <= 4);
541 
542 	node = NULL;
543 	cursor = &tree->root;
544 	keybits = 8 * keylen;
545 
546 	/* Insert, creating path along the way. */
547 	while (keybits) {
548 		if (!*cursor)
549 			*cursor = alloc_node(node);
550 		node = *cursor;
551 		if (node->nextbyte)
552 			key++;
553 		if (*key & (1 << (node->bitnum & 7)))
554 			cursor = &node->right;
555 		else
556 			cursor = &node->left;
557 		keybits--;
558 	}
559 	*cursor = leaf;
560 
561 	/* Merge subtrees if possible. */
562 	while (node) {
563 		if (*key & (1 << (node->bitnum & 7)))
564 			node->rightnode = LEAF;
565 		else
566 			node->leftnode = LEAF;
567 		if (node->nextbyte)
568 			break;
569 		if (node->leftnode == NODE || node->rightnode == NODE)
570 			break;
571 		assert(node->left);
572 		assert(node->right);
573 		/* Compare */
574 		if (! tree->leaf_equal(node->left, node->right))
575 			break;
576 		/* Keep left, drop right leaf. */
577 		leaf = node->left;
578 		/* Check in parent */
579 		parent = node->parent;
580 		if (!parent) {
581 			/* root of tree! */
582 			tree->root = leaf;
583 			tree->childnode = LEAF;
584 		} else if (parent->left == node) {
585 			parent->left = leaf;
586 			parent->leftnode = LEAF;
587 			if (parent->right) {
588 				parent->keymask = 0;
589 				parent->keybits = 0;
590 			} else {
591 				parent->keymask |= (1 << node->bitnum);
592 			}
593 		} else if (parent->right == node) {
594 			parent->right = leaf;
595 			parent->rightnode = LEAF;
596 			if (parent->left) {
597 				parent->keymask = 0;
598 				parent->keybits = 0;
599 			} else {
600 				parent->keymask |= (1 << node->bitnum);
601 				parent->keybits |= (1 << node->bitnum);
602 			}
603 		} else {
604 			/* internal tree error */
605 			assert(0);
606 		}
607 		free(node);
608 		node = parent;
609 	}
610 
611 	/* Propagate keymasks up along singleton chains. */
612 	while (node) {
613 		parent = node->parent;
614 		if (!parent)
615 			break;
616 		/* Nix the mask for parents with two children. */
617 		if (node->keymask == 0) {
618 			parent->keymask = 0;
619 			parent->keybits = 0;
620 		} else if (parent->left && parent->right) {
621 			parent->keymask = 0;
622 			parent->keybits = 0;
623 		} else {
624 			assert((parent->keymask & node->keymask) == 0);
625 			parent->keymask |= node->keymask;
626 			parent->keymask |= (1 << parent->bitnum);
627 			parent->keybits |= node->keybits;
628 			if (parent->right)
629 				parent->keybits |= (1 << parent->bitnum);
630 		}
631 		node = parent;
632 	}
633 
634 	return 0;
635 }
636 
637 /*
638  * Prune internal nodes.
639  *
640  * Fully populated subtrees that end at the same leaf have already
641  * been collapsed.  There are still internal nodes that have for both
642  * their left and right branches a sequence of singletons that make
643  * identical choices and end in identical leaves.  The keymask and
644  * keybits collected in the nodes describe the choices made in these
645  * singleton chains.  When they are identical for the left and right
646  * branch of a node, and the two leaves comare identical, the node in
647  * question can be removed.
648  *
649  * Note that nodes with the nextbyte tag set will not be removed by
650  * this to ensure tree integrity.  Note as well that the structure of
651  * utf8 ensures that these nodes would not have been candidates for
652  * removal in any case.
653  */
prune(struct tree * tree)654 static void prune(struct tree *tree)
655 {
656 	struct node *node;
657 	struct node *left;
658 	struct node *right;
659 	struct node *parent;
660 	void *leftleaf;
661 	void *rightleaf;
662 	unsigned int leftmask;
663 	unsigned int rightmask;
664 	unsigned int bitmask;
665 	int count;
666 
667 	if (verbose > 0)
668 		printf("Pruning %s_%x\n", tree->type, tree->maxage);
669 
670 	count = 0;
671 	if (tree->childnode == LEAF)
672 		return;
673 	if (!tree->root)
674 		return;
675 
676 	leftmask = rightmask = 0;
677 	node = tree->root;
678 	while (node) {
679 		if (node->nextbyte)
680 			goto advance;
681 		if (node->leftnode == LEAF)
682 			goto advance;
683 		if (node->rightnode == LEAF)
684 			goto advance;
685 		if (!node->left)
686 			goto advance;
687 		if (!node->right)
688 			goto advance;
689 		left = node->left;
690 		right = node->right;
691 		if (left->keymask == 0)
692 			goto advance;
693 		if (right->keymask == 0)
694 			goto advance;
695 		if (left->keymask != right->keymask)
696 			goto advance;
697 		if (left->keybits != right->keybits)
698 			goto advance;
699 		leftleaf = NULL;
700 		while (!leftleaf) {
701 			assert(left->left || left->right);
702 			if (left->leftnode == LEAF)
703 				leftleaf = left->left;
704 			else if (left->rightnode == LEAF)
705 				leftleaf = left->right;
706 			else if (left->left)
707 				left = left->left;
708 			else if (left->right)
709 				left = left->right;
710 			else
711 				assert(0);
712 		}
713 		rightleaf = NULL;
714 		while (!rightleaf) {
715 			assert(right->left || right->right);
716 			if (right->leftnode == LEAF)
717 				rightleaf = right->left;
718 			else if (right->rightnode == LEAF)
719 				rightleaf = right->right;
720 			else if (right->left)
721 				right = right->left;
722 			else if (right->right)
723 				right = right->right;
724 			else
725 				assert(0);
726 		}
727 		if (! tree->leaf_equal(leftleaf, rightleaf))
728 			goto advance;
729 		/*
730 		 * This node has identical singleton-only subtrees.
731 		 * Remove it.
732 		 */
733 		parent = node->parent;
734 		left = node->left;
735 		right = node->right;
736 		if (parent->left == node)
737 			parent->left = left;
738 		else if (parent->right == node)
739 			parent->right = left;
740 		else
741 			assert(0);
742 		left->parent = parent;
743 		left->keymask |= (1 << node->bitnum);
744 		node->left = NULL;
745 		while (node) {
746 			bitmask = 1 << node->bitnum;
747 			leftmask &= ~bitmask;
748 			rightmask &= ~bitmask;
749 			if (node->leftnode == NODE && node->left) {
750 				left = node->left;
751 				free(node);
752 				count++;
753 				node = left;
754 			} else if (node->rightnode == NODE && node->right) {
755 				right = node->right;
756 				free(node);
757 				count++;
758 				node = right;
759 			} else {
760 				node = NULL;
761 			}
762 		}
763 		/* Propagate keymasks up along singleton chains. */
764 		node = parent;
765 		/* Force re-check */
766 		bitmask = 1 << node->bitnum;
767 		leftmask &= ~bitmask;
768 		rightmask &= ~bitmask;
769 		for (;;) {
770 			if (node->left && node->right)
771 				break;
772 			if (node->left) {
773 				left = node->left;
774 				node->keymask |= left->keymask;
775 				node->keybits |= left->keybits;
776 			}
777 			if (node->right) {
778 				right = node->right;
779 				node->keymask |= right->keymask;
780 				node->keybits |= right->keybits;
781 			}
782 			node->keymask |= (1 << node->bitnum);
783 			node = node->parent;
784 			/* Force re-check */
785 			bitmask = 1 << node->bitnum;
786 			leftmask &= ~bitmask;
787 			rightmask &= ~bitmask;
788 		}
789 	advance:
790 		bitmask = 1 << node->bitnum;
791 		if ((leftmask & bitmask) == 0 &&
792 		    node->leftnode == NODE &&
793 		    node->left) {
794 			leftmask |= bitmask;
795 			node = node->left;
796 		} else if ((rightmask & bitmask) == 0 &&
797 			   node->rightnode == NODE &&
798 			   node->right) {
799 			rightmask |= bitmask;
800 			node = node->right;
801 		} else {
802 			leftmask &= ~bitmask;
803 			rightmask &= ~bitmask;
804 			node = node->parent;
805 		}
806 	}
807 	if (verbose > 0)
808 		printf("Pruned %d nodes\n", count);
809 }
810 
811 /*
812  * Mark the nodes in the tree that lead to leaves that must be
813  * emitted.
814  */
mark_nodes(struct tree * tree)815 static void mark_nodes(struct tree *tree)
816 {
817 	struct node *node;
818 	struct node *n;
819 	unsigned int leftmask;
820 	unsigned int rightmask;
821 	unsigned int bitmask;
822 	int marked;
823 
824 	marked = 0;
825 	if (verbose > 0)
826 		printf("Marking %s_%x\n", tree->type, tree->maxage);
827 	if (tree->childnode == LEAF)
828 		goto done;
829 
830 	assert(tree->childnode == NODE);
831 	node = tree->root;
832 	leftmask = rightmask = 0;
833 	while (node) {
834 		bitmask = 1 << node->bitnum;
835 		if ((leftmask & bitmask) == 0) {
836 			leftmask |= bitmask;
837 			if (node->leftnode == LEAF) {
838 				assert(node->left);
839 				if (tree->leaf_mark(node->left)) {
840 					n = node;
841 					while (n && !n->mark) {
842 						marked++;
843 						n->mark = 1;
844 						n = n->parent;
845 					}
846 				}
847 			} else if (node->left) {
848 				assert(node->leftnode == NODE);
849 				node = node->left;
850 				continue;
851 			}
852 		}
853 		if ((rightmask & bitmask) == 0) {
854 			rightmask |= bitmask;
855 			if (node->rightnode == LEAF) {
856 				assert(node->right);
857 				if (tree->leaf_mark(node->right)) {
858 					n = node;
859 					while (n && !n->mark) {
860 						marked++;
861 						n->mark = 1;
862 						n = n->parent;
863 					}
864 				}
865 			} else if (node->right) {
866 				assert(node->rightnode == NODE);
867 				node = node->right;
868 				continue;
869 			}
870 		}
871 		leftmask &= ~bitmask;
872 		rightmask &= ~bitmask;
873 		node = node->parent;
874 	}
875 
876 	/* second pass: left siblings and singletons */
877 
878 	assert(tree->childnode == NODE);
879 	node = tree->root;
880 	leftmask = rightmask = 0;
881 	while (node) {
882 		bitmask = 1 << node->bitnum;
883 		if ((leftmask & bitmask) == 0) {
884 			leftmask |= bitmask;
885 			if (node->leftnode == LEAF) {
886 				assert(node->left);
887 				if (tree->leaf_mark(node->left)) {
888 					n = node;
889 					while (n && !n->mark) {
890 						marked++;
891 						n->mark = 1;
892 						n = n->parent;
893 					}
894 				}
895 			} else if (node->left) {
896 				assert(node->leftnode == NODE);
897 				node = node->left;
898 				if (!node->mark && node->parent->mark) {
899 					marked++;
900 					node->mark = 1;
901 				}
902 				continue;
903 			}
904 		}
905 		if ((rightmask & bitmask) == 0) {
906 			rightmask |= bitmask;
907 			if (node->rightnode == LEAF) {
908 				assert(node->right);
909 				if (tree->leaf_mark(node->right)) {
910 					n = node;
911 					while (n && !n->mark) {
912 						marked++;
913 						n->mark = 1;
914 						n = n->parent;
915 					}
916 				}
917 			} else if (node->right) {
918 				assert(node->rightnode == NODE);
919 				node = node->right;
920 				if (!node->mark && node->parent->mark &&
921 				    !node->parent->left) {
922 					marked++;
923 					node->mark = 1;
924 				}
925 				continue;
926 			}
927 		}
928 		leftmask &= ~bitmask;
929 		rightmask &= ~bitmask;
930 		node = node->parent;
931 	}
932 done:
933 	if (verbose > 0)
934 		printf("Marked %d nodes\n", marked);
935 }
936 
937 /*
938  * Compute the index of each node and leaf, which is the offset in the
939  * emitted trie.  These values must be pre-computed because relative
940  * offsets between nodes are used to navigate the tree.
941  */
index_nodes(struct tree * tree,int index)942 static int index_nodes(struct tree *tree, int index)
943 {
944 	struct node *node;
945 	unsigned int leftmask;
946 	unsigned int rightmask;
947 	unsigned int bitmask;
948 	int count;
949 	int indent;
950 
951 	/* Align to a cache line (or half a cache line?). */
952 	while (index % 64)
953 		index++;
954 	tree->index = index;
955 	indent = 1;
956 	count = 0;
957 
958 	if (verbose > 0)
959 		printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960 	if (tree->childnode == LEAF) {
961 		index += tree->leaf_size(tree->root);
962 		goto done;
963 	}
964 
965 	assert(tree->childnode == NODE);
966 	node = tree->root;
967 	leftmask = rightmask = 0;
968 	while (node) {
969 		if (!node->mark)
970 			goto skip;
971 		count++;
972 		if (node->index != index)
973 			node->index = index;
974 		index += node->size;
975 skip:
976 		while (node) {
977 			bitmask = 1 << node->bitnum;
978 			if (node->mark && (leftmask & bitmask) == 0) {
979 				leftmask |= bitmask;
980 				if (node->leftnode == LEAF) {
981 					assert(node->left);
982 					*tree->leaf_index(tree, node->left) =
983 									index;
984 					index += tree->leaf_size(node->left);
985 					count++;
986 				} else if (node->left) {
987 					assert(node->leftnode == NODE);
988 					indent += 1;
989 					node = node->left;
990 					break;
991 				}
992 			}
993 			if (node->mark && (rightmask & bitmask) == 0) {
994 				rightmask |= bitmask;
995 				if (node->rightnode == LEAF) {
996 					assert(node->right);
997 					*tree->leaf_index(tree, node->right) = index;
998 					index += tree->leaf_size(node->right);
999 					count++;
1000 				} else if (node->right) {
1001 					assert(node->rightnode == NODE);
1002 					indent += 1;
1003 					node = node->right;
1004 					break;
1005 				}
1006 			}
1007 			leftmask &= ~bitmask;
1008 			rightmask &= ~bitmask;
1009 			node = node->parent;
1010 			indent -= 1;
1011 		}
1012 	}
1013 done:
1014 	/* Round up to a multiple of 16 */
1015 	while (index % 16)
1016 		index++;
1017 	if (verbose > 0)
1018 		printf("Final index %d\n", index);
1019 	return index;
1020 }
1021 
1022 /*
1023  * Mark the nodes in a subtree, helper for size_nodes().
1024  */
mark_subtree(struct node * node)1025 static int mark_subtree(struct node *node)
1026 {
1027 	int changed;
1028 
1029 	if (!node || node->mark)
1030 		return 0;
1031 	node->mark = 1;
1032 	node->index = node->parent->index;
1033 	changed = 1;
1034 	if (node->leftnode == NODE)
1035 		changed += mark_subtree(node->left);
1036 	if (node->rightnode == NODE)
1037 		changed += mark_subtree(node->right);
1038 	return changed;
1039 }
1040 
1041 /*
1042  * Compute the size of nodes and leaves. We start by assuming that
1043  * each node needs to store a three-byte offset. The indexes of the
1044  * nodes are calculated based on that, and then this function is
1045  * called to see if the sizes of some nodes can be reduced.  This is
1046  * repeated until no more changes are seen.
1047  */
size_nodes(struct tree * tree)1048 static int size_nodes(struct tree *tree)
1049 {
1050 	struct tree *next;
1051 	struct node *node;
1052 	struct node *right;
1053 	struct node *n;
1054 	unsigned int leftmask;
1055 	unsigned int rightmask;
1056 	unsigned int bitmask;
1057 	unsigned int pathbits;
1058 	unsigned int pathmask;
1059 	unsigned int nbit;
1060 	int changed;
1061 	int offset;
1062 	int size;
1063 	int indent;
1064 
1065 	indent = 1;
1066 	changed = 0;
1067 	size = 0;
1068 
1069 	if (verbose > 0)
1070 		printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071 	if (tree->childnode == LEAF)
1072 		goto done;
1073 
1074 	assert(tree->childnode == NODE);
1075 	pathbits = 0;
1076 	pathmask = 0;
1077 	node = tree->root;
1078 	leftmask = rightmask = 0;
1079 	while (node) {
1080 		if (!node->mark)
1081 			goto skip;
1082 		offset = 0;
1083 		if (!node->left || !node->right) {
1084 			size = 1;
1085 		} else {
1086 			if (node->rightnode == NODE) {
1087 				/*
1088 				 * If the right node is not marked,
1089 				 * look for a corresponding node in
1090 				 * the next tree.  Such a node need
1091 				 * not exist.
1092 				 */
1093 				right = node->right;
1094 				next = tree->next;
1095 				while (!right->mark) {
1096 					assert(next);
1097 					n = next->root;
1098 					while (n->bitnum != node->bitnum) {
1099 						nbit = 1 << n->bitnum;
1100 						if (!(pathmask & nbit))
1101 							break;
1102 						if (pathbits & nbit) {
1103 							if (n->rightnode == LEAF)
1104 								break;
1105 							n = n->right;
1106 						} else {
1107 							if (n->leftnode == LEAF)
1108 								break;
1109 							n = n->left;
1110 						}
1111 					}
1112 					if (n->bitnum != node->bitnum)
1113 						break;
1114 					n = n->right;
1115 					right = n;
1116 					next = next->next;
1117 				}
1118 				/* Make sure the right node is marked. */
1119 				if (!right->mark)
1120 					changed += mark_subtree(right);
1121 				offset = right->index - node->index;
1122 			} else {
1123 				offset = *tree->leaf_index(tree, node->right);
1124 				offset -= node->index;
1125 			}
1126 			assert(offset >= 0);
1127 			assert(offset <= 0xffffff);
1128 			if (offset <= 0xff) {
1129 				size = 2;
1130 			} else if (offset <= 0xffff) {
1131 				size = 3;
1132 			} else { /* offset <= 0xffffff */
1133 				size = 4;
1134 			}
1135 		}
1136 		if (node->size != size || node->offset != offset) {
1137 			node->size = size;
1138 			node->offset = offset;
1139 			changed++;
1140 		}
1141 skip:
1142 		while (node) {
1143 			bitmask = 1 << node->bitnum;
1144 			pathmask |= bitmask;
1145 			if (node->mark && (leftmask & bitmask) == 0) {
1146 				leftmask |= bitmask;
1147 				if (node->leftnode == LEAF) {
1148 					assert(node->left);
1149 				} else if (node->left) {
1150 					assert(node->leftnode == NODE);
1151 					indent += 1;
1152 					node = node->left;
1153 					break;
1154 				}
1155 			}
1156 			if (node->mark && (rightmask & bitmask) == 0) {
1157 				rightmask |= bitmask;
1158 				pathbits |= bitmask;
1159 				if (node->rightnode == LEAF) {
1160 					assert(node->right);
1161 				} else if (node->right) {
1162 					assert(node->rightnode == NODE);
1163 					indent += 1;
1164 					node = node->right;
1165 					break;
1166 				}
1167 			}
1168 			leftmask &= ~bitmask;
1169 			rightmask &= ~bitmask;
1170 			pathmask &= ~bitmask;
1171 			pathbits &= ~bitmask;
1172 			node = node->parent;
1173 			indent -= 1;
1174 		}
1175 	}
1176 done:
1177 	if (verbose > 0)
1178 		printf("Found %d changes\n", changed);
1179 	return changed;
1180 }
1181 
1182 /*
1183  * Emit a trie for the given tree into the data array.
1184  */
emit(struct tree * tree,unsigned char * data)1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187 	struct node *node;
1188 	unsigned int leftmask;
1189 	unsigned int rightmask;
1190 	unsigned int bitmask;
1191 	int offlen;
1192 	int offset;
1193 	int index;
1194 	int indent;
1195 	int size;
1196 	int bytes;
1197 	int leaves;
1198 	int nodes[4];
1199 	unsigned char byte;
1200 
1201 	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202 	leaves = 0;
1203 	bytes = 0;
1204 	index = tree->index;
1205 	data += index;
1206 	indent = 1;
1207 	if (verbose > 0)
1208 		printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209 	if (tree->childnode == LEAF) {
1210 		assert(tree->root);
1211 		tree->leaf_emit(tree->root, data);
1212 		size = tree->leaf_size(tree->root);
1213 		index += size;
1214 		leaves++;
1215 		goto done;
1216 	}
1217 
1218 	assert(tree->childnode == NODE);
1219 	node = tree->root;
1220 	leftmask = rightmask = 0;
1221 	while (node) {
1222 		if (!node->mark)
1223 			goto skip;
1224 		assert(node->offset != -1);
1225 		assert(node->index == index);
1226 
1227 		byte = 0;
1228 		if (node->nextbyte)
1229 			byte |= NEXTBYTE;
1230 		byte |= (node->bitnum & BITNUM);
1231 		if (node->left && node->right) {
1232 			if (node->leftnode == NODE)
1233 				byte |= LEFTNODE;
1234 			if (node->rightnode == NODE)
1235 				byte |= RIGHTNODE;
1236 			if (node->offset <= 0xff)
1237 				offlen = 1;
1238 			else if (node->offset <= 0xffff)
1239 				offlen = 2;
1240 			else
1241 				offlen = 3;
1242 			nodes[offlen]++;
1243 			offset = node->offset;
1244 			byte |= offlen << OFFLEN_SHIFT;
1245 			*data++ = byte;
1246 			index++;
1247 			while (offlen--) {
1248 				*data++ = offset & 0xff;
1249 				index++;
1250 				offset >>= 8;
1251 			}
1252 		} else if (node->left) {
1253 			if (node->leftnode == NODE)
1254 				byte |= TRIENODE;
1255 			nodes[0]++;
1256 			*data++ = byte;
1257 			index++;
1258 		} else if (node->right) {
1259 			byte |= RIGHTNODE;
1260 			if (node->rightnode == NODE)
1261 				byte |= TRIENODE;
1262 			nodes[0]++;
1263 			*data++ = byte;
1264 			index++;
1265 		} else {
1266 			assert(0);
1267 		}
1268 skip:
1269 		while (node) {
1270 			bitmask = 1 << node->bitnum;
1271 			if (node->mark && (leftmask & bitmask) == 0) {
1272 				leftmask |= bitmask;
1273 				if (node->leftnode == LEAF) {
1274 					assert(node->left);
1275 					data = tree->leaf_emit(node->left,
1276 							       data);
1277 					size = tree->leaf_size(node->left);
1278 					index += size;
1279 					bytes += size;
1280 					leaves++;
1281 				} else if (node->left) {
1282 					assert(node->leftnode == NODE);
1283 					indent += 1;
1284 					node = node->left;
1285 					break;
1286 				}
1287 			}
1288 			if (node->mark && (rightmask & bitmask) == 0) {
1289 				rightmask |= bitmask;
1290 				if (node->rightnode == LEAF) {
1291 					assert(node->right);
1292 					data = tree->leaf_emit(node->right,
1293 							       data);
1294 					size = tree->leaf_size(node->right);
1295 					index += size;
1296 					bytes += size;
1297 					leaves++;
1298 				} else if (node->right) {
1299 					assert(node->rightnode == NODE);
1300 					indent += 1;
1301 					node = node->right;
1302 					break;
1303 				}
1304 			}
1305 			leftmask &= ~bitmask;
1306 			rightmask &= ~bitmask;
1307 			node = node->parent;
1308 			indent -= 1;
1309 		}
1310 	}
1311 done:
1312 	if (verbose > 0) {
1313 		printf("Emitted %d (%d) leaves",
1314 			leaves, bytes);
1315 		printf(" %d (%d+%d+%d+%d) nodes",
1316 			nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317 			nodes[0], nodes[1], nodes[2], nodes[3]);
1318 		printf(" %d total\n", index - tree->index);
1319 	}
1320 }
1321 
1322 /* ------------------------------------------------------------------ */
1323 
1324 /*
1325  * Unicode data.
1326  *
1327  * We need to keep track of the Canonical Combining Class, the Age,
1328  * and decompositions for a code point.
1329  *
1330  * For the Age, we store the index into the ages table.  Effectively
1331  * this is a generation number that the table maps to a unicode
1332  * version.
1333  *
1334  * The correction field is used to indicate that this entry is in the
1335  * corrections array, which contains decompositions that were
1336  * corrected in later revisions.  The value of the correction field is
1337  * the Unicode version in which the mapping was corrected.
1338  */
1339 struct unicode_data {
1340 	unsigned int code;
1341 	int ccc;
1342 	int gen;
1343 	int correction;
1344 	unsigned int *utf32nfdi;
1345 	unsigned int *utf32nfdicf;
1346 	char *utf8nfdi;
1347 	char *utf8nfdicf;
1348 };
1349 
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int    corrections_count;
1353 
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356 
1357 struct tree *trees;
1358 int          trees_count;
1359 
1360 /*
1361  * Check the corrections array to see if this entry was corrected at
1362  * some point.
1363  */
corrections_lookup(struct unicode_data * u)1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366 	int i;
1367 
1368 	for (i = 0; i != corrections_count; i++)
1369 		if (u->code == corrections[i].code)
1370 			return &corrections[i];
1371 	return u;
1372 }
1373 
nfdi_equal(void * l,void * r)1374 static int nfdi_equal(void *l, void *r)
1375 {
1376 	struct unicode_data *left = l;
1377 	struct unicode_data *right = r;
1378 
1379 	if (left->gen != right->gen)
1380 		return 0;
1381 	if (left->ccc != right->ccc)
1382 		return 0;
1383 	if (left->utf8nfdi && right->utf8nfdi &&
1384 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385 		return 1;
1386 	if (left->utf8nfdi || right->utf8nfdi)
1387 		return 0;
1388 	return 1;
1389 }
1390 
nfdicf_equal(void * l,void * r)1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393 	struct unicode_data *left = l;
1394 	struct unicode_data *right = r;
1395 
1396 	if (left->gen != right->gen)
1397 		return 0;
1398 	if (left->ccc != right->ccc)
1399 		return 0;
1400 	if (left->utf8nfdicf && right->utf8nfdicf &&
1401 	    strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402 		return 1;
1403 	if (left->utf8nfdicf && right->utf8nfdicf)
1404 		return 0;
1405 	if (left->utf8nfdicf || right->utf8nfdicf)
1406 		return 0;
1407 	if (left->utf8nfdi && right->utf8nfdi &&
1408 	    strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409 		return 1;
1410 	if (left->utf8nfdi || right->utf8nfdi)
1411 		return 0;
1412 	return 1;
1413 }
1414 
nfdi_print(void * l,int indent)1415 static void nfdi_print(void *l, int indent)
1416 {
1417 	struct unicode_data *leaf = l;
1418 
1419 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420 		leaf->code, leaf->ccc, leaf->gen);
1421 
1422 	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 	else if (leaf->utf8nfdi)
1425 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426 
1427 	printf("\n");
1428 }
1429 
nfdicf_print(void * l,int indent)1430 static void nfdicf_print(void *l, int indent)
1431 {
1432 	struct unicode_data *leaf = l;
1433 
1434 	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435 		leaf->code, leaf->ccc, leaf->gen);
1436 
1437 	if (leaf->utf8nfdicf)
1438 		printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439 	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440 		printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 	else if (leaf->utf8nfdi)
1442 		printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443 	printf("\n");
1444 }
1445 
nfdi_mark(void * l)1446 static int nfdi_mark(void *l)
1447 {
1448 	return 1;
1449 }
1450 
nfdicf_mark(void * l)1451 static int nfdicf_mark(void *l)
1452 {
1453 	struct unicode_data *leaf = l;
1454 
1455 	if (leaf->utf8nfdicf)
1456 		return 1;
1457 	return 0;
1458 }
1459 
correction_mark(void * l)1460 static int correction_mark(void *l)
1461 {
1462 	struct unicode_data *leaf = l;
1463 
1464 	return leaf->correction;
1465 }
1466 
nfdi_size(void * l)1467 static int nfdi_size(void *l)
1468 {
1469 	struct unicode_data *leaf = l;
1470 	int size = 2;
1471 
1472 	if (HANGUL_SYLLABLE(leaf->code))
1473 		size += 1;
1474 	else if (leaf->utf8nfdi)
1475 		size += strlen(leaf->utf8nfdi) + 1;
1476 	return size;
1477 }
1478 
nfdicf_size(void * l)1479 static int nfdicf_size(void *l)
1480 {
1481 	struct unicode_data *leaf = l;
1482 	int size = 2;
1483 
1484 	if (HANGUL_SYLLABLE(leaf->code))
1485 		size += 1;
1486 	else if (leaf->utf8nfdicf)
1487 		size += strlen(leaf->utf8nfdicf) + 1;
1488 	else if (leaf->utf8nfdi)
1489 		size += strlen(leaf->utf8nfdi) + 1;
1490 	return size;
1491 }
1492 
nfdi_index(struct tree * tree,void * l)1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495 	struct unicode_data *leaf = l;
1496 
1497 	return &tree->leafindex[leaf->code];
1498 }
1499 
nfdicf_index(struct tree * tree,void * l)1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502 	struct unicode_data *leaf = l;
1503 
1504 	return &tree->leafindex[leaf->code];
1505 }
1506 
nfdi_emit(void * l,unsigned char * data)1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509 	struct unicode_data *leaf = l;
1510 	unsigned char *s;
1511 
1512 	*data++ = leaf->gen;
1513 
1514 	if (HANGUL_SYLLABLE(leaf->code)) {
1515 		*data++ = DECOMPOSE;
1516 		*data++ = HANGUL;
1517 	} else if (leaf->utf8nfdi) {
1518 		*data++ = DECOMPOSE;
1519 		s = (unsigned char*)leaf->utf8nfdi;
1520 		while ((*data++ = *s++) != 0)
1521 			;
1522 	} else {
1523 		*data++ = leaf->ccc;
1524 	}
1525 	return data;
1526 }
1527 
nfdicf_emit(void * l,unsigned char * data)1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530 	struct unicode_data *leaf = l;
1531 	unsigned char *s;
1532 
1533 	*data++ = leaf->gen;
1534 
1535 	if (HANGUL_SYLLABLE(leaf->code)) {
1536 		*data++ = DECOMPOSE;
1537 		*data++ = HANGUL;
1538 	} else if (leaf->utf8nfdicf) {
1539 		*data++ = DECOMPOSE;
1540 		s = (unsigned char*)leaf->utf8nfdicf;
1541 		while ((*data++ = *s++) != 0)
1542 			;
1543 	} else if (leaf->utf8nfdi) {
1544 		*data++ = DECOMPOSE;
1545 		s = (unsigned char*)leaf->utf8nfdi;
1546 		while ((*data++ = *s++) != 0)
1547 			;
1548 	} else {
1549 		*data++ = leaf->ccc;
1550 	}
1551 	return data;
1552 }
1553 
utf8_create(struct unicode_data * data)1554 static void utf8_create(struct unicode_data *data)
1555 {
1556 	char utf[18*4+1];
1557 	char *u;
1558 	unsigned int *um;
1559 	int i;
1560 
1561 	if (data->utf8nfdi) {
1562 		assert(data->utf8nfdi[0] == HANGUL);
1563 		return;
1564 	}
1565 
1566 	u = utf;
1567 	um = data->utf32nfdi;
1568 	if (um) {
1569 		for (i = 0; um[i]; i++)
1570 			u += utf8encode(u, um[i]);
1571 		*u = '\0';
1572 		data->utf8nfdi = strdup(utf);
1573 	}
1574 	u = utf;
1575 	um = data->utf32nfdicf;
1576 	if (um) {
1577 		for (i = 0; um[i]; i++)
1578 			u += utf8encode(u, um[i]);
1579 		*u = '\0';
1580 		if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581 			data->utf8nfdicf = strdup(utf);
1582 	}
1583 }
1584 
utf8_init(void)1585 static void utf8_init(void)
1586 {
1587 	unsigned int unichar;
1588 	int i;
1589 
1590 	for (unichar = 0; unichar != 0x110000; unichar++)
1591 		utf8_create(&unicode_data[unichar]);
1592 
1593 	for (i = 0; i != corrections_count; i++)
1594 		utf8_create(&corrections[i]);
1595 }
1596 
trees_init(void)1597 static void trees_init(void)
1598 {
1599 	struct unicode_data *data;
1600 	unsigned int maxage;
1601 	unsigned int nextage;
1602 	int count;
1603 	int i;
1604 	int j;
1605 
1606 	/* Count the number of different ages. */
1607 	count = 0;
1608 	nextage = (unsigned int)-1;
1609 	do {
1610 		maxage = nextage;
1611 		nextage = 0;
1612 		for (i = 0; i <= corrections_count; i++) {
1613 			data = &corrections[i];
1614 			if (nextage < data->correction &&
1615 			    data->correction < maxage)
1616 				nextage = data->correction;
1617 		}
1618 		count++;
1619 	} while (nextage);
1620 
1621 	/* Two trees per age: nfdi and nfdicf */
1622 	trees_count = count * 2;
1623 	trees = calloc(trees_count, sizeof(struct tree));
1624 
1625 	/* Assign ages to the trees. */
1626 	count = trees_count;
1627 	nextage = (unsigned int)-1;
1628 	do {
1629 		maxage = nextage;
1630 		trees[--count].maxage = maxage;
1631 		trees[--count].maxage = maxage;
1632 		nextage = 0;
1633 		for (i = 0; i <= corrections_count; i++) {
1634 			data = &corrections[i];
1635 			if (nextage < data->correction &&
1636 			    data->correction < maxage)
1637 				nextage = data->correction;
1638 		}
1639 	} while (nextage);
1640 
1641 	/* The ages assigned above are off by one. */
1642 	for (i = 0; i != trees_count; i++) {
1643 		j = 0;
1644 		while (ages[j] < trees[i].maxage)
1645 			j++;
1646 		trees[i].maxage = ages[j-1];
1647 	}
1648 
1649 	/* Set up the forwarding between trees. */
1650 	trees[trees_count-2].next = &trees[trees_count-1];
1651 	trees[trees_count-1].leaf_mark = nfdi_mark;
1652 	trees[trees_count-2].leaf_mark = nfdicf_mark;
1653 	for (i = 0; i != trees_count-2; i += 2) {
1654 		trees[i].next = &trees[trees_count-2];
1655 		trees[i].leaf_mark = correction_mark;
1656 		trees[i+1].next = &trees[trees_count-1];
1657 		trees[i+1].leaf_mark = correction_mark;
1658 	}
1659 
1660 	/* Assign the callouts. */
1661 	for (i = 0; i != trees_count; i += 2) {
1662 		trees[i].type = "nfdicf";
1663 		trees[i].leaf_equal = nfdicf_equal;
1664 		trees[i].leaf_print = nfdicf_print;
1665 		trees[i].leaf_size = nfdicf_size;
1666 		trees[i].leaf_index = nfdicf_index;
1667 		trees[i].leaf_emit = nfdicf_emit;
1668 
1669 		trees[i+1].type = "nfdi";
1670 		trees[i+1].leaf_equal = nfdi_equal;
1671 		trees[i+1].leaf_print = nfdi_print;
1672 		trees[i+1].leaf_size = nfdi_size;
1673 		trees[i+1].leaf_index = nfdi_index;
1674 		trees[i+1].leaf_emit = nfdi_emit;
1675 	}
1676 
1677 	/* Finish init. */
1678 	for (i = 0; i != trees_count; i++)
1679 		trees[i].childnode = NODE;
1680 }
1681 
trees_populate(void)1682 static void trees_populate(void)
1683 {
1684 	struct unicode_data *data;
1685 	unsigned int unichar;
1686 	char keyval[4];
1687 	int keylen;
1688 	int i;
1689 
1690 	for (i = 0; i != trees_count; i++) {
1691 		if (verbose > 0) {
1692 			printf("Populating %s_%x\n",
1693 				trees[i].type, trees[i].maxage);
1694 		}
1695 		for (unichar = 0; unichar != 0x110000; unichar++) {
1696 			if (unicode_data[unichar].gen < 0)
1697 				continue;
1698 			keylen = utf8encode(keyval, unichar);
1699 			data = corrections_lookup(&unicode_data[unichar]);
1700 			if (data->correction <= trees[i].maxage)
1701 				data = &unicode_data[unichar];
1702 			insert(&trees[i], keyval, keylen, data);
1703 		}
1704 	}
1705 }
1706 
trees_reduce(void)1707 static void trees_reduce(void)
1708 {
1709 	int i;
1710 	int size;
1711 	int changed;
1712 
1713 	for (i = 0; i != trees_count; i++)
1714 		prune(&trees[i]);
1715 	for (i = 0; i != trees_count; i++)
1716 		mark_nodes(&trees[i]);
1717 	do {
1718 		size = 0;
1719 		for (i = 0; i != trees_count; i++)
1720 			size = index_nodes(&trees[i], size);
1721 		changed = 0;
1722 		for (i = 0; i != trees_count; i++)
1723 			changed += size_nodes(&trees[i]);
1724 	} while (changed);
1725 
1726 	utf8data = calloc(size, 1);
1727 	utf8data_size = size;
1728 	for (i = 0; i != trees_count; i++)
1729 		emit(&trees[i], utf8data);
1730 
1731 	if (verbose > 0) {
1732 		for (i = 0; i != trees_count; i++) {
1733 			printf("%s_%x idx %d\n",
1734 				trees[i].type, trees[i].maxage, trees[i].index);
1735 		}
1736 	}
1737 
1738 	nfdi = utf8data + trees[trees_count-1].index;
1739 	nfdicf = utf8data + trees[trees_count-2].index;
1740 
1741 	nfdi_tree = &trees[trees_count-1];
1742 	nfdicf_tree = &trees[trees_count-2];
1743 }
1744 
verify(struct tree * tree)1745 static void verify(struct tree *tree)
1746 {
1747 	struct unicode_data *data;
1748 	utf8leaf_t	*leaf;
1749 	unsigned int	unichar;
1750 	char		key[4];
1751 	unsigned char	hangul[UTF8HANGULLEAF];
1752 	int		report;
1753 	int		nocf;
1754 
1755 	if (verbose > 0)
1756 		printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757 	nocf = strcmp(tree->type, "nfdicf");
1758 
1759 	for (unichar = 0; unichar != 0x110000; unichar++) {
1760 		report = 0;
1761 		data = corrections_lookup(&unicode_data[unichar]);
1762 		if (data->correction <= tree->maxage)
1763 			data = &unicode_data[unichar];
1764 		utf8encode(key,unichar);
1765 		leaf = utf8lookup(tree, hangul, key);
1766 
1767 		if (!leaf) {
1768 			if (data->gen != -1)
1769 				report++;
1770 			if (unichar < 0xd800 || unichar > 0xdfff)
1771 				report++;
1772 		} else {
1773 			if (unichar >= 0xd800 && unichar <= 0xdfff)
1774 				report++;
1775 			if (data->gen == -1)
1776 				report++;
1777 			if (data->gen != LEAF_GEN(leaf))
1778 				report++;
1779 			if (LEAF_CCC(leaf) == DECOMPOSE) {
1780 				if (HANGUL_SYLLABLE(data->code)) {
1781 					if (data->utf8nfdi[0] != HANGUL)
1782 						report++;
1783 				} else if (nocf) {
1784 					if (!data->utf8nfdi) {
1785 						report++;
1786 					} else if (strcmp(data->utf8nfdi,
1787 							  LEAF_STR(leaf))) {
1788 						report++;
1789 					}
1790 				} else {
1791 					if (!data->utf8nfdicf &&
1792 					    !data->utf8nfdi) {
1793 						report++;
1794 					} else if (data->utf8nfdicf) {
1795 						if (strcmp(data->utf8nfdicf,
1796 							   LEAF_STR(leaf)))
1797 							report++;
1798 					} else if (strcmp(data->utf8nfdi,
1799 							  LEAF_STR(leaf))) {
1800 						report++;
1801 					}
1802 				}
1803 			} else if (data->ccc != LEAF_CCC(leaf)) {
1804 				report++;
1805 			}
1806 		}
1807 		if (report) {
1808 			printf("%X code %X gen %d ccc %d"
1809 				" nfdi -> \"%s\"",
1810 				unichar, data->code, data->gen,
1811 				data->ccc,
1812 				data->utf8nfdi);
1813 			if (leaf) {
1814 				printf(" gen %d ccc %d"
1815 					" nfdi -> \"%s\"",
1816 					LEAF_GEN(leaf),
1817 					LEAF_CCC(leaf),
1818 					LEAF_CCC(leaf) == DECOMPOSE ?
1819 						LEAF_STR(leaf) : "");
1820 			}
1821 			printf("\n");
1822 		}
1823 	}
1824 }
1825 
trees_verify(void)1826 static void trees_verify(void)
1827 {
1828 	int i;
1829 
1830 	for (i = 0; i != trees_count; i++)
1831 		verify(&trees[i]);
1832 }
1833 
1834 /* ------------------------------------------------------------------ */
1835 
help(void)1836 static void help(void)
1837 {
1838 	printf("Usage: %s [options]\n", argv0);
1839 	printf("\n");
1840 	printf("This program creates an a data trie used for parsing and\n");
1841 	printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 	printf("a set of input files from the Unicode character database\n");
1843 	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844 	printf("\n");
1845 	printf("The generated tree supports two normalization forms:\n");
1846 	printf("\n");
1847 	printf("\tnfdi:\n");
1848 	printf("\t- Apply unicode normalization form NFD.\n");
1849 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850 	printf("\n");
1851 	printf("\tnfdicf:\n");
1852 	printf("\t- Apply unicode normalization form NFD.\n");
1853 	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 	printf("\t- Apply a full casefold (C + F).\n");
1855 	printf("\n");
1856 	printf("These forms were chosen as being most useful when dealing\n");
1857 	printf("with file names: NFD catches most cases where characters\n");
1858 	printf("should be considered equivalent. The ignorables are mostly\n");
1859 	printf("invisible, making names hard to type.\n");
1860 	printf("\n");
1861 	printf("The options to specify the files to be used are listed\n");
1862 	printf("below with their default values, which are the names used\n");
1863 	printf("by version 11.0.0 of the Unicode Character Database.\n");
1864 	printf("\n");
1865 	printf("The input files:\n");
1866 	printf("\t-a %s\n", AGE_NAME);
1867 	printf("\t-c %s\n", CCC_NAME);
1868 	printf("\t-p %s\n", PROP_NAME);
1869 	printf("\t-d %s\n", DATA_NAME);
1870 	printf("\t-f %s\n", FOLD_NAME);
1871 	printf("\t-n %s\n", NORM_NAME);
1872 	printf("\n");
1873 	printf("Additionally, the generated tables are tested using:\n");
1874 	printf("\t-t %s\n", TEST_NAME);
1875 	printf("\n");
1876 	printf("Finally, the output file:\n");
1877 	printf("\t-o %s\n", UTF8_NAME);
1878 	printf("\n");
1879 }
1880 
usage(void)1881 static void usage(void)
1882 {
1883 	help();
1884 	exit(1);
1885 }
1886 
open_fail(const char * name,int error)1887 static void open_fail(const char *name, int error)
1888 {
1889 	printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890 	exit(1);
1891 }
1892 
file_fail(const char * filename)1893 static void file_fail(const char *filename)
1894 {
1895 	printf("Error parsing %s\n", filename);
1896 	exit(1);
1897 }
1898 
line_fail(const char * filename,const char * line)1899 static void line_fail(const char *filename, const char *line)
1900 {
1901 	printf("Error parsing %s:%s\n", filename, line);
1902 	exit(1);
1903 }
1904 
1905 /* ------------------------------------------------------------------ */
1906 
print_utf32(unsigned int * utf32str)1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909 	int	i;
1910 
1911 	for (i = 0; utf32str[i]; i++)
1912 		printf(" %X", utf32str[i]);
1913 }
1914 
print_utf32nfdi(unsigned int unichar)1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917 	printf(" %X ->", unichar);
1918 	print_utf32(unicode_data[unichar].utf32nfdi);
1919 	printf("\n");
1920 }
1921 
print_utf32nfdicf(unsigned int unichar)1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924 	printf(" %X ->", unichar);
1925 	print_utf32(unicode_data[unichar].utf32nfdicf);
1926 	printf("\n");
1927 }
1928 
1929 /* ------------------------------------------------------------------ */
1930 
age_init(void)1931 static void age_init(void)
1932 {
1933 	FILE *file;
1934 	unsigned int first;
1935 	unsigned int last;
1936 	unsigned int unichar;
1937 	unsigned int major;
1938 	unsigned int minor;
1939 	unsigned int revision;
1940 	int gen;
1941 	int count;
1942 	int ret;
1943 
1944 	if (verbose > 0)
1945 		printf("Parsing %s\n", age_name);
1946 
1947 	file = fopen(age_name, "r");
1948 	if (!file)
1949 		open_fail(age_name, errno);
1950 	count = 0;
1951 
1952 	gen = 0;
1953 	while (fgets(line, LINESIZE, file)) {
1954 		ret = sscanf(line, "# Age=V%d_%d_%d",
1955 				&major, &minor, &revision);
1956 		if (ret == 3) {
1957 			ages_count++;
1958 			if (verbose > 1)
1959 				printf(" Age V%d_%d_%d\n",
1960 					major, minor, revision);
1961 			if (!age_valid(major, minor, revision))
1962 				line_fail(age_name, line);
1963 			continue;
1964 		}
1965 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966 		if (ret == 2) {
1967 			ages_count++;
1968 			if (verbose > 1)
1969 				printf(" Age V%d_%d\n", major, minor);
1970 			if (!age_valid(major, minor, 0))
1971 				line_fail(age_name, line);
1972 			continue;
1973 		}
1974 	}
1975 
1976 	/* We must have found something above. */
1977 	if (verbose > 1)
1978 		printf("%d age entries\n", ages_count);
1979 	if (ages_count == 0 || ages_count > MAXGEN)
1980 		file_fail(age_name);
1981 
1982 	/* There is a 0 entry. */
1983 	ages_count++;
1984 	ages = calloc(ages_count + 1, sizeof(*ages));
1985 	/* And a guard entry. */
1986 	ages[ages_count] = (unsigned int)-1;
1987 
1988 	rewind(file);
1989 	count = 0;
1990 	gen = 0;
1991 	while (fgets(line, LINESIZE, file)) {
1992 		ret = sscanf(line, "# Age=V%d_%d_%d",
1993 				&major, &minor, &revision);
1994 		if (ret == 3) {
1995 			ages[++gen] =
1996 				UNICODE_AGE(major, minor, revision);
1997 			if (verbose > 1)
1998 				printf(" Age V%d_%d_%d = gen %d\n",
1999 					major, minor, revision, gen);
2000 			if (!age_valid(major, minor, revision))
2001 				line_fail(age_name, line);
2002 			continue;
2003 		}
2004 		ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005 		if (ret == 2) {
2006 			ages[++gen] = UNICODE_AGE(major, minor, 0);
2007 			if (verbose > 1)
2008 				printf(" Age V%d_%d = %d\n",
2009 					major, minor, gen);
2010 			if (!age_valid(major, minor, 0))
2011 				line_fail(age_name, line);
2012 			continue;
2013 		}
2014 		ret = sscanf(line, "%X..%X ; %d.%d #",
2015 			     &first, &last, &major, &minor);
2016 		if (ret == 4) {
2017 			for (unichar = first; unichar <= last; unichar++)
2018 				unicode_data[unichar].gen = gen;
2019 			count += 1 + last - first;
2020 			if (verbose > 1)
2021 				printf("  %X..%X gen %d\n", first, last, gen);
2022 			if (!utf32valid(first) || !utf32valid(last))
2023 				line_fail(age_name, line);
2024 			continue;
2025 		}
2026 		ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027 		if (ret == 3) {
2028 			unicode_data[unichar].gen = gen;
2029 			count++;
2030 			if (verbose > 1)
2031 				printf("  %X gen %d\n", unichar, gen);
2032 			if (!utf32valid(unichar))
2033 				line_fail(age_name, line);
2034 			continue;
2035 		}
2036 	}
2037 	unicode_maxage = ages[gen];
2038 	fclose(file);
2039 
2040 	/* Nix surrogate block */
2041 	if (verbose > 1)
2042 		printf(" Removing surrogate block D800..DFFF\n");
2043 	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044 		unicode_data[unichar].gen = -1;
2045 
2046 	if (verbose > 0)
2047 	        printf("Found %d entries\n", count);
2048 	if (count == 0)
2049 		file_fail(age_name);
2050 }
2051 
ccc_init(void)2052 static void ccc_init(void)
2053 {
2054 	FILE *file;
2055 	unsigned int first;
2056 	unsigned int last;
2057 	unsigned int unichar;
2058 	unsigned int value;
2059 	int count;
2060 	int ret;
2061 
2062 	if (verbose > 0)
2063 		printf("Parsing %s\n", ccc_name);
2064 
2065 	file = fopen(ccc_name, "r");
2066 	if (!file)
2067 		open_fail(ccc_name, errno);
2068 
2069 	count = 0;
2070 	while (fgets(line, LINESIZE, file)) {
2071 		ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072 		if (ret == 3) {
2073 			for (unichar = first; unichar <= last; unichar++) {
2074 				unicode_data[unichar].ccc = value;
2075                                 count++;
2076 			}
2077 			if (verbose > 1)
2078 				printf(" %X..%X ccc %d\n", first, last, value);
2079 			if (!utf32valid(first) || !utf32valid(last))
2080 				line_fail(ccc_name, line);
2081 			continue;
2082 		}
2083 		ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084 		if (ret == 2) {
2085 			unicode_data[unichar].ccc = value;
2086                         count++;
2087 			if (verbose > 1)
2088 				printf(" %X ccc %d\n", unichar, value);
2089 			if (!utf32valid(unichar))
2090 				line_fail(ccc_name, line);
2091 			continue;
2092 		}
2093 	}
2094 	fclose(file);
2095 
2096 	if (verbose > 0)
2097 		printf("Found %d entries\n", count);
2098 	if (count == 0)
2099 		file_fail(ccc_name);
2100 }
2101 
ignore_compatibility_form(char * type)2102 static int ignore_compatibility_form(char *type)
2103 {
2104 	int i;
2105 	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106 				 "final", "isolated", "circle", "super",
2107 				 "sub", "vertical", "wide", "narrow",
2108 				 "small", "square", "fraction", "compat"};
2109 
2110 	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111 		if (strcmp(type, ignored_types[i]) == 0)
2112 			return 1;
2113 	return 0;
2114 }
2115 
nfdi_init(void)2116 static void nfdi_init(void)
2117 {
2118 	FILE *file;
2119 	unsigned int unichar;
2120 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121 	char *s;
2122 	char *type;
2123 	unsigned int *um;
2124 	int count;
2125 	int i;
2126 	int ret;
2127 
2128 	if (verbose > 0)
2129 		printf("Parsing %s\n", data_name);
2130 	file = fopen(data_name, "r");
2131 	if (!file)
2132 		open_fail(data_name, errno);
2133 
2134 	count = 0;
2135 	while (fgets(line, LINESIZE, file)) {
2136 		ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137 			     &unichar, buf0);
2138 		if (ret != 2)
2139 			continue;
2140 		if (!utf32valid(unichar))
2141 			line_fail(data_name, line);
2142 
2143 		s = buf0;
2144 		/* skip over <tag> */
2145 		if (*s == '<') {
2146 			type = ++s;
2147 			while (*++s != '>');
2148 			*s++ = '\0';
2149 			if(ignore_compatibility_form(type))
2150 				continue;
2151 		}
2152 		/* decode the decomposition into UTF-32 */
2153 		i = 0;
2154 		while (*s) {
2155 			mapping[i] = strtoul(s, &s, 16);
2156 			if (!utf32valid(mapping[i]))
2157 				line_fail(data_name, line);
2158 			i++;
2159 		}
2160 		mapping[i++] = 0;
2161 
2162 		um = malloc(i * sizeof(unsigned int));
2163 		memcpy(um, mapping, i * sizeof(unsigned int));
2164 		unicode_data[unichar].utf32nfdi = um;
2165 
2166 		if (verbose > 1)
2167 			print_utf32nfdi(unichar);
2168 		count++;
2169 	}
2170 	fclose(file);
2171 	if (verbose > 0)
2172 		printf("Found %d entries\n", count);
2173 	if (count == 0)
2174 		file_fail(data_name);
2175 }
2176 
nfdicf_init(void)2177 static void nfdicf_init(void)
2178 {
2179 	FILE *file;
2180 	unsigned int unichar;
2181 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182 	char status;
2183 	char *s;
2184 	unsigned int *um;
2185 	int i;
2186 	int count;
2187 	int ret;
2188 
2189 	if (verbose > 0)
2190 		printf("Parsing %s\n", fold_name);
2191 	file = fopen(fold_name, "r");
2192 	if (!file)
2193 		open_fail(fold_name, errno);
2194 
2195 	count = 0;
2196 	while (fgets(line, LINESIZE, file)) {
2197 		ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198 		if (ret != 3)
2199 			continue;
2200 		if (!utf32valid(unichar))
2201 			line_fail(fold_name, line);
2202 		/* Use the C+F casefold. */
2203 		if (status != 'C' && status != 'F')
2204 			continue;
2205 		s = buf0;
2206 		if (*s == '<')
2207 			while (*s++ != ' ')
2208 				;
2209 		i = 0;
2210 		while (*s) {
2211 			mapping[i] = strtoul(s, &s, 16);
2212 			if (!utf32valid(mapping[i]))
2213 				line_fail(fold_name, line);
2214 			i++;
2215 		}
2216 		mapping[i++] = 0;
2217 
2218 		um = malloc(i * sizeof(unsigned int));
2219 		memcpy(um, mapping, i * sizeof(unsigned int));
2220 		unicode_data[unichar].utf32nfdicf = um;
2221 
2222 		if (verbose > 1)
2223 			print_utf32nfdicf(unichar);
2224 		count++;
2225 	}
2226 	fclose(file);
2227 	if (verbose > 0)
2228 		printf("Found %d entries\n", count);
2229 	if (count == 0)
2230 		file_fail(fold_name);
2231 }
2232 
ignore_init(void)2233 static void ignore_init(void)
2234 {
2235 	FILE *file;
2236 	unsigned int unichar;
2237 	unsigned int first;
2238 	unsigned int last;
2239 	unsigned int *um;
2240 	int count;
2241 	int ret;
2242 
2243 	if (verbose > 0)
2244 		printf("Parsing %s\n", prop_name);
2245 	file = fopen(prop_name, "r");
2246 	if (!file)
2247 		open_fail(prop_name, errno);
2248 	assert(file);
2249 	count = 0;
2250 	while (fgets(line, LINESIZE, file)) {
2251 		ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252 		if (ret == 3) {
2253 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254 				continue;
2255 			if (!utf32valid(first) || !utf32valid(last))
2256 				line_fail(prop_name, line);
2257 			for (unichar = first; unichar <= last; unichar++) {
2258 				free(unicode_data[unichar].utf32nfdi);
2259 				um = malloc(sizeof(unsigned int));
2260 				*um = 0;
2261 				unicode_data[unichar].utf32nfdi = um;
2262 				free(unicode_data[unichar].utf32nfdicf);
2263 				um = malloc(sizeof(unsigned int));
2264 				*um = 0;
2265 				unicode_data[unichar].utf32nfdicf = um;
2266 				count++;
2267 			}
2268 			if (verbose > 1)
2269 				printf(" %X..%X Default_Ignorable_Code_Point\n",
2270 					first, last);
2271 			continue;
2272 		}
2273 		ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274 		if (ret == 2) {
2275 			if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276 				continue;
2277 			if (!utf32valid(unichar))
2278 				line_fail(prop_name, line);
2279 			free(unicode_data[unichar].utf32nfdi);
2280 			um = malloc(sizeof(unsigned int));
2281 			*um = 0;
2282 			unicode_data[unichar].utf32nfdi = um;
2283 			free(unicode_data[unichar].utf32nfdicf);
2284 			um = malloc(sizeof(unsigned int));
2285 			*um = 0;
2286 			unicode_data[unichar].utf32nfdicf = um;
2287 			if (verbose > 1)
2288 				printf(" %X Default_Ignorable_Code_Point\n",
2289 					unichar);
2290 			count++;
2291 			continue;
2292 		}
2293 	}
2294 	fclose(file);
2295 
2296 	if (verbose > 0)
2297 		printf("Found %d entries\n", count);
2298 	if (count == 0)
2299 		file_fail(prop_name);
2300 }
2301 
corrections_init(void)2302 static void corrections_init(void)
2303 {
2304 	FILE *file;
2305 	unsigned int unichar;
2306 	unsigned int major;
2307 	unsigned int minor;
2308 	unsigned int revision;
2309 	unsigned int age;
2310 	unsigned int *um;
2311 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312 	char *s;
2313 	int i;
2314 	int count;
2315 	int ret;
2316 
2317 	if (verbose > 0)
2318 		printf("Parsing %s\n", norm_name);
2319 	file = fopen(norm_name, "r");
2320 	if (!file)
2321 		open_fail(norm_name, errno);
2322 
2323 	count = 0;
2324 	while (fgets(line, LINESIZE, file)) {
2325 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326 				&unichar, buf0, buf1,
2327 				&major, &minor, &revision);
2328 		if (ret != 6)
2329 			continue;
2330 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331 			line_fail(norm_name, line);
2332 		count++;
2333 	}
2334 	corrections = calloc(count, sizeof(struct unicode_data));
2335 	corrections_count = count;
2336 	rewind(file);
2337 
2338 	count = 0;
2339 	while (fgets(line, LINESIZE, file)) {
2340 		ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341 				&unichar, buf0, buf1,
2342 				&major, &minor, &revision);
2343 		if (ret != 6)
2344 			continue;
2345 		if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346 			line_fail(norm_name, line);
2347 		corrections[count] = unicode_data[unichar];
2348 		assert(corrections[count].code == unichar);
2349 		age = UNICODE_AGE(major, minor, revision);
2350 		corrections[count].correction = age;
2351 
2352 		i = 0;
2353 		s = buf0;
2354 		while (*s) {
2355 			mapping[i] = strtoul(s, &s, 16);
2356 			if (!utf32valid(mapping[i]))
2357 				line_fail(norm_name, line);
2358 			i++;
2359 		}
2360 		mapping[i++] = 0;
2361 
2362 		um = malloc(i * sizeof(unsigned int));
2363 		memcpy(um, mapping, i * sizeof(unsigned int));
2364 		corrections[count].utf32nfdi = um;
2365 
2366 		if (verbose > 1)
2367 			printf(" %X -> %s -> %s V%d_%d_%d\n",
2368 				unichar, buf0, buf1, major, minor, revision);
2369 		count++;
2370 	}
2371 	fclose(file);
2372 
2373 	if (verbose > 0)
2374 	        printf("Found %d entries\n", count);
2375 	if (count == 0)
2376 		file_fail(norm_name);
2377 }
2378 
2379 /* ------------------------------------------------------------------ */
2380 
2381 /*
2382  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383  *
2384  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386  *
2387  * SBase = 0xAC00
2388  * LBase = 0x1100
2389  * VBase = 0x1161
2390  * TBase = 0x11A7
2391  * LCount = 19
2392  * VCount = 21
2393  * TCount = 28
2394  * NCount = 588 (VCount * TCount)
2395  * SCount = 11172 (LCount * NCount)
2396  *
2397  * Decomposition:
2398  *   SIndex = s - SBase
2399  *
2400  * LV (Canonical/Full)
2401  *   LIndex = SIndex / NCount
2402  *   VIndex = (Sindex % NCount) / TCount
2403  *   LPart = LBase + LIndex
2404  *   VPart = VBase + VIndex
2405  *
2406  * LVT (Canonical)
2407  *   LVIndex = (SIndex / TCount) * TCount
2408  *   TIndex = (Sindex % TCount)
2409  *   LVPart = SBase + LVIndex
2410  *   TPart = TBase + TIndex
2411  *
2412  * LVT (Full)
2413  *   LIndex = SIndex / NCount
2414  *   VIndex = (Sindex % NCount) / TCount
2415  *   TIndex = (Sindex % TCount)
2416  *   LPart = LBase + LIndex
2417  *   VPart = VBase + VIndex
2418  *   if (TIndex == 0) {
2419  *          d = <LPart, VPart>
2420  *   } else {
2421  *          TPart = TBase + TIndex
2422  *          d = <LPart, VPart, TPart>
2423  *   }
2424  *
2425  */
2426 
hangul_decompose(void)2427 static void hangul_decompose(void)
2428 {
2429 	unsigned int sb = 0xAC00;
2430 	unsigned int lb = 0x1100;
2431 	unsigned int vb = 0x1161;
2432 	unsigned int tb = 0x11a7;
2433 	/* unsigned int lc = 19; */
2434 	unsigned int vc = 21;
2435 	unsigned int tc = 28;
2436 	unsigned int nc = (vc * tc);
2437 	/* unsigned int sc = (lc * nc); */
2438 	unsigned int unichar;
2439 	unsigned int mapping[4];
2440 	unsigned int *um;
2441         int count;
2442 	int i;
2443 
2444 	if (verbose > 0)
2445 		printf("Decomposing hangul\n");
2446 	/* Hangul */
2447 	count = 0;
2448 	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449 		unsigned int si = unichar - sb;
2450 		unsigned int li = si / nc;
2451 		unsigned int vi = (si % nc) / tc;
2452 		unsigned int ti = si % tc;
2453 
2454 		i = 0;
2455 		mapping[i++] = lb + li;
2456 		mapping[i++] = vb + vi;
2457 		if (ti)
2458 			mapping[i++] = tb + ti;
2459 		mapping[i++] = 0;
2460 
2461 		assert(!unicode_data[unichar].utf32nfdi);
2462 		um = malloc(i * sizeof(unsigned int));
2463 		memcpy(um, mapping, i * sizeof(unsigned int));
2464 		unicode_data[unichar].utf32nfdi = um;
2465 
2466 		assert(!unicode_data[unichar].utf32nfdicf);
2467 		um = malloc(i * sizeof(unsigned int));
2468 		memcpy(um, mapping, i * sizeof(unsigned int));
2469 		unicode_data[unichar].utf32nfdicf = um;
2470 
2471 		/*
2472 		 * Add a cookie as a reminder that the hangul syllable
2473 		 * decompositions must not be stored in the generated
2474 		 * trie.
2475 		 */
2476 		unicode_data[unichar].utf8nfdi = malloc(2);
2477 		unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478 		unicode_data[unichar].utf8nfdi[1] = '\0';
2479 
2480 		if (verbose > 1)
2481 			print_utf32nfdi(unichar);
2482 
2483 		count++;
2484 	}
2485 	if (verbose > 0)
2486 		printf("Created %d entries\n", count);
2487 }
2488 
nfdi_decompose(void)2489 static void nfdi_decompose(void)
2490 {
2491 	unsigned int unichar;
2492 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493 	unsigned int *um;
2494 	unsigned int *dc;
2495 	int count;
2496 	int i;
2497 	int j;
2498 	int ret;
2499 
2500 	if (verbose > 0)
2501 		printf("Decomposing nfdi\n");
2502 
2503 	count = 0;
2504 	for (unichar = 0; unichar != 0x110000; unichar++) {
2505 		if (!unicode_data[unichar].utf32nfdi)
2506 			continue;
2507 		for (;;) {
2508 			ret = 1;
2509 			i = 0;
2510 			um = unicode_data[unichar].utf32nfdi;
2511 			while (*um) {
2512 				dc = unicode_data[*um].utf32nfdi;
2513 				if (dc) {
2514 					for (j = 0; dc[j]; j++)
2515 						mapping[i++] = dc[j];
2516 					ret = 0;
2517 				} else {
2518 					mapping[i++] = *um;
2519 				}
2520 				um++;
2521 			}
2522 			mapping[i++] = 0;
2523 			if (ret)
2524 				break;
2525 			free(unicode_data[unichar].utf32nfdi);
2526 			um = malloc(i * sizeof(unsigned int));
2527 			memcpy(um, mapping, i * sizeof(unsigned int));
2528 			unicode_data[unichar].utf32nfdi = um;
2529 		}
2530 		/* Add this decomposition to nfdicf if there is no entry. */
2531 		if (!unicode_data[unichar].utf32nfdicf) {
2532 			um = malloc(i * sizeof(unsigned int));
2533 			memcpy(um, mapping, i * sizeof(unsigned int));
2534 			unicode_data[unichar].utf32nfdicf = um;
2535 		}
2536 		if (verbose > 1)
2537 			print_utf32nfdi(unichar);
2538 		count++;
2539 	}
2540 	if (verbose > 0)
2541 		printf("Processed %d entries\n", count);
2542 }
2543 
nfdicf_decompose(void)2544 static void nfdicf_decompose(void)
2545 {
2546 	unsigned int unichar;
2547 	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548 	unsigned int *um;
2549 	unsigned int *dc;
2550 	int count;
2551 	int i;
2552 	int j;
2553 	int ret;
2554 
2555 	if (verbose > 0)
2556 		printf("Decomposing nfdicf\n");
2557 	count = 0;
2558 	for (unichar = 0; unichar != 0x110000; unichar++) {
2559 		if (!unicode_data[unichar].utf32nfdicf)
2560 			continue;
2561 		for (;;) {
2562 			ret = 1;
2563 			i = 0;
2564 			um = unicode_data[unichar].utf32nfdicf;
2565 			while (*um) {
2566 				dc = unicode_data[*um].utf32nfdicf;
2567 				if (dc) {
2568 					for (j = 0; dc[j]; j++)
2569 						mapping[i++] = dc[j];
2570 					ret = 0;
2571 				} else {
2572 					mapping[i++] = *um;
2573 				}
2574 				um++;
2575 			}
2576 			mapping[i++] = 0;
2577 			if (ret)
2578 				break;
2579 			free(unicode_data[unichar].utf32nfdicf);
2580 			um = malloc(i * sizeof(unsigned int));
2581 			memcpy(um, mapping, i * sizeof(unsigned int));
2582 			unicode_data[unichar].utf32nfdicf = um;
2583 		}
2584 		if (verbose > 1)
2585 			print_utf32nfdicf(unichar);
2586 		count++;
2587 	}
2588 	if (verbose > 0)
2589 		printf("Processed %d entries\n", count);
2590 }
2591 
2592 /* ------------------------------------------------------------------ */
2593 
2594 int utf8agemax(struct tree *, const char *);
2595 int utf8nagemax(struct tree *, const char *, size_t);
2596 int utf8agemin(struct tree *, const char *);
2597 int utf8nagemin(struct tree *, const char *, size_t);
2598 ssize_t utf8len(struct tree *, const char *);
2599 ssize_t utf8nlen(struct tree *, const char *, size_t);
2600 struct utf8cursor;
2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603 int utf8byte(struct utf8cursor *);
2604 
2605 /*
2606  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607  *
2608  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610  *
2611  * SBase = 0xAC00
2612  * LBase = 0x1100
2613  * VBase = 0x1161
2614  * TBase = 0x11A7
2615  * LCount = 19
2616  * VCount = 21
2617  * TCount = 28
2618  * NCount = 588 (VCount * TCount)
2619  * SCount = 11172 (LCount * NCount)
2620  *
2621  * Decomposition:
2622  *   SIndex = s - SBase
2623  *
2624  * LV (Canonical/Full)
2625  *   LIndex = SIndex / NCount
2626  *   VIndex = (Sindex % NCount) / TCount
2627  *   LPart = LBase + LIndex
2628  *   VPart = VBase + VIndex
2629  *
2630  * LVT (Canonical)
2631  *   LVIndex = (SIndex / TCount) * TCount
2632  *   TIndex = (Sindex % TCount)
2633  *   LVPart = SBase + LVIndex
2634  *   TPart = TBase + TIndex
2635  *
2636  * LVT (Full)
2637  *   LIndex = SIndex / NCount
2638  *   VIndex = (Sindex % NCount) / TCount
2639  *   TIndex = (Sindex % TCount)
2640  *   LPart = LBase + LIndex
2641  *   VPart = VBase + VIndex
2642  *   if (TIndex == 0) {
2643  *          d = <LPart, VPart>
2644  *   } else {
2645  *          TPart = TBase + TIndex
2646  *          d = <LPart, VPart, TPart>
2647  *   }
2648  */
2649 
2650 /* Constants */
2651 #define SB	(0xAC00)
2652 #define LB	(0x1100)
2653 #define VB	(0x1161)
2654 #define TB	(0x11A7)
2655 #define LC	(19)
2656 #define VC	(21)
2657 #define TC	(28)
2658 #define NC	(VC * TC)
2659 #define SC	(LC * NC)
2660 
2661 /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663 {
2664 	unsigned int	si;
2665 	unsigned int	li;
2666 	unsigned int	vi;
2667 	unsigned int	ti;
2668 	unsigned char	*h;
2669 
2670 	/* Calculate the SI, LI, VI, and TI values. */
2671 	si = utf8decode(str) - SB;
2672 	li = si / NC;
2673 	vi = (si % NC) / TC;
2674 	ti = si % TC;
2675 
2676 	/* Fill in base of leaf. */
2677 	h = hangul;
2678 	LEAF_GEN(h) = 2;
2679 	LEAF_CCC(h) = DECOMPOSE;
2680 	h += 2;
2681 
2682 	/* Add LPart, a 3-byte UTF-8 sequence. */
2683 	h += utf8encode((char *)h, li + LB);
2684 
2685 	/* Add VPart, a 3-byte UTF-8 sequence. */
2686 	h += utf8encode((char *)h, vi + VB);
2687 
2688 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689 	if (ti)
2690 		h += utf8encode((char *)h, ti + TB);
2691 
2692 	/* Terminate string. */
2693 	h[0] = '\0';
2694 
2695 	return hangul;
2696 }
2697 
2698 /*
2699  * Use trie to scan s, touching at most len bytes.
2700  * Returns the leaf if one exists, NULL otherwise.
2701  *
2702  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703  * is well-formed and corresponds to a known unicode code point.  The
2704  * shorthand for this will be "is valid UTF-8 unicode".
2705  */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707 			       const char *s, size_t len)
2708 {
2709 	utf8trie_t	*trie;
2710 	int		offlen;
2711 	int		offset;
2712 	int		mask;
2713 	int		node;
2714 
2715 	if (!tree)
2716 		return NULL;
2717 	if (len == 0)
2718 		return NULL;
2719 	node = 1;
2720 	trie = utf8data + tree->index;
2721 	while (node) {
2722 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723 		if (*trie & NEXTBYTE) {
2724 			if (--len == 0)
2725 				return NULL;
2726 			s++;
2727 		}
2728 		mask = 1 << (*trie & BITNUM);
2729 		if (*s & mask) {
2730 			/* Right leg */
2731 			if (offlen) {
2732 				/* Right node at offset of trie */
2733 				node = (*trie & RIGHTNODE);
2734 				offset = trie[offlen];
2735 				while (--offlen) {
2736 					offset <<= 8;
2737 					offset |= trie[offlen];
2738 				}
2739 				trie += offset;
2740 			} else if (*trie & RIGHTPATH) {
2741 				/* Right node after this node */
2742 				node = (*trie & TRIENODE);
2743 				trie++;
2744 			} else {
2745 				/* No right node. */
2746 				return NULL;
2747 			}
2748 		} else {
2749 			/* Left leg */
2750 			if (offlen) {
2751 				/* Left node after this node. */
2752 				node = (*trie & LEFTNODE);
2753 				trie += offlen + 1;
2754 			} else if (*trie & RIGHTPATH) {
2755 				/* No left node. */
2756 				return NULL;
2757 			} else {
2758 				/* Left node after this node */
2759 				node = (*trie & TRIENODE);
2760 				trie++;
2761 			}
2762 		}
2763 	}
2764 	/*
2765 	 * Hangul decomposition is done algorithmically. These are the
2766 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767 	 * always 3 bytes long, so s has been advanced twice, and the
2768 	 * start of the sequence is at s-2.
2769 	 */
2770 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771 		trie = utf8hangul(s - 2, hangul);
2772 	return trie;
2773 }
2774 
2775 /*
2776  * Use trie to scan s.
2777  * Returns the leaf if one exists, NULL otherwise.
2778  *
2779  * Forwards to trie_nlookup().
2780  */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782 			      const char *s)
2783 {
2784 	return utf8nlookup(tree, hangul, s, (size_t)-1);
2785 }
2786 
2787 /*
2788  * Return the number of bytes used by the current UTF-8 sequence.
2789  * Assumes the input points to the first byte of a valid UTF-8
2790  * sequence.
2791  */
utf8clen(const char * s)2792 static inline int utf8clen(const char *s)
2793 {
2794 	unsigned char c = *s;
2795 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796 }
2797 
2798 /*
2799  * Maximum age of any character in s.
2800  * Return -1 if s is not valid UTF-8 unicode.
2801  * Return 0 if only non-assigned code points are used.
2802  */
utf8agemax(struct tree * tree,const char * s)2803 int utf8agemax(struct tree *tree, const char *s)
2804 {
2805 	utf8leaf_t	*leaf;
2806 	int		age = 0;
2807 	int		leaf_age;
2808 	unsigned char	hangul[UTF8HANGULLEAF];
2809 
2810 	if (!tree)
2811 		return -1;
2812 
2813 	while (*s) {
2814 		leaf = utf8lookup(tree, hangul, s);
2815 		if (!leaf)
2816 			return -1;
2817 		leaf_age = ages[LEAF_GEN(leaf)];
2818 		if (leaf_age <= tree->maxage && leaf_age > age)
2819 			age = leaf_age;
2820 		s += utf8clen(s);
2821 	}
2822 	return age;
2823 }
2824 
2825 /*
2826  * Minimum age of any character in s.
2827  * Return -1 if s is not valid UTF-8 unicode.
2828  * Return 0 if non-assigned code points are used.
2829  */
utf8agemin(struct tree * tree,const char * s)2830 int utf8agemin(struct tree *tree, const char *s)
2831 {
2832 	utf8leaf_t	*leaf;
2833 	int		age;
2834 	int		leaf_age;
2835 	unsigned char	hangul[UTF8HANGULLEAF];
2836 
2837 	if (!tree)
2838 		return -1;
2839 	age = tree->maxage;
2840 	while (*s) {
2841 		leaf = utf8lookup(tree, hangul, s);
2842 		if (!leaf)
2843 			return -1;
2844 		leaf_age = ages[LEAF_GEN(leaf)];
2845 		if (leaf_age <= tree->maxage && leaf_age < age)
2846 			age = leaf_age;
2847 		s += utf8clen(s);
2848 	}
2849 	return age;
2850 }
2851 
2852 /*
2853  * Maximum age of any character in s, touch at most len bytes.
2854  * Return -1 if s is not valid UTF-8 unicode.
2855  */
utf8nagemax(struct tree * tree,const char * s,size_t len)2856 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857 {
2858 	utf8leaf_t	*leaf;
2859 	int		age = 0;
2860 	int		leaf_age;
2861 	unsigned char	hangul[UTF8HANGULLEAF];
2862 
2863 	if (!tree)
2864 		return -1;
2865 
2866         while (len && *s) {
2867 		leaf = utf8nlookup(tree, hangul, s, len);
2868 		if (!leaf)
2869 			return -1;
2870 		leaf_age = ages[LEAF_GEN(leaf)];
2871 		if (leaf_age <= tree->maxage && leaf_age > age)
2872 			age = leaf_age;
2873 		len -= utf8clen(s);
2874 		s += utf8clen(s);
2875 	}
2876 	return age;
2877 }
2878 
2879 /*
2880  * Maximum age of any character in s, touch at most len bytes.
2881  * Return -1 if s is not valid UTF-8 unicode.
2882  */
utf8nagemin(struct tree * tree,const char * s,size_t len)2883 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884 {
2885 	utf8leaf_t	*leaf;
2886 	int		leaf_age;
2887 	int		age;
2888 	unsigned char	hangul[UTF8HANGULLEAF];
2889 
2890 	if (!tree)
2891 		return -1;
2892 	age = tree->maxage;
2893         while (len && *s) {
2894 		leaf = utf8nlookup(tree, hangul, s, len);
2895 		if (!leaf)
2896 			return -1;
2897 		leaf_age = ages[LEAF_GEN(leaf)];
2898 		if (leaf_age <= tree->maxage && leaf_age < age)
2899 			age = leaf_age;
2900 		len -= utf8clen(s);
2901 		s += utf8clen(s);
2902 	}
2903 	return age;
2904 }
2905 
2906 /*
2907  * Length of the normalization of s.
2908  * Return -1 if s is not valid UTF-8 unicode.
2909  *
2910  * A string of Default_Ignorable_Code_Point has length 0.
2911  */
utf8len(struct tree * tree,const char * s)2912 ssize_t utf8len(struct tree *tree, const char *s)
2913 {
2914 	utf8leaf_t	*leaf;
2915 	size_t		ret = 0;
2916 	unsigned char	hangul[UTF8HANGULLEAF];
2917 
2918 	if (!tree)
2919 		return -1;
2920 	while (*s) {
2921 		leaf = utf8lookup(tree, hangul, s);
2922 		if (!leaf)
2923 			return -1;
2924 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 			ret += utf8clen(s);
2926 		else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 			ret += strlen(LEAF_STR(leaf));
2928 		else
2929 			ret += utf8clen(s);
2930 		s += utf8clen(s);
2931 	}
2932 	return ret;
2933 }
2934 
2935 /*
2936  * Length of the normalization of s, touch at most len bytes.
2937  * Return -1 if s is not valid UTF-8 unicode.
2938  */
utf8nlen(struct tree * tree,const char * s,size_t len)2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940 {
2941 	utf8leaf_t	*leaf;
2942 	size_t		ret = 0;
2943 	unsigned char	hangul[UTF8HANGULLEAF];
2944 
2945 	if (!tree)
2946 		return -1;
2947 	while (len && *s) {
2948 		leaf = utf8nlookup(tree, hangul, s, len);
2949 		if (!leaf)
2950 			return -1;
2951 		if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952 			ret += utf8clen(s);
2953 		else if (LEAF_CCC(leaf) == DECOMPOSE)
2954 			ret += strlen(LEAF_STR(leaf));
2955 		else
2956 			ret += utf8clen(s);
2957 		len -= utf8clen(s);
2958 		s += utf8clen(s);
2959 	}
2960 	return ret;
2961 }
2962 
2963 /*
2964  * Cursor structure used by the normalizer.
2965  */
2966 struct utf8cursor {
2967 	struct tree	*tree;
2968 	const char	*s;
2969 	const char	*p;
2970 	const char	*ss;
2971 	const char	*sp;
2972 	unsigned int	len;
2973 	unsigned int	slen;
2974 	short int	ccc;
2975 	short int	nccc;
2976 	unsigned int	unichar;
2977 	unsigned char	hangul[UTF8HANGULLEAF];
2978 };
2979 
2980 /*
2981  * Set up an utf8cursor for use by utf8byte().
2982  *
2983  *   s      : string.
2984  *   len    : length of s.
2985  *   u8c    : pointer to cursor.
2986  *   trie   : utf8trie_t to use for normalization.
2987  *
2988  * Returns -1 on error, 0 on success.
2989  */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991 		size_t len)
2992 {
2993 	if (!tree)
2994 		return -1;
2995 	if (!s)
2996 		return -1;
2997 	u8c->tree = tree;
2998 	u8c->s = s;
2999 	u8c->p = NULL;
3000 	u8c->ss = NULL;
3001 	u8c->sp = NULL;
3002 	u8c->len = len;
3003 	u8c->slen = 0;
3004 	u8c->ccc = STOPPER;
3005 	u8c->nccc = STOPPER;
3006 	u8c->unichar = 0;
3007 	/* Check we didn't clobber the maximum length. */
3008 	if (u8c->len != len)
3009 		return -1;
3010 	/* The first byte of s may not be an utf8 continuation. */
3011 	if (len > 0 && (*s & 0xC0) == 0x80)
3012 		return -1;
3013 	return 0;
3014 }
3015 
3016 /*
3017  * Set up an utf8cursor for use by utf8byte().
3018  *
3019  *   s      : NUL-terminated string.
3020  *   u8c    : pointer to cursor.
3021  *   trie   : utf8trie_t to use for normalization.
3022  *
3023  * Returns -1 on error, 0 on success.
3024  */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026 {
3027 	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028 }
3029 
3030 /*
3031  * Get one byte from the normalized form of the string described by u8c.
3032  *
3033  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034  *
3035  * The cursor keeps track of the location in the string in u8c->s.
3036  * When a character is decomposed, the current location is stored in
3037  * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038  * that bytes from a decomposition do not count against u8c->len.
3039  *
3040  * Characters are emitted if they match the current CCC in u8c->ccc.
3041  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042  * and the function returns 0 in that case.
3043  *
3044  * Sorting by CCC is done by repeatedly scanning the string.  The
3045  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046  * the start of the scan.  The first pass finds the lowest CCC to be
3047  * emitted and stores it in u8c->nccc, the second pass emits the
3048  * characters with this CCC and finds the next lowest CCC. This limits
3049  * the number of passes to 1 + the number of different CCCs in the
3050  * sequence being scanned.
3051  *
3052  * Therefore:
3053  *  u8c->p  != NULL -> a decomposition is being scanned.
3054  *  u8c->ss != NULL -> this is a repeating scan.
3055  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3056  */
utf8byte(struct utf8cursor * u8c)3057 int utf8byte(struct utf8cursor *u8c)
3058 {
3059 	utf8leaf_t *leaf;
3060 	int ccc;
3061 
3062 	for (;;) {
3063 		/* Check for the end of a decomposed character. */
3064 		if (u8c->p && *u8c->s == '\0') {
3065 			u8c->s = u8c->p;
3066 			u8c->p = NULL;
3067 		}
3068 
3069 		/* Check for end-of-string. */
3070 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071 			/* There is no next byte. */
3072 			if (u8c->ccc == STOPPER)
3073 				return 0;
3074 			/* End-of-string during a scan counts as a stopper. */
3075 			ccc = STOPPER;
3076 			goto ccc_mismatch;
3077 		} else if ((*u8c->s & 0xC0) == 0x80) {
3078 			/* This is a continuation of the current character. */
3079 			if (!u8c->p)
3080 				u8c->len--;
3081 			return (unsigned char)*u8c->s++;
3082 		}
3083 
3084 		/* Look up the data for the current character. */
3085 		if (u8c->p) {
3086 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087 		} else {
3088 			leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089 					   u8c->s, u8c->len);
3090 		}
3091 
3092 		/* No leaf found implies that the input is a binary blob. */
3093 		if (!leaf)
3094 			return -1;
3095 
3096 		/* Characters that are too new have CCC 0. */
3097 		if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098 			ccc = STOPPER;
3099 		} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100 			u8c->len -= utf8clen(u8c->s);
3101 			u8c->p = u8c->s + utf8clen(u8c->s);
3102 			u8c->s = LEAF_STR(leaf);
3103 			/* Empty decomposition implies CCC 0. */
3104 			if (*u8c->s == '\0') {
3105 				if (u8c->ccc == STOPPER)
3106 					continue;
3107 				ccc = STOPPER;
3108 				goto ccc_mismatch;
3109 			}
3110 			leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111 			ccc = LEAF_CCC(leaf);
3112 		}
3113 		u8c->unichar = utf8decode(u8c->s);
3114 
3115 		/*
3116 		 * If this is not a stopper, then see if it updates
3117 		 * the next canonical class to be emitted.
3118 		 */
3119 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120 			u8c->nccc = ccc;
3121 
3122 		/*
3123 		 * Return the current byte if this is the current
3124 		 * combining class.
3125 		 */
3126 		if (ccc == u8c->ccc) {
3127 			if (!u8c->p)
3128 				u8c->len--;
3129 			return (unsigned char)*u8c->s++;
3130 		}
3131 
3132 		/* Current combining class mismatch. */
3133 	ccc_mismatch:
3134 		if (u8c->nccc == STOPPER) {
3135 			/*
3136 			 * Scan forward for the first canonical class
3137 			 * to be emitted.  Save the position from
3138 			 * which to restart.
3139 			 */
3140 			assert(u8c->ccc == STOPPER);
3141 			u8c->ccc = MINCCC - 1;
3142 			u8c->nccc = ccc;
3143 			u8c->sp = u8c->p;
3144 			u8c->ss = u8c->s;
3145 			u8c->slen = u8c->len;
3146 			if (!u8c->p)
3147 				u8c->len -= utf8clen(u8c->s);
3148 			u8c->s += utf8clen(u8c->s);
3149 		} else if (ccc != STOPPER) {
3150 			/* Not a stopper, and not the ccc we're emitting. */
3151 			if (!u8c->p)
3152 				u8c->len -= utf8clen(u8c->s);
3153 			u8c->s += utf8clen(u8c->s);
3154 		} else if (u8c->nccc != MAXCCC + 1) {
3155 			/* At a stopper, restart for next ccc. */
3156 			u8c->ccc = u8c->nccc;
3157 			u8c->nccc = MAXCCC + 1;
3158 			u8c->s = u8c->ss;
3159 			u8c->p = u8c->sp;
3160 			u8c->len = u8c->slen;
3161 		} else {
3162 			/* All done, proceed from here. */
3163 			u8c->ccc = STOPPER;
3164 			u8c->nccc = STOPPER;
3165 			u8c->sp = NULL;
3166 			u8c->ss = NULL;
3167 			u8c->slen = 0;
3168 		}
3169 	}
3170 }
3171 
3172 /* ------------------------------------------------------------------ */
3173 
normalize_line(struct tree * tree)3174 static int normalize_line(struct tree *tree)
3175 {
3176 	char *s;
3177 	char *t;
3178 	int c;
3179 	struct utf8cursor u8c;
3180 
3181 	/* First test: null-terminated string. */
3182 	s = buf2;
3183 	t = buf3;
3184 	if (utf8cursor(&u8c, tree, s))
3185 		return -1;
3186 	while ((c = utf8byte(&u8c)) > 0)
3187 		if (c != (unsigned char)*t++)
3188 			return -1;
3189 	if (c < 0)
3190 		return -1;
3191 	if (*t != 0)
3192 		return -1;
3193 
3194 	/* Second test: length-limited string. */
3195 	s = buf2;
3196 	/* Replace NUL with a value that will cause an error if seen. */
3197 	s[strlen(s) + 1] = -1;
3198 	t = buf3;
3199 	if (utf8cursor(&u8c, tree, s))
3200 		return -1;
3201 	while ((c = utf8byte(&u8c)) > 0)
3202 		if (c != (unsigned char)*t++)
3203 			return -1;
3204 	if (c < 0)
3205 		return -1;
3206 	if (*t != 0)
3207 		return -1;
3208 
3209 	return 0;
3210 }
3211 
normalization_test(void)3212 static void normalization_test(void)
3213 {
3214 	FILE *file;
3215 	unsigned int unichar;
3216 	struct unicode_data *data;
3217 	char *s;
3218 	char *t;
3219 	int ret;
3220 	int ignorables;
3221 	int tests = 0;
3222 	int failures = 0;
3223 
3224 	if (verbose > 0)
3225 		printf("Parsing %s\n", test_name);
3226 	/* Step one, read data from file. */
3227 	file = fopen(test_name, "r");
3228 	if (!file)
3229 		open_fail(test_name, errno);
3230 
3231 	while (fgets(line, LINESIZE, file)) {
3232 		ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233 			     buf0, buf1);
3234 		if (ret != 2 || *line == '#')
3235 			continue;
3236 		s = buf0;
3237 		t = buf2;
3238 		while (*s) {
3239 			unichar = strtoul(s, &s, 16);
3240 			t += utf8encode(t, unichar);
3241 		}
3242 		*t = '\0';
3243 
3244 		ignorables = 0;
3245 		s = buf1;
3246 		t = buf3;
3247 		while (*s) {
3248 			unichar = strtoul(s, &s, 16);
3249 			data = &unicode_data[unichar];
3250 			if (data->utf8nfdi && !*data->utf8nfdi)
3251 				ignorables = 1;
3252 			else
3253 				t += utf8encode(t, unichar);
3254 		}
3255 		*t = '\0';
3256 
3257 		tests++;
3258 		if (normalize_line(nfdi_tree) < 0) {
3259 			printf("Line %s -> %s", buf0, buf1);
3260 			if (ignorables)
3261 				printf(" (ignorables removed)");
3262 			printf(" failure\n");
3263 			failures++;
3264 		}
3265 	}
3266 	fclose(file);
3267 	if (verbose > 0)
3268 		printf("Ran %d tests with %d failures\n", tests, failures);
3269 	if (failures)
3270 		file_fail(test_name);
3271 }
3272 
3273 /* ------------------------------------------------------------------ */
3274 
write_file(void)3275 static void write_file(void)
3276 {
3277 	FILE *file;
3278 	int i;
3279 	int j;
3280 	int t;
3281 	int gen;
3282 
3283 	if (verbose > 0)
3284 		printf("Writing %s\n", utf8_name);
3285 	file = fopen(utf8_name, "w");
3286 	if (!file)
3287 		open_fail(utf8_name, errno);
3288 
3289 	fprintf(file, "/* This file is generated code, do not edit. */\n");
3290 	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291 	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292 	fprintf(file, "#endif\n");
3293 	fprintf(file, "\n");
3294 	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295 		unicode_maxage);
3296 	fprintf(file, "\n");
3297 	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3298 	for (i = 0; i != ages_count; i++)
3299 		fprintf(file, "\t%#x%s\n", ages[i],
3300 			ages[i] == unicode_maxage ? "" : ",");
3301 	fprintf(file, "};\n");
3302 	fprintf(file, "\n");
3303 	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3304 	t = 0;
3305 	for (gen = 0; gen < ages_count; gen++) {
3306 		fprintf(file, "\t{ %#x, %d }%s\n",
3307 			ages[gen], trees[t].index,
3308 			ages[gen] == unicode_maxage ? "" : ",");
3309 		if (trees[t].maxage == ages[gen])
3310 			t += 2;
3311 	}
3312 	fprintf(file, "};\n");
3313 	fprintf(file, "\n");
3314 	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3315 	t = 1;
3316 	for (gen = 0; gen < ages_count; gen++) {
3317 		fprintf(file, "\t{ %#x, %d }%s\n",
3318 			ages[gen], trees[t].index,
3319 			ages[gen] == unicode_maxage ? "" : ",");
3320 		if (trees[t].maxage == ages[gen])
3321 			t += 2;
3322 	}
3323 	fprintf(file, "};\n");
3324 	fprintf(file, "\n");
3325 	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3326 		utf8data_size);
3327 	t = 0;
3328 	for (i = 0; i != utf8data_size; i += 16) {
3329 		if (i == trees[t].index) {
3330 			fprintf(file, "\t/* %s_%x */\n",
3331 				trees[t].type, trees[t].maxage);
3332 			if (t < trees_count-1)
3333 				t++;
3334 		}
3335 		fprintf(file, "\t");
3336 		for (j = i; j != i + 16; j++)
3337 			fprintf(file, "0x%.2x%s", utf8data[j],
3338 				(j < utf8data_size -1 ? "," : ""));
3339 		fprintf(file, "\n");
3340 	}
3341 	fprintf(file, "};\n");
3342 	fclose(file);
3343 }
3344 
3345 /* ------------------------------------------------------------------ */
3346 
main(int argc,char * argv[])3347 int main(int argc, char *argv[])
3348 {
3349 	unsigned int unichar;
3350 	int opt;
3351 
3352 	argv0 = argv[0];
3353 
3354 	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3355 		switch (opt) {
3356 		case 'a':
3357 			age_name = optarg;
3358 			break;
3359 		case 'c':
3360 			ccc_name = optarg;
3361 			break;
3362 		case 'd':
3363 			data_name = optarg;
3364 			break;
3365 		case 'f':
3366 			fold_name = optarg;
3367 			break;
3368 		case 'n':
3369 			norm_name = optarg;
3370 			break;
3371 		case 'o':
3372 			utf8_name = optarg;
3373 			break;
3374 		case 'p':
3375 			prop_name = optarg;
3376 			break;
3377 		case 't':
3378 			test_name = optarg;
3379 			break;
3380 		case 'v':
3381 			verbose++;
3382 			break;
3383 		case 'h':
3384 			help();
3385 			exit(0);
3386 		default:
3387 			usage();
3388 		}
3389 	}
3390 
3391 	if (verbose > 1)
3392 		help();
3393 	for (unichar = 0; unichar != 0x110000; unichar++)
3394 		unicode_data[unichar].code = unichar;
3395 	age_init();
3396 	ccc_init();
3397 	nfdi_init();
3398 	nfdicf_init();
3399 	ignore_init();
3400 	corrections_init();
3401 	hangul_decompose();
3402 	nfdi_decompose();
3403 	nfdicf_decompose();
3404 	utf8_init();
3405 	trees_init();
3406 	trees_populate();
3407 	trees_reduce();
3408 	trees_verify();
3409 	/* Prevent "unused function" warning. */
3410 	(void)lookup(nfdi_tree, " ");
3411 	if (verbose > 2)
3412 		tree_walk(nfdi_tree);
3413 	if (verbose > 2)
3414 		tree_walk(nfdicf_tree);
3415 	normalization_test();
3416 	write_file();
3417 
3418 	return 0;
3419 }
3420