xref: /freebsd/usr.bin/localedef/collate.c (revision 00c61a3b)
1 /*-
2  * Copyright 2018 Nexenta Systems, Inc.
3  * Copyright 2015 John Marino <draco@marino.st>
4  *
5  * This source code is derived from the illumos localedef command, and
6  * provided under BSD-style license terms by Nexenta Systems, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * LC_COLLATE database generation routines for localedef.
33  */
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/types.h>
38 #include <sys/tree.h>
39 
40 #include <stdio.h>
41 #include <stddef.h>
42 #include <stdlib.h>
43 #include <errno.h>
44 #include <string.h>
45 #include <unistd.h>
46 #include <wchar.h>
47 #include <limits.h>
48 #include "localedef.h"
49 #include "parser.h"
50 #include "collate.h"
51 
52 _Static_assert(COLL_WEIGHTS_MAX == 10, "This code assumes a value of 10");
53 
54 /*
55  * Design notes.
56  *
57  * It will be extremely helpful to the reader if they have access to
58  * the localedef and locale file format specifications available.
59  * Latest versions of these are available from www.opengroup.org.
60  *
61  * The design for the collation code is a bit complex.  The goal is a
62  * single collation database as described in collate.h (in
63  * libc/port/locale).  However, there are some other tidbits:
64  *
65  * a) The substitution entries are now a directly indexable array.  A
66  * priority elsewhere in the table is taken as an index into the
67  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
68  * set.  (The bit is cleared and the result is the index into the
69  * table.
70  *
71  * b) We eliminate duplicate entries into the substitution table.
72  * This saves a lot of space.
73  *
74  * c) The priorities for each level are "compressed", so that each
75  * sorting level has consecutively numbered priorities starting at 1.
76  * (O is reserved for the ignore priority.)  This means sort levels
77  * which only have a few distinct priorities can represent the
78  * priority level in fewer bits, which makes the strxfrm output
79  * smaller.
80  *
81  * d) We record the total number of priorities so that strxfrm can
82  * figure out how many bytes to expand a numeric priority into.
83  *
84  * e) For the UNDEFINED pass (the last pass), we record the maximum
85  * number of bits needed to uniquely prioritize these entries, so that
86  * the last pass can also use smaller strxfrm output when possible.
87  *
88  * f) Priorities with the sign bit set are verboten.  This works out
89  * because no active character set needs that bit to carry significant
90  * information once the character is in wide form.
91  *
92  * To process the entire data to make the database, we actually run
93  * multiple passes over the data.
94  *
95  * The first pass, which is done at parse time, identifies elements,
96  * substitutions, and such, and records them in priority order.  As
97  * some priorities can refer to other priorities, using forward
98  * references, we use a table of references indicating whether the
99  * priority's value has been resolved, or whether it is still a
100  * reference.
101  *
102  * The second pass walks over all the items in priority order, noting
103  * that they are used directly, and not just an indirect reference.
104  * This is done by creating a "weight" structure for the item.  The
105  * weights are stashed in an RB tree sorted by relative "priority".
106  *
107  * The third pass walks over all the weight structures, in priority
108  * order, and assigns a new monotonically increasing (per sort level)
109  * weight value to them.  These are the values that will actually be
110  * written to the file.
111  *
112  * The fourth pass just writes the data out.
113  */
114 
115 /*
116  * In order to resolve the priorities, we create a table of priorities.
117  * Entries in the table can be in one of three states.
118  *
119  * UNKNOWN is for newly allocated entries, and indicates that nothing
120  * is known about the priority.  (For example, when new entries are created
121  * for collating-symbols, this is the value assigned for them until the
122  * collating symbol's order has been determined.
123  *
124  * RESOLVED is used for an entry where the priority indicates the final
125  * numeric weight.
126  *
127  * REFER is used for entries that reference other entries.  Typically
128  * this is used for forward references.  A collating-symbol can never
129  * have this value.
130  *
131  * The "pass" field is used during final resolution to aid in detection
132  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
133  * priority dependent on <A>.)
134  */
135 typedef enum {
136 	UNKNOWN,	/* priority is totally unknown */
137 	RESOLVED,	/* priority value fully resolved */
138 	REFER		/* priority is a reference (index) */
139 } res_t;
140 
141 typedef struct weight {
142 	int32_t		pri;
143 	int		opt;
144 	RB_ENTRY(weight) entry;
145 } weight_t;
146 
147 typedef struct priority {
148 	res_t		res;
149 	int32_t		pri;
150 	int		pass;
151 	int		lineno;
152 } collpri_t;
153 
154 #define	NUM_WT	collinfo.directive_count
155 
156 /*
157  * These are the abstract collating symbols, which are just a symbolic
158  * way to reference a priority.
159  */
160 struct collsym {
161 	char		*name;
162 	int32_t		ref;
163 	RB_ENTRY(collsym) entry;
164 };
165 
166 /*
167  * These are also abstract collating symbols, but we allow them to have
168  * different priorities at different levels.
169  */
170 typedef struct collundef {
171 	char		*name;
172 	int32_t		ref[COLL_WEIGHTS_MAX];
173 	RB_ENTRY(collundef) entry;
174 } collundef_t;
175 
176 /*
177  * These are called "chains" in libc.  This records the fact that two
178  * more characters should be treated as a single collating entity when
179  * they appear together.  For example, in Spanish <C><h> gets collated
180  * as a character between <C> and <D>.
181  */
182 struct collelem {
183 	char		*symbol;
184 	wchar_t		*expand;
185 	int32_t		ref[COLL_WEIGHTS_MAX];
186 	RB_ENTRY(collelem) rb_bysymbol;
187 	RB_ENTRY(collelem) rb_byexpand;
188 };
189 
190 /*
191  * Individual characters have a sequence of weights as well.
192  */
193 typedef struct collchar {
194 	wchar_t		wc;
195 	int32_t		ref[COLL_WEIGHTS_MAX];
196 	RB_ENTRY(collchar) entry;
197 } collchar_t;
198 
199 /*
200  * Substitution entries.  The key is itself a priority.  Note that
201  * when we create one of these, we *automatically* wind up with a
202  * fully resolved priority for the key, because creation of
203  * substitutions creates a resolved priority at the same time.
204  */
205 typedef struct subst{
206 	int32_t		key;
207 	int32_t		ref[COLLATE_STR_LEN];
208 	RB_ENTRY(subst)	entry;
209 	RB_ENTRY(subst)	entry_ref;
210 } subst_t;
211 
212 static RB_HEAD(collsyms, collsym) collsyms;
213 static RB_HEAD(collundefs, collundef) collundefs;
214 static RB_HEAD(elem_by_symbol, collelem) elem_by_symbol;
215 static RB_HEAD(elem_by_expand, collelem) elem_by_expand;
216 static RB_HEAD(collchars, collchar) collchars;
217 static RB_HEAD(substs, subst) substs[COLL_WEIGHTS_MAX];
218 static RB_HEAD(substs_ref, subst) substs_ref[COLL_WEIGHTS_MAX];
219 static RB_HEAD(weights, weight) weights[COLL_WEIGHTS_MAX];
220 static int32_t		nweight[COLL_WEIGHTS_MAX];
221 
222 /*
223  * This is state tracking for the ellipsis token.  Note that we start
224  * the initial values so that the ellipsis logic will think we got a
225  * magic starting value of NUL.  It starts at minus one because the
226  * starting point is exclusive -- i.e. the starting point is not
227  * itself handled by the ellipsis code.
228  */
229 static int currorder = EOF;
230 static int lastorder = EOF;
231 static collelem_t *currelem;
232 static collchar_t *currchar;
233 static collundef_t *currundef;
234 static wchar_t ellipsis_start = 0;
235 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
236 
237 /*
238  * We keep a running tally of weights.
239  */
240 static int nextpri = 1;
241 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
242 
243 /*
244  * This array collects up the weights for each level.
245  */
246 static int32_t order_weights[COLL_WEIGHTS_MAX];
247 static int curr_weight = 0;
248 static int32_t subst_weights[COLLATE_STR_LEN];
249 static int curr_subst = 0;
250 
251 /*
252  * Some initial priority values.
253  */
254 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
255 static int32_t pri_ignore;
256 
257 static collate_info_t collinfo;
258 static int32_t subst_count[COLL_WEIGHTS_MAX];
259 static int32_t chain_count;
260 static int32_t large_count;
261 
262 static collpri_t	*prilist = NULL;
263 static int		numpri = 0;
264 static int		maxpri = 0;
265 
266 static void start_order(int);
267 
268 static int32_t
269 new_pri(void)
270 {
271 	int i;
272 
273 	if (numpri >= maxpri) {
274 		maxpri = maxpri ? maxpri * 2 : 1024;
275 		prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
276 		if (prilist == NULL) {
277 			fprintf(stderr,"out of memory");
278 			return (-1);
279 		}
280 		for (i = numpri; i < maxpri; i++) {
281 			prilist[i].res = UNKNOWN;
282 			prilist[i].pri = 0;
283 			prilist[i].pass = 0;
284 		}
285 	}
286 	return (numpri++);
287 }
288 
289 static collpri_t *
290 get_pri(int32_t ref)
291 {
292 	if ((ref < 0) || (ref > numpri)) {
293 		INTERR;
294 		return (NULL);
295 	}
296 	return (&prilist[ref]);
297 }
298 
299 static void
300 set_pri(int32_t ref, int32_t v, res_t res)
301 {
302 	collpri_t	*pri;
303 
304 	pri = get_pri(ref);
305 
306 	if ((res == REFER) && ((v < 0) || (v >= numpri))) {
307 		INTERR;
308 	}
309 
310 	/* Resolve self references */
311 	if ((res == REFER) && (ref == v)) {
312 		v = nextpri;
313 		res = RESOLVED;
314 	}
315 
316 	if (pri->res != UNKNOWN) {
317 		warn("repeated item in order list (first on %d)",
318 		    pri->lineno);
319 		return;
320 	}
321 	pri->lineno = lineno;
322 	pri->pri = v;
323 	pri->res = res;
324 }
325 
326 static int32_t
327 resolve_pri(int32_t ref)
328 {
329 	collpri_t	*pri;
330 	static int32_t	pass = 0;
331 
332 	pri = get_pri(ref);
333 	pass++;
334 	while (pri->res == REFER) {
335 		if (pri->pass == pass) {
336 			/* report a line with the circular symbol */
337 			lineno = pri->lineno;
338 			fprintf(stderr,"circular reference in order list");
339 			return (-1);
340 		}
341 		if ((pri->pri < 0) || (pri->pri >= numpri)) {
342 			INTERR;
343 			return (-1);
344 		}
345 		pri->pass = pass;
346 		pri = &prilist[pri->pri];
347 	}
348 
349 	if (pri->res == UNKNOWN) {
350 		return (-1);
351 	}
352 	if (pri->res != RESOLVED)
353 		INTERR;
354 
355 	return (pri->pri);
356 }
357 
358 static int
359 weight_compare(const void *n1, const void *n2)
360 {
361 	int32_t	k1 = ((const weight_t *)n1)->pri;
362 	int32_t	k2 = ((const weight_t *)n2)->pri;
363 
364 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
365 }
366 
367 RB_GENERATE_STATIC(weights, weight, entry, weight_compare);
368 
369 static int
370 collsym_compare(const void *n1, const void *n2)
371 {
372 	const collsym_t *c1 = n1;
373 	const collsym_t *c2 = n2;
374 	int rv;
375 
376 	rv = strcmp(c1->name, c2->name);
377 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
378 }
379 
380 RB_GENERATE_STATIC(collsyms, collsym, entry, collsym_compare);
381 
382 static int
383 collundef_compare(const void *n1, const void *n2)
384 {
385 	const collundef_t *c1 = n1;
386 	const collundef_t *c2 = n2;
387 	int rv;
388 
389 	rv = strcmp(c1->name, c2->name);
390 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
391 }
392 
393 RB_GENERATE_STATIC(collundefs, collundef, entry, collundef_compare);
394 
395 static int
396 element_compare_symbol(const void *n1, const void *n2)
397 {
398 	const collelem_t *c1 = n1;
399 	const collelem_t *c2 = n2;
400 	int rv;
401 
402 	rv = strcmp(c1->symbol, c2->symbol);
403 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
404 }
405 
406 RB_GENERATE_STATIC(elem_by_symbol, collelem, rb_bysymbol, element_compare_symbol);
407 
408 static int
409 element_compare_expand(const void *n1, const void *n2)
410 {
411 	const collelem_t *c1 = n1;
412 	const collelem_t *c2 = n2;
413 	int rv;
414 
415 	rv = wcscmp(c1->expand, c2->expand);
416 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
417 }
418 
419 RB_GENERATE_STATIC(elem_by_expand, collelem, rb_byexpand, element_compare_expand);
420 
421 static int
422 collchar_compare(const void *n1, const void *n2)
423 {
424 	wchar_t	k1 = ((const collchar_t *)n1)->wc;
425 	wchar_t	k2 = ((const collchar_t *)n2)->wc;
426 
427 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
428 }
429 
430 RB_GENERATE_STATIC(collchars, collchar, entry, collchar_compare);
431 
432 static int
433 subst_compare(const void *n1, const void *n2)
434 {
435 	int32_t	k1 = ((const subst_t *)n1)->key;
436 	int32_t	k2 = ((const subst_t *)n2)->key;
437 
438 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
439 }
440 
441 RB_GENERATE_STATIC(substs, subst, entry, subst_compare);
442 
443 static int
444 subst_compare_ref(const void *n1, const void *n2)
445 {
446 	const wchar_t *c1 = ((const subst_t *)n1)->ref;
447 	const wchar_t *c2 = ((const subst_t *)n2)->ref;
448 	int rv;
449 
450 	rv = wcscmp(c1, c2);
451 	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
452 }
453 
454 RB_GENERATE_STATIC(substs_ref, subst, entry_ref, subst_compare_ref);
455 
456 void
457 init_collate(void)
458 {
459 	int i;
460 
461 	RB_INIT(&collsyms);
462 
463 	RB_INIT(&collundefs);
464 
465 	RB_INIT(&elem_by_symbol);
466 
467 	RB_INIT(&elem_by_expand);
468 
469 	RB_INIT(&collchars);
470 
471 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
472 		RB_INIT(&substs[i]);
473 		RB_INIT(&substs_ref[i]);
474 		RB_INIT(&weights[i]);
475 		nweight[i] = 1;
476 	}
477 
478 	(void) memset(&collinfo, 0, sizeof (collinfo));
479 
480 	/* allocate some initial priorities */
481 	pri_ignore = new_pri();
482 
483 	set_pri(pri_ignore, 0, RESOLVED);
484 
485 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
486 		pri_undefined[i] = new_pri();
487 
488 		/* we will override this later */
489 		set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
490 	}
491 }
492 
493 void
494 define_collsym(char *name)
495 {
496 	collsym_t	*sym;
497 
498 	if ((sym = calloc(1, sizeof(*sym))) == NULL) {
499 		fprintf(stderr,"out of memory");
500 		return;
501 	}
502 	sym->name = name;
503 	sym->ref = new_pri();
504 
505 	if (RB_FIND(collsyms, &collsyms, sym) != NULL) {
506 		/*
507 		 * This should never happen because we are only called
508 		 * for undefined symbols.
509 		 */
510 		free(sym);
511 		INTERR;
512 		return;
513 	}
514 	RB_INSERT(collsyms, &collsyms, sym);
515 }
516 
517 collsym_t *
518 lookup_collsym(char *name)
519 {
520 	collsym_t	srch;
521 
522 	srch.name = name;
523 	return (RB_FIND(collsyms, &collsyms, &srch));
524 }
525 
526 collelem_t *
527 lookup_collelem(char *symbol)
528 {
529 	collelem_t	srch;
530 
531 	srch.symbol = symbol;
532 	return (RB_FIND(elem_by_symbol, &elem_by_symbol, &srch));
533 }
534 
535 static collundef_t *
536 get_collundef(char *name)
537 {
538 	collundef_t	srch;
539 	collundef_t	*ud;
540 	int		i;
541 
542 	srch.name = name;
543 	if ((ud = RB_FIND(collundefs, &collundefs, &srch)) == NULL) {
544 		if (((ud = calloc(1, sizeof(*ud))) == NULL) ||
545 		    ((ud->name = strdup(name)) == NULL)) {
546 			fprintf(stderr,"out of memory");
547 			free(ud);
548 			return (NULL);
549 		}
550 		for (i = 0; i < NUM_WT; i++) {
551 			ud->ref[i] = new_pri();
552 		}
553 		RB_INSERT(collundefs, &collundefs, ud);
554 	}
555 	add_charmap_undefined(name);
556 	return (ud);
557 }
558 
559 static collchar_t *
560 get_collchar(wchar_t wc, int create)
561 {
562 	collchar_t	srch;
563 	collchar_t	*cc;
564 	int		i;
565 
566 	srch.wc = wc;
567 	cc = RB_FIND(collchars, &collchars, &srch);
568 	if ((cc == NULL) && create) {
569 		if ((cc = calloc(1, sizeof(*cc))) == NULL) {
570 			fprintf(stderr, "out of memory");
571 			return (NULL);
572 		}
573 		for (i = 0; i < NUM_WT; i++) {
574 			cc->ref[i] = new_pri();
575 		}
576 		cc->wc = wc;
577 		RB_INSERT(collchars, &collchars, cc);
578 	}
579 	return (cc);
580 }
581 
582 void
583 end_order_collsym(collsym_t *sym)
584 {
585 	start_order(T_COLLSYM);
586 	/* update the weight */
587 
588 	set_pri(sym->ref, nextpri, RESOLVED);
589 	nextpri++;
590 }
591 
592 void
593 end_order(void)
594 {
595 	int		i;
596 	int32_t		pri;
597 	int32_t		ref;
598 	collpri_t	*p;
599 
600 	/* advance the priority/weight */
601 	pri = nextpri;
602 
603 	switch (currorder) {
604 	case T_CHAR:
605 		for (i = 0; i < NUM_WT; i++) {
606 			if (((ref = order_weights[i]) < 0) ||
607 			    ((p = get_pri(ref)) == NULL) ||
608 			    (p->pri == -1)) {
609 				/* unspecified weight is a self reference */
610 				set_pri(currchar->ref[i], pri, RESOLVED);
611 			} else {
612 				set_pri(currchar->ref[i], ref, REFER);
613 			}
614 			order_weights[i] = -1;
615 		}
616 
617 		/* leave a cookie trail in case next symbol is ellipsis */
618 		ellipsis_start = currchar->wc + 1;
619 		currchar = NULL;
620 		break;
621 
622 	case T_ELLIPSIS:
623 		/* save off the weights were we can find them */
624 		for (i = 0; i < NUM_WT; i++) {
625 			ellipsis_weights[i] = order_weights[i];
626 			order_weights[i] = -1;
627 		}
628 		break;
629 
630 	case T_COLLELEM:
631 		if (currelem == NULL) {
632 			INTERR;
633 		} else {
634 			for (i = 0; i < NUM_WT; i++) {
635 
636 				if (((ref = order_weights[i]) < 0) ||
637 				    ((p = get_pri(ref)) == NULL) ||
638 				    (p->pri == -1)) {
639 					set_pri(currelem->ref[i], pri,
640 					    RESOLVED);
641 				} else {
642 					set_pri(currelem->ref[i], ref, REFER);
643 				}
644 				order_weights[i] = -1;
645 			}
646 		}
647 		break;
648 
649 	case T_UNDEFINED:
650 		for (i = 0; i < NUM_WT; i++) {
651 			if (((ref = order_weights[i]) < 0) ||
652 			    ((p = get_pri(ref)) == NULL) ||
653 			    (p->pri == -1)) {
654 				set_pri(pri_undefined[i], -1, RESOLVED);
655 			} else {
656 				set_pri(pri_undefined[i], ref, REFER);
657 			}
658 			order_weights[i] = -1;
659 		}
660 		break;
661 
662 	case T_SYMBOL:
663 		for (i = 0; i < NUM_WT; i++) {
664 			if (((ref = order_weights[i]) < 0) ||
665 			    ((p = get_pri(ref)) == NULL) ||
666 			    (p->pri == -1)) {
667 				set_pri(currundef->ref[i], pri, RESOLVED);
668 			} else {
669 				set_pri(currundef->ref[i], ref, REFER);
670 			}
671 			order_weights[i] = -1;
672 		}
673 		break;
674 
675 	default:
676 		INTERR;
677 	}
678 
679 	nextpri++;
680 }
681 
682 static void
683 start_order(int type)
684 {
685 	int	i;
686 
687 	lastorder = currorder;
688 	currorder = type;
689 
690 	/* this is used to protect ELLIPSIS processing */
691 	if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
692 		fprintf(stderr, "character value expected");
693 	}
694 
695 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
696 		order_weights[i] = -1;
697 	}
698 	curr_weight = 0;
699 }
700 
701 void
702 start_order_undefined(void)
703 {
704 	start_order(T_UNDEFINED);
705 }
706 
707 void
708 start_order_symbol(char *name)
709 {
710 	currundef = get_collundef(name);
711 	start_order(T_SYMBOL);
712 }
713 
714 void
715 start_order_char(wchar_t wc)
716 {
717 	collchar_t	*cc;
718 	int32_t		ref;
719 
720 	start_order(T_CHAR);
721 
722 	/*
723 	 * If we last saw an ellipsis, then we need to close the range.
724 	 * Handle that here.  Note that we have to be careful because the
725 	 * items *inside* the range are treated exclusiveley to the items
726 	 * outside of the range.  The ends of the range can have quite
727 	 * different weights than the range members.
728 	 */
729 	if (lastorder == T_ELLIPSIS) {
730 		int		i;
731 
732 		if (wc < ellipsis_start) {
733 			fprintf(stderr, "malformed range!");
734 			return;
735 		}
736 		while (ellipsis_start < wc) {
737 			/*
738 			 * pick all of the saved weights for the
739 			 * ellipsis.  note that -1 encodes for the
740 			 * ellipsis itself, which means to take the
741 			 * current relative priority.
742 			 */
743 			if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
744 				INTERR;
745 				return;
746 			}
747 			for (i = 0; i < NUM_WT; i++) {
748 				collpri_t *p;
749 				if (((ref = ellipsis_weights[i]) == -1) ||
750 				    ((p = get_pri(ref)) == NULL) ||
751 				    (p->pri == -1)) {
752 					set_pri(cc->ref[i], nextpri, RESOLVED);
753 				} else {
754 					set_pri(cc->ref[i], ref, REFER);
755 				}
756 				ellipsis_weights[i] = 0;
757 			}
758 			ellipsis_start++;
759 			nextpri++;
760 		}
761 	}
762 
763 	currchar = get_collchar(wc, 1);
764 }
765 
766 void
767 start_order_collelem(collelem_t *e)
768 {
769 	start_order(T_COLLELEM);
770 	currelem = e;
771 }
772 
773 void
774 start_order_ellipsis(void)
775 {
776 	int	i;
777 
778 	start_order(T_ELLIPSIS);
779 
780 	if (lastorder != T_CHAR) {
781 		fprintf(stderr, "illegal starting point for range");
782 		return;
783 	}
784 
785 	for (i = 0; i < NUM_WT; i++) {
786 		ellipsis_weights[i] = order_weights[i];
787 	}
788 }
789 
790 void
791 define_collelem(char *name, wchar_t *wcs)
792 {
793 	collelem_t	*e;
794 	int		i;
795 
796 	if (wcslen(wcs) >= COLLATE_STR_LEN) {
797 		fprintf(stderr,"expanded collation element too long");
798 		return;
799 	}
800 
801 	if ((e = calloc(1, sizeof(*e))) == NULL) {
802 		fprintf(stderr, "out of memory");
803 		return;
804 	}
805 	e->expand = wcs;
806 	e->symbol = name;
807 
808 	/*
809 	 * This is executed before the order statement, so we don't
810 	 * know how many priorities we *really* need.  We allocate one
811 	 * for each possible weight.  Not a big deal, as collating-elements
812 	 * prove to be quite rare.
813 	 */
814 	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
815 		e->ref[i] = new_pri();
816 	}
817 
818 	/* A character sequence can only reduce to one element. */
819 	if ((RB_FIND(elem_by_symbol, &elem_by_symbol, e) != NULL) ||
820 	    (RB_FIND(elem_by_expand, &elem_by_expand, e) != NULL)) {
821 		fprintf(stderr, "duplicate collating element definition");
822 		free(e);
823 		return;
824 	}
825 	RB_INSERT(elem_by_symbol, &elem_by_symbol, e);
826 	RB_INSERT(elem_by_expand, &elem_by_expand, e);
827 }
828 
829 void
830 add_order_bit(int kw)
831 {
832 	uint8_t bit = DIRECTIVE_UNDEF;
833 
834 	switch (kw) {
835 	case T_FORWARD:
836 		bit = DIRECTIVE_FORWARD;
837 		break;
838 	case T_BACKWARD:
839 		bit = DIRECTIVE_BACKWARD;
840 		break;
841 	case T_POSITION:
842 		bit = DIRECTIVE_POSITION;
843 		break;
844 	default:
845 		INTERR;
846 		break;
847 	}
848 	collinfo.directive[collinfo.directive_count] |= bit;
849 }
850 
851 void
852 add_order_directive(void)
853 {
854 	if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
855 		fprintf(stderr, "too many directives (max %d)\n", COLL_WEIGHTS_MAX);
856 		return;
857 	}
858 	collinfo.directive_count++;
859 }
860 
861 static void
862 add_order_pri(int32_t ref)
863 {
864 	if (curr_weight >= NUM_WT) {
865 		fprintf(stderr, "too many weights (max %d)\n", NUM_WT);
866 		return;
867 	}
868 	order_weights[curr_weight] = ref;
869 	curr_weight++;
870 }
871 
872 void
873 add_order_collsym(collsym_t *s)
874 {
875 	add_order_pri(s->ref);
876 }
877 
878 void
879 add_order_char(wchar_t wc)
880 {
881 	collchar_t *cc;
882 
883 	if ((cc = get_collchar(wc, 1)) == NULL) {
884 		INTERR;
885 		return;
886 	}
887 
888 	add_order_pri(cc->ref[curr_weight]);
889 }
890 
891 void
892 add_order_collelem(collelem_t *e)
893 {
894 	add_order_pri(e->ref[curr_weight]);
895 }
896 
897 void
898 add_order_ignore(void)
899 {
900 	add_order_pri(pri_ignore);
901 }
902 
903 void
904 add_order_symbol(char *sym)
905 {
906 	collundef_t *c;
907 	if ((c = get_collundef(sym)) == NULL) {
908 		INTERR;
909 		return;
910 	}
911 	add_order_pri(c->ref[curr_weight]);
912 }
913 
914 void
915 add_order_ellipsis(void)
916 {
917 	/* special NULL value indicates self reference */
918 	add_order_pri(0);
919 }
920 
921 void
922 add_order_subst(void)
923 {
924 	subst_t srch;
925 	subst_t	*s;
926 	int i;
927 
928 	(void) memset(&srch, 0, sizeof (srch));
929 	for (i = 0; i < curr_subst; i++) {
930 		srch.ref[i] = subst_weights[i];
931 		subst_weights[i] = 0;
932 	}
933 	s = RB_FIND(substs_ref, &substs_ref[curr_weight], &srch);
934 
935 	if (s == NULL) {
936 		if ((s = calloc(1, sizeof(*s))) == NULL) {
937 			fprintf(stderr,"out of memory");
938 			return;
939 		}
940 		s->key = new_pri();
941 
942 		/*
943 		 * We use a self reference for our key, but we set a
944 		 * high bit to indicate that this is a substitution
945 		 * reference.  This will expedite table lookups later,
946 		 * and prevent table lookups for situations that don't
947 		 * require it.  (In short, its a big win, because we
948 		 * can skip a lot of binary searching.)
949 		 */
950 		set_pri(s->key,
951 		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
952 		    RESOLVED);
953 		nextsubst[curr_weight] += 1;
954 
955 		for (i = 0; i < curr_subst; i++) {
956 			s->ref[i] = srch.ref[i];
957 		}
958 
959 		RB_INSERT(substs_ref, &substs_ref[curr_weight], s);
960 
961 		if (RB_FIND(substs, &substs[curr_weight], s) != NULL) {
962 			INTERR;
963 			return;
964 		}
965 		RB_INSERT(substs, &substs[curr_weight], s);
966 	}
967 	curr_subst = 0;
968 
969 
970 	/*
971 	 * We are using the current (unique) priority as a search key
972 	 * in the substitution table.
973 	 */
974 	add_order_pri(s->key);
975 }
976 
977 static void
978 add_subst_pri(int32_t ref)
979 {
980 	if (curr_subst >= COLLATE_STR_LEN) {
981 		fprintf(stderr,"substitution string is too long");
982 		return;
983 	}
984 	subst_weights[curr_subst] = ref;
985 	curr_subst++;
986 }
987 
988 void
989 add_subst_char(wchar_t wc)
990 {
991 	collchar_t *cc;
992 
993 
994 	if (((cc = get_collchar(wc, 1)) == NULL) ||
995 	    (cc->wc != wc)) {
996 		INTERR;
997 		return;
998 	}
999 	/* we take the weight for the character at that position */
1000 	add_subst_pri(cc->ref[curr_weight]);
1001 }
1002 
1003 void
1004 add_subst_collelem(collelem_t *e)
1005 {
1006 	add_subst_pri(e->ref[curr_weight]);
1007 }
1008 
1009 void
1010 add_subst_collsym(collsym_t *s)
1011 {
1012 	add_subst_pri(s->ref);
1013 }
1014 
1015 void
1016 add_subst_symbol(char *ptr)
1017 {
1018 	collundef_t *cu;
1019 
1020 	if ((cu = get_collundef(ptr)) != NULL) {
1021 		add_subst_pri(cu->ref[curr_weight]);
1022 	}
1023 }
1024 
1025 void
1026 add_weight(int32_t ref, int pass)
1027 {
1028 	weight_t srch;
1029 	weight_t *w;
1030 
1031 	srch.pri = resolve_pri(ref);
1032 
1033 	/* No translation of ignores */
1034 	if (srch.pri == 0)
1035 		return;
1036 
1037 	/* Substitution priorities are not weights */
1038 	if (srch.pri & COLLATE_SUBST_PRIORITY)
1039 		return;
1040 
1041 	if (RB_FIND(weights, &weights[pass], &srch) != NULL)
1042 		return;
1043 
1044 	if ((w = calloc(1, sizeof(*w))) == NULL) {
1045 		fprintf(stderr, "out of memory");
1046 		return;
1047 	}
1048 	w->pri = srch.pri;
1049 	RB_INSERT(weights, &weights[pass], w);
1050 }
1051 
1052 void
1053 add_weights(int32_t *refs)
1054 {
1055 	int i;
1056 	for (i = 0; i < NUM_WT; i++) {
1057 		add_weight(refs[i], i);
1058 	}
1059 }
1060 
1061 int32_t
1062 get_weight(int32_t ref, int pass)
1063 {
1064 	weight_t	srch;
1065 	weight_t	*w;
1066 	int32_t		pri;
1067 
1068 	pri = resolve_pri(ref);
1069 	if (pri & COLLATE_SUBST_PRIORITY) {
1070 		return (pri);
1071 	}
1072 	if (pri <= 0) {
1073 		return (pri);
1074 	}
1075 	srch.pri = pri;
1076 	if ((w = RB_FIND(weights, &weights[pass], &srch)) == NULL) {
1077 		INTERR;
1078 		return (-1);
1079 	}
1080 	return (w->opt);
1081 }
1082 
1083 wchar_t *
1084 wsncpy(wchar_t *s1, const wchar_t *s2, size_t n)
1085 {
1086 	wchar_t *os1 = s1;
1087 
1088 	n++;
1089 	while (--n > 0 && (*s1++ = htote(*s2++)) != 0)
1090 		continue;
1091 	if (n > 0)
1092 		while (--n > 0)
1093 			*s1++ = 0;
1094 	return (os1);
1095 }
1096 
1097 #define RB_COUNT(x, name, head, cnt) do { \
1098 	(cnt) = 0; \
1099 	RB_FOREACH(x, name, (head)) { \
1100 		(cnt)++; \
1101 	} \
1102 } while (0)
1103 
1104 #define RB_NUMNODES(type, name, head, cnt) do { \
1105 	type *t; \
1106 	cnt = 0; \
1107 	RB_FOREACH(t, name, head) { \
1108 		cnt++; \
1109 	} \
1110 } while (0)
1111 
1112 void
1113 dump_collate(void)
1114 {
1115 	FILE			*f;
1116 	int			i, j, n;
1117 	size_t			sz;
1118 	int32_t			pri;
1119 	collelem_t		*ce;
1120 	collchar_t		*cc;
1121 	subst_t			*sb;
1122 	char			vers[COLLATE_STR_LEN];
1123 	collate_char_t		chars[UCHAR_MAX + 1];
1124 	collate_large_t		*large;
1125 	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
1126 	collate_chain_t		*chain;
1127 
1128 	/*
1129 	 * We have to run through a preliminary pass to identify all the
1130 	 * weights that we use for each sorting level.
1131 	 */
1132 	for (i = 0; i < NUM_WT; i++) {
1133 		add_weight(pri_ignore, i);
1134 	}
1135 	for (i = 0; i < NUM_WT; i++) {
1136 		RB_FOREACH(sb, substs, &substs[i]) {
1137 			for (j = 0; sb->ref[j]; j++) {
1138 				add_weight(sb->ref[j], i);
1139 			}
1140 		}
1141 	}
1142 	RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1143 		add_weights(ce->ref);
1144 	}
1145 	RB_FOREACH(cc, collchars, &collchars) {
1146 		add_weights(cc->ref);
1147 	}
1148 
1149 	/*
1150 	 * Now we walk the entire set of weights, removing the gaps
1151 	 * in the weights.  This gives us optimum usage.  The walk
1152 	 * occurs in priority.
1153 	 */
1154 	for (i = 0; i < NUM_WT; i++) {
1155 		weight_t *w;
1156 		RB_FOREACH(w, weights, &weights[i]) {
1157 			w->opt = nweight[i];
1158 			nweight[i] += 1;
1159 		}
1160 	}
1161 
1162 	(void) memset(&chars, 0, sizeof (chars));
1163 	(void) memset(vers, 0, COLLATE_STR_LEN);
1164 	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1165 
1166 	/*
1167 	 * We need to make sure we arrange for the UNDEFINED field
1168 	 * to show up.  Also, set the total weight counts.
1169 	 */
1170 	for (i = 0; i < NUM_WT; i++) {
1171 		if (resolve_pri(pri_undefined[i]) == -1) {
1172 			set_pri(pri_undefined[i], -1, RESOLVED);
1173 			/* they collate at the end of everything else */
1174 			collinfo.undef_pri[i] = htote(COLLATE_MAX_PRIORITY);
1175 		}
1176 		collinfo.pri_count[i] = htote(nweight[i]);
1177 	}
1178 
1179 	collinfo.pri_count[NUM_WT] = htote(max_wide());
1180 	collinfo.undef_pri[NUM_WT] = htote(COLLATE_MAX_PRIORITY);
1181 	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1182 
1183 	/*
1184 	 * Ordinary character priorities
1185 	 */
1186 	for (i = 0; i <= UCHAR_MAX; i++) {
1187 		if ((cc = get_collchar(i, 0)) != NULL) {
1188 			for (j = 0; j < NUM_WT; j++) {
1189 				chars[i].pri[j] =
1190 				    htote(get_weight(cc->ref[j], j));
1191 			}
1192 		} else {
1193 			for (j = 0; j < NUM_WT; j++) {
1194 				chars[i].pri[j] =
1195 				    htote(get_weight(pri_undefined[j], j));
1196 			}
1197 			/*
1198 			 * Per POSIX, for undefined characters, we
1199 			 * also have to add a last item, which is the
1200 			 * character code.
1201 			 */
1202 			chars[i].pri[NUM_WT] = htote(i);
1203 		}
1204 	}
1205 
1206 	/*
1207 	 * Substitution tables
1208 	 */
1209 	for (i = 0; i < NUM_WT; i++) {
1210 		collate_subst_t *st = NULL;
1211 		subst_t *temp;
1212 		RB_COUNT(temp, substs, &substs[i], n);
1213 		subst_count[i] = n;
1214 		if ((st = calloc(n, sizeof(collate_subst_t))) == NULL) {
1215 			fprintf(stderr, "out of memory");
1216 			return;
1217 		}
1218 		n = 0;
1219 		RB_FOREACH(sb, substs, &substs[i]) {
1220 			if ((st[n].key = resolve_pri(sb->key)) < 0) {
1221 				/* by definition these resolve! */
1222 				INTERR;
1223 			}
1224 			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1225 				INTERR;
1226 			}
1227 			st[n].key = htote(st[n].key);
1228 			for (j = 0; sb->ref[j]; j++) {
1229 				st[n].pri[j] = htote(get_weight(sb->ref[j],
1230 				    i));
1231 			}
1232 			n++;
1233 		}
1234 		if (n != subst_count[i])
1235 			INTERR;
1236 		subst[i] = st;
1237 	}
1238 
1239 
1240 	/*
1241 	 * Chains, i.e. collating elements
1242 	 */
1243 	RB_NUMNODES(collelem_t, elem_by_expand, &elem_by_expand, chain_count);
1244 	chain = calloc(chain_count, sizeof(collate_chain_t));
1245 	if (chain == NULL) {
1246 		fprintf(stderr, "out of memory");
1247 		return;
1248 	}
1249 	n = 0;
1250 	RB_FOREACH(ce, elem_by_expand, &elem_by_expand) {
1251 		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1252 		for (i = 0; i < NUM_WT; i++) {
1253 			chain[n].pri[i] = htote(get_weight(ce->ref[i], i));
1254 		}
1255 		n++;
1256 	}
1257 	if (n != chain_count)
1258 		INTERR;
1259 
1260 	/*
1261 	 * Large (> UCHAR_MAX) character priorities
1262 	 */
1263 	RB_NUMNODES(collchar_t, collchars, &collchars, n);
1264 	large = calloc(n, sizeof(collate_large_t));
1265 	if (large == NULL) {
1266 		fprintf(stderr, "out of memory");
1267 		return;
1268 	}
1269 
1270 	i = 0;
1271 	RB_FOREACH(cc, collchars, &collchars) {
1272 		int	undef = 0;
1273 		/* we already gathered those */
1274 		if (cc->wc <= UCHAR_MAX)
1275 			continue;
1276 		for (j = 0; j < NUM_WT; j++) {
1277 			if ((pri = get_weight(cc->ref[j], j)) < 0) {
1278 				undef = 1;
1279 			}
1280 			if (undef && (pri >= 0)) {
1281 				/* if undefined, then all priorities are */
1282 				INTERR;
1283 			} else {
1284 				large[i].pri.pri[j] = htote(pri);
1285 			}
1286 		}
1287 		if (!undef) {
1288 			large[i].val = htote(cc->wc);
1289 			large_count = i++;
1290 		}
1291 	}
1292 
1293 	if ((f = open_category()) == NULL) {
1294 		return;
1295 	}
1296 
1297 	/* Time to write the entire data set out */
1298 
1299 	for (i = 0; i < NUM_WT; i++)
1300 		collinfo.subst_count[i] = htote(subst_count[i]);
1301 	collinfo.chain_count = htote(chain_count);
1302 	collinfo.large_count = htote(large_count);
1303 
1304 	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1305 	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1306 	    (wr_category(&chars, sizeof (chars), f) < 0)) {
1307 		return;
1308 	}
1309 
1310 	for (i = 0; i < NUM_WT; i++) {
1311 		sz = sizeof (collate_subst_t) * subst_count[i];
1312 		if (wr_category(subst[i], sz, f) < 0) {
1313 			return;
1314 		}
1315 	}
1316 	sz = sizeof (collate_chain_t) * chain_count;
1317 	if (wr_category(chain, sz, f) < 0) {
1318 		return;
1319 	}
1320 	sz = sizeof (collate_large_t) * large_count;
1321 	if (wr_category(large, sz, f) < 0) {
1322 		return;
1323 	}
1324 
1325 	close_category(f);
1326 }
1327