1 /*	$NetBSD: ucgendat.c,v 1.1.1.3 2010/12/12 15:21:57 adam Exp $	*/
2 
3 /* OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucgendat.c,v 1.39.2.5 2010/04/13 20:23:04 kurt Exp */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2010 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Copyright 2001 Computing Research Labs, New Mexico State University
18  *
19  * Permission is hereby granted, free of charge, to any person obtaining a
20  * copy of this software and associated documentation files (the "Software"),
21  * to deal in the Software without restriction, including without limitation
22  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
23  * and/or sell copies of the Software, and to permit persons to whom the
24  * Software is furnished to do so, subject to the following conditions:
25  *
26  * The above copyright notice and this permission notice shall be included in
27  * all copies or substantial portions of the Software.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
32  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
33  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
34  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
35  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36  */
37 /* Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp" */
38 
39 #include "portable.h"
40 #include "ldap_config.h"
41 
42 #include <stdio.h>
43 #include <ac/ctype.h>
44 #include <ac/stdlib.h>
45 #include <ac/string.h>
46 #include <ac/unistd.h>
47 
48 #include <ac/bytes.h>
49 
50 #include <lutil.h>
51 
52 #ifndef HARDCODE_DATA
53 #define	HARDCODE_DATA	1
54 #endif
55 
56 #undef ishdigit
57 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
58                       ((cc) >= 'A' && (cc) <= 'F') ||\
59                       ((cc) >= 'a' && (cc) <= 'f'))
60 
61 /*
62  * A header written to the output file with the byte-order-mark and the number
63  * of property nodes.
64  */
65 static ac_uint2 hdr[2] = {0xfeff, 0};
66 
67 #define NUMPROPS 50
68 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
69 
70 typedef struct {
71     char *name;
72     int len;
73 } _prop_t;
74 
75 /*
76  * List of properties expected to be found in the Unicode Character Database
77  * including some implementation specific properties.
78  *
79  * The implementation specific properties are:
80  * Cm = Composed (can be decomposed)
81  * Nb = Non-breaking
82  * Sy = Symmetric (has left and right forms)
83  * Hd = Hex digit
84  * Qm = Quote marks
85  * Mr = Mirroring
86  * Ss = Space, other
87  * Cp = Defined character
88  */
89 static _prop_t props[NUMPROPS] = {
90     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
91     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
92     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
93     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
94     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
95     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
96     {"S",  1}, {"WS", 2}, {"ON", 2},
97     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
98     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
99 };
100 
101 typedef struct {
102     ac_uint4 *ranges;
103     ac_uint2 used;
104     ac_uint2 size;
105 } _ranges_t;
106 
107 static _ranges_t proptbl[NUMPROPS];
108 
109 /*
110  * Make sure this array is sized to be on a 4-byte boundary at compile time.
111  */
112 static ac_uint2 propcnt[NEEDPROPS];
113 
114 /*
115  * Array used to collect a decomposition before adding it to the decomposition
116  * table.
117  */
118 static ac_uint4 dectmp[64];
119 static ac_uint4 dectmp_size;
120 
121 typedef struct {
122     ac_uint4 code;
123     ac_uint2 size;
124     ac_uint2 used;
125     ac_uint4 *decomp;
126 } _decomp_t;
127 
128 /*
129  * List of decomposition.  Created and expanded in order as the characters are
130  * encountered. First list contains canonical mappings, second also includes
131  * compatibility mappings.
132  */
133 static _decomp_t *decomps;
134 static ac_uint4 decomps_used;
135 static ac_uint4 decomps_size;
136 
137 static _decomp_t *kdecomps;
138 static ac_uint4 kdecomps_used;
139 static ac_uint4 kdecomps_size;
140 
141 /*
142  * Composition exclusion table stuff.
143  */
144 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
145 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
146 static ac_uint4 compexs[8192];
147 
148 /*
149  * Struct for holding a composition pair, and array of composition pairs
150  */
151 typedef struct {
152     ac_uint4 comp;
153     ac_uint4 count;
154     ac_uint4 code1;
155     ac_uint4 code2;
156 } _comp_t;
157 
158 static _comp_t *comps;
159 static ac_uint4 comps_used;
160 
161 /*
162  * Types and lists for handling lists of case mappings.
163  */
164 typedef struct {
165     ac_uint4 key;
166     ac_uint4 other1;
167     ac_uint4 other2;
168 } _case_t;
169 
170 static _case_t *upper;
171 static _case_t *lower;
172 static _case_t *title;
173 static ac_uint4 upper_used;
174 static ac_uint4 upper_size;
175 static ac_uint4 lower_used;
176 static ac_uint4 lower_size;
177 static ac_uint4 title_used;
178 static ac_uint4 title_size;
179 
180 /*
181  * Array used to collect case mappings before adding them to a list.
182  */
183 static ac_uint4 cases[3];
184 
185 /*
186  * An array to hold ranges for combining classes.
187  */
188 static ac_uint4 *ccl;
189 static ac_uint4 ccl_used;
190 static ac_uint4 ccl_size;
191 
192 /*
193  * Structures for handling numbers.
194  */
195 typedef struct {
196     ac_uint4 code;
197     ac_uint4 idx;
198 } _codeidx_t;
199 
200 typedef struct {
201     short numerator;
202     short denominator;
203 } _num_t;
204 
205 /*
206  * Arrays to hold the mapping of codes to numbers.
207  */
208 static _codeidx_t *ncodes;
209 static ac_uint4 ncodes_used;
210 static ac_uint4 ncodes_size;
211 
212 static _num_t *nums;
213 static ac_uint4 nums_used;
214 static ac_uint4 nums_size;
215 
216 /*
217  * Array for holding numbers.
218  */
219 static _num_t *nums;
220 static ac_uint4 nums_used;
221 static ac_uint4 nums_size;
222 
223 static void
224 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
225 {
226     int i, j, k, len;
227     _ranges_t *rlp;
228     char *name;
229 
230     for (k = 0; k < 2; k++) {
231         if (k == 0) {
232             name = p1;
233             len = 2;
234         } else {
235             if (p2 == 0)
236               break;
237 
238             name = p2;
239             len = 1;
240         }
241 
242         for (i = 0; i < NUMPROPS; i++) {
243             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
244               break;
245         }
246 
247         if (i == NUMPROPS)
248           continue;
249 
250         rlp = &proptbl[i];
251 
252         /*
253          * Resize the range list if necessary.
254          */
255         if (rlp->used == rlp->size) {
256             if (rlp->size == 0)
257               rlp->ranges = (ac_uint4 *)
258                   malloc(sizeof(ac_uint4) << 3);
259             else
260               rlp->ranges = (ac_uint4 *)
261                   realloc((char *) rlp->ranges,
262                           sizeof(ac_uint4) * (rlp->size + 8));
263             rlp->size += 8;
264         }
265 
266         /*
267          * If this is the first code for this property list, just add it
268          * and return.
269          */
270         if (rlp->used == 0) {
271             rlp->ranges[0] = start;
272             rlp->ranges[1] = end;
273             rlp->used += 2;
274             continue;
275         }
276 
277         /*
278          * Optimize the case of adding the range to the end.
279          */
280         j = rlp->used - 1;
281         if (start > rlp->ranges[j]) {
282             j = rlp->used;
283             rlp->ranges[j++] = start;
284             rlp->ranges[j++] = end;
285             rlp->used = j;
286             continue;
287         }
288 
289         /*
290          * Need to locate the insertion point.
291          */
292         for (i = 0;
293              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
294 
295         /*
296          * If the start value lies in the current range, then simply set the
297          * new end point of the range to the end value passed as a parameter.
298          */
299         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
300             rlp->ranges[i + 1] = end;
301             return;
302         }
303 
304         /*
305          * Shift following values up by two.
306          */
307         for (j = rlp->used; j > i; j -= 2) {
308             rlp->ranges[j] = rlp->ranges[j - 2];
309             rlp->ranges[j + 1] = rlp->ranges[j - 1];
310         }
311 
312         /*
313          * Add the new range at the insertion point.
314          */
315         rlp->ranges[i] = start;
316         rlp->ranges[i + 1] = end;
317         rlp->used += 2;
318     }
319 }
320 
321 static void
322 ordered_range_insert(ac_uint4 c, char *name, int len)
323 {
324     int i, j;
325     ac_uint4 s, e;
326     _ranges_t *rlp;
327 
328     if (len == 0)
329       return;
330 
331     /*
332      * Deal with directionality codes introduced in Unicode 3.0.
333      */
334     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
335         (len == 3 &&
336          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
337           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
338           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
339         /*
340          * Mark all of these as Other Neutral to preserve compatibility with
341          * older versions.
342          */
343         len = 2;
344         name = "ON";
345     }
346 
347     for (i = 0; i < NUMPROPS; i++) {
348         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
349           break;
350     }
351 
352     if (i == NUMPROPS)
353       return;
354 
355     /*
356      * Have a match, so insert the code in order.
357      */
358     rlp = &proptbl[i];
359 
360     /*
361      * Resize the range list if necessary.
362      */
363     if (rlp->used == rlp->size) {
364         if (rlp->size == 0)
365           rlp->ranges = (ac_uint4 *)
366               malloc(sizeof(ac_uint4) << 3);
367         else
368           rlp->ranges = (ac_uint4 *)
369               realloc((char *) rlp->ranges,
370                       sizeof(ac_uint4) * (rlp->size + 8));
371         rlp->size += 8;
372     }
373 
374     /*
375      * If this is the first code for this property list, just add it
376      * and return.
377      */
378     if (rlp->used == 0) {
379         rlp->ranges[0] = rlp->ranges[1] = c;
380         rlp->used += 2;
381         return;
382     }
383 
384     /*
385      * Optimize the cases of extending the last range and adding new ranges to
386      * the end.
387      */
388     j = rlp->used - 1;
389     e = rlp->ranges[j];
390     s = rlp->ranges[j - 1];
391 
392     if (c == e + 1) {
393         /*
394          * Extend the last range.
395          */
396         rlp->ranges[j] = c;
397         return;
398     }
399 
400     if (c > e + 1) {
401         /*
402          * Start another range on the end.
403          */
404         j = rlp->used;
405         rlp->ranges[j] = rlp->ranges[j + 1] = c;
406         rlp->used += 2;
407         return;
408     }
409 
410     if (c >= s)
411       /*
412        * The code is a duplicate of a code in the last range, so just return.
413        */
414       return;
415 
416     /*
417      * The code should be inserted somewhere before the last range in the
418      * list.  Locate the insertion point.
419      */
420     for (i = 0;
421          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
422 
423     s = rlp->ranges[i];
424     e = rlp->ranges[i + 1];
425 
426     if (c == e + 1)
427       /*
428        * Simply extend the current range.
429        */
430       rlp->ranges[i + 1] = c;
431     else if (c < s) {
432         /*
433          * Add a new entry before the current location.  Shift all entries
434          * before the current one up by one to make room.
435          */
436         for (j = rlp->used; j > i; j -= 2) {
437             rlp->ranges[j] = rlp->ranges[j - 2];
438             rlp->ranges[j + 1] = rlp->ranges[j - 1];
439         }
440         rlp->ranges[i] = rlp->ranges[i + 1] = c;
441 
442         rlp->used += 2;
443     }
444 }
445 
446 static void
447 add_decomp(ac_uint4 code, short compat)
448 {
449     ac_uint4 i, j, size;
450     _decomp_t **pdecomps;
451     ac_uint4 *pdecomps_used;
452     ac_uint4 *pdecomps_size;
453 
454     if (compat) {
455 	pdecomps = &kdecomps;
456 	pdecomps_used = &kdecomps_used;
457 	pdecomps_size = &kdecomps_size;
458     } else {
459 	pdecomps = &decomps;
460 	pdecomps_used = &decomps_used;
461 	pdecomps_size = &decomps_size;
462     }
463 
464     /*
465      * Add the code to the composite property.
466      */
467     if (!compat) {
468 	ordered_range_insert(code, "Cm", 2);
469     }
470 
471     /*
472      * Locate the insertion point for the code.
473      */
474     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
475 
476     /*
477      * Allocate space for a new decomposition.
478      */
479     if (*pdecomps_used == *pdecomps_size) {
480         if (*pdecomps_size == 0)
481           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
482         else
483           *pdecomps = (_decomp_t *)
484               realloc((char *) *pdecomps,
485                       sizeof(_decomp_t) * (*pdecomps_size + 8));
486         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
487                       sizeof(_decomp_t) << 3);
488         *pdecomps_size += 8;
489     }
490 
491     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
492         /*
493          * Shift the decomps up by one if the codes don't match.
494          */
495         for (j = *pdecomps_used; j > i; j--)
496           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
497                         sizeof(_decomp_t));
498     }
499 
500     /*
501      * Insert or replace a decomposition.
502      */
503     size = dectmp_size + (4 - (dectmp_size & 3));
504     if ((*pdecomps)[i].size < size) {
505         if ((*pdecomps)[i].size == 0)
506           (*pdecomps)[i].decomp = (ac_uint4 *)
507               malloc(sizeof(ac_uint4) * size);
508         else
509           (*pdecomps)[i].decomp = (ac_uint4 *)
510               realloc((char *) (*pdecomps)[i].decomp,
511                       sizeof(ac_uint4) * size);
512         (*pdecomps)[i].size = size;
513     }
514 
515     if ((*pdecomps)[i].code != code)
516       (*pdecomps_used)++;
517 
518     (*pdecomps)[i].code = code;
519     (*pdecomps)[i].used = dectmp_size;
520     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
521                   sizeof(ac_uint4) * dectmp_size);
522 
523     /*
524      * NOTICE: This needs changing later so it is more general than simply
525      * pairs.  This calculation is done here to simplify allocation elsewhere.
526      */
527     if (!compat && dectmp_size == 2)
528       comps_used++;
529 }
530 
531 static void
532 add_title(ac_uint4 code)
533 {
534     ac_uint4 i, j;
535 
536     /*
537      * Always map the code to itself.
538      */
539     cases[2] = code;
540 
541     if (title_used == title_size) {
542         if (title_size == 0)
543           title = (_case_t *) malloc(sizeof(_case_t) << 3);
544         else
545           title = (_case_t *) realloc((char *) title,
546                                       sizeof(_case_t) * (title_size + 8));
547         title_size += 8;
548     }
549 
550     /*
551      * Locate the insertion point.
552      */
553     for (i = 0; i < title_used && code > title[i].key; i++) ;
554 
555     if (i < title_used) {
556         /*
557          * Shift the array up by one.
558          */
559         for (j = title_used; j > i; j--)
560           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
561                         sizeof(_case_t));
562     }
563 
564     title[i].key = cases[2];    /* Title */
565     title[i].other1 = cases[0]; /* Upper */
566     title[i].other2 = cases[1]; /* Lower */
567 
568     title_used++;
569 }
570 
571 static void
572 add_upper(ac_uint4 code)
573 {
574     ac_uint4 i, j;
575 
576     /*
577      * Always map the code to itself.
578      */
579     cases[0] = code;
580 
581     /*
582      * If the title case character is not present, then make it the same as
583      * the upper case.
584      */
585     if (cases[2] == 0)
586       cases[2] = code;
587 
588     if (upper_used == upper_size) {
589         if (upper_size == 0)
590           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
591         else
592           upper = (_case_t *) realloc((char *) upper,
593                                       sizeof(_case_t) * (upper_size + 8));
594         upper_size += 8;
595     }
596 
597     /*
598      * Locate the insertion point.
599      */
600     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
601 
602     if (i < upper_used) {
603         /*
604          * Shift the array up by one.
605          */
606         for (j = upper_used; j > i; j--)
607           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
608                         sizeof(_case_t));
609     }
610 
611     upper[i].key = cases[0];    /* Upper */
612     upper[i].other1 = cases[1]; /* Lower */
613     upper[i].other2 = cases[2]; /* Title */
614 
615     upper_used++;
616 }
617 
618 static void
619 add_lower(ac_uint4 code)
620 {
621     ac_uint4 i, j;
622 
623     /*
624      * Always map the code to itself.
625      */
626     cases[1] = code;
627 
628     /*
629      * If the title case character is empty, then make it the same as the
630      * upper case.
631      */
632     if (cases[2] == 0)
633       cases[2] = cases[0];
634 
635     if (lower_used == lower_size) {
636         if (lower_size == 0)
637           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
638         else
639           lower = (_case_t *) realloc((char *) lower,
640                                       sizeof(_case_t) * (lower_size + 8));
641         lower_size += 8;
642     }
643 
644     /*
645      * Locate the insertion point.
646      */
647     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
648 
649     if (i < lower_used) {
650         /*
651          * Shift the array up by one.
652          */
653         for (j = lower_used; j > i; j--)
654           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
655                         sizeof(_case_t));
656     }
657 
658     lower[i].key = cases[1];    /* Lower */
659     lower[i].other1 = cases[0]; /* Upper */
660     lower[i].other2 = cases[2]; /* Title */
661 
662     lower_used++;
663 }
664 
665 static void
666 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
667 {
668     ac_uint4 i, j;
669 
670     if (ccl_used == ccl_size) {
671         if (ccl_size == 0)
672           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
673         else
674           ccl = (ac_uint4 *)
675               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
676         ccl_size += 24;
677     }
678 
679     /*
680      * Optimize adding the first item.
681      */
682     if (ccl_used == 0) {
683         ccl[0] = ccl[1] = c;
684         ccl[2] = ccl_code;
685         ccl_used += 3;
686         return;
687     }
688 
689     /*
690      * Handle the special case of extending the range on the end.  This
691      * requires that the combining class codes are the same.
692      */
693     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
694         ccl[ccl_used - 2] = c;
695         return;
696     }
697 
698     /*
699      * Handle the special case of adding another range on the end.
700      */
701     if (c > ccl[ccl_used - 2] + 1 ||
702         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
703         ccl[ccl_used++] = c;
704         ccl[ccl_used++] = c;
705         ccl[ccl_used++] = ccl_code;
706         return;
707     }
708 
709     /*
710      * Locate either the insertion point or range for the code.
711      */
712     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
713 
714     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
715         /*
716          * Extend an existing range.
717          */
718         ccl[i + 1] = c;
719         return;
720     } else if (c < ccl[i]) {
721         /*
722          * Start a new range before the current location.
723          */
724         for (j = ccl_used; j > i; j -= 3) {
725             ccl[j] = ccl[j - 3];
726             ccl[j - 1] = ccl[j - 4];
727             ccl[j - 2] = ccl[j - 5];
728         }
729         ccl[i] = ccl[i + 1] = c;
730         ccl[i + 2] = ccl_code;
731     }
732 }
733 
734 /*
735  * Adds a number if it does not already exist and returns an index value
736  * multiplied by 2.
737  */
738 static ac_uint4
739 make_number(short num, short denom)
740 {
741     ac_uint4 n;
742 
743     /*
744      * Determine if the number already exists.
745      */
746     for (n = 0; n < nums_used; n++) {
747         if (nums[n].numerator == num && nums[n].denominator == denom)
748           return n << 1;
749     }
750 
751     if (nums_used == nums_size) {
752         if (nums_size == 0)
753           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
754         else
755           nums = (_num_t *) realloc((char *) nums,
756                                     sizeof(_num_t) * (nums_size + 8));
757         nums_size += 8;
758     }
759 
760     n = nums_used++;
761     nums[n].numerator = num;
762     nums[n].denominator = denom;
763 
764     return n << 1;
765 }
766 
767 static void
768 add_number(ac_uint4 code, short num, short denom)
769 {
770     ac_uint4 i, j;
771 
772     /*
773      * Insert the code in order.
774      */
775     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
776 
777     /*
778      * Handle the case of the codes matching and simply replace the number
779      * that was there before.
780      */
781     if (i < ncodes_used && code == ncodes[i].code) {
782         ncodes[i].idx = make_number(num, denom);
783         return;
784     }
785 
786     /*
787      * Resize the array if necessary.
788      */
789     if (ncodes_used == ncodes_size) {
790         if (ncodes_size == 0)
791           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
792         else
793           ncodes = (_codeidx_t *)
794               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
795 
796         ncodes_size += 8;
797     }
798 
799     /*
800      * Shift things around to insert the code if necessary.
801      */
802     if (i < ncodes_used) {
803         for (j = ncodes_used; j > i; j--) {
804             ncodes[j].code = ncodes[j - 1].code;
805             ncodes[j].idx = ncodes[j - 1].idx;
806         }
807     }
808     ncodes[i].code = code;
809     ncodes[i].idx = make_number(num, denom);
810 
811     ncodes_used++;
812 }
813 
814 /*
815  * This routine assumes that the line is a valid Unicode Character Database
816  * entry.
817  */
818 static void
819 read_cdata(FILE *in)
820 {
821     ac_uint4 i, lineno, skip, code, ccl_code;
822     short wnum, neg, number[2], compat;
823     char line[512], *s, *e;
824 
825     lineno = skip = 0;
826     while (fgets(line, sizeof(line), in)) {
827 	if( (s=strchr(line, '\n')) ) *s = '\0';
828         lineno++;
829 
830         /*
831          * Skip blank lines and lines that start with a '#'.
832          */
833         if (line[0] == 0 || line[0] == '#')
834           continue;
835 
836         /*
837          * If lines need to be skipped, do it here.
838          */
839         if (skip) {
840             skip--;
841             continue;
842         }
843 
844         /*
845          * Collect the code.  The code can be up to 6 hex digits in length to
846          * allow surrogates to be specified.
847          */
848         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
849             code <<= 4;
850             if (*s >= '0' && *s <= '9')
851               code += *s - '0';
852             else if (*s >= 'A' && *s <= 'F')
853               code += (*s - 'A') + 10;
854             else if (*s >= 'a' && *s <= 'f')
855               code += (*s - 'a') + 10;
856         }
857 
858         /*
859          * Handle the following special cases:
860          * 1. 4E00-9FA5 CJK Ideographs.
861          * 2. AC00-D7A3 Hangul Syllables.
862          * 3. D800-DFFF Surrogates.
863          * 4. E000-F8FF Private Use Area.
864          * 5. F900-FA2D Han compatibility.
865 	 * ...Plus additional ranges in newer Unicode versions...
866          */
867         switch (code) {
868 	  case 0x3400:
869 	    /* CJK Ideograph Extension A */
870             add_range(0x3400, 0x4db5, "Lo", "L");
871 
872             add_range(0x3400, 0x4db5, "Cp", 0);
873 
874 	    skip = 1;
875 	    break;
876           case 0x4e00:
877             /*
878              * The Han ideographs.
879              */
880             add_range(0x4e00, 0x9fff, "Lo", "L");
881 
882             /*
883              * Add the characters to the defined category.
884              */
885             add_range(0x4e00, 0x9fa5, "Cp", 0);
886 
887             skip = 1;
888             break;
889           case 0xac00:
890             /*
891              * The Hangul syllables.
892              */
893             add_range(0xac00, 0xd7a3, "Lo", "L");
894 
895             /*
896              * Add the characters to the defined category.
897              */
898             add_range(0xac00, 0xd7a3, "Cp", 0);
899 
900             skip = 1;
901             break;
902           case 0xd800:
903             /*
904              * Make a range of all surrogates and assume some default
905              * properties.
906              */
907             add_range(0x010000, 0x10ffff, "Cs", "L");
908             skip = 5;
909             break;
910           case 0xe000:
911             /*
912              * The Private Use area.  Add with a default set of properties.
913              */
914             add_range(0xe000, 0xf8ff, "Co", "L");
915             skip = 1;
916             break;
917           case 0xf900:
918             /*
919              * The CJK compatibility area.
920              */
921             add_range(0xf900, 0xfaff, "Lo", "L");
922 
923             /*
924              * Add the characters to the defined category.
925              */
926             add_range(0xf900, 0xfaff, "Cp", 0);
927 
928             skip = 1;
929 	    break;
930 	  case 0x20000:
931 	    /* CJK Ideograph Extension B */
932             add_range(0x20000, 0x2a6d6, "Lo", "L");
933 
934             add_range(0x20000, 0x2a6d6, "Cp", 0);
935 
936 	    skip = 1;
937 	    break;
938 	  case 0xf0000:
939 	    /* Plane 15 private use */
940 	    add_range(0xf0000, 0xffffd, "Co", "L");
941 	    skip = 1;
942 	    break;
943 
944 	  case 0x100000:
945 	    /* Plane 16 private use */
946 	    add_range(0x100000, 0x10fffd, "Co", "L");
947 	    skip = 1;
948 	    break;
949         }
950 
951         if (skip)
952           continue;
953 
954         /*
955          * Add the code to the defined category.
956          */
957         ordered_range_insert(code, "Cp", 2);
958 
959         /*
960          * Locate the first character property field.
961          */
962         for (i = 0; *s != 0 && i < 2; s++) {
963             if (*s == ';')
964               i++;
965         }
966         for (e = s; *e && *e != ';'; e++) ;
967 
968         ordered_range_insert(code, s, e - s);
969 
970         /*
971          * Locate the combining class code.
972          */
973         for (s = e; *s != 0 && i < 3; s++) {
974             if (*s == ';')
975               i++;
976         }
977 
978         /*
979          * Convert the combining class code from decimal.
980          */
981         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
982           ccl_code = (ccl_code * 10) + (*e - '0');
983 
984         /*
985          * Add the code if it not 0.
986          */
987         if (ccl_code != 0)
988           ordered_ccl_insert(code, ccl_code);
989 
990         /*
991          * Locate the second character property field.
992          */
993         for (s = e; *s != 0 && i < 4; s++) {
994             if (*s == ';')
995               i++;
996         }
997         for (e = s; *e && *e != ';'; e++) ;
998 
999         ordered_range_insert(code, s, e - s);
1000 
1001         /*
1002          * Check for a decomposition.
1003          */
1004         s = ++e;
1005         if (*s != ';') {
1006 	    compat = *s == '<';
1007 	    if (compat) {
1008 		/*
1009 		 * Skip compatibility formatting tag.
1010 		 */
1011 		while (*s++ != '>');
1012 	    }
1013             /*
1014              * Collect the codes of the decomposition.
1015              */
1016             for (dectmp_size = 0; *s != ';'; ) {
1017                 /*
1018                  * Skip all leading non-hex digits.
1019                  */
1020                 while (!ishdigit(*s))
1021  		  s++;
1022 
1023                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1024                     dectmp[dectmp_size] <<= 4;
1025                     if (*s >= '0' && *s <= '9')
1026                       dectmp[dectmp_size] += *s - '0';
1027                     else if (*s >= 'A' && *s <= 'F')
1028                       dectmp[dectmp_size] += (*s - 'A') + 10;
1029                     else if (*s >= 'a' && *s <= 'f')
1030                       dectmp[dectmp_size] += (*s - 'a') + 10;
1031                 }
1032                 dectmp_size++;
1033             }
1034 
1035             /*
1036              * If there are any codes in the temporary decomposition array,
1037              * then add the character with its decomposition.
1038              */
1039             if (dectmp_size > 0) {
1040 		if (!compat) {
1041 		    add_decomp(code, 0);
1042 		}
1043 		add_decomp(code, 1);
1044 	    }
1045         }
1046 
1047         /*
1048          * Skip to the number field.
1049          */
1050         for (i = 0; i < 3 && *s; s++) {
1051             if (*s == ';')
1052               i++;
1053         }
1054 
1055         /*
1056          * Scan the number in.
1057          */
1058         number[0] = number[1] = 0;
1059         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1060             if (*e == '-') {
1061                 neg = 1;
1062                 continue;
1063             }
1064 
1065             if (*e == '/') {
1066                 /*
1067                  * Move the the denominator of the fraction.
1068                  */
1069                 if (neg)
1070                   number[wnum] *= -1;
1071                 neg = 0;
1072                 e++;
1073                 wnum++;
1074             }
1075             number[wnum] = (number[wnum] * 10) + (*e - '0');
1076         }
1077 
1078         if (e > s) {
1079             /*
1080              * Adjust the denominator in case of integers and add the number.
1081              */
1082             if (wnum == 0)
1083               number[1] = 1;
1084 
1085             add_number(code, number[0], number[1]);
1086         }
1087 
1088         /*
1089          * Skip to the start of the possible case mappings.
1090          */
1091         for (s = e, i = 0; i < 4 && *s; s++) {
1092             if (*s == ';')
1093               i++;
1094         }
1095 
1096         /*
1097          * Collect the case mappings.
1098          */
1099         cases[0] = cases[1] = cases[2] = 0;
1100         for (i = 0; i < 3; i++) {
1101             while (ishdigit(*s)) {
1102                 cases[i] <<= 4;
1103                 if (*s >= '0' && *s <= '9')
1104                   cases[i] += *s - '0';
1105                 else if (*s >= 'A' && *s <= 'F')
1106                   cases[i] += (*s - 'A') + 10;
1107                 else if (*s >= 'a' && *s <= 'f')
1108                   cases[i] += (*s - 'a') + 10;
1109                 s++;
1110             }
1111             if (*s == ';')
1112               s++;
1113         }
1114         if (cases[0] && cases[1])
1115           /*
1116            * Add the upper and lower mappings for a title case character.
1117            */
1118           add_title(code);
1119         else if (cases[1])
1120           /*
1121            * Add the lower and title case mappings for the upper case
1122            * character.
1123            */
1124           add_upper(code);
1125         else if (cases[0])
1126           /*
1127            * Add the upper and title case mappings for the lower case
1128            * character.
1129            */
1130           add_lower(code);
1131     }
1132 }
1133 
1134 static _decomp_t *
1135 find_decomp(ac_uint4 code, short compat)
1136 {
1137     long l, r, m;
1138     _decomp_t *decs;
1139 
1140     l = 0;
1141     r = (compat ? kdecomps_used : decomps_used) - 1;
1142     decs = compat ? kdecomps : decomps;
1143     while (l <= r) {
1144         m = (l + r) >> 1;
1145         if (code > decs[m].code)
1146           l = m + 1;
1147         else if (code < decs[m].code)
1148           r = m - 1;
1149         else
1150           return &decs[m];
1151     }
1152     return 0;
1153 }
1154 
1155 static void
1156 decomp_it(_decomp_t *d, short compat)
1157 {
1158     ac_uint4 i;
1159     _decomp_t *dp;
1160 
1161     for (i = 0; i < d->used; i++) {
1162         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1163           decomp_it(dp, compat);
1164         else
1165           dectmp[dectmp_size++] = d->decomp[i];
1166     }
1167 }
1168 
1169 /*
1170  * Expand all decompositions by recursively decomposing each character
1171  * in the decomposition.
1172  */
1173 static void
1174 expand_decomp(void)
1175 {
1176     ac_uint4 i;
1177 
1178     for (i = 0; i < decomps_used; i++) {
1179         dectmp_size = 0;
1180         decomp_it(&decomps[i], 0);
1181         if (dectmp_size > 0)
1182           add_decomp(decomps[i].code, 0);
1183     }
1184 
1185     for (i = 0; i < kdecomps_used; i++) {
1186         dectmp_size = 0;
1187         decomp_it(&kdecomps[i], 1);
1188         if (dectmp_size > 0)
1189           add_decomp(kdecomps[i].code, 1);
1190     }
1191 }
1192 
1193 static int
1194 cmpcomps(const void *v_comp1, const void *v_comp2)
1195 {
1196 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1197     long diff = comp1->code1 - comp2->code1;
1198 
1199     if (!diff)
1200 	diff = comp1->code2 - comp2->code2;
1201     return (int) diff;
1202 }
1203 
1204 /*
1205  * Load composition exclusion data
1206  */
1207 static void
1208 read_compexdata(FILE *in)
1209 {
1210     ac_uint2 i;
1211     ac_uint4 code;
1212     char line[512], *s;
1213 
1214     (void) memset((char *) compexs, 0, sizeof(compexs));
1215 
1216     while (fgets(line, sizeof(line), in)) {
1217 	if( (s=strchr(line, '\n')) ) *s = '\0';
1218         /*
1219          * Skip blank lines and lines that start with a '#'.
1220          */
1221         if (line[0] == 0 || line[0] == '#')
1222 	    continue;
1223 
1224 	/*
1225          * Collect the code.  Assume max 6 digits
1226          */
1227 
1228 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1229 	    if (isspace((unsigned char)*s)) break;
1230             code <<= 4;
1231             if (*s >= '0' && *s <= '9')
1232 		code += *s - '0';
1233             else if (*s >= 'A' && *s <= 'F')
1234 		code += (*s - 'A') + 10;
1235             else if (*s >= 'a' && *s <= 'f')
1236 		code += (*s - 'a') + 10;
1237         }
1238         COMPEX_SET(code);
1239     }
1240 }
1241 
1242 /*
1243  * Creates array of compositions from decomposition array
1244  */
1245 static void
1246 create_comps(void)
1247 {
1248     ac_uint4 i, cu;
1249 
1250     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1251 
1252     for (i = cu = 0; i < decomps_used; i++) {
1253 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1254 	    continue;
1255 	comps[cu].comp = decomps[i].code;
1256 	comps[cu].count = 2;
1257 	comps[cu].code1 = decomps[i].decomp[0];
1258 	comps[cu].code2 = decomps[i].decomp[1];
1259 	cu++;
1260     }
1261     comps_used = cu;
1262     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1263 }
1264 
1265 #if HARDCODE_DATA
1266 static void
1267 write_case(FILE *out, _case_t *tab, int num, int first)
1268 {
1269     int i;
1270 
1271     for (i=0; i<num; i++) {
1272 	if (first) first = 0;
1273 	else fprintf(out, ",");
1274 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1275 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1276 		(unsigned long) tab[i].other2);
1277     }
1278 }
1279 
1280 #define PREF "static const "
1281 
1282 #endif
1283 
1284 static void
1285 write_cdata(char *opath)
1286 {
1287     FILE *out;
1288 	ac_uint4 bytes;
1289     ac_uint4 i, idx, nprops;
1290 #if !(HARDCODE_DATA)
1291     ac_uint2 casecnt[2];
1292 #endif
1293     char path[BUFSIZ];
1294 #if HARDCODE_DATA
1295     int j, k;
1296 
1297     /*****************************************************************
1298      *
1299      * Generate the ctype data.
1300      *
1301      *****************************************************************/
1302 
1303     /*
1304      * Open the output file.
1305      */
1306     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1307     if ((out = fopen(path, "w")) == 0)
1308       return;
1309 #else
1310     /*
1311      * Open the ctype.dat file.
1312      */
1313     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1314     if ((out = fopen(path, "wb")) == 0)
1315       return;
1316 #endif
1317 
1318     /*
1319      * Collect the offsets for the properties.  The offsets array is
1320      * on a 4-byte boundary to keep things efficient for architectures
1321      * that need such a thing.
1322      */
1323     for (i = idx = 0; i < NUMPROPS; i++) {
1324         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1325         idx += proptbl[i].used;
1326     }
1327 
1328     /*
1329      * Add the sentinel index which is used by the binary search as the upper
1330      * bound for a search.
1331      */
1332     propcnt[i] = idx;
1333 
1334     /*
1335      * Record the actual number of property lists.  This may be different than
1336      * the number of offsets actually written because of aligning on a 4-byte
1337      * boundary.
1338      */
1339     hdr[1] = NUMPROPS;
1340 
1341     /*
1342      * Calculate the byte count needed and pad the property counts array to a
1343      * 4-byte boundary.
1344      */
1345     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1346       bytes += 4 - (bytes & 3);
1347     nprops = bytes / sizeof(ac_uint2);
1348     bytes += sizeof(ac_uint4) * idx;
1349 
1350 #if HARDCODE_DATA
1351     fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
1352 
1353     fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
1354 
1355     for (i = 0; i<nprops; i++) {
1356        if (i) fprintf(out, ",");
1357        if (!(i&7)) fprintf(out, "\n\t");
1358        else fprintf(out, " ");
1359        fprintf(out, "0x%04x", propcnt[i]);
1360     }
1361     fprintf(out, "\n};\n\n");
1362 
1363     fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
1364 
1365     k = 0;
1366     for (i = 0; i < NUMPROPS; i++) {
1367 	if (proptbl[i].used > 0) {
1368 	  for (j=0; j<proptbl[i].used; j++) {
1369 	    if (k) fprintf(out, ",");
1370 	    if (!(k&3)) fprintf(out,"\n\t");
1371 	    else fprintf(out, " ");
1372 	    k++;
1373 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1374 	  }
1375 	}
1376     }
1377     fprintf(out, "\n};\n\n");
1378 #else
1379     /*
1380      * Write the header.
1381      */
1382     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1383 
1384     /*
1385      * Write the byte count.
1386      */
1387     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1388 
1389     /*
1390      * Write the property list counts.
1391      */
1392     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1393 
1394     /*
1395      * Write the property lists.
1396      */
1397     for (i = 0; i < NUMPROPS; i++) {
1398         if (proptbl[i].used > 0)
1399           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1400                  proptbl[i].used, out);
1401     }
1402 
1403     fclose(out);
1404 #endif
1405 
1406     /*****************************************************************
1407      *
1408      * Generate the case mapping data.
1409      *
1410      *****************************************************************/
1411 
1412 #if HARDCODE_DATA
1413     fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
1414         (long) (upper_used + lower_used + title_used));
1415 
1416     fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
1417         (long) upper_used, (long) lower_used);
1418     fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
1419 
1420     if (upper_used > 0)
1421       /*
1422        * Write the upper case table.
1423        */
1424       write_case(out, upper, upper_used, 1);
1425 
1426     if (lower_used > 0)
1427       /*
1428        * Write the lower case table.
1429        */
1430       write_case(out, lower, lower_used, !upper_used);
1431 
1432     if (title_used > 0)
1433       /*
1434        * Write the title case table.
1435        */
1436       write_case(out, title, title_used, !(upper_used||lower_used));
1437 
1438     if (!(upper_used || lower_used || title_used))
1439 	fprintf(out, "\t0");
1440 
1441     fprintf(out, "\n};\n\n");
1442 #else
1443     /*
1444      * Open the case.dat file.
1445      */
1446     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1447     if ((out = fopen(path, "wb")) == 0)
1448       return;
1449 
1450     /*
1451      * Write the case mapping tables.
1452      */
1453     hdr[1] = upper_used + lower_used + title_used;
1454     casecnt[0] = upper_used;
1455     casecnt[1] = lower_used;
1456 
1457     /*
1458      * Write the header.
1459      */
1460     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1461 
1462     /*
1463      * Write the upper and lower case table sizes.
1464      */
1465     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1466 
1467     if (upper_used > 0)
1468       /*
1469        * Write the upper case table.
1470        */
1471       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1472 
1473     if (lower_used > 0)
1474       /*
1475        * Write the lower case table.
1476        */
1477       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1478 
1479     if (title_used > 0)
1480       /*
1481        * Write the title case table.
1482        */
1483       fwrite((char *) title, sizeof(_case_t), title_used, out);
1484 
1485     fclose(out);
1486 #endif
1487 
1488     /*****************************************************************
1489      *
1490      * Generate the composition data.
1491      *
1492      *****************************************************************/
1493 
1494     /*
1495      * Create compositions from decomposition data
1496      */
1497     create_comps();
1498 
1499 #if HARDCODE_DATA
1500     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1501         comps_used * 4L);
1502 
1503     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1504 
1505      /*
1506       * Now, if comps exist, write them out.
1507       */
1508     if (comps_used > 0) {
1509 	for (i=0; i<comps_used; i++) {
1510 	    if (i) fprintf(out, ",");
1511 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1512 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1513 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1514 	}
1515     } else {
1516 	fprintf(out, "\t0");
1517     }
1518     fprintf(out, "\n};\n\n");
1519 #else
1520     /*
1521      * Open the comp.dat file.
1522      */
1523     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1524     if ((out = fopen(path, "wb")) == 0)
1525 	return;
1526 
1527     /*
1528      * Write the header.
1529      */
1530     hdr[1] = (ac_uint2) comps_used * 4;
1531     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1532 
1533     /*
1534      * Write out the byte count to maintain header size.
1535      */
1536     bytes = comps_used * sizeof(_comp_t);
1537     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1538 
1539     /*
1540      * Now, if comps exist, write them out.
1541      */
1542     if (comps_used > 0)
1543         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1544 
1545     fclose(out);
1546 #endif
1547 
1548     /*****************************************************************
1549      *
1550      * Generate the decomposition data.
1551      *
1552      *****************************************************************/
1553 
1554     /*
1555      * Fully expand all decompositions before generating the output file.
1556      */
1557     expand_decomp();
1558 
1559 #if HARDCODE_DATA
1560     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1561         decomps_used * 2L);
1562 
1563     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1564 
1565     if (decomps_used) {
1566 	/*
1567 	 * Write the list of decomp nodes.
1568 	 */
1569 	for (i = idx = 0; i < decomps_used; i++) {
1570 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1571 	        (unsigned long) decomps[i].code, (unsigned long) idx);
1572 	    idx += decomps[i].used;
1573 	}
1574 
1575 	/*
1576 	 * Write the sentinel index as the last decomp node.
1577 	 */
1578 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1579 
1580 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1581 	/*
1582 	 * Write the decompositions themselves.
1583 	 */
1584 	k = 0;
1585 	for (i = 0; i < decomps_used; i++)
1586 	  for (j=0; j<decomps[i].used; j++) {
1587 	    if (k) fprintf(out, ",");
1588 	    if (!(k&3)) fprintf(out,"\n\t");
1589 	    else fprintf(out, " ");
1590 	    k++;
1591 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1592 	  }
1593 	fprintf(out, "\n};\n\n");
1594     }
1595 #else
1596     /*
1597      * Open the decomp.dat file.
1598      */
1599     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1600     if ((out = fopen(path, "wb")) == 0)
1601       return;
1602 
1603     hdr[1] = decomps_used;
1604 
1605     /*
1606      * Write the header.
1607      */
1608     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1609 
1610     /*
1611      * Write a temporary byte count which will be calculated as the
1612      * decompositions are written out.
1613      */
1614     bytes = 0;
1615     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1616 
1617     if (decomps_used) {
1618         /*
1619          * Write the list of decomp nodes.
1620          */
1621         for (i = idx = 0; i < decomps_used; i++) {
1622             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1623             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1624             idx += decomps[i].used;
1625         }
1626 
1627         /*
1628          * Write the sentinel index as the last decomp node.
1629          */
1630         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1631 
1632         /*
1633          * Write the decompositions themselves.
1634          */
1635         for (i = 0; i < decomps_used; i++)
1636           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1637                  decomps[i].used, out);
1638 
1639         /*
1640          * Seek back to the beginning and write the byte count.
1641          */
1642         bytes = (sizeof(ac_uint4) * idx) +
1643             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1644         fseek(out, sizeof(ac_uint2) << 1, 0L);
1645         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1646 
1647         fclose(out);
1648     }
1649 #endif
1650 
1651 #ifdef HARDCODE_DATA
1652     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1653         kdecomps_used * 2L);
1654 
1655     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1656 
1657     if (kdecomps_used) {
1658 	/*
1659 	 * Write the list of kdecomp nodes.
1660 	 */
1661 	for (i = idx = 0; i < kdecomps_used; i++) {
1662 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1663 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
1664 	    idx += kdecomps[i].used;
1665 	}
1666 
1667 	/*
1668 	 * Write the sentinel index as the last decomp node.
1669 	 */
1670 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1671 
1672 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1673 
1674 	/*
1675 	 * Write the decompositions themselves.
1676 	 */
1677 	k = 0;
1678 	for (i = 0; i < kdecomps_used; i++)
1679 	  for (j=0; j<kdecomps[i].used; j++) {
1680 	    if (k) fprintf(out, ",");
1681 	    if (!(k&3)) fprintf(out,"\n\t");
1682 	    else fprintf(out, " ");
1683 	    k++;
1684 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1685 	  }
1686 	fprintf(out, "\n};\n\n");
1687     }
1688 #else
1689     /*
1690      * Open the kdecomp.dat file.
1691      */
1692     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1693     if ((out = fopen(path, "wb")) == 0)
1694       return;
1695 
1696     hdr[1] = kdecomps_used;
1697 
1698     /*
1699      * Write the header.
1700      */
1701     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1702 
1703     /*
1704      * Write a temporary byte count which will be calculated as the
1705      * decompositions are written out.
1706      */
1707     bytes = 0;
1708     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1709 
1710     if (kdecomps_used) {
1711         /*
1712          * Write the list of kdecomp nodes.
1713          */
1714         for (i = idx = 0; i < kdecomps_used; i++) {
1715             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1716             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1717             idx += kdecomps[i].used;
1718         }
1719 
1720         /*
1721          * Write the sentinel index as the last decomp node.
1722          */
1723         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1724 
1725         /*
1726          * Write the decompositions themselves.
1727          */
1728         for (i = 0; i < kdecomps_used; i++)
1729           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1730                  kdecomps[i].used, out);
1731 
1732         /*
1733          * Seek back to the beginning and write the byte count.
1734          */
1735         bytes = (sizeof(ac_uint4) * idx) +
1736             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1737         fseek(out, sizeof(ac_uint2) << 1, 0L);
1738         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1739 
1740         fclose(out);
1741     }
1742 #endif
1743 
1744     /*****************************************************************
1745      *
1746      * Generate the combining class data.
1747      *
1748      *****************************************************************/
1749 #ifdef HARDCODE_DATA
1750     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1751 
1752     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1753 
1754     if (ccl_used > 0) {
1755 	/*
1756 	 * Write the combining class ranges out.
1757 	 */
1758 	for (i = 0; i<ccl_used; i++) {
1759 	    if (i) fprintf(out, ",");
1760 	    if (!(i&3)) fprintf(out, "\n\t");
1761 	    else fprintf(out, " ");
1762 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1763 	}
1764     } else {
1765 	fprintf(out, "\t0");
1766     }
1767     fprintf(out, "\n};\n\n");
1768 #else
1769     /*
1770      * Open the cmbcl.dat file.
1771      */
1772     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1773     if ((out = fopen(path, "wb")) == 0)
1774       return;
1775 
1776     /*
1777      * Set the number of ranges used.  Each range has a combining class which
1778      * means each entry is a 3-tuple.
1779      */
1780     hdr[1] = ccl_used / 3;
1781 
1782     /*
1783      * Write the header.
1784      */
1785     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1786 
1787     /*
1788      * Write out the byte count to maintain header size.
1789      */
1790     bytes = ccl_used * sizeof(ac_uint4);
1791     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1792 
1793     if (ccl_used > 0)
1794       /*
1795        * Write the combining class ranges out.
1796        */
1797       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1798 
1799     fclose(out);
1800 #endif
1801 
1802     /*****************************************************************
1803      *
1804      * Generate the number data.
1805      *
1806      *****************************************************************/
1807 
1808 #if HARDCODE_DATA
1809     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1810         (unsigned long)ncodes_used<<1);
1811 
1812     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1813 
1814     /*
1815      * Now, if number mappings exist, write them out.
1816      */
1817     if (ncodes_used > 0) {
1818 	for (i = 0; i<ncodes_used; i++) {
1819 	    if (i) fprintf(out, ",");
1820 	    if (!(i&1)) fprintf(out, "\n\t");
1821 	    else fprintf(out, " ");
1822 	    fprintf(out, "0x%08lx, 0x%08lx",
1823 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1824 	}
1825 	fprintf(out, "\n};\n\n");
1826 
1827 	fprintf(out, PREF "short _ucnum_vals[] = {");
1828 	for (i = 0; i<nums_used; i++) {
1829 	    if (i) fprintf(out, ",");
1830 	    if (!(i&3)) fprintf(out, "\n\t");
1831 	    else fprintf(out, " ");
1832 	    if (nums[i].numerator < 0) {
1833 		fprintf(out, "%6d, 0x%04x",
1834 		  nums[i].numerator, nums[i].denominator);
1835 	    } else {
1836 		fprintf(out, "0x%04x, 0x%04x",
1837 		  nums[i].numerator, nums[i].denominator);
1838 	    }
1839 	}
1840 	fprintf(out, "\n};\n\n");
1841     }
1842 #else
1843     /*
1844      * Open the num.dat file.
1845      */
1846     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1847     if ((out = fopen(path, "wb")) == 0)
1848       return;
1849 
1850     /*
1851      * The count part of the header will be the total number of codes that
1852      * have numbers.
1853      */
1854     hdr[1] = (ac_uint2) (ncodes_used << 1);
1855     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1856 
1857     /*
1858      * Write the header.
1859      */
1860     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1861 
1862     /*
1863      * Write out the byte count to maintain header size.
1864      */
1865     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1866 
1867     /*
1868      * Now, if number mappings exist, write them out.
1869      */
1870     if (ncodes_used > 0) {
1871         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1872         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1873     }
1874 #endif
1875 
1876     fclose(out);
1877 }
1878 
1879 static void
1880 usage(char *prog)
1881 {
1882     fprintf(stderr,
1883             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1884     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1885     fprintf(stderr,
1886             "-o output-directory\n\t\tWrite the output files to a different");
1887     fprintf(stderr, " directory (default: .).\n");
1888     fprintf(stderr,
1889             "-x composition-exclusion\n\t\tFile of composition codes");
1890     fprintf(stderr, " that should be excluded.\n");
1891     exit(1);
1892 }
1893 
1894 int
1895 main(int argc, char *argv[])
1896 {
1897     FILE *in;
1898     char *prog, *opath;
1899 
1900     prog = lutil_progname( "ucgendat", argc, argv );
1901 
1902     opath = 0;
1903     in = stdin;
1904 
1905     argc--;
1906     argv++;
1907 
1908     while (argc > 0) {
1909         if (argv[0][0] == '-') {
1910             switch (argv[0][1]) {
1911               case 'o':
1912                 argc--;
1913                 argv++;
1914                 opath = argv[0];
1915                 break;
1916               case 'x':
1917                 argc--;
1918                 argv++;
1919                 if ((in = fopen(argv[0], "r")) == 0)
1920                   fprintf(stderr,
1921                           "%s: unable to open composition exclusion file %s\n",
1922                           prog, argv[0]);
1923                 else {
1924                     read_compexdata(in);
1925                     fclose(in);
1926                     in = 0;
1927                 }
1928                 break;
1929               default:
1930                 usage(prog);
1931             }
1932         } else {
1933             if (in != stdin && in != NULL)
1934               fclose(in);
1935             if ((in = fopen(argv[0], "r")) == 0)
1936               fprintf(stderr, "%s: unable to open ctype file %s\n",
1937                       prog, argv[0]);
1938             else {
1939                 read_cdata(in);
1940                 fclose(in);
1941                 in = 0;
1942 	    }
1943         }
1944         argc--;
1945         argv++;
1946     }
1947 
1948     if (opath == 0)
1949       opath = ".";
1950     write_cdata(opath);
1951 
1952     return 0;
1953 }
1954