1 /* $OpenLDAP$ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3  *
4  * Copyright 1998-2021 The OpenLDAP Foundation.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted only as authorized by the OpenLDAP
9  * Public License.
10  *
11  * A copy of this license is available in file LICENSE in the
12  * top-level directory of the distribution or, alternatively, at
13  * <http://www.OpenLDAP.org/license.html>.
14  */
15 /* Copyright 2001 Computing Research Labs, New Mexico State University
16  *
17  * Permission is hereby granted, free of charge, to any person obtaining a
18  * copy of this software and associated documentation files (the "Software"),
19  * to deal in the Software without restriction, including without limitation
20  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
21  * and/or sell copies of the Software, and to permit persons to whom the
22  * Software is furnished to do so, subject to the following conditions:
23  *
24  * The above copyright notice and this permission notice shall be included in
25  * all copies or substantial portions of the Software.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
30  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
31  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
32  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
33  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
34  */
35 /* $Id: ucgendat.c,v 1.4 2001/01/02 18:46:20 mleisher Exp $" */
36 
37 #include "portable.h"
38 #include "ldap_config.h"
39 
40 #include <stdio.h>
41 #include <ac/ctype.h>
42 #include <ac/stdlib.h>
43 #include <ac/string.h>
44 #include <ac/unistd.h>
45 
46 #include <ac/bytes.h>
47 
48 #include <lutil.h>
49 
50 #ifndef HARDCODE_DATA
51 #define	HARDCODE_DATA	1
52 #endif
53 
54 #undef ishdigit
55 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
56                       ((cc) >= 'A' && (cc) <= 'F') ||\
57                       ((cc) >= 'a' && (cc) <= 'f'))
58 
59 /*
60  * A header written to the output file with the byte-order-mark and the number
61  * of property nodes.
62  */
63 static ac_uint2 hdr[2] = {0xfeff, 0};
64 
65 #define NUMPROPS 50
66 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
67 
68 typedef struct {
69     char *name;
70     int len;
71 } _prop_t;
72 
73 /*
74  * List of properties expected to be found in the Unicode Character Database
75  * including some implementation specific properties.
76  *
77  * The implementation specific properties are:
78  * Cm = Composed (can be decomposed)
79  * Nb = Non-breaking
80  * Sy = Symmetric (has left and right forms)
81  * Hd = Hex digit
82  * Qm = Quote marks
83  * Mr = Mirroring
84  * Ss = Space, other
85  * Cp = Defined character
86  */
87 static _prop_t props[NUMPROPS] = {
88     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
89     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
90     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
91     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
92     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
93     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
94     {"S",  1}, {"WS", 2}, {"ON", 2},
95     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
96     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}, {"AL", 2}
97 };
98 
99 typedef struct {
100     ac_uint4 *ranges;
101     ac_uint2 used;
102     ac_uint2 size;
103 } _ranges_t;
104 
105 static _ranges_t proptbl[NUMPROPS];
106 
107 /*
108  * Make sure this array is sized to be on a 4-byte boundary at compile time.
109  */
110 static ac_uint2 propcnt[NEEDPROPS];
111 
112 /*
113  * Array used to collect a decomposition before adding it to the decomposition
114  * table.
115  */
116 static ac_uint4 dectmp[64];
117 static ac_uint4 dectmp_size;
118 
119 typedef struct {
120     ac_uint4 code;
121     ac_uint2 size;
122     ac_uint2 used;
123     ac_uint4 *decomp;
124 } _decomp_t;
125 
126 /*
127  * List of decomposition.  Created and expanded in order as the characters are
128  * encountered. First list contains canonical mappings, second also includes
129  * compatibility mappings.
130  */
131 static _decomp_t *decomps;
132 static ac_uint4 decomps_used;
133 static ac_uint4 decomps_size;
134 
135 static _decomp_t *kdecomps;
136 static ac_uint4 kdecomps_used;
137 static ac_uint4 kdecomps_size;
138 
139 /*
140  * Composition exclusion table stuff.
141  */
142 #define COMPEX_SET(c) (compexs[(c) >> 5] |= (1 << ((c) & 31)))
143 #define COMPEX_TEST(c) (compexs[(c) >> 5] & (1 << ((c) & 31)))
144 static ac_uint4 compexs[8192];
145 
146 /*
147  * Struct for holding a composition pair, and array of composition pairs
148  */
149 typedef struct {
150     ac_uint4 comp;
151     ac_uint4 count;
152     ac_uint4 code1;
153     ac_uint4 code2;
154 } _comp_t;
155 
156 static _comp_t *comps;
157 static ac_uint4 comps_used;
158 
159 /*
160  * Types and lists for handling lists of case mappings.
161  */
162 typedef struct {
163     ac_uint4 key;
164     ac_uint4 other1;
165     ac_uint4 other2;
166 } _case_t;
167 
168 static _case_t *upper;
169 static _case_t *lower;
170 static _case_t *title;
171 static ac_uint4 upper_used;
172 static ac_uint4 upper_size;
173 static ac_uint4 lower_used;
174 static ac_uint4 lower_size;
175 static ac_uint4 title_used;
176 static ac_uint4 title_size;
177 
178 /*
179  * Array used to collect case mappings before adding them to a list.
180  */
181 static ac_uint4 cases[3];
182 
183 /*
184  * An array to hold ranges for combining classes.
185  */
186 static ac_uint4 *ccl;
187 static ac_uint4 ccl_used;
188 static ac_uint4 ccl_size;
189 
190 /*
191  * Structures for handling numbers.
192  */
193 typedef struct {
194     ac_uint4 code;
195     ac_uint4 idx;
196 } _codeidx_t;
197 
198 typedef struct {
199     short numerator;
200     short denominator;
201 } _num_t;
202 
203 /*
204  * Arrays to hold the mapping of codes to numbers.
205  */
206 static _codeidx_t *ncodes;
207 static ac_uint4 ncodes_used;
208 static ac_uint4 ncodes_size;
209 
210 static _num_t *nums;
211 static ac_uint4 nums_used;
212 static ac_uint4 nums_size;
213 
214 /*
215  * Array for holding numbers.
216  */
217 static _num_t *nums;
218 static ac_uint4 nums_used;
219 static ac_uint4 nums_size;
220 
221 static void
add_range(ac_uint4 start,ac_uint4 end,char * p1,char * p2)222 add_range(ac_uint4 start, ac_uint4 end, char *p1, char *p2)
223 {
224     int i, j, k, len;
225     _ranges_t *rlp;
226     char *name;
227 
228     for (k = 0; k < 2; k++) {
229         if (k == 0) {
230             name = p1;
231             len = 2;
232         } else {
233             if (p2 == 0)
234               break;
235 
236             name = p2;
237             len = 1;
238         }
239 
240         for (i = 0; i < NUMPROPS; i++) {
241             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
242               break;
243         }
244 
245         if (i == NUMPROPS)
246           continue;
247 
248         rlp = &proptbl[i];
249 
250         /*
251          * Resize the range list if necessary.
252          */
253         if (rlp->used == rlp->size) {
254             if (rlp->size == 0)
255               rlp->ranges = (ac_uint4 *)
256                   malloc(sizeof(ac_uint4) << 3);
257             else
258               rlp->ranges = (ac_uint4 *)
259                   realloc((char *) rlp->ranges,
260                           sizeof(ac_uint4) * (rlp->size + 8));
261             rlp->size += 8;
262         }
263 
264         /*
265          * If this is the first code for this property list, just add it
266          * and return.
267          */
268         if (rlp->used == 0) {
269             rlp->ranges[0] = start;
270             rlp->ranges[1] = end;
271             rlp->used += 2;
272             continue;
273         }
274 
275         /*
276          * Optimize the case of adding the range to the end.
277          */
278         j = rlp->used - 1;
279         if (start > rlp->ranges[j]) {
280             j = rlp->used;
281             rlp->ranges[j++] = start;
282             rlp->ranges[j++] = end;
283             rlp->used = j;
284             continue;
285         }
286 
287         /*
288          * Need to locate the insertion point.
289          */
290         for (i = 0;
291              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
292 
293         /*
294          * If the start value lies in the current range, then simply set the
295          * new end point of the range to the end value passed as a parameter.
296          */
297         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
298             rlp->ranges[i + 1] = end;
299             return;
300         }
301 
302         /*
303          * Shift following values up by two.
304          */
305         for (j = rlp->used; j > i; j -= 2) {
306             rlp->ranges[j] = rlp->ranges[j - 2];
307             rlp->ranges[j + 1] = rlp->ranges[j - 1];
308         }
309 
310         /*
311          * Add the new range at the insertion point.
312          */
313         rlp->ranges[i] = start;
314         rlp->ranges[i + 1] = end;
315         rlp->used += 2;
316     }
317 }
318 
319 static void
ordered_range_insert(ac_uint4 c,char * name,int len)320 ordered_range_insert(ac_uint4 c, char *name, int len)
321 {
322     int i, j;
323     ac_uint4 s, e;
324     _ranges_t *rlp;
325 
326     if (len == 0)
327       return;
328 
329     /*
330      * Deal with directionality codes introduced in Unicode 3.0.
331      */
332     if ((len == 2 && memcmp(name, "BN", 2) == 0) ||
333         (len == 3 &&
334          (memcmp(name, "NSM", 3) == 0 || memcmp(name, "PDF", 3) == 0 ||
335           memcmp(name, "LRE", 3) == 0 || memcmp(name, "LRO", 3) == 0 ||
336           memcmp(name, "RLE", 3) == 0 || memcmp(name, "RLO", 3) == 0))) {
337         /*
338          * Mark all of these as Other Neutral to preserve compatibility with
339          * older versions.
340          */
341         len = 2;
342         name = "ON";
343     }
344 
345     for (i = 0; i < NUMPROPS; i++) {
346         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
347           break;
348     }
349 
350     if (i == NUMPROPS)
351       return;
352 
353     /*
354      * Have a match, so insert the code in order.
355      */
356     rlp = &proptbl[i];
357 
358     /*
359      * Resize the range list if necessary.
360      */
361     if (rlp->used == rlp->size) {
362         if (rlp->size == 0)
363           rlp->ranges = (ac_uint4 *)
364               malloc(sizeof(ac_uint4) << 3);
365         else
366           rlp->ranges = (ac_uint4 *)
367               realloc((char *) rlp->ranges,
368                       sizeof(ac_uint4) * (rlp->size + 8));
369         rlp->size += 8;
370     }
371 
372     /*
373      * If this is the first code for this property list, just add it
374      * and return.
375      */
376     if (rlp->used == 0) {
377         rlp->ranges[0] = rlp->ranges[1] = c;
378         rlp->used += 2;
379         return;
380     }
381 
382     /*
383      * Optimize the cases of extending the last range and adding new ranges to
384      * the end.
385      */
386     j = rlp->used - 1;
387     e = rlp->ranges[j];
388     s = rlp->ranges[j - 1];
389 
390     if (c == e + 1) {
391         /*
392          * Extend the last range.
393          */
394         rlp->ranges[j] = c;
395         return;
396     }
397 
398     if (c > e + 1) {
399         /*
400          * Start another range on the end.
401          */
402         j = rlp->used;
403         rlp->ranges[j] = rlp->ranges[j + 1] = c;
404         rlp->used += 2;
405         return;
406     }
407 
408     if (c >= s)
409       /*
410        * The code is a duplicate of a code in the last range, so just return.
411        */
412       return;
413 
414     /*
415      * The code should be inserted somewhere before the last range in the
416      * list.  Locate the insertion point.
417      */
418     for (i = 0;
419          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
420 
421     s = rlp->ranges[i];
422     e = rlp->ranges[i + 1];
423 
424     if (c == e + 1)
425       /*
426        * Simply extend the current range.
427        */
428       rlp->ranges[i + 1] = c;
429     else if (c < s) {
430         /*
431          * Add a new entry before the current location.  Shift all entries
432          * before the current one up by one to make room.
433          */
434         for (j = rlp->used; j > i; j -= 2) {
435             rlp->ranges[j] = rlp->ranges[j - 2];
436             rlp->ranges[j + 1] = rlp->ranges[j - 1];
437         }
438         rlp->ranges[i] = rlp->ranges[i + 1] = c;
439 
440         rlp->used += 2;
441     }
442 }
443 
444 static void
add_decomp(ac_uint4 code,short compat)445 add_decomp(ac_uint4 code, short compat)
446 {
447     ac_uint4 i, j, size;
448     _decomp_t **pdecomps;
449     ac_uint4 *pdecomps_used;
450     ac_uint4 *pdecomps_size;
451 
452     if (compat) {
453 	pdecomps = &kdecomps;
454 	pdecomps_used = &kdecomps_used;
455 	pdecomps_size = &kdecomps_size;
456     } else {
457 	pdecomps = &decomps;
458 	pdecomps_used = &decomps_used;
459 	pdecomps_size = &decomps_size;
460     }
461 
462     /*
463      * Add the code to the composite property.
464      */
465     if (!compat) {
466 	ordered_range_insert(code, "Cm", 2);
467     }
468 
469     /*
470      * Locate the insertion point for the code.
471      */
472     for (i = 0; i < *pdecomps_used && code > (*pdecomps)[i].code; i++) ;
473 
474     /*
475      * Allocate space for a new decomposition.
476      */
477     if (*pdecomps_used == *pdecomps_size) {
478         if (*pdecomps_size == 0)
479           *pdecomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
480         else
481           *pdecomps = (_decomp_t *)
482               realloc((char *) *pdecomps,
483                       sizeof(_decomp_t) * (*pdecomps_size + 8));
484         (void) memset((char *) (*pdecomps + *pdecomps_size), '\0',
485                       sizeof(_decomp_t) << 3);
486         *pdecomps_size += 8;
487     }
488 
489     if (i < *pdecomps_used && code != (*pdecomps)[i].code) {
490         /*
491          * Shift the decomps up by one if the codes don't match.
492          */
493         for (j = *pdecomps_used; j > i; j--)
494           (void) AC_MEMCPY((char *) &(*pdecomps)[j], (char *) &(*pdecomps)[j - 1],
495                         sizeof(_decomp_t));
496     }
497 
498     /*
499      * Insert or replace a decomposition.
500      */
501     size = dectmp_size + (4 - (dectmp_size & 3));
502     if ((*pdecomps)[i].size < size) {
503         if ((*pdecomps)[i].size == 0)
504           (*pdecomps)[i].decomp = (ac_uint4 *)
505               malloc(sizeof(ac_uint4) * size);
506         else
507           (*pdecomps)[i].decomp = (ac_uint4 *)
508               realloc((char *) (*pdecomps)[i].decomp,
509                       sizeof(ac_uint4) * size);
510         (*pdecomps)[i].size = size;
511     }
512 
513     if ((*pdecomps)[i].code != code)
514       (*pdecomps_used)++;
515 
516     (*pdecomps)[i].code = code;
517     (*pdecomps)[i].used = dectmp_size;
518     (void) AC_MEMCPY((char *) (*pdecomps)[i].decomp, (char *) dectmp,
519                   sizeof(ac_uint4) * dectmp_size);
520 
521     /*
522      * NOTICE: This needs changing later so it is more general than simply
523      * pairs.  This calculation is done here to simplify allocation elsewhere.
524      */
525     if (!compat && dectmp_size == 2)
526       comps_used++;
527 }
528 
529 static void
add_title(ac_uint4 code)530 add_title(ac_uint4 code)
531 {
532     ac_uint4 i, j;
533 
534     /*
535      * Always map the code to itself.
536      */
537     cases[2] = code;
538 
539     /*
540      * If the upper case character is not present, then make it the same as
541      * the title case.
542      */
543     if (cases[0] == 0)
544       cases[0] = code;
545 
546     if (title_used == title_size) {
547         if (title_size == 0)
548           title = (_case_t *) malloc(sizeof(_case_t) << 3);
549         else
550           title = (_case_t *) realloc((char *) title,
551                                       sizeof(_case_t) * (title_size + 8));
552         title_size += 8;
553     }
554 
555     /*
556      * Locate the insertion point.
557      */
558     for (i = 0; i < title_used && code > title[i].key; i++) ;
559 
560     if (i < title_used) {
561         /*
562          * Shift the array up by one.
563          */
564         for (j = title_used; j > i; j--)
565           (void) AC_MEMCPY((char *) &title[j], (char *) &title[j - 1],
566                         sizeof(_case_t));
567     }
568 
569     title[i].key = cases[2];    /* Title */
570     title[i].other1 = cases[0]; /* Upper */
571     title[i].other2 = cases[1]; /* Lower */
572 
573     title_used++;
574 }
575 
576 static void
add_upper(ac_uint4 code)577 add_upper(ac_uint4 code)
578 {
579     ac_uint4 i, j;
580 
581     /*
582      * Always map the code to itself.
583      */
584     cases[0] = code;
585 
586     /*
587      * If the title case character is not present, then make it the same as
588      * the upper case.
589      */
590     if (cases[2] == 0)
591       cases[2] = code;
592 
593     if (upper_used == upper_size) {
594         if (upper_size == 0)
595           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
596         else
597           upper = (_case_t *) realloc((char *) upper,
598                                       sizeof(_case_t) * (upper_size + 8));
599         upper_size += 8;
600     }
601 
602     /*
603      * Locate the insertion point.
604      */
605     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
606 
607     if (i < upper_used) {
608         /*
609          * Shift the array up by one.
610          */
611         for (j = upper_used; j > i; j--)
612           (void) AC_MEMCPY((char *) &upper[j], (char *) &upper[j - 1],
613                         sizeof(_case_t));
614     }
615 
616     upper[i].key = cases[0];    /* Upper */
617     upper[i].other1 = cases[1]; /* Lower */
618     upper[i].other2 = cases[2]; /* Title */
619 
620     upper_used++;
621 }
622 
623 static void
add_lower(ac_uint4 code)624 add_lower(ac_uint4 code)
625 {
626     ac_uint4 i, j;
627 
628     /*
629      * Always map the code to itself.
630      */
631     cases[1] = code;
632 
633     /*
634      * If the title case character is empty, then make it the same as the
635      * upper case.
636      */
637     if (cases[2] == 0)
638       cases[2] = cases[0];
639 
640     if (lower_used == lower_size) {
641         if (lower_size == 0)
642           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
643         else
644           lower = (_case_t *) realloc((char *) lower,
645                                       sizeof(_case_t) * (lower_size + 8));
646         lower_size += 8;
647     }
648 
649     /*
650      * Locate the insertion point.
651      */
652     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
653 
654     if (i < lower_used) {
655         /*
656          * Shift the array up by one.
657          */
658         for (j = lower_used; j > i; j--)
659           (void) AC_MEMCPY((char *) &lower[j], (char *) &lower[j - 1],
660                         sizeof(_case_t));
661     }
662 
663     lower[i].key = cases[1];    /* Lower */
664     lower[i].other1 = cases[0]; /* Upper */
665     lower[i].other2 = cases[2]; /* Title */
666 
667     lower_used++;
668 }
669 
670 static void
ordered_ccl_insert(ac_uint4 c,ac_uint4 ccl_code)671 ordered_ccl_insert(ac_uint4 c, ac_uint4 ccl_code)
672 {
673     ac_uint4 i, j;
674 
675     if (ccl_used == ccl_size) {
676         if (ccl_size == 0)
677           ccl = (ac_uint4 *) malloc(sizeof(ac_uint4) * 24);
678         else
679           ccl = (ac_uint4 *)
680               realloc((char *) ccl, sizeof(ac_uint4) * (ccl_size + 24));
681         ccl_size += 24;
682     }
683 
684     /*
685      * Optimize adding the first item.
686      */
687     if (ccl_used == 0) {
688         ccl[0] = ccl[1] = c;
689         ccl[2] = ccl_code;
690         ccl_used += 3;
691         return;
692     }
693 
694     /*
695      * Handle the special case of extending the range on the end.  This
696      * requires that the combining class codes are the same.
697      */
698     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
699         ccl[ccl_used - 2] = c;
700         return;
701     }
702 
703     /*
704      * Handle the special case of adding another range on the end.
705      */
706     if (c > ccl[ccl_used - 2] + 1 ||
707         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
708         ccl[ccl_used++] = c;
709         ccl[ccl_used++] = c;
710         ccl[ccl_used++] = ccl_code;
711         return;
712     }
713 
714     /*
715      * Locate either the insertion point or range for the code.
716      */
717     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
718 
719     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
720         /*
721          * Extend an existing range.
722          */
723         ccl[i + 1] = c;
724         return;
725     } else if (c < ccl[i]) {
726         /*
727          * Start a new range before the current location.
728          */
729         for (j = ccl_used; j > i; j -= 3) {
730             ccl[j] = ccl[j - 3];
731             ccl[j - 1] = ccl[j - 4];
732             ccl[j - 2] = ccl[j - 5];
733         }
734         ccl[i] = ccl[i + 1] = c;
735         ccl[i + 2] = ccl_code;
736     }
737 }
738 
739 /*
740  * Adds a number if it does not already exist and returns an index value
741  * multiplied by 2.
742  */
743 static ac_uint4
make_number(short num,short denom)744 make_number(short num, short denom)
745 {
746     ac_uint4 n;
747 
748     /*
749      * Determine if the number already exists.
750      */
751     for (n = 0; n < nums_used; n++) {
752         if (nums[n].numerator == num && nums[n].denominator == denom)
753           return n << 1;
754     }
755 
756     if (nums_used == nums_size) {
757         if (nums_size == 0)
758           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
759         else
760           nums = (_num_t *) realloc((char *) nums,
761                                     sizeof(_num_t) * (nums_size + 8));
762         nums_size += 8;
763     }
764 
765     n = nums_used++;
766     nums[n].numerator = num;
767     nums[n].denominator = denom;
768 
769     return n << 1;
770 }
771 
772 static void
add_number(ac_uint4 code,short num,short denom)773 add_number(ac_uint4 code, short num, short denom)
774 {
775     ac_uint4 i, j;
776 
777     /*
778      * Insert the code in order.
779      */
780     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
781 
782     /*
783      * Handle the case of the codes matching and simply replace the number
784      * that was there before.
785      */
786     if (i < ncodes_used && code == ncodes[i].code) {
787         ncodes[i].idx = make_number(num, denom);
788         return;
789     }
790 
791     /*
792      * Resize the array if necessary.
793      */
794     if (ncodes_used == ncodes_size) {
795         if (ncodes_size == 0)
796           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
797         else
798           ncodes = (_codeidx_t *)
799               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
800 
801         ncodes_size += 8;
802     }
803 
804     /*
805      * Shift things around to insert the code if necessary.
806      */
807     if (i < ncodes_used) {
808         for (j = ncodes_used; j > i; j--) {
809             ncodes[j].code = ncodes[j - 1].code;
810             ncodes[j].idx = ncodes[j - 1].idx;
811         }
812     }
813     ncodes[i].code = code;
814     ncodes[i].idx = make_number(num, denom);
815 
816     ncodes_used++;
817 }
818 
819 /*
820  * This routine assumes that the line is a valid Unicode Character Database
821  * entry.
822  */
823 static void
read_cdata(FILE * in)824 read_cdata(FILE *in)
825 {
826     ac_uint4 i, lineno, skip, code, ccl_code;
827     short wnum, neg, number[2], compat;
828     char line[512], *s, *e, *first_prop;
829 
830     lineno = skip = 0;
831     while (fgets(line, sizeof(line), in)) {
832 	if( (s=strchr(line, '\n')) ) *s = '\0';
833         lineno++;
834 
835         /*
836          * Skip blank lines and lines that start with a '#'.
837          */
838         if (line[0] == 0 || line[0] == '#')
839           continue;
840 
841         /*
842          * If lines need to be skipped, do it here.
843          */
844         if (skip) {
845             skip--;
846             continue;
847         }
848 
849         /*
850          * Collect the code.  The code can be up to 6 hex digits in length to
851          * allow surrogates to be specified.
852          */
853         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
854             code <<= 4;
855             if (*s >= '0' && *s <= '9')
856               code += *s - '0';
857             else if (*s >= 'A' && *s <= 'F')
858               code += (*s - 'A') + 10;
859             else if (*s >= 'a' && *s <= 'f')
860               code += (*s - 'a') + 10;
861         }
862 
863         /*
864          * Handle the following special cases:
865          * 1. 4E00-9FA5 CJK Ideographs.
866          * 2. AC00-D7A3 Hangul Syllables.
867          * 3. D800-DFFF Surrogates.
868          * 4. E000-F8FF Private Use Area.
869          * 5. F900-FA2D Han compatibility.
870 	 * ...Plus additional ranges in newer Unicode versions...
871          */
872         switch (code) {
873 	  case 0x3400:
874 	    /* CJK Ideograph Extension A */
875             add_range(0x3400, 0x4db5, "Lo", "L");
876 
877             add_range(0x3400, 0x4db5, "Cp", 0);
878 
879 	    skip = 1;
880 	    break;
881           case 0x4e00:
882             /*
883              * The Han ideographs.
884              */
885             add_range(0x4e00, 0x9fff, "Lo", "L");
886 
887             /*
888              * Add the characters to the defined category.
889              */
890             add_range(0x4e00, 0x9fa5, "Cp", 0);
891 
892             skip = 1;
893             break;
894           case 0xac00:
895             /*
896              * The Hangul syllables.
897              */
898             add_range(0xac00, 0xd7a3, "Lo", "L");
899 
900             /*
901              * Add the characters to the defined category.
902              */
903             add_range(0xac00, 0xd7a3, "Cp", 0);
904 
905             skip = 1;
906             break;
907           case 0xd800:
908             /*
909              * Make a range of all surrogates and assume some default
910              * properties.
911              */
912             add_range(0x010000, 0x10ffff, "Cs", "L");
913             skip = 5;
914             break;
915           case 0xe000:
916             /*
917              * The Private Use area.  Add with a default set of properties.
918              */
919             add_range(0xe000, 0xf8ff, "Co", "L");
920             skip = 1;
921             break;
922           case 0xf900:
923             /*
924              * The CJK compatibility area.
925              */
926             add_range(0xf900, 0xfaff, "Lo", "L");
927 
928             /*
929              * Add the characters to the defined category.
930              */
931             add_range(0xf900, 0xfaff, "Cp", 0);
932 
933             skip = 1;
934 	    break;
935 	  case 0x20000:
936 	    /* CJK Ideograph Extension B */
937             add_range(0x20000, 0x2a6d6, "Lo", "L");
938 
939             add_range(0x20000, 0x2a6d6, "Cp", 0);
940 
941 	    skip = 1;
942 	    break;
943 	  case 0xf0000:
944 	    /* Plane 15 private use */
945 	    add_range(0xf0000, 0xffffd, "Co", "L");
946 	    skip = 1;
947 	    break;
948 
949 	  case 0x100000:
950 	    /* Plane 16 private use */
951 	    add_range(0x100000, 0x10fffd, "Co", "L");
952 	    skip = 1;
953 	    break;
954         }
955 
956         if (skip)
957           continue;
958 
959         /*
960          * Add the code to the defined category.
961          */
962         ordered_range_insert(code, "Cp", 2);
963 
964         /*
965          * Locate the first character property field.
966          */
967         for (i = 0; *s != 0 && i < 2; s++) {
968             if (*s == ';')
969               i++;
970         }
971         for (e = s; *e && *e != ';'; e++) ;
972 
973         first_prop = s;
974 
975         ordered_range_insert(code, s, e - s);
976 
977         /*
978          * Locate the combining class code.
979          */
980         for (s = e; *s != 0 && i < 3; s++) {
981             if (*s == ';')
982               i++;
983         }
984 
985         /*
986          * Convert the combining class code from decimal.
987          */
988         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
989           ccl_code = (ccl_code * 10) + (*e - '0');
990 
991         /*
992          * Add the code if it not 0.
993          */
994         if (ccl_code != 0)
995           ordered_ccl_insert(code, ccl_code);
996 
997         /*
998          * Locate the second character property field.
999          */
1000         for (s = e; *s != 0 && i < 4; s++) {
1001             if (*s == ';')
1002               i++;
1003         }
1004         for (e = s; *e && *e != ';'; e++) ;
1005 
1006         ordered_range_insert(code, s, e - s);
1007 
1008         /*
1009          * Check for a decomposition.
1010          */
1011         s = ++e;
1012         if (*s != ';') {
1013 	    compat = *s == '<';
1014 	    if (compat) {
1015 		/*
1016 		 * Skip compatibility formatting tag.
1017 		 */
1018 		while (*s++ != '>');
1019 	    }
1020             /*
1021              * Collect the codes of the decomposition.
1022              */
1023             for (dectmp_size = 0; *s != ';'; ) {
1024                 /*
1025                  * Skip all leading non-hex digits.
1026                  */
1027                 while (!ishdigit(*s))
1028  		  s++;
1029 
1030                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
1031                     dectmp[dectmp_size] <<= 4;
1032                     if (*s >= '0' && *s <= '9')
1033                       dectmp[dectmp_size] += *s - '0';
1034                     else if (*s >= 'A' && *s <= 'F')
1035                       dectmp[dectmp_size] += (*s - 'A') + 10;
1036                     else if (*s >= 'a' && *s <= 'f')
1037                       dectmp[dectmp_size] += (*s - 'a') + 10;
1038                 }
1039                 dectmp_size++;
1040             }
1041 
1042             /*
1043              * If there are any codes in the temporary decomposition array,
1044              * then add the character with its decomposition.
1045              */
1046             if (dectmp_size > 0) {
1047 		if (!compat) {
1048 		    add_decomp(code, 0);
1049 		}
1050 		add_decomp(code, 1);
1051 	    }
1052         }
1053 
1054         /*
1055          * Skip to the number field.
1056          */
1057         for (i = 0; i < 3 && *s; s++) {
1058             if (*s == ';')
1059               i++;
1060         }
1061 
1062         /*
1063          * Scan the number in.
1064          */
1065         number[0] = number[1] = 0;
1066         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
1067             if (*e == '-') {
1068                 neg = 1;
1069                 continue;
1070             }
1071 
1072             if (*e == '/') {
1073                 /*
1074                  * Move the the denominator of the fraction.
1075                  */
1076                 if (neg)
1077                   number[wnum] *= -1;
1078                 neg = 0;
1079                 e++;
1080                 wnum++;
1081             }
1082             number[wnum] = (number[wnum] * 10) + (*e - '0');
1083         }
1084 
1085         if (e > s) {
1086             /*
1087              * Adjust the denominator in case of integers and add the number.
1088              */
1089             if (wnum == 0)
1090               number[1] = 1;
1091 
1092             add_number(code, number[0], number[1]);
1093         }
1094 
1095         /*
1096          * Skip to the start of the possible case mappings.
1097          */
1098         for (s = e, i = 0; i < 4 && *s; s++) {
1099             if (*s == ';')
1100               i++;
1101         }
1102 
1103         /*
1104          * Collect the case mappings.
1105          */
1106         cases[0] = cases[1] = cases[2] = 0;
1107         for (i = 0; i < 3; i++) {
1108             while (ishdigit(*s)) {
1109                 cases[i] <<= 4;
1110                 if (*s >= '0' && *s <= '9')
1111                   cases[i] += *s - '0';
1112                 else if (*s >= 'A' && *s <= 'F')
1113                   cases[i] += (*s - 'A') + 10;
1114                 else if (*s >= 'a' && *s <= 'f')
1115                   cases[i] += (*s - 'a') + 10;
1116                 s++;
1117             }
1118             if (*s == ';')
1119               s++;
1120         }
1121         if (!strncmp(first_prop,"Lt",2) && (cases[0] || cases[1]))
1122           /*
1123            * Add the upper and lower mappings for a title case character.
1124            */
1125           add_title(code);
1126         else if (cases[1])
1127           /*
1128            * Add the lower and title case mappings for the upper case
1129            * character.
1130            */
1131           add_upper(code);
1132         else if (cases[0])
1133           /*
1134            * Add the upper and title case mappings for the lower case
1135            * character.
1136            */
1137           add_lower(code);
1138     }
1139 }
1140 
1141 static _decomp_t *
find_decomp(ac_uint4 code,short compat)1142 find_decomp(ac_uint4 code, short compat)
1143 {
1144     long l, r, m;
1145     _decomp_t *decs;
1146 
1147     l = 0;
1148     r = (compat ? kdecomps_used : decomps_used) - 1;
1149     decs = compat ? kdecomps : decomps;
1150     while (l <= r) {
1151         m = (l + r) >> 1;
1152         if (code > decs[m].code)
1153           l = m + 1;
1154         else if (code < decs[m].code)
1155           r = m - 1;
1156         else
1157           return &decs[m];
1158     }
1159     return 0;
1160 }
1161 
1162 static void
decomp_it(_decomp_t * d,short compat)1163 decomp_it(_decomp_t *d, short compat)
1164 {
1165     ac_uint4 i;
1166     _decomp_t *dp;
1167 
1168     for (i = 0; i < d->used; i++) {
1169         if ((dp = find_decomp(d->decomp[i], compat)) != 0)
1170           decomp_it(dp, compat);
1171         else
1172           dectmp[dectmp_size++] = d->decomp[i];
1173     }
1174 }
1175 
1176 /*
1177  * Expand all decompositions by recursively decomposing each character
1178  * in the decomposition.
1179  */
1180 static void
expand_decomp(void)1181 expand_decomp(void)
1182 {
1183     ac_uint4 i;
1184 
1185     for (i = 0; i < decomps_used; i++) {
1186         dectmp_size = 0;
1187         decomp_it(&decomps[i], 0);
1188         if (dectmp_size > 0)
1189           add_decomp(decomps[i].code, 0);
1190     }
1191 
1192     for (i = 0; i < kdecomps_used; i++) {
1193         dectmp_size = 0;
1194         decomp_it(&kdecomps[i], 1);
1195         if (dectmp_size > 0)
1196           add_decomp(kdecomps[i].code, 1);
1197     }
1198 }
1199 
1200 static int
cmpcomps(const void * v_comp1,const void * v_comp2)1201 cmpcomps(const void *v_comp1, const void *v_comp2)
1202 {
1203 	const _comp_t *comp1 = v_comp1, *comp2 = v_comp2;
1204     long diff = comp1->code1 - comp2->code1;
1205 
1206     if (!diff)
1207 	diff = comp1->code2 - comp2->code2;
1208     return (int) diff;
1209 }
1210 
1211 /*
1212  * Load composition exclusion data
1213  */
1214 static void
read_compexdata(FILE * in)1215 read_compexdata(FILE *in)
1216 {
1217     ac_uint2 i;
1218     ac_uint4 code;
1219     char line[512], *s;
1220 
1221     (void) memset((char *) compexs, 0, sizeof(compexs));
1222 
1223     while (fgets(line, sizeof(line), in)) {
1224 	if( (s=strchr(line, '\n')) ) *s = '\0';
1225         /*
1226          * Skip blank lines and lines that start with a '#'.
1227          */
1228         if (line[0] == 0 || line[0] == '#')
1229 	    continue;
1230 
1231 	/*
1232          * Collect the code.  Assume max 6 digits
1233          */
1234 
1235 	for (s = line, i = code = 0; *s != '#' && i < 6; i++, s++) {
1236 	    if (isspace((unsigned char)*s)) break;
1237             code <<= 4;
1238             if (*s >= '0' && *s <= '9')
1239 		code += *s - '0';
1240             else if (*s >= 'A' && *s <= 'F')
1241 		code += (*s - 'A') + 10;
1242             else if (*s >= 'a' && *s <= 'f')
1243 		code += (*s - 'a') + 10;
1244         }
1245         COMPEX_SET(code);
1246     }
1247 }
1248 
1249 /*
1250  * Creates array of compositions from decomposition array
1251  */
1252 static void
create_comps(void)1253 create_comps(void)
1254 {
1255     ac_uint4 i, cu;
1256 
1257     comps = (_comp_t *) malloc(comps_used * sizeof(_comp_t));
1258 
1259     for (i = cu = 0; i < decomps_used; i++) {
1260 	if (decomps[i].used != 2 || COMPEX_TEST(decomps[i].code))
1261 	    continue;
1262 	comps[cu].comp = decomps[i].code;
1263 	comps[cu].count = 2;
1264 	comps[cu].code1 = decomps[i].decomp[0];
1265 	comps[cu].code2 = decomps[i].decomp[1];
1266 	cu++;
1267     }
1268     comps_used = cu;
1269     qsort(comps, comps_used, sizeof(_comp_t), cmpcomps);
1270 }
1271 
1272 #if HARDCODE_DATA
1273 static void
write_case(FILE * out,_case_t * tab,int num,int first)1274 write_case(FILE *out, _case_t *tab, int num, int first)
1275 {
1276     int i;
1277 
1278     for (i=0; i<num; i++) {
1279 	if (first) first = 0;
1280 	else fprintf(out, ",");
1281 	fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx",
1282 		(unsigned long) tab[i].key, (unsigned long) tab[i].other1,
1283 		(unsigned long) tab[i].other2);
1284     }
1285 }
1286 
1287 #define PREF "static const "
1288 
1289 #endif
1290 
1291 static void
write_cdata(char * opath)1292 write_cdata(char *opath)
1293 {
1294     FILE *out;
1295 	ac_uint4 bytes;
1296     ac_uint4 i, idx, nprops;
1297 #if !(HARDCODE_DATA)
1298     ac_uint2 casecnt[2];
1299 #endif
1300     char path[BUFSIZ];
1301 #if HARDCODE_DATA
1302     int j, k;
1303 
1304     /*****************************************************************
1305      *
1306      * Generate the ctype data.
1307      *
1308      *****************************************************************/
1309 
1310     /*
1311      * Open the output file.
1312      */
1313     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "uctable.h", opath);
1314     if ((out = fopen(path, "w")) == 0)
1315       return;
1316 #else
1317     /*
1318      * Open the ctype.dat file.
1319      */
1320     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "ctype.dat", opath);
1321     if ((out = fopen(path, "wb")) == 0)
1322       return;
1323 #endif
1324 
1325     /*
1326      * Collect the offsets for the properties.  The offsets array is
1327      * on a 4-byte boundary to keep things efficient for architectures
1328      * that need such a thing.
1329      */
1330     for (i = idx = 0; i < NUMPROPS; i++) {
1331         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1332         idx += proptbl[i].used;
1333     }
1334 
1335     /*
1336      * Add the sentinel index which is used by the binary search as the upper
1337      * bound for a search.
1338      */
1339     propcnt[i] = idx;
1340 
1341     /*
1342      * Record the actual number of property lists.  This may be different than
1343      * the number of offsets actually written because of aligning on a 4-byte
1344      * boundary.
1345      */
1346     hdr[1] = NUMPROPS;
1347 
1348     /*
1349      * Calculate the byte count needed and pad the property counts array to a
1350      * 4-byte boundary.
1351      */
1352     if ((bytes = sizeof(ac_uint2) * (NUMPROPS + 1)) & 3)
1353       bytes += 4 - (bytes & 3);
1354     nprops = bytes / sizeof(ac_uint2);
1355     bytes += sizeof(ac_uint4) * idx;
1356 
1357 #if HARDCODE_DATA
1358     fprintf(out, PREF "ac_uint4 _ucprop_size = %d;\n\n", NUMPROPS);
1359 
1360     fprintf(out, PREF "ac_uint2 _ucprop_offsets[] = {");
1361 
1362     for (i = 0; i<nprops; i++) {
1363        if (i) fprintf(out, ",");
1364        if (!(i&7)) fprintf(out, "\n\t");
1365        else fprintf(out, " ");
1366        fprintf(out, "0x%04x", propcnt[i]);
1367     }
1368     fprintf(out, "\n};\n\n");
1369 
1370     fprintf(out, PREF "ac_uint4 _ucprop_ranges[] = {");
1371 
1372     k = 0;
1373     for (i = 0; i < NUMPROPS; i++) {
1374 	if (proptbl[i].used > 0) {
1375 	  for (j=0; j<proptbl[i].used; j++) {
1376 	    if (k) fprintf(out, ",");
1377 	    if (!(k&3)) fprintf(out,"\n\t");
1378 	    else fprintf(out, " ");
1379 	    k++;
1380 	    fprintf(out, "0x%08lx", (unsigned long) proptbl[i].ranges[j]);
1381 	  }
1382 	}
1383     }
1384     fprintf(out, "\n};\n\n");
1385 #else
1386     /*
1387      * Write the header.
1388      */
1389     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1390 
1391     /*
1392      * Write the byte count.
1393      */
1394     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1395 
1396     /*
1397      * Write the property list counts.
1398      */
1399     fwrite((char *) propcnt, sizeof(ac_uint2), nprops, out);
1400 
1401     /*
1402      * Write the property lists.
1403      */
1404     for (i = 0; i < NUMPROPS; i++) {
1405         if (proptbl[i].used > 0)
1406           fwrite((char *) proptbl[i].ranges, sizeof(ac_uint4),
1407                  proptbl[i].used, out);
1408     }
1409 
1410     fclose(out);
1411 #endif
1412 
1413     /*****************************************************************
1414      *
1415      * Generate the case mapping data.
1416      *
1417      *****************************************************************/
1418 
1419 #if HARDCODE_DATA
1420     fprintf(out, PREF "ac_uint4 _uccase_size = %ld;\n\n",
1421         (long) (upper_used + lower_used + title_used));
1422 
1423     fprintf(out, PREF "ac_uint2 _uccase_len[2] = {%ld, %ld};\n\n",
1424         (long) upper_used, (long) lower_used);
1425     fprintf(out, PREF "ac_uint4 _uccase_map[] = {");
1426 
1427     if (upper_used > 0)
1428       /*
1429        * Write the upper case table.
1430        */
1431       write_case(out, upper, upper_used, 1);
1432 
1433     if (lower_used > 0)
1434       /*
1435        * Write the lower case table.
1436        */
1437       write_case(out, lower, lower_used, !upper_used);
1438 
1439     if (title_used > 0)
1440       /*
1441        * Write the title case table.
1442        */
1443       write_case(out, title, title_used, !(upper_used||lower_used));
1444 
1445     if (!(upper_used || lower_used || title_used))
1446 	fprintf(out, "\t0");
1447 
1448     fprintf(out, "\n};\n\n");
1449 #else
1450     /*
1451      * Open the case.dat file.
1452      */
1453     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "case.dat", opath);
1454     if ((out = fopen(path, "wb")) == 0)
1455       return;
1456 
1457     /*
1458      * Write the case mapping tables.
1459      */
1460     hdr[1] = upper_used + lower_used + title_used;
1461     casecnt[0] = upper_used;
1462     casecnt[1] = lower_used;
1463 
1464     /*
1465      * Write the header.
1466      */
1467     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1468 
1469     /*
1470      * Write the upper and lower case table sizes.
1471      */
1472     fwrite((char *) casecnt, sizeof(ac_uint2), 2, out);
1473 
1474     if (upper_used > 0)
1475       /*
1476        * Write the upper case table.
1477        */
1478       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1479 
1480     if (lower_used > 0)
1481       /*
1482        * Write the lower case table.
1483        */
1484       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1485 
1486     if (title_used > 0)
1487       /*
1488        * Write the title case table.
1489        */
1490       fwrite((char *) title, sizeof(_case_t), title_used, out);
1491 
1492     fclose(out);
1493 #endif
1494 
1495     /*****************************************************************
1496      *
1497      * Generate the composition data.
1498      *
1499      *****************************************************************/
1500 
1501     /*
1502      * Create compositions from decomposition data
1503      */
1504     create_comps();
1505 
1506 #if HARDCODE_DATA
1507     fprintf(out, PREF "ac_uint4 _uccomp_size = %ld;\n\n",
1508         comps_used * 4L);
1509 
1510     fprintf(out, PREF "ac_uint4 _uccomp_data[] = {");
1511 
1512      /*
1513       * Now, if comps exist, write them out.
1514       */
1515     if (comps_used > 0) {
1516 	for (i=0; i<comps_used; i++) {
1517 	    if (i) fprintf(out, ",");
1518 	    fprintf(out, "\n\t0x%08lx, 0x%08lx, 0x%08lx, 0x%08lx",
1519 	        (unsigned long) comps[i].comp, (unsigned long) comps[i].count,
1520 	        (unsigned long) comps[i].code1, (unsigned long) comps[i].code2);
1521 	}
1522     } else {
1523 	fprintf(out, "\t0");
1524     }
1525     fprintf(out, "\n};\n\n");
1526 #else
1527     /*
1528      * Open the comp.dat file.
1529      */
1530     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "comp.dat", opath);
1531     if ((out = fopen(path, "wb")) == 0)
1532 	return;
1533 
1534     /*
1535      * Write the header.
1536      */
1537     hdr[1] = (ac_uint2) comps_used * 4;
1538     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1539 
1540     /*
1541      * Write out the byte count to maintain header size.
1542      */
1543     bytes = comps_used * sizeof(_comp_t);
1544     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1545 
1546     /*
1547      * Now, if comps exist, write them out.
1548      */
1549     if (comps_used > 0)
1550         fwrite((char *) comps, sizeof(_comp_t), comps_used, out);
1551 
1552     fclose(out);
1553 #endif
1554 
1555     /*****************************************************************
1556      *
1557      * Generate the decomposition data.
1558      *
1559      *****************************************************************/
1560 
1561     /*
1562      * Fully expand all decompositions before generating the output file.
1563      */
1564     expand_decomp();
1565 
1566 #if HARDCODE_DATA
1567     fprintf(out, PREF "ac_uint4 _ucdcmp_size = %ld;\n\n",
1568         decomps_used * 2L);
1569 
1570     fprintf(out, PREF "ac_uint4 _ucdcmp_nodes[] = {");
1571 
1572     if (decomps_used) {
1573 	/*
1574 	 * Write the list of decomp nodes.
1575 	 */
1576 	for (i = idx = 0; i < decomps_used; i++) {
1577 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1578 	        (unsigned long) decomps[i].code, (unsigned long) idx);
1579 	    idx += decomps[i].used;
1580 	}
1581 
1582 	/*
1583 	 * Write the sentinel index as the last decomp node.
1584 	 */
1585 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1586 
1587 	fprintf(out, PREF "ac_uint4 _ucdcmp_decomp[] = {");
1588 	/*
1589 	 * Write the decompositions themselves.
1590 	 */
1591 	k = 0;
1592 	for (i = 0; i < decomps_used; i++)
1593 	  for (j=0; j<decomps[i].used; j++) {
1594 	    if (k) fprintf(out, ",");
1595 	    if (!(k&3)) fprintf(out,"\n\t");
1596 	    else fprintf(out, " ");
1597 	    k++;
1598 	    fprintf(out, "0x%08lx", (unsigned long) decomps[i].decomp[j]);
1599 	  }
1600 	fprintf(out, "\n};\n\n");
1601     }
1602 #else
1603     /*
1604      * Open the decomp.dat file.
1605      */
1606     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "decomp.dat", opath);
1607     if ((out = fopen(path, "wb")) == 0)
1608       return;
1609 
1610     hdr[1] = decomps_used;
1611 
1612     /*
1613      * Write the header.
1614      */
1615     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1616 
1617     /*
1618      * Write a temporary byte count which will be calculated as the
1619      * decompositions are written out.
1620      */
1621     bytes = 0;
1622     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1623 
1624     if (decomps_used) {
1625         /*
1626          * Write the list of decomp nodes.
1627          */
1628         for (i = idx = 0; i < decomps_used; i++) {
1629             fwrite((char *) &decomps[i].code, sizeof(ac_uint4), 1, out);
1630             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1631             idx += decomps[i].used;
1632         }
1633 
1634         /*
1635          * Write the sentinel index as the last decomp node.
1636          */
1637         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1638 
1639         /*
1640          * Write the decompositions themselves.
1641          */
1642         for (i = 0; i < decomps_used; i++)
1643           fwrite((char *) decomps[i].decomp, sizeof(ac_uint4),
1644                  decomps[i].used, out);
1645 
1646         /*
1647          * Seek back to the beginning and write the byte count.
1648          */
1649         bytes = (sizeof(ac_uint4) * idx) +
1650             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1651         fseek(out, sizeof(ac_uint2) << 1, 0L);
1652         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1653 
1654         fclose(out);
1655     }
1656 #endif
1657 
1658 #ifdef HARDCODE_DATA
1659     fprintf(out, PREF "ac_uint4 _uckdcmp_size = %ld;\n\n",
1660         kdecomps_used * 2L);
1661 
1662     fprintf(out, PREF "ac_uint4 _uckdcmp_nodes[] = {");
1663 
1664     if (kdecomps_used) {
1665 	/*
1666 	 * Write the list of kdecomp nodes.
1667 	 */
1668 	for (i = idx = 0; i < kdecomps_used; i++) {
1669 	    fprintf(out, "\n\t0x%08lx, 0x%08lx,",
1670 	        (unsigned long) kdecomps[i].code, (unsigned long) idx);
1671 	    idx += kdecomps[i].used;
1672 	}
1673 
1674 	/*
1675 	 * Write the sentinel index as the last decomp node.
1676 	 */
1677 	fprintf(out, "\n\t0x%08lx\n};\n\n", (unsigned long) idx);
1678 
1679 	fprintf(out, PREF "ac_uint4 _uckdcmp_decomp[] = {");
1680 
1681 	/*
1682 	 * Write the decompositions themselves.
1683 	 */
1684 	k = 0;
1685 	for (i = 0; i < kdecomps_used; i++)
1686 	  for (j=0; j<kdecomps[i].used; j++) {
1687 	    if (k) fprintf(out, ",");
1688 	    if (!(k&3)) fprintf(out,"\n\t");
1689 	    else fprintf(out, " ");
1690 	    k++;
1691 	    fprintf(out, "0x%08lx", (unsigned long) kdecomps[i].decomp[j]);
1692 	  }
1693 	fprintf(out, "\n};\n\n");
1694     }
1695 #else
1696     /*
1697      * Open the kdecomp.dat file.
1698      */
1699     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "kdecomp.dat", opath);
1700     if ((out = fopen(path, "wb")) == 0)
1701       return;
1702 
1703     hdr[1] = kdecomps_used;
1704 
1705     /*
1706      * Write the header.
1707      */
1708     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1709 
1710     /*
1711      * Write a temporary byte count which will be calculated as the
1712      * decompositions are written out.
1713      */
1714     bytes = 0;
1715     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1716 
1717     if (kdecomps_used) {
1718         /*
1719          * Write the list of kdecomp nodes.
1720          */
1721         for (i = idx = 0; i < kdecomps_used; i++) {
1722             fwrite((char *) &kdecomps[i].code, sizeof(ac_uint4), 1, out);
1723             fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1724             idx += kdecomps[i].used;
1725         }
1726 
1727         /*
1728          * Write the sentinel index as the last decomp node.
1729          */
1730         fwrite((char *) &idx, sizeof(ac_uint4), 1, out);
1731 
1732         /*
1733          * Write the decompositions themselves.
1734          */
1735         for (i = 0; i < kdecomps_used; i++)
1736           fwrite((char *) kdecomps[i].decomp, sizeof(ac_uint4),
1737                  kdecomps[i].used, out);
1738 
1739         /*
1740          * Seek back to the beginning and write the byte count.
1741          */
1742         bytes = (sizeof(ac_uint4) * idx) +
1743             (sizeof(ac_uint4) * ((hdr[1] << 1) + 1));
1744         fseek(out, sizeof(ac_uint2) << 1, 0L);
1745         fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1746 
1747         fclose(out);
1748     }
1749 #endif
1750 
1751     /*****************************************************************
1752      *
1753      * Generate the combining class data.
1754      *
1755      *****************************************************************/
1756 #ifdef HARDCODE_DATA
1757     fprintf(out, PREF "ac_uint4 _uccmcl_size = %ld;\n\n", (long) ccl_used);
1758 
1759     fprintf(out, PREF "ac_uint4 _uccmcl_nodes[] = {");
1760 
1761     if (ccl_used > 0) {
1762 	/*
1763 	 * Write the combining class ranges out.
1764 	 */
1765 	for (i = 0; i<ccl_used; i++) {
1766 	    if (i) fprintf(out, ",");
1767 	    if (!(i&3)) fprintf(out, "\n\t");
1768 	    else fprintf(out, " ");
1769 	    fprintf(out, "0x%08lx", (unsigned long) ccl[i]);
1770 	}
1771     } else {
1772 	fprintf(out, "\t0");
1773     }
1774     fprintf(out, "\n};\n\n");
1775 #else
1776     /*
1777      * Open the cmbcl.dat file.
1778      */
1779     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "cmbcl.dat", opath);
1780     if ((out = fopen(path, "wb")) == 0)
1781       return;
1782 
1783     /*
1784      * Set the number of ranges used.  Each range has a combining class which
1785      * means each entry is a 3-tuple.
1786      */
1787     hdr[1] = ccl_used / 3;
1788 
1789     /*
1790      * Write the header.
1791      */
1792     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1793 
1794     /*
1795      * Write out the byte count to maintain header size.
1796      */
1797     bytes = ccl_used * sizeof(ac_uint4);
1798     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1799 
1800     if (ccl_used > 0)
1801       /*
1802        * Write the combining class ranges out.
1803        */
1804       fwrite((char *) ccl, sizeof(ac_uint4), ccl_used, out);
1805 
1806     fclose(out);
1807 #endif
1808 
1809     /*****************************************************************
1810      *
1811      * Generate the number data.
1812      *
1813      *****************************************************************/
1814 
1815 #if HARDCODE_DATA
1816     fprintf(out, PREF "ac_uint4 _ucnum_size = %lu;\n\n",
1817         (unsigned long)ncodes_used<<1);
1818 
1819     fprintf(out, PREF "ac_uint4 _ucnum_nodes[] = {");
1820 
1821     /*
1822      * Now, if number mappings exist, write them out.
1823      */
1824     if (ncodes_used > 0) {
1825 	for (i = 0; i<ncodes_used; i++) {
1826 	    if (i) fprintf(out, ",");
1827 	    if (!(i&1)) fprintf(out, "\n\t");
1828 	    else fprintf(out, " ");
1829 	    fprintf(out, "0x%08lx, 0x%08lx",
1830 	        (unsigned long) ncodes[i].code, (unsigned long) ncodes[i].idx);
1831 	}
1832 	fprintf(out, "\n};\n\n");
1833 
1834 	fprintf(out, PREF "short _ucnum_vals[] = {");
1835 	for (i = 0; i<nums_used; i++) {
1836 	    if (i) fprintf(out, ",");
1837 	    if (!(i&3)) fprintf(out, "\n\t");
1838 	    else fprintf(out, " ");
1839 	    if (nums[i].numerator < 0) {
1840 		fprintf(out, "%6d, 0x%04x",
1841 		  nums[i].numerator, nums[i].denominator);
1842 	    } else {
1843 		fprintf(out, "0x%04x, 0x%04x",
1844 		  nums[i].numerator, nums[i].denominator);
1845 	    }
1846 	}
1847 	fprintf(out, "\n};\n\n");
1848     }
1849 #else
1850     /*
1851      * Open the num.dat file.
1852      */
1853     snprintf(path, sizeof path, "%s" LDAP_DIRSEP "num.dat", opath);
1854     if ((out = fopen(path, "wb")) == 0)
1855       return;
1856 
1857     /*
1858      * The count part of the header will be the total number of codes that
1859      * have numbers.
1860      */
1861     hdr[1] = (ac_uint2) (ncodes_used << 1);
1862     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1863 
1864     /*
1865      * Write the header.
1866      */
1867     fwrite((char *) hdr, sizeof(ac_uint2), 2, out);
1868 
1869     /*
1870      * Write out the byte count to maintain header size.
1871      */
1872     fwrite((char *) &bytes, sizeof(ac_uint4), 1, out);
1873 
1874     /*
1875      * Now, if number mappings exist, write them out.
1876      */
1877     if (ncodes_used > 0) {
1878         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1879         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1880     }
1881 #endif
1882 
1883     fclose(out);
1884 }
1885 
1886 static void
usage(char * prog)1887 usage(char *prog)
1888 {
1889     fprintf(stderr,
1890             "Usage: %s [-o output-directory|-x composition-exclusions]", prog);
1891     fprintf(stderr, " datafile1 datafile2 ...\n\n");
1892     fprintf(stderr,
1893             "-o output-directory\n\t\tWrite the output files to a different");
1894     fprintf(stderr, " directory (default: .).\n");
1895     fprintf(stderr,
1896             "-x composition-exclusion\n\t\tFile of composition codes");
1897     fprintf(stderr, " that should be excluded.\n");
1898     exit(1);
1899 }
1900 
1901 int
main(int argc,char * argv[])1902 main(int argc, char *argv[])
1903 {
1904     FILE *in;
1905     char *prog, *opath;
1906 
1907     prog = lutil_progname( "ucgendat", argc, argv );
1908 
1909     opath = 0;
1910     in = stdin;
1911 
1912     argc--;
1913     argv++;
1914 
1915     while (argc > 0) {
1916         if (argv[0][0] == '-') {
1917             switch (argv[0][1]) {
1918               case 'o':
1919                 argc--;
1920                 argv++;
1921                 opath = argv[0];
1922                 break;
1923               case 'x':
1924                 argc--;
1925                 argv++;
1926                 if ((in = fopen(argv[0], "r")) == 0)
1927                   fprintf(stderr,
1928                           "%s: unable to open composition exclusion file %s\n",
1929                           prog, argv[0]);
1930                 else {
1931                     read_compexdata(in);
1932                     fclose(in);
1933                     in = 0;
1934                 }
1935                 break;
1936               default:
1937                 usage(prog);
1938             }
1939         } else {
1940             if (in != stdin && in != NULL)
1941               fclose(in);
1942             if ((in = fopen(argv[0], "r")) == 0)
1943               fprintf(stderr, "%s: unable to open ctype file %s\n",
1944                       prog, argv[0]);
1945             else {
1946                 read_cdata(in);
1947                 fclose(in);
1948                 in = 0;
1949 	    }
1950         }
1951         argc--;
1952         argv++;
1953     }
1954 
1955     if (opath == 0)
1956       opath = ".";
1957     write_cdata(opath);
1958 
1959     return 0;
1960 }
1961