1 /**
2 ***     Build a deterministic finite automaton to associate CCSIDs with
3 ***             character set names.
4 ***
5 ***     Compile on OS/400 with options SYSIFCOPT(*IFSIO).
6 ***
7 ***     See Copyright for the status of this software.
8 ***
9 ***     Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
10 **/
11 
12 #include <stdio.h>
13 #include <errno.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <fcntl.h>
17 #include <ctype.h>
18 
19 #include <iconv.h>
20 
21 
22 #ifdef OLDXML
23 #include "xml.h"
24 #else
25 #include <libxml/hash.h>
26 #include <libxml/parser.h>
27 #include <libxml/xpath.h>
28 #include <libxml/xpathInternals.h>
29 #endif
30 
31 
32 #ifdef __OS400__
33 #define iconv_open_error(cd)            ((cd).return_value == -1)
34 #define set_iconv_open_error(cd)        ((cd).return_value = -1)
35 #else
36 #define iconv_open_error(cd)            ((cd) == (iconv_t) -1)
37 #define set_iconv_open_error(cd)        ((cd) = (iconv_t) -1)
38 #endif
39 
40 
41 #define C_SOURCE_CCSID          500
42 #define C_UTF8_CCSID            1208
43 
44 
45 #define UTF8_SPACE      0x20
46 #define UTF8_HT         0x09
47 #define UTF8_0          0x30
48 #define UTF8_9          0x39
49 #define UTF8_A          0x41
50 #define UTF8_Z          0x5A
51 #define UTF8_a          0x61
52 #define UTF8_z          0x7A
53 
54 
55 #define GRANULE         128             /* Memory allocation granule. */
56 
57 #define EPSILON         0x100           /* Token for empty transition. */
58 
59 
60 #ifndef OFFSETOF
61 #define OFFSETOF(t, f)  ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
62 #endif
63 
64 #ifndef OFFSETBY
65 #define OFFSETBY(t, p, o)       ((t *) ((char *) (p) + (unsigned int) (o)))
66 #endif
67 
68 
69 typedef struct t_transition     t_transition;   /* NFA/DFA transition. */
70 typedef struct t_state          t_state;        /* NFA/DFA state node. */
71 typedef struct t_symlist        t_symlist;      /* Symbol (i.e.: name) list. */
72 typedef struct t_chset          t_chset;        /* Character set. */
73 typedef struct t_stategroup     t_stategroup;   /* Optimization group. */
74 typedef unsigned char           utf8char;       /* UTF-8 character byte. */
75 typedef unsigned char           byte;           /* Untyped data byte. */
76 
77 
78 typedef struct {                        /* Set of pointers. */
79         unsigned int    p_size;         /* Current allocated size. */
80         unsigned int    p_card;         /* Current element count. */
81         void *          p_set[1];       /* Element array. */
82 }               t_powerset;
83 
84 
85 struct t_transition {
86         t_transition *  t_forwprev;     /* Head of forward transition list. */
87         t_transition *  t_forwnext;     /* Tail of forward transition list. */
88         t_transition *  t_backprev;     /* Head of backward transition list. */
89         t_transition *  t_backnext;     /* Tail of backward transition list. */
90         t_state *       t_from;         /* Incoming state. */
91         t_state *       t_to;           /* Destination state. */
92         unsigned short  t_token;        /* Transition token. */
93         unsigned int    t_index;        /* Transition array index. */
94 };
95 
96 
97 struct t_state {
98         t_state *       s_next;         /* Next state (for DFA construction). */
99         t_state *       s_stack;        /* Unprocessed DFA states stack. */
100         t_transition *  s_forward;      /* Forward transitions. */
101         t_transition *  s_backward;     /* Backward transitions. */
102         t_chset *       s_final;        /* Recognized character set. */
103         t_powerset *    s_nfastates;    /* Corresponding NFA states. */
104         unsigned int    s_index;        /* State index. */
105 };
106 
107 
108 struct t_symlist {
109         t_symlist *     l_next;         /* Next name in list. */
110         utf8char        l_symbol[1];    /* Name bytes. */
111 };
112 
113 
114 struct t_chset {
115         t_chset *       c_next;         /* Next character set. */
116         t_symlist *     c_names;        /* Character set name list. */
117         iconv_t         c_fromUTF8;     /* Conversion from UTF-8. */
118         unsigned int    c_ccsid;        /* IBM character set code. */
119         unsigned int    c_mibenum;      /* IANA character code. */
120 };
121 
122 
123 struct t_stategroup {
124         t_stategroup *  g_next;         /* Next group. */
125         t_state *       g_member;       /* Group member (s_stack) list. */
126         unsigned int    g_id;           /* Group ident. */
127 };
128 
129 
130 
131 t_chset *       chset_list;             /* Character set list. */
132 t_state *       initial_state;          /* Initial NFA state. */
133 iconv_t         job2utf8;               /* Job CCSID to UTF-8 conversion. */
134 iconv_t         utf82job;               /* UTF-8 to job CCSID conversion. */
135 t_state *       dfa_states;             /* List of DFA states. */
136 unsigned int    groupid;                /* Group ident counter. */
137 
138 
139 /**
140 ***     UTF-8 strings.
141 **/
142 
143 #pragma convert(819)
144 
145 static const utf8char   utf8_MIBenum[] = "MIBenum";
146 static const utf8char   utf8_mibenum[] = "mibenum";
147 static const utf8char   utf8_ibm_[] = "ibm-";
148 static const utf8char   utf8_IBMCCSID[] = "IBMCCSID";
149 static const utf8char   utf8_iana_[] = "iana-";
150 static const utf8char   utf8_Name[] = "Name";
151 static const utf8char   utf8_Pref_MIME_Name[] = "Preferred MIME Name";
152 static const utf8char   utf8_Aliases[] = "Aliases";
153 static const utf8char   utf8_html[] = "html";
154 static const utf8char   utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
155 static const utf8char   utf8_A[] = "A";
156 static const utf8char   utf8_C[] = "C";
157 static const utf8char   utf8_M[] = "M";
158 static const utf8char   utf8_N[] = "N";
159 static const utf8char   utf8_P[] = "P";
160 static const utf8char   utf8_T[] = "T";
161 static const utf8char   utf8_ccsid[] = "ccsid";
162 static const utf8char   utf8_EBCDIC[] = "EBCDIC";
163 static const utf8char   utf8_ASCII[] = "ASCII";
164 static const utf8char   utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
165 static const utf8char   utf8_aliastext[] =
166                                 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
167 #ifdef OLDXML
168 static const utf8char   utf8_tablerows[] =
169                         "//table[@id='table-character-sets-1']/*/tr";
170 static const utf8char   utf8_headerpos[] =
171                 "count(th[text()=$T]/preceding-sibling::th)+1";
172 static const utf8char   utf8_getmibenum[] = "number(td[$M])";
173 static const utf8char   utf8_getprefname[] = "string(td[$P])";
174 static const utf8char   utf8_getname[] = "string(td[$N])";
175 static const utf8char   utf8_getaliases[] = "td[$A]/text()";
176 #else
177 static const utf8char   utf8_tablerows[] =
178                         "//html:table[@id='table-character-sets-1']/*/html:tr";
179 static const utf8char   utf8_headerpos[] =
180                 "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
181 static const utf8char   utf8_getmibenum[] = "number(html:td[$M])";
182 static const utf8char   utf8_getprefname[] = "string(html:td[$P])";
183 static const utf8char   utf8_getname[] = "string(html:td[$N])";
184 static const utf8char   utf8_getaliases[] = "html:td[$A]/text()";
185 #endif
186 
187 #pragma convert(0)
188 
189 
190 /**
191 ***     UTF-8 character length table.
192 ***
193 ***     Index is first character byte, value is the character byte count.
194 **/
195 
196 static signed char      utf8_chlen[] = {
197 /* 00-07 */     1,      1,      1,      1,      1,      1,      1,      1,
198 /* 08-0F */     1,      1,      1,      1,      1,      1,      1,      1,
199 /* 10-17 */     1,      1,      1,      1,      1,      1,      1,      1,
200 /* 18-1F */     1,      1,      1,      1,      1,      1,      1,      1,
201 /* 20-27 */     1,      1,      1,      1,      1,      1,      1,      1,
202 /* 28-2F */     1,      1,      1,      1,      1,      1,      1,      1,
203 /* 30-37 */     1,      1,      1,      1,      1,      1,      1,      1,
204 /* 38-3F */     1,      1,      1,      1,      1,      1,      1,      1,
205 /* 40-47 */     1,      1,      1,      1,      1,      1,      1,      1,
206 /* 48-4F */     1,      1,      1,      1,      1,      1,      1,      1,
207 /* 50-57 */     1,      1,      1,      1,      1,      1,      1,      1,
208 /* 58-5F */     1,      1,      1,      1,      1,      1,      1,      1,
209 /* 60-67 */     1,      1,      1,      1,      1,      1,      1,      1,
210 /* 68-6F */     1,      1,      1,      1,      1,      1,      1,      1,
211 /* 70-77 */     1,      1,      1,      1,      1,      1,      1,      1,
212 /* 78-7F */     1,      1,      1,      1,      1,      1,      1,      1,
213 /* 80-87 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
214 /* 88-8F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
215 /* 90-97 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
216 /* 98-9F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
217 /* A0-A7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
218 /* A8-AF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
219 /* B0-B7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
220 /* B8-BF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
221 /* C0-C7 */     2,      2,      2,      2,      2,      2,      2,      2,
222 /* C8-CF */     2,      2,      2,      2,      2,      2,      2,      2,
223 /* D0-D7 */     2,      2,      2,      2,      2,      2,      2,      2,
224 /* D8-DF */     2,      2,      2,      2,      2,      2,      2,      2,
225 /* E0-E7 */     3,      3,      3,      3,      3,      3,      3,      3,
226 /* E8-EF */     3,      3,      3,      3,      3,      3,      3,      3,
227 /* F0-F7 */     4,      4,      4,      4,      4,      4,      4,      4,
228 /* F8-FF */     5,      5,      5,      5,      6,      6,      -1,     -1
229 };
230 
231 
232 
233 void
chknull(void * p)234 chknull(void * p)
235 
236 {
237         if (p)
238                 return;
239 
240         fprintf(stderr, "Not enough memory\n");
241         exit(1);
242 }
243 
244 
245 void
makecode(char * buf,unsigned int ccsid)246 makecode(char * buf, unsigned int ccsid)
247 
248 {
249         ccsid &= 0xFFFF;
250         memset(buf, 0, 32);
251         sprintf(buf, "IBMCCSID%05u0000000", ccsid);
252 }
253 
254 
255 iconv_t
iconv_open_ccsid(unsigned int ccsidout,unsigned int ccsidin,unsigned int nullflag)256 iconv_open_ccsid(unsigned int ccsidout,
257                                 unsigned int ccsidin, unsigned int nullflag)
258 
259 {
260         char fromcode[33];
261         char tocode[33];
262 
263         makecode(fromcode, ccsidin);
264         makecode(tocode, ccsidout);
265         memset(tocode + 13, 0, sizeof tocode - 13);
266 
267         if (nullflag)
268                 fromcode[18] = '1';
269 
270         return iconv_open(tocode, fromcode);
271 }
272 
273 
274 unsigned int
getnum(char ** cpp)275 getnum(char * * cpp)
276 
277 {
278         unsigned int n;
279         char * cp;
280 
281         cp = *cpp;
282         n = 0;
283 
284         while (isdigit(*cp))
285                 n = 10 * n + *cp++ - '0';
286 
287         *cpp = cp;
288         return n;
289 }
290 
291 
292 const utf8char *
hashBinaryKey(const byte * bytes,unsigned int len)293 hashBinaryKey(const byte * bytes, unsigned int len)
294 
295 {
296         const byte * bp;
297         utf8char * key;
298         utf8char * cp;
299         unsigned int n;
300         unsigned int n4;
301         unsigned int i;
302 
303         /**
304         ***     Encode binary data in character form to be used as hash
305         ***             table key.
306         **/
307 
308         n = (4 * len + 2) / 3;
309         key = (utf8char *) malloc(n + 1);
310         chknull(key);
311         bp = bytes;
312         cp = key;
313 
314         for (n4 = n >> 2; n4; n4--) {
315                 i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
316                 *cp++ = 0x21 + ((i >> 18) & 0x3F);
317                 *cp++ = 0x21 + ((i >> 12) & 0x3F);
318                 *cp++ = 0x21 + ((i >> 6) & 0x3F);
319                 *cp++ = 0x21 + (i & 0x3F);
320                 bp += 3;
321                 }
322 
323         switch (n & 0x3) {
324 
325         case 2:
326                 *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
327                 *cp++ = 0x21 + ((*bp << 4) & 0x3F);
328                 break;
329 
330         case 3:
331                 i = (bp[0] << 8) | bp[1];
332                 *cp++ = 0x21 + ((i >> 10) & 0x3F);
333                 *cp++ = 0x21 + ((i >> 4) & 0x3F);
334                 *cp++ = 0x21 + ((i << 2) & 0x3F);
335                 break;
336                 }
337 
338         *cp = '\0';
339         return key;
340 }
341 
342 
343 void *
hash_get(xmlHashTablePtr h,const void * binkey,unsigned int len)344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
345 
346 {
347         const utf8char * key;
348         void * result;
349 
350         key = hashBinaryKey((const byte *) binkey, len);
351         result = xmlHashLookup(h, key);
352         free((char *) key);
353         return result;
354 }
355 
356 
357 int
hash_add(xmlHashTablePtr h,const void * binkey,unsigned int len,void * data)358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
359 
360 {
361         const utf8char * key;
362         int result;
363 
364         key = hashBinaryKey((const byte *) binkey, len);
365         result = xmlHashAddEntry(h, key, data);
366         free((char *) key);
367         return result;
368 }
369 
370 
371 xmlDocPtr
loadXMLFile(const char * filename)372 loadXMLFile(const char * filename)
373 
374 {
375         struct stat sbuf;
376         byte * databuf;
377         int fd;
378         int i;
379         xmlDocPtr doc;
380 
381         if (stat(filename, &sbuf))
382                 return (xmlDocPtr) NULL;
383 
384         databuf = malloc(sbuf.st_size + 4);
385 
386         if (!databuf)
387                 return (xmlDocPtr) NULL;
388 
389         fd = open(filename, O_RDONLY
390 #ifdef O_BINARY
391                                          | O_BINARY
392 #endif
393                                                         );
394 
395         if (fd < 0) {
396                 free((char *) databuf);
397                 return (xmlDocPtr) NULL;
398                 }
399 
400         i = read(fd, (char *) databuf, sbuf.st_size);
401         close(fd);
402 
403         if (i != sbuf.st_size) {
404                 free((char *) databuf);
405                 return (xmlDocPtr) NULL;
406                 }
407 
408         databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
409         doc = xmlParseMemory((xmlChar *) databuf, i);
410         free((char *) databuf);
411         return doc;
412 }
413 
414 
415 int
match(char ** cpp,char * s)416 match(char * * cpp, char * s)
417 
418 {
419         char * cp;
420         int c1;
421         int c2;
422 
423         cp = *cpp;
424 
425         for (cp = *cpp; c2 = *s++; cp++) {
426                 c1 = *cp;
427 
428                 if (c1 != c2) {
429                         if (isupper(c1))
430                                 c1 = tolower(c1);
431 
432                         if (isupper(c2))
433                                 c2 = tolower(c2);
434                         }
435 
436                 if (c1 != c2)
437                         return 0;
438                 }
439 
440         c1 = *cp;
441 
442         while (c1 == ' ' || c1 == '\t')
443                 c1 = *++cp;
444 
445         *cpp = cp;
446         return 1;
447 }
448 
449 
450 t_state *
newstate(void)451 newstate(void)
452 
453 {
454         t_state * s;
455 
456         s = (t_state *) malloc(sizeof *s);
457         chknull(s);
458         memset((char *) s, 0, sizeof *s);
459         return s;
460 }
461 
462 
463 void
unlink_transition(t_transition * t)464 unlink_transition(t_transition * t)
465 
466 {
467         if (t->t_backnext)
468                 t->t_backnext->t_backprev = t->t_backprev;
469 
470         if (t->t_backprev)
471                 t->t_backprev->t_backnext = t->t_backnext;
472         else if (t->t_to)
473                 t->t_to->s_backward = t->t_backnext;
474 
475         if (t->t_forwnext)
476                 t->t_forwnext->t_forwprev = t->t_forwprev;
477 
478         if (t->t_forwprev)
479                 t->t_forwprev->t_forwnext = t->t_forwnext;
480         else if (t->t_from)
481                 t->t_from->s_forward = t->t_forwnext;
482 
483         t->t_backprev = (t_transition *) NULL;
484         t->t_backnext = (t_transition *) NULL;
485         t->t_forwprev = (t_transition *) NULL;
486         t->t_forwnext = (t_transition *) NULL;
487         t->t_from = (t_state *) NULL;
488         t->t_to = (t_state *) NULL;
489 }
490 
491 
492 void
link_transition(t_transition * t,t_state * from,t_state * to)493 link_transition(t_transition * t, t_state * from, t_state * to)
494 
495 {
496         if (!from)
497                 from = t->t_from;
498 
499         if (!to)
500                 to = t->t_to;
501 
502         unlink_transition(t);
503 
504         if ((t->t_from = from)) {
505                 if ((t->t_forwnext = from->s_forward))
506                         t->t_forwnext->t_forwprev = t;
507 
508                 from->s_forward = t;
509                 }
510 
511         if ((t->t_to = to)) {
512                 if ((t->t_backnext = to->s_backward))
513                         t->t_backnext->t_backprev = t;
514 
515                 to->s_backward = t;
516                 }
517 }
518 
519 
520 t_transition *
newtransition(unsigned int token,t_state * from,t_state * to)521 newtransition(unsigned int token, t_state * from, t_state * to)
522 
523 {
524         t_transition * t;
525 
526         t = (t_transition *) malloc(sizeof *t);
527         chknull(t);
528         memset((char *) t, 0, sizeof *t);
529         t->t_token = token;
530         link_transition(t, from, to);
531         return t;
532 }
533 
534 
535 t_transition *
uniquetransition(unsigned int token,t_state * from,t_state * to)536 uniquetransition(unsigned int token, t_state * from, t_state * to)
537 
538 {
539         t_transition * t;
540 
541         for (t = from->s_forward; t; t = t->t_forwnext)
542                 if (t->t_token == token && (t->t_to == to || !to))
543                         return t;
544 
545         return to? newtransition(token, from, to): (t_transition *) NULL;
546 }
547 
548 
549 int
set_position(t_powerset * s,void * e)550 set_position(t_powerset * s, void * e)
551 
552 {
553         unsigned int l;
554         unsigned int h;
555         unsigned int m;
556         int i;
557 
558         l = 0;
559         h = s->p_card;
560 
561         while (l < h) {
562                 m = (l + h) >> 1;
563 
564                 /**
565                 ***     If both pointers belong to different allocation arenas,
566                 ***             native comparison may find them neither
567                 ***             equal, nor greater, nor smaller.
568                 ***     We thus compare using memcmp() to get an orthogonal
569                 ***             result.
570                 **/
571 
572                 i = memcmp(&e, s->p_set + m, sizeof e);
573 
574                 if (i < 0)
575                         h = m;
576                 else if (!i)
577                         return m;
578                 else
579                         l = m + 1;
580                 }
581 
582         return l;
583 }
584 
585 
586 t_powerset *
set_include(t_powerset * s,void * e)587 set_include(t_powerset * s, void * e)
588 
589 {
590         unsigned int pos;
591         unsigned int n;
592 
593         if (!s) {
594                 s = (t_powerset *) malloc(sizeof *s +
595                     GRANULE * sizeof s->p_set);
596                 chknull(s);
597                 s->p_size = GRANULE;
598                 s->p_set[GRANULE] = (t_state *) NULL;
599                 s->p_set[0] = e;
600                 s->p_card = 1;
601                 return s;
602                 }
603 
604         pos = set_position(s, e);
605 
606         if (pos < s->p_card && s->p_set[pos] == e)
607                 return s;
608 
609         if (s->p_card >= s->p_size) {
610                 s->p_size += GRANULE;
611                 s = (t_powerset *) realloc(s,
612                     sizeof *s + s->p_size * sizeof s->p_set);
613                 chknull(s);
614                 s->p_set[s->p_size] = (t_state *) NULL;
615                 }
616 
617         n = s->p_card - pos;
618 
619         if (n)
620                 memmove((char *) (s->p_set + pos + 1),
621                     (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
622 
623         s->p_set[pos] = e;
624         s->p_card++;
625         return s;
626 }
627 
628 
629 t_state *
nfatransition(t_state * to,byte token)630 nfatransition(t_state * to, byte token)
631 
632 {
633         t_state * from;
634 
635         from = newstate();
636         newtransition(token, from, to);
637         return from;
638 }
639 
640 
641 static t_state *        nfadevelop(t_state * from, t_state * final, iconv_t icc,
642                                 const utf8char * name, unsigned int len);
643 
644 
645 void
nfaslice(t_state ** from,t_state ** to,iconv_t icc,const utf8char * chr,unsigned int chlen,const utf8char * name,unsigned int len,t_state * final)646 nfaslice(t_state * * from, t_state * * to, iconv_t icc,
647                 const utf8char * chr, unsigned int chlen,
648                 const utf8char * name, unsigned int len, t_state * final)
649 
650 {
651         char * srcp;
652         char * dstp;
653         size_t srcc;
654         size_t dstc;
655         unsigned int cnt;
656         t_state * f;
657         t_state * t;
658         t_transition * tp;
659         byte bytebuf[8];
660 
661         srcp = (char *) chr;
662         srcc = chlen;
663         dstp = (char *) bytebuf;
664         dstc = sizeof bytebuf;
665         iconv(icc, &srcp, &srcc, &dstp, &dstc);
666         dstp = (char *) bytebuf;
667         cnt = sizeof bytebuf - dstc;
668         t = *to;
669         f = *from;
670 
671         /**
672         ***     Check for end of string.
673         **/
674 
675         if (!len)
676                 if (t && t != final)
677                         uniquetransition(EPSILON, t, final);
678                 else
679                         t = final;
680 
681         if (f)
682                 while (cnt) {
683                         tp = uniquetransition(*dstp, f, (t_state *) NULL);
684 
685                         if (!tp)
686                                 break;
687 
688                         f = tp->t_to;
689                         dstp++;
690                         cnt--;
691                         }
692 
693         if (!cnt) {
694                 if (!t)
695                         t = nfadevelop(f, final, icc, name, len);
696 
697                 *to = t;
698                 return;
699                 }
700 
701         if (!t) {
702                 t = nfadevelop((t_state *) NULL, final, icc, name, len);
703                 *to = t;
704                 }
705 
706         if (!f)
707                 *from = f = newstate();
708 
709         while (cnt > 1)
710                 t = nfatransition(t, dstp[--cnt]);
711 
712         newtransition(*dstp, f, t);
713 }
714 
715 
716 t_state *
nfadevelop(t_state * from,t_state * final,iconv_t icc,const utf8char * name,unsigned int len)717 nfadevelop(t_state * from, t_state * final, iconv_t icc,
718                                         const utf8char * name, unsigned int len)
719 
720 {
721         int chlen;
722         int i;
723         t_state * to;
724         int uccnt;
725         int lccnt;
726         utf8char chr;
727 
728         chlen = utf8_chlen[*name];
729 
730         for (i = 1; i < chlen; i++)
731                 if ((name[i] & 0xC0) != 0x80)
732                         break;
733 
734         if (i != chlen) {
735                 fprintf(stderr,
736                     "Invalid UTF8 character in character set name\n");
737                 return (t_state *) NULL;
738                 }
739 
740         to = (t_state *) NULL;
741         nfaslice(&from, &to,
742             icc, name, chlen, name + chlen, len - chlen, final);
743 
744         if (*name >= UTF8_a && *name <= UTF8_z)
745                 chr = *name - UTF8_a + UTF8_A;
746         else if (*name >= UTF8_A && *name <= UTF8_Z)
747                 chr = *name - UTF8_A + UTF8_a;
748         else
749                 return from;
750 
751         nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
752         return from;
753 }
754 
755 
756 
757 void
nfaenter(const utf8char * name,int len,t_chset * charset)758 nfaenter(const utf8char * name, int len, t_chset * charset)
759 
760 {
761         t_chset * s;
762         t_state * final;
763         t_state * sp;
764         t_symlist * lp;
765 
766         /**
767         ***     Enter case-insensitive `name' in NFA in all known
768         ***             character codes.
769         ***     Redundant shift state changes as well as shift state
770         ***             differences between uppercase and lowercase are
771         ***             not handled.
772         **/
773 
774         if (len < 0)
775                 len = strlen(name) + 1;
776 
777         for (lp = charset->c_names; lp; lp = lp->l_next)
778                 if (!memcmp(name, lp->l_symbol, len))
779                         return;         /* Already entered. */
780 
781         lp = (t_symlist *) malloc(sizeof *lp + len);
782         chknull(lp);
783         memcpy(lp->l_symbol, name, len);
784         lp->l_symbol[len] = '\0';
785         lp->l_next = charset->c_names;
786         charset->c_names = lp;
787         final = newstate();
788         final->s_final = charset;
789 
790         for (s = chset_list; s; s = s->c_next)
791                 if (!iconv_open_error(s->c_fromUTF8))
792                         sp = nfadevelop(initial_state, final,
793                             s->c_fromUTF8, name, len);
794 }
795 
796 
797 unsigned int
utf8_utostr(utf8char * s,unsigned int v)798 utf8_utostr(utf8char * s, unsigned int v)
799 
800 {
801         unsigned int d;
802         unsigned int i;
803 
804         d = v / 10;
805         v -= d * 10;
806         i = d? utf8_utostr(s, d): 0;
807         s[i++] = v + UTF8_0;
808         s[i] = '\0';
809         return i;
810 }
811 
812 
813 unsigned int
utf8_utostrpad(utf8char * s,unsigned int v,int digits)814 utf8_utostrpad(utf8char * s, unsigned int v, int digits)
815 
816 {
817         unsigned int i = utf8_utostr(s, v);
818         utf8char pad = UTF8_SPACE;
819 
820         if (digits < 0) {
821                 pad = UTF8_0;
822                 digits = -digits;
823                 }
824 
825         if (i >= digits)
826                 return i;
827 
828         memmove(s + digits - i, s, i + 1);
829         memset(s, pad, digits - i);
830         return digits;
831 }
832 
833 
834 unsigned int
utf8_strtou(const utf8char * s)835 utf8_strtou(const utf8char * s)
836 
837 {
838         unsigned int v;
839 
840         while (*s == UTF8_SPACE || *s == UTF8_HT)
841                 s++;
842 
843         for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
844                 v = 10 * v + *s++ - UTF8_0;
845 
846         return v;
847 }
848 
849 
850 unsigned int
getNumAttr(xmlNodePtr node,const xmlChar * name)851 getNumAttr(xmlNodePtr node, const xmlChar * name)
852 
853 {
854         const xmlChar * s;
855         unsigned int val;
856 
857         s = xmlGetProp(node, name);
858 
859         if (!s)
860                 return 0;
861 
862         val = utf8_strtou(s);
863         xmlFree((xmlChar *) s);
864         return val;
865 }
866 
867 
868 void
read_assocs(const char * filename)869 read_assocs(const char * filename)
870 
871 {
872         xmlDocPtr doc;
873         xmlXPathContextPtr ctxt;
874         xmlXPathObjectPtr obj;
875         xmlNodePtr node;
876         t_chset * sp;
877         int i;
878         unsigned int ccsid;
879         unsigned int mibenum;
880         utf8char symbuf[32];
881 
882         doc = loadXMLFile(filename);
883 
884         if (!doc) {
885                 fprintf(stderr, "Cannot load file %s\n", filename);
886                 exit(1);
887                 }
888 
889         ctxt = xmlXPathNewContext(doc);
890         obj = xmlXPathEval(utf8_assocnodes, ctxt);
891 
892         if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
893             !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
894                 fprintf(stderr, "No association found in %s\n", filename);
895                 exit(1);
896                 }
897 
898         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
899                 node = obj->nodesetval->nodeTab[i];
900                 ccsid = getNumAttr(node, utf8_ccsid);
901                 mibenum = getNumAttr(node, utf8_mibenum);
902 
903                 /**
904                 ***     Check for duplicate.
905                 **/
906 
907                 for (sp = chset_list; sp; sp = sp->c_next)
908                         if (ccsid && ccsid == sp->c_ccsid ||
909                             mibenum && mibenum == sp->c_mibenum) {
910                                 fprintf(stderr, "Duplicate character set: ");
911                                 fprintf(stderr, "CCSID = %u/%u, ",
912                                     ccsid, sp->c_ccsid);
913                                 fprintf(stderr, "MIBenum = %u/%u\n",
914                                     mibenum, sp->c_mibenum);
915                                 break;
916                                 }
917 
918                 if (sp)
919                         continue;
920 
921                 /**
922                 ***     Allocate the new character set.
923                 **/
924 
925                 sp = (t_chset *) malloc(sizeof *sp);
926                 chknull(sp);
927                 memset(sp, 0, sizeof *sp);
928 
929                 if (!ccsid)     /* Do not attempt with current job CCSID. */
930                         set_iconv_open_error(sp->c_fromUTF8);
931                 else {
932                         sp->c_fromUTF8 =
933                             iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
934 
935                         if (iconv_open_error(sp->c_fromUTF8) == -1)
936                                 fprintf(stderr,
937                                     "Cannot convert into CCSID %u: ignored\n",
938                                     ccsid);
939                         }
940 
941                 sp->c_ccsid = ccsid;
942                 sp->c_mibenum = mibenum;
943                 sp->c_next = chset_list;
944                 chset_list = sp;
945                 }
946 
947         xmlXPathFreeObject(obj);
948 
949         /**
950         ***     Enter aliases.
951         **/
952 
953         for (sp = chset_list; sp; sp = sp->c_next) {
954                 strcpy(symbuf, utf8_ibm_);
955                 utf8_utostr(symbuf + 4, sp->c_ccsid);
956                 nfaenter(symbuf, -1, sp);
957                 strcpy(symbuf, utf8_IBMCCSID);
958                 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
959                 nfaenter(symbuf, 13, sp);       /* Not null-terminated. */
960 
961                 if (sp->c_mibenum) {
962                         strcpy(symbuf, utf8_iana_);
963                         utf8_utostr(symbuf + 5, sp->c_mibenum);
964                         nfaenter(symbuf, -1, sp);
965                         }
966 
967                 xmlXPathRegisterVariable(ctxt, utf8_C,
968                     xmlXPathNewFloat((double) sp->c_ccsid));
969                 obj = xmlXPathEval(utf8_aliastext, ctxt);
970 
971                 if (!obj || obj->type != XPATH_NODESET) {
972                         fprintf(stderr, "getAlias failed in %s\n", filename);
973                         exit(1);
974                         }
975 
976                 if (obj->nodesetval &&
977                     obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
978                         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
979                                 node = obj->nodesetval->nodeTab[i];
980                                 nfaenter(node->content, -1, sp);
981                                 }
982                         }
983 
984                 xmlXPathFreeObject(obj);
985                 }
986 
987         xmlXPathFreeContext(ctxt);
988         xmlFreeDoc(doc);
989 }
990 
991 
992 unsigned int
columnPosition(xmlXPathContextPtr ctxt,const xmlChar * header)993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
994 
995 {
996         xmlXPathObjectPtr obj;
997         unsigned int res = 0;
998 
999         xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
1000         obj = xmlXPathEval(utf8_headerpos, ctxt);
1001 
1002         if (obj) {
1003                 if (obj->type == XPATH_NUMBER)
1004                         res = (unsigned int) obj->floatval;
1005 
1006                 xmlXPathFreeObject(obj);
1007                 }
1008 
1009         return res;
1010 }
1011 
1012 
1013 void
read_iana(const char * filename)1014 read_iana(const char * filename)
1015 
1016 {
1017         xmlDocPtr doc;
1018         xmlXPathContextPtr ctxt;
1019         xmlXPathObjectPtr obj1;
1020         xmlXPathObjectPtr obj2;
1021         xmlNodePtr node;
1022         int prefnamecol;
1023         int namecol;
1024         int mibenumcol;
1025         int aliascol;
1026         int mibenum;
1027         t_chset * sp;
1028         int n;
1029         int i;
1030 
1031         doc = loadXMLFile(filename);
1032 
1033         if (!doc) {
1034                 fprintf(stderr, "Cannot load file %s\n", filename);
1035                 exit(1);
1036                 }
1037 
1038         ctxt = xmlXPathNewContext(doc);
1039 
1040 #ifndef OLDXML
1041         xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
1042 #endif
1043 
1044         obj1 = xmlXPathEval(utf8_tablerows, ctxt);
1045 
1046         if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
1047             !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
1048                 fprintf(stderr, "No data in %s\n", filename);
1049                 exit(1);
1050                 }
1051 
1052         /**
1053         ***     Identify columns.
1054         **/
1055 
1056         xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
1057         prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
1058         namecol = columnPosition(ctxt, utf8_Name);
1059         mibenumcol = columnPosition(ctxt, utf8_MIBenum);
1060         aliascol = columnPosition(ctxt, utf8_Aliases);
1061 
1062         if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
1063                 fprintf(stderr, "Key column(s) missing in %s\n", filename);
1064                 exit(1);
1065                 }
1066 
1067         xmlXPathRegisterVariable(ctxt, utf8_P,
1068             xmlXPathNewFloat((double) prefnamecol));
1069         xmlXPathRegisterVariable(ctxt, utf8_N,
1070             xmlXPathNewFloat((double) namecol));
1071         xmlXPathRegisterVariable(ctxt, utf8_M,
1072             xmlXPathNewFloat((double) mibenumcol));
1073         xmlXPathRegisterVariable(ctxt, utf8_A,
1074             xmlXPathNewFloat((double) aliascol));
1075 
1076         /**
1077         ***     Process each row.
1078         **/
1079 
1080         for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
1081                 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
1082 
1083                 /**
1084                 ***     Get the MIBenum from current row.
1085                 */
1086 
1087                 obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
1088 
1089                 if (!obj2 || obj2->type != XPATH_NUMBER) {
1090                         fprintf(stderr, "get MIBenum failed at row %u\n", n);
1091                         exit(1);
1092                         }
1093 
1094                 if (xmlXPathIsNaN(obj2->floatval) ||
1095                     obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
1096                     ((unsigned int) obj2->floatval) != obj2->floatval) {
1097                         fprintf(stderr, "invalid MIBenum at row %u\n", n);
1098                         xmlXPathFreeObject(obj2);
1099                         continue;
1100                         }
1101 
1102                 mibenum = obj2->floatval;
1103                 xmlXPathFreeObject(obj2);
1104 
1105                 /**
1106                 ***     Search the associations for a corresponding CCSID.
1107                 **/
1108 
1109                 for (sp = chset_list; sp; sp = sp->c_next)
1110                         if (sp->c_mibenum == mibenum)
1111                                 break;
1112 
1113                 if (!sp)
1114                         continue;       /* No CCSID for this MIBenum. */
1115 
1116                 /**
1117                 ***     Process preferred MIME name.
1118                 **/
1119 
1120                 obj2 = xmlXPathEval(utf8_getprefname, ctxt);
1121 
1122                 if (!obj2 || obj2->type != XPATH_STRING) {
1123                         fprintf(stderr,
1124                             "get Preferred_MIME_Name failed at row %u\n", n);
1125                         exit(1);
1126                         }
1127 
1128                 if (obj2->stringval && obj2->stringval[0])
1129                         nfaenter(obj2->stringval, -1, sp);
1130 
1131                 xmlXPathFreeObject(obj2);
1132 
1133                 /**
1134                 ***     Process name.
1135                 **/
1136 
1137                 obj2 = xmlXPathEval(utf8_getname, ctxt);
1138 
1139                 if (!obj2 || obj2->type != XPATH_STRING) {
1140                         fprintf(stderr, "get name failed at row %u\n", n);
1141                         exit(1);
1142                         }
1143 
1144                 if (obj2->stringval && obj2->stringval[0])
1145                         nfaenter(obj2->stringval, -1, sp);
1146 
1147                 xmlXPathFreeObject(obj2);
1148 
1149                 /**
1150                 ***     Process aliases.
1151                 **/
1152 
1153                 obj2 = xmlXPathEval(utf8_getaliases, ctxt);
1154 
1155                 if (!obj2 || obj2->type != XPATH_NODESET) {
1156                         fprintf(stderr, "get aliases failed at row %u\n", n);
1157                         exit(1);
1158                         }
1159 
1160                 if (obj2->nodesetval && obj2->nodesetval->nodeTab)
1161                         for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
1162                                 node = obj2->nodesetval->nodeTab[i];
1163 
1164                                 if (node && node->content && node->content[0])
1165                                         nfaenter(node->content, -1, sp);
1166                                 }
1167 
1168                 xmlXPathFreeObject(obj2);
1169                 }
1170 
1171         xmlXPathFreeObject(obj1);
1172         xmlXPathFreeContext(ctxt);
1173         xmlFreeDoc(doc);
1174 }
1175 
1176 
1177 t_powerset *    closureset(t_powerset * dst, t_powerset * src);
1178 
1179 
1180 t_powerset *
closure(t_powerset * dst,t_state * src)1181 closure(t_powerset * dst, t_state * src)
1182 
1183 {
1184         t_transition * t;
1185         unsigned int oldcard;
1186 
1187         if (src->s_nfastates) {
1188                 /**
1189                 ***     Is a DFA state: return closure of set of equivalent
1190                 ***             NFA states.
1191                 **/
1192 
1193                 return closureset(dst, src->s_nfastates);
1194                 }
1195 
1196         /**
1197         ***     Compute closure of NFA state.
1198         **/
1199 
1200         dst = set_include(dst, src);
1201 
1202         for (t = src->s_forward; t; t = t->t_forwnext)
1203                 if (t->t_token == EPSILON) {
1204                         oldcard = dst->p_card;
1205                         dst = set_include(dst, t->t_to);
1206 
1207                         if (oldcard != dst->p_card)
1208                                 dst = closure(dst, t->t_to);
1209                         }
1210 
1211         return dst;
1212 }
1213 
1214 
1215 t_powerset *
closureset(t_powerset * dst,t_powerset * src)1216 closureset(t_powerset * dst, t_powerset * src)
1217 
1218 {
1219         unsigned int i;
1220 
1221         for (i = 0; i < src->p_card; i++)
1222                 dst = closure(dst, (t_state *) src->p_set[i]);
1223 
1224         return dst;
1225 }
1226 
1227 
1228 t_state *
get_dfa_state(t_state ** stack,t_powerset * nfastates,xmlHashTablePtr sethash)1229 get_dfa_state(t_state * * stack,
1230                         t_powerset * nfastates, xmlHashTablePtr sethash)
1231 
1232 {
1233         t_state * s;
1234 
1235         if (s = hash_get(sethash, nfastates->p_set,
1236             nfastates->p_card * sizeof nfastates->p_set[0])) {
1237                 /**
1238                 ***     DFA state already present.
1239                 ***     Release the NFA state set and return
1240                 ***             the address of the old DFA state.
1241                 **/
1242 
1243                 free((char *) nfastates);
1244                 return s;
1245                 }
1246 
1247         /**
1248         ***     Build the new state.
1249         **/
1250 
1251         s = newstate();
1252         s->s_nfastates = nfastates;
1253         s->s_next = dfa_states;
1254         dfa_states = s;
1255         s->s_stack = *stack;
1256         *stack = s;
1257 
1258         /**
1259         ***     Enter it in hash.
1260         **/
1261 
1262         if (hash_add(sethash, nfastates->p_set,
1263             nfastates->p_card * sizeof nfastates->p_set[0], s))
1264                 chknull(NULL);          /* Memory allocation error. */
1265 
1266         return s;
1267 }
1268 
1269 
1270 int
transcmp(const void * p1,const void * p2)1271 transcmp(const void * p1, const void * p2)
1272 
1273 {
1274         t_transition * t1;
1275         t_transition * t2;
1276 
1277         t1 = *(t_transition * *) p1;
1278         t2 = *(t_transition * *) p2;
1279         return ((int) t1->t_token) - ((int) t2->t_token);
1280 }
1281 
1282 
1283 void
builddfa(void)1284 builddfa(void)
1285 
1286 {
1287         t_powerset * transset;
1288         t_powerset * stateset;
1289         t_state * s;
1290         t_state * s2;
1291         unsigned int n;
1292         unsigned int i;
1293         unsigned int token;
1294         t_transition * t;
1295         t_state * stack;
1296         xmlHashTablePtr sethash;
1297         unsigned int nst;
1298 
1299         transset = set_include(NULL, NULL);
1300         chknull(transset);
1301         stateset = set_include(NULL, NULL);
1302         chknull(stateset);
1303         sethash = xmlHashCreate(1);
1304         chknull(sethash);
1305         dfa_states = (t_state *) NULL;
1306         stack = (t_state *) NULL;
1307         nst = 0;
1308 
1309         /**
1310         ***     Build the DFA initial state.
1311         **/
1312 
1313         get_dfa_state(&stack, closure(NULL, initial_state), sethash);
1314 
1315         /**
1316         ***     Build the other DFA states by looking at each
1317         ***             possible transition from stacked DFA states.
1318         **/
1319 
1320         do {
1321                 if (!(++nst % 100))
1322                         fprintf(stderr, "%u DFA states\n", nst);
1323 
1324                 s = stack;
1325                 stack = s->s_stack;
1326                 s->s_stack = (t_state *) NULL;
1327 
1328                 /**
1329                 ***     Build a set of all non-epsilon transitions from this
1330                 ***             state.
1331                 **/
1332 
1333                 transset->p_card = 0;
1334 
1335                 for (n = 0; n < s->s_nfastates->p_card; n++) {
1336                         s2 = s->s_nfastates->p_set[n];
1337 
1338                         for (t = s2->s_forward; t; t = t->t_forwnext)
1339                                 if (t->t_token != EPSILON) {
1340                                         transset = set_include(transset, t);
1341                                         chknull(transset);
1342                                         }
1343                         }
1344 
1345                 /**
1346                 ***     Sort transitions by token.
1347                 **/
1348 
1349                 qsort(transset->p_set, transset->p_card,
1350                     sizeof transset->p_set[0], transcmp);
1351 
1352                 /**
1353                 ***     Process all transitions, grouping them by token.
1354                 **/
1355 
1356                 stateset->p_card = 0;
1357                 token = EPSILON;
1358 
1359                 for (i = 0; i < transset->p_card; i++) {
1360                         t = transset->p_set[i];
1361 
1362                         if (token != t->t_token) {
1363                                 if (stateset->p_card) {
1364                                         /**
1365                                         ***     Get the equivalent DFA state
1366                                         ***             and create transition.
1367                                         **/
1368 
1369                                         newtransition(token, s,
1370                                             get_dfa_state(&stack,
1371                                             closureset(NULL, stateset),
1372                                             sethash));
1373                                         stateset->p_card = 0;
1374                                         }
1375 
1376                                 token = t->t_token;
1377                                 }
1378 
1379                         stateset = set_include(stateset, t->t_to);
1380                         }
1381 
1382                 if (stateset->p_card)
1383                         newtransition(token, s, get_dfa_state(&stack,
1384                             closureset(NULL, stateset), sethash));
1385         } while (stack);
1386 
1387         free((char *) transset);
1388         free((char *) stateset);
1389         xmlHashFree(sethash, NULL);
1390 
1391         /**
1392         ***     Reverse the state list to get the initial state first,
1393         ***             check for ambiguous prefixes, determine final states,
1394         ***             destroy NFA state sets.
1395         **/
1396 
1397         while (s = dfa_states) {
1398                 dfa_states = s->s_next;
1399                 s->s_next = stack;
1400                 stack = s;
1401                 stateset = s->s_nfastates;
1402                 s->s_nfastates = (t_powerset *) NULL;
1403 
1404                 for (n = 0; n < stateset->p_card; n++) {
1405                         s2 = (t_state *) stateset->p_set[n];
1406 
1407                         if (s2->s_final) {
1408                                 if (s->s_final && s->s_final != s2->s_final)
1409                                         fprintf(stderr,
1410                                             "Ambiguous name for CCSIDs %u/%u\n",
1411                                             s->s_final->c_ccsid,
1412                                             s2->s_final->c_ccsid);
1413 
1414                                 s->s_final = s2->s_final;
1415                                 }
1416                         }
1417 
1418                 free((char *) stateset);
1419                 }
1420 
1421         dfa_states = stack;
1422 }
1423 
1424 
1425 void
deletenfa(void)1426 deletenfa(void)
1427 
1428 {
1429         t_transition * t;
1430         t_state * s;
1431         t_state * u;
1432         t_state * stack;
1433 
1434         stack = initial_state;
1435         stack->s_stack = (t_state *) NULL;
1436 
1437         while ((s = stack)) {
1438                 stack = s->s_stack;
1439 
1440                 while ((t = s->s_forward)) {
1441                         u = t->t_to;
1442                         unlink_transition(t);
1443                         free((char *) t);
1444 
1445                         if (!u->s_backward) {
1446                                 u->s_stack = stack;
1447                                 stack = u;
1448                                 }
1449                         }
1450 
1451                 free((char *) s);
1452                 }
1453 }
1454 
1455 
1456 t_stategroup *
newgroup(void)1457 newgroup(void)
1458 
1459 {
1460         t_stategroup * g;
1461 
1462         g = (t_stategroup *) malloc(sizeof *g);
1463         chknull(g);
1464         memset((char *) g, 0, sizeof *g);
1465         g->g_id = groupid++;
1466         return g;
1467 }
1468 
1469 
1470 void
optimizedfa(void)1471 optimizedfa(void)
1472 
1473 {
1474         unsigned int i;
1475         xmlHashTablePtr h;
1476         t_state * s1;
1477         t_state * s2;
1478         t_state * finstates;
1479         t_state * * sp;
1480         t_stategroup * g1;
1481         t_stategroup * g2;
1482         t_stategroup * ghead;
1483         t_transition * t1;
1484         t_transition * t2;
1485         unsigned int done;
1486         unsigned int startgroup;
1487         unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
1488 
1489         /**
1490         ***     Reduce DFA state count.
1491         **/
1492 
1493         groupid = 0;
1494         ghead = (t_stategroup *) NULL;
1495 
1496         /**
1497         ***     First split: non-final and each distinct final states.
1498         **/
1499 
1500         h = xmlHashCreate(4);
1501         chknull(h);
1502 
1503         for (s1 = dfa_states; s1; s1 = s1->s_next) {
1504                 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
1505                         g1 = newgroup();
1506                         g1->g_next = ghead;
1507                         ghead = g1;
1508 
1509                         if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
1510                                 chknull(NULL);  /* Memory allocation error. */
1511                         }
1512 
1513                 s1->s_index = g1->g_id;
1514                 s1->s_stack = g1->g_member;
1515                 g1->g_member = s1;
1516                 }
1517 
1518         xmlHashFree(h, NULL);
1519 
1520         /**
1521         ***     Subsequent splits: states that have the same forward
1522         ***             transition tokens to states in the same group.
1523         **/
1524 
1525         do {
1526                 done = 1;
1527 
1528                 for (g2 = ghead; g2; g2 = g2->g_next) {
1529                         s1 = g2->g_member;
1530 
1531                         if (!s1->s_stack)
1532                                 continue;
1533 
1534                         h = xmlHashCreate(1);
1535                         chknull(h);
1536 
1537                         /**
1538                         ***     Build the group transition map.
1539                         **/
1540 
1541                         memset((char *) gtrans, ~0, sizeof gtrans);
1542 
1543                         for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
1544                                 gtrans[t1->t_token] = t1->t_to->s_index;
1545 
1546                         if (hash_add(h, gtrans, sizeof gtrans, g2))
1547                                 chknull(NULL);
1548 
1549                         /**
1550                         ***     Process other states in group.
1551                         **/
1552 
1553                         sp = &s1->s_stack;
1554                         s1 = *sp;
1555 
1556                         do {
1557                                 *sp = s1->s_stack;
1558 
1559                                 /**
1560                                 ***     Build the transition map.
1561                                 **/
1562 
1563                                 memset((char *) gtrans, ~0, sizeof gtrans);
1564 
1565                                 for (t1 = s1->s_forward;
1566                                     t1; t1 = t1->t_forwnext)
1567                                         gtrans[t1->t_token] = t1->t_to->s_index;
1568 
1569                                 g1 = hash_get(h, gtrans, sizeof gtrans);
1570 
1571                                 if (g1 == g2) {
1572                                         *sp = s1;
1573                                         sp = &s1->s_stack;
1574                                         }
1575                                 else {
1576                                         if (!g1) {
1577                                                 g1 = newgroup();
1578                                                 g1->g_next = ghead;
1579                                                 ghead = g1;
1580 
1581                                                 if (hash_add(h, gtrans,
1582                                                     sizeof gtrans, g1))
1583                                                         chknull(NULL);
1584                                                 }
1585 
1586                                         s1->s_index = g1->g_id;
1587                                         s1->s_stack = g1->g_member;
1588                                         g1->g_member = s1;
1589                                         done = 0;
1590                                         }
1591                         } while (s1 = *sp);
1592 
1593                         xmlHashFree(h, NULL);
1594                         }
1595         } while (!done);
1596 
1597         /**
1598         ***     Establish group leaders and remap transitions.
1599         **/
1600 
1601         startgroup = dfa_states->s_index;
1602 
1603         for (g1 = ghead; g1; g1 = g1->g_next)
1604                 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
1605                         for (t1 = s1->s_backward; t1; t1 = t2) {
1606                                 t2 = t1->t_backnext;
1607                                 link_transition(t1, NULL, g1->g_member);
1608                                 }
1609 
1610         /**
1611         ***     Remove redundant states and transitions.
1612         **/
1613 
1614         for (g1 = ghead; g1; g1 = g1->g_next) {
1615                 g1->g_member->s_next = (t_state *) NULL;
1616 
1617                 while ((s1 = g1->g_member->s_stack)) {
1618                         g1->g_member->s_stack = s1->s_stack;
1619 
1620                         for (t1 = s1->s_forward; t1; t1 = t2) {
1621                                 t2 = t1->t_forwnext;
1622                                 unlink_transition(t1);
1623                                 free((char *) t1);
1624                                 }
1625 
1626                         free((char *) s1);
1627                         }
1628                 }
1629 
1630         /**
1631         ***     Remove group support and relink DFA states.
1632         **/
1633 
1634         dfa_states = (t_state *) NULL;
1635         s2 = (t_state *) NULL;
1636         finstates = (t_state *) NULL;
1637 
1638         while (g1 = ghead) {
1639                 ghead = g1->g_next;
1640                 s1 = g1->g_member;
1641 
1642                 if (g1->g_id == startgroup)
1643                         dfa_states = s1;        /* Keep start state first. */
1644                 else if (s1->s_final) {         /* Then final states. */
1645                         s1->s_next = finstates;
1646                         finstates = s1;
1647                         }
1648                 else {                  /* Finish with non-final states. */
1649                         s1->s_next = s2;
1650                         s2 = s1;
1651                         }
1652 
1653                 free((char *) g1);
1654                 }
1655 
1656         for (dfa_states->s_next = finstates; finstates->s_next;)
1657                 finstates = finstates->s_next;
1658 
1659         finstates->s_next = s2;
1660 }
1661 
1662 
1663 const char *
inttype(unsigned long max)1664 inttype(unsigned long max)
1665 
1666 {
1667         int i;
1668 
1669         for (i = 0; max; i++)
1670                 max >>= 1;
1671 
1672         if (i > 8 * sizeof(unsigned int))
1673                 return "unsigned long";
1674 
1675         if (i > 8 * sizeof(unsigned short))
1676                 return "unsigned int";
1677 
1678         if (i > 8 * sizeof(unsigned char))
1679                 return "unsigned short";
1680 
1681         return "unsigned char";
1682 }
1683 
1684 
listids(FILE * fp)1685 listids(FILE * fp)
1686 
1687 {
1688         unsigned int pos;
1689         t_chset * cp;
1690         t_symlist * lp;
1691         char * srcp;
1692         char * dstp;
1693         size_t srcc;
1694         size_t dstc;
1695         char buf[80];
1696 
1697         fprintf(fp, "/**\n***     CCSID   For arg   Recognized name.\n");
1698         pos = 0;
1699 
1700         for (cp = chset_list; cp; cp = cp->c_next) {
1701                 if (pos) {
1702                         fprintf(fp, "\n");
1703                         pos = 0;
1704                         }
1705 
1706                 if (!cp->c_names)
1707                         continue;
1708 
1709                 pos = fprintf(fp, "***     %5u      %c     ", cp->c_ccsid,
1710                     iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
1711 
1712                 for (lp = cp->c_names; lp; lp = lp->l_next) {
1713                         srcp = (char *) lp->l_symbol;
1714                         srcc = strlen(srcp);
1715                         dstp = buf;
1716                         dstc = sizeof buf;
1717                         iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
1718                         srcc = dstp - buf;
1719 
1720                         if (pos + srcc > 79) {
1721                                 fprintf(fp, "\n***%22c", ' ');
1722                                 pos = 25;
1723                                 }
1724 
1725                         pos += fprintf(fp, " %.*s", srcc, buf);
1726                         }
1727                 }
1728 
1729         if (pos)
1730                 fprintf(fp, "\n");
1731 
1732         fprintf(fp, "**/\n\n");
1733 }
1734 
1735 
1736 void
generate(FILE * fp)1737 generate(FILE * fp)
1738 
1739 {
1740         unsigned int nstates;
1741         unsigned int ntrans;
1742         unsigned int maxfinal;
1743         t_state * s;
1744         t_transition * t;
1745         unsigned int i;
1746         unsigned int pos;
1747         char * ns;
1748 
1749         /**
1750         ***     Assign indexes to states and transitions.
1751         **/
1752 
1753         nstates = 0;
1754         ntrans = 0;
1755         maxfinal = 0;
1756 
1757         for (s = dfa_states; s; s = s->s_next) {
1758                 s->s_index = nstates++;
1759 
1760                 if (s->s_final)
1761                         maxfinal = nstates;
1762 
1763                 for (t = s->s_forward; t; t = t->t_forwnext)
1764                         t->t_index = ntrans++;
1765                 }
1766 
1767         fprintf(fp,
1768             "/**\n***     %u states, %u finals, %u transitions.\n**/\n\n",
1769             nstates, maxfinal, ntrans);
1770         fprintf(stderr, "%u states, %u finals, %u transitions.\n",
1771             nstates, maxfinal, ntrans);
1772 
1773         /**
1774         ***     Generate types.
1775         **/
1776 
1777         fprintf(fp, "typedef unsigned short          t_ccsid;\n");
1778         fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
1779         fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
1780 
1781         /**
1782         ***     Generate first transition index for each state.
1783         **/
1784 
1785         fprintf(fp, "static t_transrange     trans_array[] = {\n");
1786         pos = 0;
1787         ntrans = 0;
1788 
1789         for (s = dfa_states; s; s = s->s_next) {
1790                 pos += fprintf(fp, " %u,", ntrans);
1791 
1792                 if (pos > 72) {
1793                         fprintf(fp, "\n");
1794                         pos = 0;
1795                         }
1796 
1797                 for (t = s->s_forward; t; t = t->t_forwnext)
1798                         ntrans++;
1799                 }
1800 
1801         fprintf(fp, " %u\n};\n\n", ntrans);
1802 
1803         /**
1804         ***     Generate final state info.
1805         **/
1806 
1807         fprintf(fp, "static t_ccsid          final_array[] = {\n");
1808         pos = 0;
1809         ns ="";
1810         i = 0;
1811 
1812         for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
1813                 pos += fprintf(fp, "%s", ns);
1814                 ns = ",";
1815 
1816                 if (pos > 72) {
1817                         fprintf(fp, "\n");
1818                         pos = 0;
1819                         }
1820 
1821                 pos += fprintf(fp, " %u",
1822                     s->s_final? s->s_final->c_ccsid + 1: 0);
1823                 }
1824 
1825         fprintf(fp, "\n};\n\n");
1826 
1827         /**
1828         ***     Generate goto table.
1829         **/
1830 
1831         fprintf(fp, "static t_staterange     goto_array[] = {\n");
1832         pos = 0;
1833 
1834         for (s = dfa_states; s; s = s->s_next)
1835                 for (t = s->s_forward; t; t = t->t_forwnext) {
1836                         pos += fprintf(fp, " %u,", t->t_to->s_index);
1837 
1838                         if (pos > 72) {
1839                                 fprintf(fp, "\n");
1840                                 pos = 0;
1841                                 }
1842                         }
1843 
1844         fprintf(fp, " %u\n};\n\n", nstates);
1845 
1846         /**
1847         ***     Generate transition label table.
1848         **/
1849 
1850         fprintf(fp, "static unsigned char    label_array[] = {\n");
1851         pos = 0;
1852         ns ="";
1853 
1854         for (s = dfa_states; s; s = s->s_next)
1855                 for (t = s->s_forward; t; t = t->t_forwnext) {
1856                         pos += fprintf(fp, "%s", ns);
1857                         ns = ",";
1858 
1859                         if (pos > 72) {
1860                                 fprintf(fp, "\n");
1861                                 pos = 0;
1862                                 }
1863 
1864                         pos += fprintf(fp, " 0x%02X", t->t_token);
1865                         }
1866 
1867         fprintf(fp, "\n};\n", nstates);
1868 }
1869 
1870 
main(argc,argv)1871 main(argc, argv)
1872 int argc;
1873 char * * argv;
1874 
1875 {
1876         FILE * fp;
1877         t_chset * csp;
1878         char symbuf[20];
1879 
1880         chset_list = (t_chset *) NULL;
1881         initial_state = newstate();
1882         job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
1883         utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
1884 
1885         if (argc != 4) {
1886                 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
1887                 fprintf(stderr, "<iana-character-set file> <output file>\n");
1888                 exit(1);
1889                 }
1890 
1891         /**
1892         ***     Read CCSID/MIBenum associations. Define special names.
1893         **/
1894 
1895         read_assocs(argv[1]);
1896 
1897         /**
1898         ***     Read character set names and establish the case-independent
1899         ***             name DFA in all possible CCSIDs.
1900         **/
1901 
1902         read_iana(argv[2]);
1903 
1904         /**
1905         ***     Build DFA from NFA.
1906         **/
1907 
1908         builddfa();
1909 
1910         /**
1911         ***     Delete NFA.
1912         **/
1913 
1914         deletenfa();
1915 
1916         /**
1917         ***     Minimize the DFA state count.
1918         **/
1919 
1920         optimizedfa();
1921 
1922         /**
1923         ***     Generate the table.
1924         **/
1925 
1926         fp = fopen(argv[3], "w+");
1927 
1928         if (!fp) {
1929                 perror(argv[3]);
1930                 exit(1);
1931                 }
1932 
1933         fprintf(fp, "/**\n");
1934         fprintf(fp, "***     Character set names table.\n");
1935         fprintf(fp, "***     Generated by program BLDCSNDFA from");
1936         fprintf(fp, " IANA character set assignment file\n");
1937         fprintf(fp, "***          and CCSID/MIBenum equivalence file.\n");
1938         fprintf(fp, "***     *** Do not edit by hand ***\n");
1939         fprintf(fp, "**/\n\n");
1940         listids(fp);
1941         generate(fp);
1942 
1943         if (ferror(fp)) {
1944                 perror(argv[3]);
1945                 fclose(fp);
1946                 exit(1);
1947                 }
1948 
1949         fclose(fp);
1950         iconv_close(job2utf8);
1951         iconv_close(utf82job);
1952         exit(0);
1953 }
1954