1 /**
2 *** Build a deterministic finite automaton to associate CCSIDs with
3 *** character set names.
4 ***
5 *** Compile on OS/400 with options SYSIFCOPT(*IFSIO).
6 ***
7 *** See Copyright for the status of this software.
8 ***
9 *** Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A.
10 **/
11
12 #include <stdio.h>
13 #include <errno.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <fcntl.h>
17 #include <ctype.h>
18
19 #include <iconv.h>
20
21
22 #ifdef OLDXML
23 #include "xml.h"
24 #else
25 #include <libxml/hash.h>
26 #include <libxml/parser.h>
27 #include <libxml/xpath.h>
28 #include <libxml/xpathInternals.h>
29 #endif
30
31
32 #ifdef __OS400__
33 #define iconv_open_error(cd) ((cd).return_value == -1)
34 #define set_iconv_open_error(cd) ((cd).return_value = -1)
35 #else
36 #define iconv_open_error(cd) ((cd) == (iconv_t) -1)
37 #define set_iconv_open_error(cd) ((cd) = (iconv_t) -1)
38 #endif
39
40
41 #define C_SOURCE_CCSID 500
42 #define C_UTF8_CCSID 1208
43
44
45 #define UTF8_SPACE 0x20
46 #define UTF8_HT 0x09
47 #define UTF8_0 0x30
48 #define UTF8_9 0x39
49 #define UTF8_A 0x41
50 #define UTF8_Z 0x5A
51 #define UTF8_a 0x61
52 #define UTF8_z 0x7A
53
54
55 #define GRANULE 128 /* Memory allocation granule. */
56
57 #define EPSILON 0x100 /* Token for empty transition. */
58
59
60 #ifndef OFFSETOF
61 #define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
62 #endif
63
64 #ifndef OFFSETBY
65 #define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o)))
66 #endif
67
68
69 typedef struct t_transition t_transition; /* NFA/DFA transition. */
70 typedef struct t_state t_state; /* NFA/DFA state node. */
71 typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */
72 typedef struct t_chset t_chset; /* Character set. */
73 typedef struct t_stategroup t_stategroup; /* Optimization group. */
74 typedef unsigned char utf8char; /* UTF-8 character byte. */
75 typedef unsigned char byte; /* Untyped data byte. */
76
77
78 typedef struct { /* Set of pointers. */
79 unsigned int p_size; /* Current allocated size. */
80 unsigned int p_card; /* Current element count. */
81 void * p_set[1]; /* Element array. */
82 } t_powerset;
83
84
85 struct t_transition {
86 t_transition * t_forwprev; /* Head of forward transition list. */
87 t_transition * t_forwnext; /* Tail of forward transition list. */
88 t_transition * t_backprev; /* Head of backward transition list. */
89 t_transition * t_backnext; /* Tail of backward transition list. */
90 t_state * t_from; /* Incoming state. */
91 t_state * t_to; /* Destination state. */
92 unsigned short t_token; /* Transition token. */
93 unsigned int t_index; /* Transition array index. */
94 };
95
96
97 struct t_state {
98 t_state * s_next; /* Next state (for DFA construction). */
99 t_state * s_stack; /* Unprocessed DFA states stack. */
100 t_transition * s_forward; /* Forward transitions. */
101 t_transition * s_backward; /* Backward transitions. */
102 t_chset * s_final; /* Recognized character set. */
103 t_powerset * s_nfastates; /* Corresponding NFA states. */
104 unsigned int s_index; /* State index. */
105 };
106
107
108 struct t_symlist {
109 t_symlist * l_next; /* Next name in list. */
110 utf8char l_symbol[1]; /* Name bytes. */
111 };
112
113
114 struct t_chset {
115 t_chset * c_next; /* Next character set. */
116 t_symlist * c_names; /* Character set name list. */
117 iconv_t c_fromUTF8; /* Conversion from UTF-8. */
118 unsigned int c_ccsid; /* IBM character set code. */
119 unsigned int c_mibenum; /* IANA character code. */
120 };
121
122
123 struct t_stategroup {
124 t_stategroup * g_next; /* Next group. */
125 t_state * g_member; /* Group member (s_stack) list. */
126 unsigned int g_id; /* Group ident. */
127 };
128
129
130
131 t_chset * chset_list; /* Character set list. */
132 t_state * initial_state; /* Initial NFA state. */
133 iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */
134 iconv_t utf82job; /* UTF-8 to job CCSID conversion. */
135 t_state * dfa_states; /* List of DFA states. */
136 unsigned int groupid; /* Group ident counter. */
137
138
139 /**
140 *** UTF-8 strings.
141 **/
142
143 #pragma convert(819)
144
145 static const utf8char utf8_MIBenum[] = "MIBenum";
146 static const utf8char utf8_mibenum[] = "mibenum";
147 static const utf8char utf8_ibm_[] = "ibm-";
148 static const utf8char utf8_IBMCCSID[] = "IBMCCSID";
149 static const utf8char utf8_iana_[] = "iana-";
150 static const utf8char utf8_Name[] = "Name";
151 static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name";
152 static const utf8char utf8_Aliases[] = "Aliases";
153 static const utf8char utf8_html[] = "html";
154 static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
155 static const utf8char utf8_A[] = "A";
156 static const utf8char utf8_C[] = "C";
157 static const utf8char utf8_M[] = "M";
158 static const utf8char utf8_N[] = "N";
159 static const utf8char utf8_P[] = "P";
160 static const utf8char utf8_T[] = "T";
161 static const utf8char utf8_ccsid[] = "ccsid";
162 static const utf8char utf8_EBCDIC[] = "EBCDIC";
163 static const utf8char utf8_ASCII[] = "ASCII";
164 static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
165 static const utf8char utf8_aliastext[] =
166 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
167 #ifdef OLDXML
168 static const utf8char utf8_tablerows[] =
169 "//table[@id='table-character-sets-1']/*/tr";
170 static const utf8char utf8_headerpos[] =
171 "count(th[text()=$T]/preceding-sibling::th)+1";
172 static const utf8char utf8_getmibenum[] = "number(td[$M])";
173 static const utf8char utf8_getprefname[] = "string(td[$P])";
174 static const utf8char utf8_getname[] = "string(td[$N])";
175 static const utf8char utf8_getaliases[] = "td[$A]/text()";
176 #else
177 static const utf8char utf8_tablerows[] =
178 "//html:table[@id='table-character-sets-1']/*/html:tr";
179 static const utf8char utf8_headerpos[] =
180 "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
181 static const utf8char utf8_getmibenum[] = "number(html:td[$M])";
182 static const utf8char utf8_getprefname[] = "string(html:td[$P])";
183 static const utf8char utf8_getname[] = "string(html:td[$N])";
184 static const utf8char utf8_getaliases[] = "html:td[$A]/text()";
185 #endif
186
187 #pragma convert(0)
188
189
190 /**
191 *** UTF-8 character length table.
192 ***
193 *** Index is first character byte, value is the character byte count.
194 **/
195
196 static signed char utf8_chlen[] = {
197 /* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1,
198 /* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1,
199 /* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1,
200 /* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1,
201 /* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1,
202 /* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1,
203 /* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1,
204 /* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1,
205 /* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1,
206 /* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1,
207 /* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1,
208 /* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1,
209 /* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1,
210 /* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1,
211 /* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1,
212 /* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1,
213 /* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1,
214 /* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1,
215 /* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1,
216 /* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1,
217 /* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1,
218 /* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1,
219 /* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1,
220 /* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1,
221 /* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2,
222 /* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2,
223 /* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2,
224 /* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2,
225 /* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3,
226 /* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3,
227 /* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4,
228 /* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1
229 };
230
231
232
233 void
chknull(void * p)234 chknull(void * p)
235
236 {
237 if (p)
238 return;
239
240 fprintf(stderr, "Not enough memory\n");
241 exit(1);
242 }
243
244
245 void
makecode(char * buf,unsigned int ccsid)246 makecode(char * buf, unsigned int ccsid)
247
248 {
249 ccsid &= 0xFFFF;
250 memset(buf, 0, 32);
251 sprintf(buf, "IBMCCSID%05u0000000", ccsid);
252 }
253
254
255 iconv_t
iconv_open_ccsid(unsigned int ccsidout,unsigned int ccsidin,unsigned int nullflag)256 iconv_open_ccsid(unsigned int ccsidout,
257 unsigned int ccsidin, unsigned int nullflag)
258
259 {
260 char fromcode[33];
261 char tocode[33];
262
263 makecode(fromcode, ccsidin);
264 makecode(tocode, ccsidout);
265 memset(tocode + 13, 0, sizeof tocode - 13);
266
267 if (nullflag)
268 fromcode[18] = '1';
269
270 return iconv_open(tocode, fromcode);
271 }
272
273
274 unsigned int
getnum(char ** cpp)275 getnum(char * * cpp)
276
277 {
278 unsigned int n;
279 char * cp;
280
281 cp = *cpp;
282 n = 0;
283
284 while (isdigit(*cp))
285 n = 10 * n + *cp++ - '0';
286
287 *cpp = cp;
288 return n;
289 }
290
291
292 const utf8char *
hashBinaryKey(const byte * bytes,unsigned int len)293 hashBinaryKey(const byte * bytes, unsigned int len)
294
295 {
296 const byte * bp;
297 utf8char * key;
298 utf8char * cp;
299 unsigned int n;
300 unsigned int n4;
301 unsigned int i;
302
303 /**
304 *** Encode binary data in character form to be used as hash
305 *** table key.
306 **/
307
308 n = (4 * len + 2) / 3;
309 key = (utf8char *) malloc(n + 1);
310 chknull(key);
311 bp = bytes;
312 cp = key;
313
314 for (n4 = n >> 2; n4; n4--) {
315 i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
316 *cp++ = 0x21 + ((i >> 18) & 0x3F);
317 *cp++ = 0x21 + ((i >> 12) & 0x3F);
318 *cp++ = 0x21 + ((i >> 6) & 0x3F);
319 *cp++ = 0x21 + (i & 0x3F);
320 bp += 3;
321 }
322
323 switch (n & 0x3) {
324
325 case 2:
326 *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
327 *cp++ = 0x21 + ((*bp << 4) & 0x3F);
328 break;
329
330 case 3:
331 i = (bp[0] << 8) | bp[1];
332 *cp++ = 0x21 + ((i >> 10) & 0x3F);
333 *cp++ = 0x21 + ((i >> 4) & 0x3F);
334 *cp++ = 0x21 + ((i << 2) & 0x3F);
335 break;
336 }
337
338 *cp = '\0';
339 return key;
340 }
341
342
343 void *
hash_get(xmlHashTablePtr h,const void * binkey,unsigned int len)344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
345
346 {
347 const utf8char * key;
348 void * result;
349
350 key = hashBinaryKey((const byte *) binkey, len);
351 result = xmlHashLookup(h, key);
352 free((char *) key);
353 return result;
354 }
355
356
357 int
hash_add(xmlHashTablePtr h,const void * binkey,unsigned int len,void * data)358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
359
360 {
361 const utf8char * key;
362 int result;
363
364 key = hashBinaryKey((const byte *) binkey, len);
365 result = xmlHashAddEntry(h, key, data);
366 free((char *) key);
367 return result;
368 }
369
370
371 xmlDocPtr
loadXMLFile(const char * filename)372 loadXMLFile(const char * filename)
373
374 {
375 struct stat sbuf;
376 byte * databuf;
377 int fd;
378 int i;
379 xmlDocPtr doc;
380
381 if (stat(filename, &sbuf))
382 return (xmlDocPtr) NULL;
383
384 databuf = malloc(sbuf.st_size + 4);
385
386 if (!databuf)
387 return (xmlDocPtr) NULL;
388
389 fd = open(filename, O_RDONLY
390 #ifdef O_BINARY
391 | O_BINARY
392 #endif
393 );
394
395 if (fd < 0) {
396 free((char *) databuf);
397 return (xmlDocPtr) NULL;
398 }
399
400 i = read(fd, (char *) databuf, sbuf.st_size);
401 close(fd);
402
403 if (i != sbuf.st_size) {
404 free((char *) databuf);
405 return (xmlDocPtr) NULL;
406 }
407
408 databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
409 doc = xmlParseMemory((xmlChar *) databuf, i);
410 free((char *) databuf);
411 return doc;
412 }
413
414
415 int
match(char ** cpp,char * s)416 match(char * * cpp, char * s)
417
418 {
419 char * cp;
420 int c1;
421 int c2;
422
423 cp = *cpp;
424
425 for (cp = *cpp; c2 = *s++; cp++) {
426 c1 = *cp;
427
428 if (c1 != c2) {
429 if (isupper(c1))
430 c1 = tolower(c1);
431
432 if (isupper(c2))
433 c2 = tolower(c2);
434 }
435
436 if (c1 != c2)
437 return 0;
438 }
439
440 c1 = *cp;
441
442 while (c1 == ' ' || c1 == '\t')
443 c1 = *++cp;
444
445 *cpp = cp;
446 return 1;
447 }
448
449
450 t_state *
newstate(void)451 newstate(void)
452
453 {
454 t_state * s;
455
456 s = (t_state *) malloc(sizeof *s);
457 chknull(s);
458 memset((char *) s, 0, sizeof *s);
459 return s;
460 }
461
462
463 void
unlink_transition(t_transition * t)464 unlink_transition(t_transition * t)
465
466 {
467 if (t->t_backnext)
468 t->t_backnext->t_backprev = t->t_backprev;
469
470 if (t->t_backprev)
471 t->t_backprev->t_backnext = t->t_backnext;
472 else if (t->t_to)
473 t->t_to->s_backward = t->t_backnext;
474
475 if (t->t_forwnext)
476 t->t_forwnext->t_forwprev = t->t_forwprev;
477
478 if (t->t_forwprev)
479 t->t_forwprev->t_forwnext = t->t_forwnext;
480 else if (t->t_from)
481 t->t_from->s_forward = t->t_forwnext;
482
483 t->t_backprev = (t_transition *) NULL;
484 t->t_backnext = (t_transition *) NULL;
485 t->t_forwprev = (t_transition *) NULL;
486 t->t_forwnext = (t_transition *) NULL;
487 t->t_from = (t_state *) NULL;
488 t->t_to = (t_state *) NULL;
489 }
490
491
492 void
link_transition(t_transition * t,t_state * from,t_state * to)493 link_transition(t_transition * t, t_state * from, t_state * to)
494
495 {
496 if (!from)
497 from = t->t_from;
498
499 if (!to)
500 to = t->t_to;
501
502 unlink_transition(t);
503
504 if ((t->t_from = from)) {
505 if ((t->t_forwnext = from->s_forward))
506 t->t_forwnext->t_forwprev = t;
507
508 from->s_forward = t;
509 }
510
511 if ((t->t_to = to)) {
512 if ((t->t_backnext = to->s_backward))
513 t->t_backnext->t_backprev = t;
514
515 to->s_backward = t;
516 }
517 }
518
519
520 t_transition *
newtransition(unsigned int token,t_state * from,t_state * to)521 newtransition(unsigned int token, t_state * from, t_state * to)
522
523 {
524 t_transition * t;
525
526 t = (t_transition *) malloc(sizeof *t);
527 chknull(t);
528 memset((char *) t, 0, sizeof *t);
529 t->t_token = token;
530 link_transition(t, from, to);
531 return t;
532 }
533
534
535 t_transition *
uniquetransition(unsigned int token,t_state * from,t_state * to)536 uniquetransition(unsigned int token, t_state * from, t_state * to)
537
538 {
539 t_transition * t;
540
541 for (t = from->s_forward; t; t = t->t_forwnext)
542 if (t->t_token == token && (t->t_to == to || !to))
543 return t;
544
545 return to? newtransition(token, from, to): (t_transition *) NULL;
546 }
547
548
549 int
set_position(t_powerset * s,void * e)550 set_position(t_powerset * s, void * e)
551
552 {
553 unsigned int l;
554 unsigned int h;
555 unsigned int m;
556 int i;
557
558 l = 0;
559 h = s->p_card;
560
561 while (l < h) {
562 m = (l + h) >> 1;
563
564 /**
565 *** If both pointers belong to different allocation arenas,
566 *** native comparison may find them neither
567 *** equal, nor greater, nor smaller.
568 *** We thus compare using memcmp() to get an orthogonal
569 *** result.
570 **/
571
572 i = memcmp(&e, s->p_set + m, sizeof e);
573
574 if (i < 0)
575 h = m;
576 else if (!i)
577 return m;
578 else
579 l = m + 1;
580 }
581
582 return l;
583 }
584
585
586 t_powerset *
set_include(t_powerset * s,void * e)587 set_include(t_powerset * s, void * e)
588
589 {
590 unsigned int pos;
591 unsigned int n;
592
593 if (!s) {
594 s = (t_powerset *) malloc(sizeof *s +
595 GRANULE * sizeof s->p_set);
596 chknull(s);
597 s->p_size = GRANULE;
598 s->p_set[GRANULE] = (t_state *) NULL;
599 s->p_set[0] = e;
600 s->p_card = 1;
601 return s;
602 }
603
604 pos = set_position(s, e);
605
606 if (pos < s->p_card && s->p_set[pos] == e)
607 return s;
608
609 if (s->p_card >= s->p_size) {
610 s->p_size += GRANULE;
611 s = (t_powerset *) realloc(s,
612 sizeof *s + s->p_size * sizeof s->p_set);
613 chknull(s);
614 s->p_set[s->p_size] = (t_state *) NULL;
615 }
616
617 n = s->p_card - pos;
618
619 if (n)
620 memmove((char *) (s->p_set + pos + 1),
621 (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
622
623 s->p_set[pos] = e;
624 s->p_card++;
625 return s;
626 }
627
628
629 t_state *
nfatransition(t_state * to,byte token)630 nfatransition(t_state * to, byte token)
631
632 {
633 t_state * from;
634
635 from = newstate();
636 newtransition(token, from, to);
637 return from;
638 }
639
640
641 static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc,
642 const utf8char * name, unsigned int len);
643
644
645 void
nfaslice(t_state ** from,t_state ** to,iconv_t icc,const utf8char * chr,unsigned int chlen,const utf8char * name,unsigned int len,t_state * final)646 nfaslice(t_state * * from, t_state * * to, iconv_t icc,
647 const utf8char * chr, unsigned int chlen,
648 const utf8char * name, unsigned int len, t_state * final)
649
650 {
651 char * srcp;
652 char * dstp;
653 size_t srcc;
654 size_t dstc;
655 unsigned int cnt;
656 t_state * f;
657 t_state * t;
658 t_transition * tp;
659 byte bytebuf[8];
660
661 srcp = (char *) chr;
662 srcc = chlen;
663 dstp = (char *) bytebuf;
664 dstc = sizeof bytebuf;
665 iconv(icc, &srcp, &srcc, &dstp, &dstc);
666 dstp = (char *) bytebuf;
667 cnt = sizeof bytebuf - dstc;
668 t = *to;
669 f = *from;
670
671 /**
672 *** Check for end of string.
673 **/
674
675 if (!len)
676 if (t && t != final)
677 uniquetransition(EPSILON, t, final);
678 else
679 t = final;
680
681 if (f)
682 while (cnt) {
683 tp = uniquetransition(*dstp, f, (t_state *) NULL);
684
685 if (!tp)
686 break;
687
688 f = tp->t_to;
689 dstp++;
690 cnt--;
691 }
692
693 if (!cnt) {
694 if (!t)
695 t = nfadevelop(f, final, icc, name, len);
696
697 *to = t;
698 return;
699 }
700
701 if (!t) {
702 t = nfadevelop((t_state *) NULL, final, icc, name, len);
703 *to = t;
704 }
705
706 if (!f)
707 *from = f = newstate();
708
709 while (cnt > 1)
710 t = nfatransition(t, dstp[--cnt]);
711
712 newtransition(*dstp, f, t);
713 }
714
715
716 t_state *
nfadevelop(t_state * from,t_state * final,iconv_t icc,const utf8char * name,unsigned int len)717 nfadevelop(t_state * from, t_state * final, iconv_t icc,
718 const utf8char * name, unsigned int len)
719
720 {
721 int chlen;
722 int i;
723 t_state * to;
724 int uccnt;
725 int lccnt;
726 utf8char chr;
727
728 chlen = utf8_chlen[*name];
729
730 for (i = 1; i < chlen; i++)
731 if ((name[i] & 0xC0) != 0x80)
732 break;
733
734 if (i != chlen) {
735 fprintf(stderr,
736 "Invalid UTF8 character in character set name\n");
737 return (t_state *) NULL;
738 }
739
740 to = (t_state *) NULL;
741 nfaslice(&from, &to,
742 icc, name, chlen, name + chlen, len - chlen, final);
743
744 if (*name >= UTF8_a && *name <= UTF8_z)
745 chr = *name - UTF8_a + UTF8_A;
746 else if (*name >= UTF8_A && *name <= UTF8_Z)
747 chr = *name - UTF8_A + UTF8_a;
748 else
749 return from;
750
751 nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
752 return from;
753 }
754
755
756
757 void
nfaenter(const utf8char * name,int len,t_chset * charset)758 nfaenter(const utf8char * name, int len, t_chset * charset)
759
760 {
761 t_chset * s;
762 t_state * final;
763 t_state * sp;
764 t_symlist * lp;
765
766 /**
767 *** Enter case-insensitive `name' in NFA in all known
768 *** character codes.
769 *** Redundant shift state changes as well as shift state
770 *** differences between uppercase and lowercase are
771 *** not handled.
772 **/
773
774 if (len < 0)
775 len = strlen(name) + 1;
776
777 for (lp = charset->c_names; lp; lp = lp->l_next)
778 if (!memcmp(name, lp->l_symbol, len))
779 return; /* Already entered. */
780
781 lp = (t_symlist *) malloc(sizeof *lp + len);
782 chknull(lp);
783 memcpy(lp->l_symbol, name, len);
784 lp->l_symbol[len] = '\0';
785 lp->l_next = charset->c_names;
786 charset->c_names = lp;
787 final = newstate();
788 final->s_final = charset;
789
790 for (s = chset_list; s; s = s->c_next)
791 if (!iconv_open_error(s->c_fromUTF8))
792 sp = nfadevelop(initial_state, final,
793 s->c_fromUTF8, name, len);
794 }
795
796
797 unsigned int
utf8_utostr(utf8char * s,unsigned int v)798 utf8_utostr(utf8char * s, unsigned int v)
799
800 {
801 unsigned int d;
802 unsigned int i;
803
804 d = v / 10;
805 v -= d * 10;
806 i = d? utf8_utostr(s, d): 0;
807 s[i++] = v + UTF8_0;
808 s[i] = '\0';
809 return i;
810 }
811
812
813 unsigned int
utf8_utostrpad(utf8char * s,unsigned int v,int digits)814 utf8_utostrpad(utf8char * s, unsigned int v, int digits)
815
816 {
817 unsigned int i = utf8_utostr(s, v);
818 utf8char pad = UTF8_SPACE;
819
820 if (digits < 0) {
821 pad = UTF8_0;
822 digits = -digits;
823 }
824
825 if (i >= digits)
826 return i;
827
828 memmove(s + digits - i, s, i + 1);
829 memset(s, pad, digits - i);
830 return digits;
831 }
832
833
834 unsigned int
utf8_strtou(const utf8char * s)835 utf8_strtou(const utf8char * s)
836
837 {
838 unsigned int v;
839
840 while (*s == UTF8_SPACE || *s == UTF8_HT)
841 s++;
842
843 for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
844 v = 10 * v + *s++ - UTF8_0;
845
846 return v;
847 }
848
849
850 unsigned int
getNumAttr(xmlNodePtr node,const xmlChar * name)851 getNumAttr(xmlNodePtr node, const xmlChar * name)
852
853 {
854 const xmlChar * s;
855 unsigned int val;
856
857 s = xmlGetProp(node, name);
858
859 if (!s)
860 return 0;
861
862 val = utf8_strtou(s);
863 xmlFree((xmlChar *) s);
864 return val;
865 }
866
867
868 void
read_assocs(const char * filename)869 read_assocs(const char * filename)
870
871 {
872 xmlDocPtr doc;
873 xmlXPathContextPtr ctxt;
874 xmlXPathObjectPtr obj;
875 xmlNodePtr node;
876 t_chset * sp;
877 int i;
878 unsigned int ccsid;
879 unsigned int mibenum;
880 utf8char symbuf[32];
881
882 doc = loadXMLFile(filename);
883
884 if (!doc) {
885 fprintf(stderr, "Cannot load file %s\n", filename);
886 exit(1);
887 }
888
889 ctxt = xmlXPathNewContext(doc);
890 obj = xmlXPathEval(utf8_assocnodes, ctxt);
891
892 if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
893 !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
894 fprintf(stderr, "No association found in %s\n", filename);
895 exit(1);
896 }
897
898 for (i = 0; i < obj->nodesetval->nodeNr; i++) {
899 node = obj->nodesetval->nodeTab[i];
900 ccsid = getNumAttr(node, utf8_ccsid);
901 mibenum = getNumAttr(node, utf8_mibenum);
902
903 /**
904 *** Check for duplicate.
905 **/
906
907 for (sp = chset_list; sp; sp = sp->c_next)
908 if (ccsid && ccsid == sp->c_ccsid ||
909 mibenum && mibenum == sp->c_mibenum) {
910 fprintf(stderr, "Duplicate character set: ");
911 fprintf(stderr, "CCSID = %u/%u, ",
912 ccsid, sp->c_ccsid);
913 fprintf(stderr, "MIBenum = %u/%u\n",
914 mibenum, sp->c_mibenum);
915 break;
916 }
917
918 if (sp)
919 continue;
920
921 /**
922 *** Allocate the new character set.
923 **/
924
925 sp = (t_chset *) malloc(sizeof *sp);
926 chknull(sp);
927 memset(sp, 0, sizeof *sp);
928
929 if (!ccsid) /* Do not attempt with current job CCSID. */
930 set_iconv_open_error(sp->c_fromUTF8);
931 else {
932 sp->c_fromUTF8 =
933 iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
934
935 if (iconv_open_error(sp->c_fromUTF8) == -1)
936 fprintf(stderr,
937 "Cannot convert into CCSID %u: ignored\n",
938 ccsid);
939 }
940
941 sp->c_ccsid = ccsid;
942 sp->c_mibenum = mibenum;
943 sp->c_next = chset_list;
944 chset_list = sp;
945 }
946
947 xmlXPathFreeObject(obj);
948
949 /**
950 *** Enter aliases.
951 **/
952
953 for (sp = chset_list; sp; sp = sp->c_next) {
954 strcpy(symbuf, utf8_ibm_);
955 utf8_utostr(symbuf + 4, sp->c_ccsid);
956 nfaenter(symbuf, -1, sp);
957 strcpy(symbuf, utf8_IBMCCSID);
958 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
959 nfaenter(symbuf, 13, sp); /* Not null-terminated. */
960
961 if (sp->c_mibenum) {
962 strcpy(symbuf, utf8_iana_);
963 utf8_utostr(symbuf + 5, sp->c_mibenum);
964 nfaenter(symbuf, -1, sp);
965 }
966
967 xmlXPathRegisterVariable(ctxt, utf8_C,
968 xmlXPathNewFloat((double) sp->c_ccsid));
969 obj = xmlXPathEval(utf8_aliastext, ctxt);
970
971 if (!obj || obj->type != XPATH_NODESET) {
972 fprintf(stderr, "getAlias failed in %s\n", filename);
973 exit(1);
974 }
975
976 if (obj->nodesetval &&
977 obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
978 for (i = 0; i < obj->nodesetval->nodeNr; i++) {
979 node = obj->nodesetval->nodeTab[i];
980 nfaenter(node->content, -1, sp);
981 }
982 }
983
984 xmlXPathFreeObject(obj);
985 }
986
987 xmlXPathFreeContext(ctxt);
988 xmlFreeDoc(doc);
989 }
990
991
992 unsigned int
columnPosition(xmlXPathContextPtr ctxt,const xmlChar * header)993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
994
995 {
996 xmlXPathObjectPtr obj;
997 unsigned int res = 0;
998
999 xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
1000 obj = xmlXPathEval(utf8_headerpos, ctxt);
1001
1002 if (obj) {
1003 if (obj->type == XPATH_NUMBER)
1004 res = (unsigned int) obj->floatval;
1005
1006 xmlXPathFreeObject(obj);
1007 }
1008
1009 return res;
1010 }
1011
1012
1013 void
read_iana(const char * filename)1014 read_iana(const char * filename)
1015
1016 {
1017 xmlDocPtr doc;
1018 xmlXPathContextPtr ctxt;
1019 xmlXPathObjectPtr obj1;
1020 xmlXPathObjectPtr obj2;
1021 xmlNodePtr node;
1022 int prefnamecol;
1023 int namecol;
1024 int mibenumcol;
1025 int aliascol;
1026 int mibenum;
1027 t_chset * sp;
1028 int n;
1029 int i;
1030
1031 doc = loadXMLFile(filename);
1032
1033 if (!doc) {
1034 fprintf(stderr, "Cannot load file %s\n", filename);
1035 exit(1);
1036 }
1037
1038 ctxt = xmlXPathNewContext(doc);
1039
1040 #ifndef OLDXML
1041 xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
1042 #endif
1043
1044 obj1 = xmlXPathEval(utf8_tablerows, ctxt);
1045
1046 if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
1047 !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
1048 fprintf(stderr, "No data in %s\n", filename);
1049 exit(1);
1050 }
1051
1052 /**
1053 *** Identify columns.
1054 **/
1055
1056 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
1057 prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
1058 namecol = columnPosition(ctxt, utf8_Name);
1059 mibenumcol = columnPosition(ctxt, utf8_MIBenum);
1060 aliascol = columnPosition(ctxt, utf8_Aliases);
1061
1062 if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
1063 fprintf(stderr, "Key column(s) missing in %s\n", filename);
1064 exit(1);
1065 }
1066
1067 xmlXPathRegisterVariable(ctxt, utf8_P,
1068 xmlXPathNewFloat((double) prefnamecol));
1069 xmlXPathRegisterVariable(ctxt, utf8_N,
1070 xmlXPathNewFloat((double) namecol));
1071 xmlXPathRegisterVariable(ctxt, utf8_M,
1072 xmlXPathNewFloat((double) mibenumcol));
1073 xmlXPathRegisterVariable(ctxt, utf8_A,
1074 xmlXPathNewFloat((double) aliascol));
1075
1076 /**
1077 *** Process each row.
1078 **/
1079
1080 for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
1081 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
1082
1083 /**
1084 *** Get the MIBenum from current row.
1085 */
1086
1087 obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
1088
1089 if (!obj2 || obj2->type != XPATH_NUMBER) {
1090 fprintf(stderr, "get MIBenum failed at row %u\n", n);
1091 exit(1);
1092 }
1093
1094 if (xmlXPathIsNaN(obj2->floatval) ||
1095 obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
1096 ((unsigned int) obj2->floatval) != obj2->floatval) {
1097 fprintf(stderr, "invalid MIBenum at row %u\n", n);
1098 xmlXPathFreeObject(obj2);
1099 continue;
1100 }
1101
1102 mibenum = obj2->floatval;
1103 xmlXPathFreeObject(obj2);
1104
1105 /**
1106 *** Search the associations for a corresponding CCSID.
1107 **/
1108
1109 for (sp = chset_list; sp; sp = sp->c_next)
1110 if (sp->c_mibenum == mibenum)
1111 break;
1112
1113 if (!sp)
1114 continue; /* No CCSID for this MIBenum. */
1115
1116 /**
1117 *** Process preferred MIME name.
1118 **/
1119
1120 obj2 = xmlXPathEval(utf8_getprefname, ctxt);
1121
1122 if (!obj2 || obj2->type != XPATH_STRING) {
1123 fprintf(stderr,
1124 "get Preferred_MIME_Name failed at row %u\n", n);
1125 exit(1);
1126 }
1127
1128 if (obj2->stringval && obj2->stringval[0])
1129 nfaenter(obj2->stringval, -1, sp);
1130
1131 xmlXPathFreeObject(obj2);
1132
1133 /**
1134 *** Process name.
1135 **/
1136
1137 obj2 = xmlXPathEval(utf8_getname, ctxt);
1138
1139 if (!obj2 || obj2->type != XPATH_STRING) {
1140 fprintf(stderr, "get name failed at row %u\n", n);
1141 exit(1);
1142 }
1143
1144 if (obj2->stringval && obj2->stringval[0])
1145 nfaenter(obj2->stringval, -1, sp);
1146
1147 xmlXPathFreeObject(obj2);
1148
1149 /**
1150 *** Process aliases.
1151 **/
1152
1153 obj2 = xmlXPathEval(utf8_getaliases, ctxt);
1154
1155 if (!obj2 || obj2->type != XPATH_NODESET) {
1156 fprintf(stderr, "get aliases failed at row %u\n", n);
1157 exit(1);
1158 }
1159
1160 if (obj2->nodesetval && obj2->nodesetval->nodeTab)
1161 for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
1162 node = obj2->nodesetval->nodeTab[i];
1163
1164 if (node && node->content && node->content[0])
1165 nfaenter(node->content, -1, sp);
1166 }
1167
1168 xmlXPathFreeObject(obj2);
1169 }
1170
1171 xmlXPathFreeObject(obj1);
1172 xmlXPathFreeContext(ctxt);
1173 xmlFreeDoc(doc);
1174 }
1175
1176
1177 t_powerset * closureset(t_powerset * dst, t_powerset * src);
1178
1179
1180 t_powerset *
closure(t_powerset * dst,t_state * src)1181 closure(t_powerset * dst, t_state * src)
1182
1183 {
1184 t_transition * t;
1185 unsigned int oldcard;
1186
1187 if (src->s_nfastates) {
1188 /**
1189 *** Is a DFA state: return closure of set of equivalent
1190 *** NFA states.
1191 **/
1192
1193 return closureset(dst, src->s_nfastates);
1194 }
1195
1196 /**
1197 *** Compute closure of NFA state.
1198 **/
1199
1200 dst = set_include(dst, src);
1201
1202 for (t = src->s_forward; t; t = t->t_forwnext)
1203 if (t->t_token == EPSILON) {
1204 oldcard = dst->p_card;
1205 dst = set_include(dst, t->t_to);
1206
1207 if (oldcard != dst->p_card)
1208 dst = closure(dst, t->t_to);
1209 }
1210
1211 return dst;
1212 }
1213
1214
1215 t_powerset *
closureset(t_powerset * dst,t_powerset * src)1216 closureset(t_powerset * dst, t_powerset * src)
1217
1218 {
1219 unsigned int i;
1220
1221 for (i = 0; i < src->p_card; i++)
1222 dst = closure(dst, (t_state *) src->p_set[i]);
1223
1224 return dst;
1225 }
1226
1227
1228 t_state *
get_dfa_state(t_state ** stack,t_powerset * nfastates,xmlHashTablePtr sethash)1229 get_dfa_state(t_state * * stack,
1230 t_powerset * nfastates, xmlHashTablePtr sethash)
1231
1232 {
1233 t_state * s;
1234
1235 if (s = hash_get(sethash, nfastates->p_set,
1236 nfastates->p_card * sizeof nfastates->p_set[0])) {
1237 /**
1238 *** DFA state already present.
1239 *** Release the NFA state set and return
1240 *** the address of the old DFA state.
1241 **/
1242
1243 free((char *) nfastates);
1244 return s;
1245 }
1246
1247 /**
1248 *** Build the new state.
1249 **/
1250
1251 s = newstate();
1252 s->s_nfastates = nfastates;
1253 s->s_next = dfa_states;
1254 dfa_states = s;
1255 s->s_stack = *stack;
1256 *stack = s;
1257
1258 /**
1259 *** Enter it in hash.
1260 **/
1261
1262 if (hash_add(sethash, nfastates->p_set,
1263 nfastates->p_card * sizeof nfastates->p_set[0], s))
1264 chknull(NULL); /* Memory allocation error. */
1265
1266 return s;
1267 }
1268
1269
1270 int
transcmp(const void * p1,const void * p2)1271 transcmp(const void * p1, const void * p2)
1272
1273 {
1274 t_transition * t1;
1275 t_transition * t2;
1276
1277 t1 = *(t_transition * *) p1;
1278 t2 = *(t_transition * *) p2;
1279 return ((int) t1->t_token) - ((int) t2->t_token);
1280 }
1281
1282
1283 void
builddfa(void)1284 builddfa(void)
1285
1286 {
1287 t_powerset * transset;
1288 t_powerset * stateset;
1289 t_state * s;
1290 t_state * s2;
1291 unsigned int n;
1292 unsigned int i;
1293 unsigned int token;
1294 t_transition * t;
1295 t_state * stack;
1296 xmlHashTablePtr sethash;
1297 unsigned int nst;
1298
1299 transset = set_include(NULL, NULL);
1300 chknull(transset);
1301 stateset = set_include(NULL, NULL);
1302 chknull(stateset);
1303 sethash = xmlHashCreate(1);
1304 chknull(sethash);
1305 dfa_states = (t_state *) NULL;
1306 stack = (t_state *) NULL;
1307 nst = 0;
1308
1309 /**
1310 *** Build the DFA initial state.
1311 **/
1312
1313 get_dfa_state(&stack, closure(NULL, initial_state), sethash);
1314
1315 /**
1316 *** Build the other DFA states by looking at each
1317 *** possible transition from stacked DFA states.
1318 **/
1319
1320 do {
1321 if (!(++nst % 100))
1322 fprintf(stderr, "%u DFA states\n", nst);
1323
1324 s = stack;
1325 stack = s->s_stack;
1326 s->s_stack = (t_state *) NULL;
1327
1328 /**
1329 *** Build a set of all non-epsilon transitions from this
1330 *** state.
1331 **/
1332
1333 transset->p_card = 0;
1334
1335 for (n = 0; n < s->s_nfastates->p_card; n++) {
1336 s2 = s->s_nfastates->p_set[n];
1337
1338 for (t = s2->s_forward; t; t = t->t_forwnext)
1339 if (t->t_token != EPSILON) {
1340 transset = set_include(transset, t);
1341 chknull(transset);
1342 }
1343 }
1344
1345 /**
1346 *** Sort transitions by token.
1347 **/
1348
1349 qsort(transset->p_set, transset->p_card,
1350 sizeof transset->p_set[0], transcmp);
1351
1352 /**
1353 *** Process all transitions, grouping them by token.
1354 **/
1355
1356 stateset->p_card = 0;
1357 token = EPSILON;
1358
1359 for (i = 0; i < transset->p_card; i++) {
1360 t = transset->p_set[i];
1361
1362 if (token != t->t_token) {
1363 if (stateset->p_card) {
1364 /**
1365 *** Get the equivalent DFA state
1366 *** and create transition.
1367 **/
1368
1369 newtransition(token, s,
1370 get_dfa_state(&stack,
1371 closureset(NULL, stateset),
1372 sethash));
1373 stateset->p_card = 0;
1374 }
1375
1376 token = t->t_token;
1377 }
1378
1379 stateset = set_include(stateset, t->t_to);
1380 }
1381
1382 if (stateset->p_card)
1383 newtransition(token, s, get_dfa_state(&stack,
1384 closureset(NULL, stateset), sethash));
1385 } while (stack);
1386
1387 free((char *) transset);
1388 free((char *) stateset);
1389 xmlHashFree(sethash, NULL);
1390
1391 /**
1392 *** Reverse the state list to get the initial state first,
1393 *** check for ambiguous prefixes, determine final states,
1394 *** destroy NFA state sets.
1395 **/
1396
1397 while (s = dfa_states) {
1398 dfa_states = s->s_next;
1399 s->s_next = stack;
1400 stack = s;
1401 stateset = s->s_nfastates;
1402 s->s_nfastates = (t_powerset *) NULL;
1403
1404 for (n = 0; n < stateset->p_card; n++) {
1405 s2 = (t_state *) stateset->p_set[n];
1406
1407 if (s2->s_final) {
1408 if (s->s_final && s->s_final != s2->s_final)
1409 fprintf(stderr,
1410 "Ambiguous name for CCSIDs %u/%u\n",
1411 s->s_final->c_ccsid,
1412 s2->s_final->c_ccsid);
1413
1414 s->s_final = s2->s_final;
1415 }
1416 }
1417
1418 free((char *) stateset);
1419 }
1420
1421 dfa_states = stack;
1422 }
1423
1424
1425 void
deletenfa(void)1426 deletenfa(void)
1427
1428 {
1429 t_transition * t;
1430 t_state * s;
1431 t_state * u;
1432 t_state * stack;
1433
1434 stack = initial_state;
1435 stack->s_stack = (t_state *) NULL;
1436
1437 while ((s = stack)) {
1438 stack = s->s_stack;
1439
1440 while ((t = s->s_forward)) {
1441 u = t->t_to;
1442 unlink_transition(t);
1443 free((char *) t);
1444
1445 if (!u->s_backward) {
1446 u->s_stack = stack;
1447 stack = u;
1448 }
1449 }
1450
1451 free((char *) s);
1452 }
1453 }
1454
1455
1456 t_stategroup *
newgroup(void)1457 newgroup(void)
1458
1459 {
1460 t_stategroup * g;
1461
1462 g = (t_stategroup *) malloc(sizeof *g);
1463 chknull(g);
1464 memset((char *) g, 0, sizeof *g);
1465 g->g_id = groupid++;
1466 return g;
1467 }
1468
1469
1470 void
optimizedfa(void)1471 optimizedfa(void)
1472
1473 {
1474 unsigned int i;
1475 xmlHashTablePtr h;
1476 t_state * s1;
1477 t_state * s2;
1478 t_state * finstates;
1479 t_state * * sp;
1480 t_stategroup * g1;
1481 t_stategroup * g2;
1482 t_stategroup * ghead;
1483 t_transition * t1;
1484 t_transition * t2;
1485 unsigned int done;
1486 unsigned int startgroup;
1487 unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
1488
1489 /**
1490 *** Reduce DFA state count.
1491 **/
1492
1493 groupid = 0;
1494 ghead = (t_stategroup *) NULL;
1495
1496 /**
1497 *** First split: non-final and each distinct final states.
1498 **/
1499
1500 h = xmlHashCreate(4);
1501 chknull(h);
1502
1503 for (s1 = dfa_states; s1; s1 = s1->s_next) {
1504 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
1505 g1 = newgroup();
1506 g1->g_next = ghead;
1507 ghead = g1;
1508
1509 if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
1510 chknull(NULL); /* Memory allocation error. */
1511 }
1512
1513 s1->s_index = g1->g_id;
1514 s1->s_stack = g1->g_member;
1515 g1->g_member = s1;
1516 }
1517
1518 xmlHashFree(h, NULL);
1519
1520 /**
1521 *** Subsequent splits: states that have the same forward
1522 *** transition tokens to states in the same group.
1523 **/
1524
1525 do {
1526 done = 1;
1527
1528 for (g2 = ghead; g2; g2 = g2->g_next) {
1529 s1 = g2->g_member;
1530
1531 if (!s1->s_stack)
1532 continue;
1533
1534 h = xmlHashCreate(1);
1535 chknull(h);
1536
1537 /**
1538 *** Build the group transition map.
1539 **/
1540
1541 memset((char *) gtrans, ~0, sizeof gtrans);
1542
1543 for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
1544 gtrans[t1->t_token] = t1->t_to->s_index;
1545
1546 if (hash_add(h, gtrans, sizeof gtrans, g2))
1547 chknull(NULL);
1548
1549 /**
1550 *** Process other states in group.
1551 **/
1552
1553 sp = &s1->s_stack;
1554 s1 = *sp;
1555
1556 do {
1557 *sp = s1->s_stack;
1558
1559 /**
1560 *** Build the transition map.
1561 **/
1562
1563 memset((char *) gtrans, ~0, sizeof gtrans);
1564
1565 for (t1 = s1->s_forward;
1566 t1; t1 = t1->t_forwnext)
1567 gtrans[t1->t_token] = t1->t_to->s_index;
1568
1569 g1 = hash_get(h, gtrans, sizeof gtrans);
1570
1571 if (g1 == g2) {
1572 *sp = s1;
1573 sp = &s1->s_stack;
1574 }
1575 else {
1576 if (!g1) {
1577 g1 = newgroup();
1578 g1->g_next = ghead;
1579 ghead = g1;
1580
1581 if (hash_add(h, gtrans,
1582 sizeof gtrans, g1))
1583 chknull(NULL);
1584 }
1585
1586 s1->s_index = g1->g_id;
1587 s1->s_stack = g1->g_member;
1588 g1->g_member = s1;
1589 done = 0;
1590 }
1591 } while (s1 = *sp);
1592
1593 xmlHashFree(h, NULL);
1594 }
1595 } while (!done);
1596
1597 /**
1598 *** Establish group leaders and remap transitions.
1599 **/
1600
1601 startgroup = dfa_states->s_index;
1602
1603 for (g1 = ghead; g1; g1 = g1->g_next)
1604 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
1605 for (t1 = s1->s_backward; t1; t1 = t2) {
1606 t2 = t1->t_backnext;
1607 link_transition(t1, NULL, g1->g_member);
1608 }
1609
1610 /**
1611 *** Remove redundant states and transitions.
1612 **/
1613
1614 for (g1 = ghead; g1; g1 = g1->g_next) {
1615 g1->g_member->s_next = (t_state *) NULL;
1616
1617 while ((s1 = g1->g_member->s_stack)) {
1618 g1->g_member->s_stack = s1->s_stack;
1619
1620 for (t1 = s1->s_forward; t1; t1 = t2) {
1621 t2 = t1->t_forwnext;
1622 unlink_transition(t1);
1623 free((char *) t1);
1624 }
1625
1626 free((char *) s1);
1627 }
1628 }
1629
1630 /**
1631 *** Remove group support and relink DFA states.
1632 **/
1633
1634 dfa_states = (t_state *) NULL;
1635 s2 = (t_state *) NULL;
1636 finstates = (t_state *) NULL;
1637
1638 while (g1 = ghead) {
1639 ghead = g1->g_next;
1640 s1 = g1->g_member;
1641
1642 if (g1->g_id == startgroup)
1643 dfa_states = s1; /* Keep start state first. */
1644 else if (s1->s_final) { /* Then final states. */
1645 s1->s_next = finstates;
1646 finstates = s1;
1647 }
1648 else { /* Finish with non-final states. */
1649 s1->s_next = s2;
1650 s2 = s1;
1651 }
1652
1653 free((char *) g1);
1654 }
1655
1656 for (dfa_states->s_next = finstates; finstates->s_next;)
1657 finstates = finstates->s_next;
1658
1659 finstates->s_next = s2;
1660 }
1661
1662
1663 const char *
inttype(unsigned long max)1664 inttype(unsigned long max)
1665
1666 {
1667 int i;
1668
1669 for (i = 0; max; i++)
1670 max >>= 1;
1671
1672 if (i > 8 * sizeof(unsigned int))
1673 return "unsigned long";
1674
1675 if (i > 8 * sizeof(unsigned short))
1676 return "unsigned int";
1677
1678 if (i > 8 * sizeof(unsigned char))
1679 return "unsigned short";
1680
1681 return "unsigned char";
1682 }
1683
1684
listids(FILE * fp)1685 listids(FILE * fp)
1686
1687 {
1688 unsigned int pos;
1689 t_chset * cp;
1690 t_symlist * lp;
1691 char * srcp;
1692 char * dstp;
1693 size_t srcc;
1694 size_t dstc;
1695 char buf[80];
1696
1697 fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n");
1698 pos = 0;
1699
1700 for (cp = chset_list; cp; cp = cp->c_next) {
1701 if (pos) {
1702 fprintf(fp, "\n");
1703 pos = 0;
1704 }
1705
1706 if (!cp->c_names)
1707 continue;
1708
1709 pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid,
1710 iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
1711
1712 for (lp = cp->c_names; lp; lp = lp->l_next) {
1713 srcp = (char *) lp->l_symbol;
1714 srcc = strlen(srcp);
1715 dstp = buf;
1716 dstc = sizeof buf;
1717 iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
1718 srcc = dstp - buf;
1719
1720 if (pos + srcc > 79) {
1721 fprintf(fp, "\n***%22c", ' ');
1722 pos = 25;
1723 }
1724
1725 pos += fprintf(fp, " %.*s", srcc, buf);
1726 }
1727 }
1728
1729 if (pos)
1730 fprintf(fp, "\n");
1731
1732 fprintf(fp, "**/\n\n");
1733 }
1734
1735
1736 void
generate(FILE * fp)1737 generate(FILE * fp)
1738
1739 {
1740 unsigned int nstates;
1741 unsigned int ntrans;
1742 unsigned int maxfinal;
1743 t_state * s;
1744 t_transition * t;
1745 unsigned int i;
1746 unsigned int pos;
1747 char * ns;
1748
1749 /**
1750 *** Assign indexes to states and transitions.
1751 **/
1752
1753 nstates = 0;
1754 ntrans = 0;
1755 maxfinal = 0;
1756
1757 for (s = dfa_states; s; s = s->s_next) {
1758 s->s_index = nstates++;
1759
1760 if (s->s_final)
1761 maxfinal = nstates;
1762
1763 for (t = s->s_forward; t; t = t->t_forwnext)
1764 t->t_index = ntrans++;
1765 }
1766
1767 fprintf(fp,
1768 "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n",
1769 nstates, maxfinal, ntrans);
1770 fprintf(stderr, "%u states, %u finals, %u transitions.\n",
1771 nstates, maxfinal, ntrans);
1772
1773 /**
1774 *** Generate types.
1775 **/
1776
1777 fprintf(fp, "typedef unsigned short t_ccsid;\n");
1778 fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
1779 fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
1780
1781 /**
1782 *** Generate first transition index for each state.
1783 **/
1784
1785 fprintf(fp, "static t_transrange trans_array[] = {\n");
1786 pos = 0;
1787 ntrans = 0;
1788
1789 for (s = dfa_states; s; s = s->s_next) {
1790 pos += fprintf(fp, " %u,", ntrans);
1791
1792 if (pos > 72) {
1793 fprintf(fp, "\n");
1794 pos = 0;
1795 }
1796
1797 for (t = s->s_forward; t; t = t->t_forwnext)
1798 ntrans++;
1799 }
1800
1801 fprintf(fp, " %u\n};\n\n", ntrans);
1802
1803 /**
1804 *** Generate final state info.
1805 **/
1806
1807 fprintf(fp, "static t_ccsid final_array[] = {\n");
1808 pos = 0;
1809 ns ="";
1810 i = 0;
1811
1812 for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
1813 pos += fprintf(fp, "%s", ns);
1814 ns = ",";
1815
1816 if (pos > 72) {
1817 fprintf(fp, "\n");
1818 pos = 0;
1819 }
1820
1821 pos += fprintf(fp, " %u",
1822 s->s_final? s->s_final->c_ccsid + 1: 0);
1823 }
1824
1825 fprintf(fp, "\n};\n\n");
1826
1827 /**
1828 *** Generate goto table.
1829 **/
1830
1831 fprintf(fp, "static t_staterange goto_array[] = {\n");
1832 pos = 0;
1833
1834 for (s = dfa_states; s; s = s->s_next)
1835 for (t = s->s_forward; t; t = t->t_forwnext) {
1836 pos += fprintf(fp, " %u,", t->t_to->s_index);
1837
1838 if (pos > 72) {
1839 fprintf(fp, "\n");
1840 pos = 0;
1841 }
1842 }
1843
1844 fprintf(fp, " %u\n};\n\n", nstates);
1845
1846 /**
1847 *** Generate transition label table.
1848 **/
1849
1850 fprintf(fp, "static unsigned char label_array[] = {\n");
1851 pos = 0;
1852 ns ="";
1853
1854 for (s = dfa_states; s; s = s->s_next)
1855 for (t = s->s_forward; t; t = t->t_forwnext) {
1856 pos += fprintf(fp, "%s", ns);
1857 ns = ",";
1858
1859 if (pos > 72) {
1860 fprintf(fp, "\n");
1861 pos = 0;
1862 }
1863
1864 pos += fprintf(fp, " 0x%02X", t->t_token);
1865 }
1866
1867 fprintf(fp, "\n};\n", nstates);
1868 }
1869
1870
main(argc,argv)1871 main(argc, argv)
1872 int argc;
1873 char * * argv;
1874
1875 {
1876 FILE * fp;
1877 t_chset * csp;
1878 char symbuf[20];
1879
1880 chset_list = (t_chset *) NULL;
1881 initial_state = newstate();
1882 job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
1883 utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
1884
1885 if (argc != 4) {
1886 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
1887 fprintf(stderr, "<iana-character-set file> <output file>\n");
1888 exit(1);
1889 }
1890
1891 /**
1892 *** Read CCSID/MIBenum associations. Define special names.
1893 **/
1894
1895 read_assocs(argv[1]);
1896
1897 /**
1898 *** Read character set names and establish the case-independent
1899 *** name DFA in all possible CCSIDs.
1900 **/
1901
1902 read_iana(argv[2]);
1903
1904 /**
1905 *** Build DFA from NFA.
1906 **/
1907
1908 builddfa();
1909
1910 /**
1911 *** Delete NFA.
1912 **/
1913
1914 deletenfa();
1915
1916 /**
1917 *** Minimize the DFA state count.
1918 **/
1919
1920 optimizedfa();
1921
1922 /**
1923 *** Generate the table.
1924 **/
1925
1926 fp = fopen(argv[3], "w+");
1927
1928 if (!fp) {
1929 perror(argv[3]);
1930 exit(1);
1931 }
1932
1933 fprintf(fp, "/**\n");
1934 fprintf(fp, "*** Character set names table.\n");
1935 fprintf(fp, "*** Generated by program BLDCSNDFA from");
1936 fprintf(fp, " IANA character set assignment file\n");
1937 fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n");
1938 fprintf(fp, "*** *** Do not edit by hand ***\n");
1939 fprintf(fp, "**/\n\n");
1940 listids(fp);
1941 generate(fp);
1942
1943 if (ferror(fp)) {
1944 perror(argv[3]);
1945 fclose(fp);
1946 exit(1);
1947 }
1948
1949 fclose(fp);
1950 iconv_close(job2utf8);
1951 iconv_close(utf82job);
1952 exit(0);
1953 }
1954