1 /**** << kanji code converter >> ****
2 *
3 * kcc.c
4 * Aug 13 1992
5 * mod: Jul 1 1994
6 ************************************************** tonooka ***********/
7 /*
8 * Copyright (c) 1994 Yasuhiro Tonooka (tonooka@msi.co.jp)
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2, or (at your option)
13 * any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24 #if !defined lint
25 static char sccsid[] = "@(#)kcc.c 2.3 (Y.Tonooka) 7/1/94";
26 static char copyright[] = "@(#)Copyright (c) 1992 Yasuhiro Tonooka";
27 #endif
28
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33
34 /*-
35 * OPTIONS
36 * -O or -IO
37 * I specifies input code and O specifies output code.
38 * When input code is not specified, kcc guesses kanji
39 * code.
40 * I is one of:
41 * e EUC (with 7-bit JIS)
42 * d DEC (with 7-bit JIS)
43 * s shift-JIS (with 7-bit JIS)
44 * j, 7 or k
45 * 7-bit JIS
46 * 8 8-bit JIS
47 * O is one of:
48 * e EUC
49 * d DEC
50 * s shift-JIS
51 * jXY, 7XY
52 * 7-bit JIS
53 * kXY 7-bit JIS using "ESC(I" for JIS katakana
54 * 8XY 8-bit JIS
55 * where X is:
56 * B "ESC$B" as kanji shift in
57 * @ "ESC$@" as kanji shift in
58 * + "ESC&@ESC$B" as kanji shift in
59 * Y is:
60 * B "ESC(B" as kanji shift out
61 * J "ESC(J" as kanji shift out
62 * H "ESC(H" as kanji shift out
63 *
64 * -c Check: Only tells file type to stdout. Overrides any
65 * other option except -x and -z. In this mode, kcc
66 * reads all thru the input file to guess the file type.
67 *
68 * -v Verbose: Prints to stderr which kanji code system
69 * kcc guessed the input code is in.
70 *
71 * -x Extended mode: In this mode, recognized area of each
72 * code system is extended in guessing.
73 *
74 * -z Reduced mode: In this mode, only zenkaku characters
75 * are recognized with EUC and shift-JIS in guessing.
76 * (Reduces ambiguity).
77 *
78 * -n Gaiji & undefined kankaku kana characters are replaced
79 * with padding character.
80 *
81 * -h Hiragana is used instead of katakana when converting
82 * hankaku kana to DEC zenkaku.
83 *
84 * -b xxx Specify hold buffer size to xxx (byte).
85 */
86
87 #define LENLINE (BUFSIZ * 4)
88 #define HOLDBUFSIZ 8192 /* default size of hold buffer */
89
90 #define ESC 0x1b
91 #define SO 0x0e
92 #define SI 0x0f
93 #define SS2 0x8e /* EUC single shift 2 */
94 #define SS3 0x8f /* EUC single shift 3 */
95
96 #define ZENPAD 0x2222 /* padding char for zenkaku */
97 #define HANPAD 0x25 /* padding char for hankaku */
98
99 typedef int bool;
100
101 #define bitflag(c) (1L << (c) - '@')
102
103 #define NONASCII 0x01 /* non-ASCII character */
104 #define JIS 0x02 /* JIS */
105 #define ESCI 0x04 /* "ESC(I" */
106 #define ASSUME 0x08 /* assumed EUC (or DEC) */
107 #define EUC 0x10
108 #define DEC 0x20
109 #define SJIS 0x40
110 #define JIS8 0x80 /* 8-bit JIS */
111 #define BIT8 (EUC | DEC | SJIS | JIS8)
112
113 enum mode {
114 M_ASCII,
115 M_KANJI,
116 M_GAIJI,
117 M_SO, /* hankaku kana with SO */
118 M_ESCI, /* hankaku kana with "ESC(I" */
119 };
120
121 char *progname;
122 char *filename = NULL;
123 char shiftin[7] = "\033$B";
124 char shiftout[4] = "\033(J";
125 unsigned incode = 0;
126 unsigned outcode = JIS;
127 bool verbose = 0;
128 bool docheck = 0;
129 bool extend = 0;
130 bool zenkaku = 0;
131 bool nogaiji = 0;
132
133 extern unsigned short katakana[];
134 extern unsigned short hiragana[];
135 unsigned short *kanatbl = katakana;
136
137 void error(char* fmt, ...);
138
139 /**********************************************************************
140 * *
141 * Main Routines *
142 * *
143 **********************************************************************/
144 /*---------------------------------------------------------------------
145 NAME
146 main
147 ---------------------------------------------------------------------*/
main(c,v)148 main(c, v)
149 register int c;
150 register char *v[];
151 {
152 register char *s;
153 bool codeopt = 0; /* code option is read */
154 FILE *iop;
155 int status;
156 int dev, ino = -1;
157 struct stat sbuf;
158 unsigned size = HOLDBUFSIZ;
159 void filter();
160 void check();
161 void buffalloc();
162 void setfunc();
163
164 progname = *v++;
165 /*
166 * Process options here.
167 */
168 for (; --c; v++) {
169 /*
170 * With -, input is taken from stdin like cat(1).
171 */
172 if ((*v)[0] != '-' || (*v)[1] == '\0')
173 break;
174 /*
175 * Size of hold buf can be changed with "-b size".
176 */
177 if (strcmp(*v, "-b") == 0) {
178 if (--c == 0)
179 error("%s option must have an argument", *v);
180 v++;
181 if ((size = atoi(*v)) <= 0)
182 error("bad buffer size");
183 continue;
184 }
185 /*
186 * Options:
187 */
188 for (s = *v + 1; *s; s++) {
189 if (strchr("esdj7k8", *s)) {
190 if (codeopt)
191 error("%s: duplicate code specification", *v);
192 codeopt = 1;
193 if (s[1] && strchr("esdj7k8", s[1]))
194 /*
195 * Input code: e, s, j, 7, k or 8.
196 */
197 switch (*s++) {
198 case 'e': /* EUC with JIS */
199 incode = EUC | NONASCII;
200 break;
201 case 's': /* shift-JIS with JIS */
202 incode = SJIS | NONASCII;
203 break;
204 case 'd': /* DEC with JIS */
205 incode = DEC | NONASCII;
206 break;
207 case 'j': /* JIS */
208 case '7': /* equivalent to 'j' */
209 case 'k': /* JIS */
210 incode = JIS;
211 break;
212 case '8': /* 8-bit JIS */
213 incode = JIS | JIS8 | NONASCII;
214 break;
215 }
216 /*
217 * Output code: e, s, d, jXY, 7XY, kXY or 8XY.
218 */
219 switch (*s) {
220 case 'e': /* EUC */
221 outcode = EUC;
222 continue;
223 case 's': /* shift-JIS */
224 outcode = SJIS;
225 continue;
226 case 'd': /* EUC */
227 outcode = DEC;
228 continue;
229 case 'j': /* 7-bit JIS using SO & SI */
230 case '7': /* equivalent to 'j' */
231 outcode = JIS;
232 break;
233 case 'k': /* 7-bit JIS using "ESC(I" */
234 outcode = JIS | ESCI;
235 break;
236 case '8': /* 8-bit JIS */
237 outcode = JIS | JIS8;
238 break;
239 }
240 /*
241 * Process "XY" part of options j, 7, k & 8.
242 */
243 if ((s[1] == 'B' || s[1] == '@' || s[1] == '+') &&
244 (s[2] == 'B' || s[2] == 'J' || s[2] == 'H')) {
245 if (s[1] == '+')
246 sprintf(shiftin, "\033&@\033$B");
247 else
248 sprintf(shiftin, "\033$%c", s[1]);
249 sprintf(shiftout, "\033(%c", s[2]);
250 s += 2;
251 }
252 continue;
253 }
254 /*
255 * Other one letter options:
256 */
257 switch (*s) {
258 case 'c': /* check */
259 docheck = 1;
260 break;
261 case 'h': /* hiragana for hankaku->DEC */
262 kanatbl = hiragana;
263 break;
264 case 'n': /* no gaiji */
265 nogaiji = 1;
266 break;
267 case 'v': /* verbose */
268 verbose = 1;
269 break;
270 case 'x': /* extended mode */
271 extend = 1;
272 break;
273 case 'z': /* reduced mode */
274 zenkaku = 1;
275 break;
276 default:
277 error("-%c: bad option", *s);
278 }
279 }
280 }
281 if (extend && zenkaku)
282 error("-x and -z can't go together");
283 if (!docheck) {
284 buffalloc(size); /* allocate hold buf */
285 setfunc();
286 }
287 /*
288 * Get some info on output file.
289 */
290 if (fstat(fileno(stdout), &sbuf) == 0) {
291 sbuf.st_mode &= S_IFMT;
292 if (sbuf.st_mode != S_IFCHR && sbuf.st_mode != S_IFBLK) {
293 dev = sbuf.st_dev;
294 ino = sbuf.st_ino;
295 }
296 }
297 /*
298 * Main loop.
299 */
300 status = 0;
301 do {
302 if (c == 0 || strcmp(*v, "-") == 0) {
303 /*
304 * Stdin: If tty and at EOF, clear EOF.
305 */
306 if (isatty(fileno(stdin)) && feof(stdin))
307 clearerr(stdin);
308 iop = stdin;
309 } else
310 /*
311 * Open a file.
312 */
313 if ((iop = fopen(*v, "r")) == NULL) {
314 perror(*v);
315 status |= 1;
316 continue;
317 }
318 if (c)
319 filename = *v;
320 if (fstat(fileno(iop), &sbuf) == 0) {
321 /*
322 * Get some info on input file, and see if it is a
323 * directory.
324 */
325 if ((sbuf.st_mode & S_IFMT) == S_IFDIR) {
326 fprintf(stderr,
327 "%s: read error on %s: Is a directory\n",
328 progname, c ? *v : "standard input");
329 if (iop != stdin)
330 fclose(iop);
331 status |= 1;
332 continue;
333 }
334 /*
335 * Compare the info of input with that of output, and see
336 * if they are identical.
337 */
338 if ((sbuf.st_mode & S_IFMT) == S_IFREG &&
339 sbuf.st_dev == dev && sbuf.st_ino == ino) {
340 fprintf(stderr, "%s: input %s is output\n", progname,
341 c ? *v : "-");
342 if (iop != stdin)
343 fclose(iop);
344 status |= 1;
345 continue;
346 }
347 }
348 /*
349 * Do the job here!
350 */
351 if (docheck)
352 check(iop);
353 else
354 filter(iop);
355 if (iop != stdin)
356 fclose(iop);
357 } while (v++, --c > 0);
358 if (ferror(stdout))
359 error("output write error");
360 return (status);
361 }
362
363 /*---------------------------------------------------------------------
364 NAME
365 error - print formatted error message on stderr and die
366 ---------------------------------------------------------------------*/
367 #include <stdarg.h>
368
error(char * fmt,...)369 void error(char *fmt, ...)
370 {
371 va_list ap;
372
373 va_start(ap, fmt);
374 fprintf(stderr, "%s: ", progname);
375 vfprintf(stderr, fmt, ap);
376 putc('\n', stderr);
377 va_end(ap);
378 exit(1);
379 }
380
381 /**********************************************************************
382 * *
383 * Filter *
384 * *
385 **********************************************************************/
386 enum mode gsmode; /* guess: M_ASCII M_KANJI M_SO */
387 enum mode inmode; /* input: M_ASCII M_KANJI M_GAIJI
388 * M_SO M_ESCI */
389 enum mode outmode; /* output: M_ASCII M_KANJI M_GAIJI
390 * M_SO M_ESCI */
391
392 unsigned long insi; /* JIS shift-in sequence flag */
393 unsigned long inso; /* JIS shift-out sequence flag
394 * including "ESC(I" */
395 unsigned long innj; /* JIS 1990 sequence flag */
396 unsigned long ingj; /* JIS 1990 aux flag */
397
398 /*---------------------------------------------------------------------
399 NAME
400 filter - filtering routine
401 ---------------------------------------------------------------------*/
filter(fp)402 void filter(fp)
403 FILE *fp;
404 {
405 register bool hold;
406 register unsigned code, c;
407 register int len;
408 char str[LENLINE];
409 unsigned guess();
410 bool append();
411 void flush();
412 unsigned out();
413 void showcode();
414
415 code = incode ? incode : extend ? BIT8 : BIT8 & ~DEC;
416 gsmode = inmode = outmode = M_ASCII;
417 insi = inso = innj = ingj = 0;
418 hold = 0;
419 while (len = getstr(str, sizeof str, fp)) {
420 if (!(code & NONASCII) && code & BIT8 ||
421 code & (EUC | DEC) && code & SJIS && !(code & ASSUME)) {
422 /*
423 * So far, no kanji has been seen, or ambiguous.
424 */
425 c = guess(str, len);
426 code |= c & (JIS | NONASCII), code &= c | ~BIT8;
427 if (code & NONASCII && code & (EUC | DEC) && code & SJIS) {
428 /*
429 * If ambiguous, store the line in hold buffer.
430 */
431 if (append(str, len)) {
432 hold = 1;
433 continue;
434 }
435 /*
436 * When buffer is full, assume EUC/DEC.
437 */
438 code |= ASSUME;
439 }
440 }
441 if (hold) {
442 /*
443 * Flush hold buffer.
444 */
445 flush(code);
446 hold = 0;
447 }
448 c = out(str, len, code);
449 code |= c & JIS, code &= c | ~BIT8;
450 }
451 if (hold)
452 /*
453 * Assume EUC.
454 */
455 flush(code |= ASSUME);
456 if (verbose)
457 showcode(code, stderr);
458 }
459
460 /*---------------------------------------------------------------------
461 NAME
462 check
463 ---------------------------------------------------------------------*/
check(fp)464 void check(fp)
465 FILE *fp;
466 {
467 register unsigned code, c;
468 register int len;
469 char str[LENLINE];
470 void showcode();
471 unsigned guess();
472
473 code = extend ? BIT8 : BIT8 & ~DEC;
474 gsmode = M_ASCII;
475 insi = inso = innj = ingj = 0;
476 while (len = getstr(str, sizeof str, fp)) {
477 c = guess(str, len);
478 code |= c & (JIS | NONASCII), code &= c | ~BIT8;
479 if (code & NONASCII && !(code & BIT8))
480 break;
481 }
482 showcode(code, stdout);
483 }
484
485 /*---------------------------------------------------------------------
486 NAME
487 showcode
488 ---------------------------------------------------------------------*/
showcode(code,fp)489 void showcode(code, fp)
490 register unsigned code;
491 register FILE *fp;
492 {
493 char *s;
494 void showjis();
495
496 if (filename)
497 if (fprintf(fp, "%s:\t", filename) < 9)
498 putc('\t', fp);
499 if (!(code & NONASCII)) {
500 /*
501 * 7-bit JIS / ASCII.
502 */
503 if (code & JIS) {
504 showjis('7', fp);
505 putc('\n', fp);
506 } else
507 fputs("ASCII\n", fp);
508 return;
509 } else if (code & (EUC | DEC)) {
510 s = code & EUC ? code & DEC ? "EUC/DEC" : "EUC" : "DEC";
511 if (code & SJIS) {
512 /*
513 * Ambiguous.
514 */
515 fprintf(fp, "ambiguous (%s", s);
516 if (code & JIS8) {
517 fputs(code & JIS ?
518 " with 7-bit JIS, or " : ", shift-JIS or ", fp);
519 showjis('8', fp);
520 if (code & ASSUME)
521 fprintf(fp, "; assumed %s",
522 code & JIS ? "the former" : s);
523 fputs(")\n", fp);
524 return;
525 }
526 fputs(" or shift-JIS", fp);
527 if (code & ASSUME)
528 fprintf(fp, "; assumed %s", s);
529 fputs(")", fp);
530 } else
531 /*
532 * EUC/DEC.
533 */
534 fputs(s, fp);
535 } else if (code & JIS8) {
536 /*
537 * 8-bit JIS / shift-JIS or 8-bit JIS.
538 */
539 if (!(code & JIS))
540 fputs("shift-JIS or ", fp);
541 showjis('8', fp);
542 putc('\n', fp);
543 return;
544 } else if (code & SJIS)
545 /*
546 * Shift-JIS.
547 */
548 fputs("shift-JIS", fp);
549 else {
550 /*
551 * Non-ASCII deteced but neither EUC/DEC nor SJIS.
552 */
553 fputs("data\n", fp);
554 return;
555 }
556 if (code & JIS) {
557 fputs(" with ", fp);
558 showjis('7', fp);
559 }
560 putc('\n', fp);
561 }
562
563 /*---------------------------------------------------------------------
564 NAME
565 showjis
566 ---------------------------------------------------------------------*/
showjis(bit,fp)567 void showjis(bit, fp)
568 int bit; /* 8-bit or 7-bit */
569 FILE *fp;
570 {
571 bool comma;
572 bool showesc();
573
574 fprintf(fp, "%c-bit JIS [", bit);
575 comma = showesc("ESC$", insi, 0, fp);
576 comma = showesc("ESC&@ESC$", innj, comma, fp);
577 comma = showesc("ESC(", inso, comma, fp);
578 showesc("ESC$(", ingj, comma, fp);
579 putc(']', fp);
580 }
581
582 /*---------------------------------------------------------------------
583 NAME
584 showesc
585 ---------------------------------------------------------------------*/
showesc(str,mask,comma,fp)586 bool showesc(str, mask, comma, fp)
587 char *str;
588 register unsigned long mask;
589 bool comma;
590 FILE *fp;
591 {
592 register unsigned long m;
593 register int c;
594
595 for (m = 1, c = '@'; m; m <<= 1, c++)
596 if (mask & m) {
597 if (comma)
598 fputs(", ", fp);
599 else
600 comma = 1;
601 fputs(str, fp);
602 putc(c, fp);
603 }
604 return (comma);
605 }
606
607 /*---------------------------------------------------------------------
608 NAME
609 getstr
610 ---------------------------------------------------------------------*/
getstr(str,n,fp)611 int getstr(str, n, fp)
612 char *str;
613 register int n;
614 FILE *fp;
615 {
616 register int c;
617 register char *s;
618
619 for (s = str; --n > 0 && (c = getc(fp)) != EOF; )
620 if ((*s++ = c) == '\n')
621 break;
622 return (s - str);
623 }
624
625 /**********************************************************************
626 * *
627 * Hold Buffer Operations *
628 * *
629 **********************************************************************/
630 char *holdbuf, *bufend;
631 char *bufp;
632
633 /*---------------------------------------------------------------------
634 NAME
635 buffalloc
636 ---------------------------------------------------------------------*/
buffalloc(len)637 void buffalloc(len)
638 unsigned len;
639 {
640 if ((bufp = holdbuf = (char *) malloc(len)) == NULL)
641 error("out of memory");
642 bufend = holdbuf + len;
643 }
644
645 /*---------------------------------------------------------------------
646 NAME
647 append
648 ---------------------------------------------------------------------*/
append(s,len)649 bool append(s, len)
650 register char *s;
651 register int len;
652 {
653 if (bufp + len > bufend)
654 return (0);
655 for (; len; --len)
656 *bufp++ = *(u_char *) s++;
657 return (1);
658 }
659
660 /*---------------------------------------------------------------------
661 NAME
662 flush
663 ---------------------------------------------------------------------*/
flush(code)664 void flush(code)
665 unsigned code;
666 {
667 unsigned out();
668
669 out(holdbuf, bufp - holdbuf, code);
670 bufp = holdbuf;
671 }
672
673 /**********************************************************************
674 * *
675 * General *
676 * *
677 **********************************************************************/
678 /*---------------------------------------------------------------------
679 NAME
680 compare
681 ---------------------------------------------------------------------*/
compare(s,str)682 bool compare(s, str)
683 register char *s, *str;
684 {
685 while (*s)
686 if (*s++ != *str++)
687 return (0);
688 return (1);
689 }
690
691 /**********************************************************************
692 * *
693 * Guessing *
694 * *
695 **********************************************************************/
696 /*---------------------------------------------------------------------
697 NAME
698 guess - distinguish code system
699 ---------------------------------------------------------------------*/
guess(str,len)700 unsigned guess(str, len)
701 char *str;
702 int len;
703 {
704 register char *s;
705 register int euc, sjis, dec;
706 bool jis8;
707 register unsigned code;
708 register int i;
709 enum mode old;
710
711 euc = sjis = 1;
712 dec = extend ? 1 : 0;
713 jis8 = 1;
714 code = 0;
715 for (s = str; s < str + len; s += i) {
716 i = 1;
717 switch (*(u_char *) s) {
718 case ESC:
719 if (gsmode == M_SO)
720 continue;
721 old = gsmode;
722 if (compare("$B", s + 1) || compare("$@", s + 1)) {
723 gsmode = M_KANJI; /* kanji */
724 insi |= bitflag(((u_char *) s)[2]);
725 i = 3;
726 } else if (compare("&@\033$B", s + 1)) {
727 gsmode = M_KANJI; /* kanji 1990 */
728 innj |= bitflag('B');
729 i = 6;
730 } else if (compare("(B", s + 1) ||
731 compare("(J", s + 1) || compare("(H", s + 1)) {
732 gsmode = M_ASCII; /* kanji end */
733 inso |= bitflag(((u_char *) s)[2]);
734 i = 3;
735 } else if (compare("(I", s + 1)) {
736 gsmode = M_KANJI; /* "ESC(I" */
737 inso |= bitflag('I');
738 i = 3;
739 } else if (compare("$(D", s + 1)) {
740 gsmode = M_KANJI; /* gaiji */
741 ingj |= bitflag('D');
742 i = 4;
743 } else
744 break;
745 code |= JIS;
746 if (old != M_ASCII)
747 continue;
748 break;
749 case SO:
750 if (gsmode == M_ASCII) {
751 code |= JIS;
752 gsmode = M_SO;
753 break;
754 }
755 continue;
756 case SI:
757 if (gsmode == M_SO) {
758 gsmode = M_ASCII;
759 continue;
760 }
761 /* fall thru */
762 default:
763 if (gsmode != M_ASCII)
764 continue;
765 break;
766 }
767 if (*(u_char *) s & 0x80)
768 code |= NONASCII;
769 switch (euc) {
770 case 1:
771 /*
772 * EUC first byte.
773 */
774 if (*(u_char *) s & 0x80) {
775 if (0xa0 < *(u_char *) s && *(u_char *) s < 0xff ||
776 !zenkaku && *(u_char *) s == SS2) {
777 euc = 2;
778 break;
779 }
780 if (extend)
781 if (*(u_char *) s == SS3) {
782 euc = 2;
783 break;
784 } else if (*(u_char *) s < 0xa0)
785 break;
786 euc = 0; /* not EUC */
787 }
788 break;
789 case 2:
790 /*
791 * EUC second byte or third byte of CS3.
792 */
793 if (((u_char *) s)[-1] == SS2) {
794 if (0xa0 < *(u_char *) s &&
795 *(u_char *) s < (extend ? 0xff : 0xe0)) {
796 euc = 1; /* hankaku kana */
797 break;
798 }
799 } else
800 if (0xa0 < *(u_char *) s && *(u_char *) s < 0xff) {
801 if (((u_char *) s)[-1] != SS3)
802 euc = 1;/* zenkaku */
803 break;
804 }
805 euc = 0; /* not EUC */
806 break;
807 }
808 if (extend)
809 switch (dec) {
810 case 1:
811 /*
812 * DEC first byte.
813 */
814 if (*(u_char *) s & 0x80) {
815 if (0xa0 < *(u_char *) s && *(u_char *) s < 0xff) {
816 dec = 2;
817 break;
818 } else if (*(u_char *) s < 0xa0)
819 break;
820 dec = 0; /* not DEC */
821 }
822 break;
823 case 2:
824 /*
825 * DEC second byte.
826 */
827 if (0x20 < (*(u_char *) s & 0x7f) &&
828 (*(u_char *) s & 0x7f) < 0x7f) {
829 dec = 1;
830 } else
831 dec = 0; /* not DEC */
832 break;
833 }
834 switch (sjis) {
835 case 1:
836 /*
837 * shift-JIS first byte.
838 */
839 if (*(u_char *) s & 0x80) {
840 if (0xa0 < *(u_char *) s && *(u_char *) s < 0xe0) {
841 if (!zenkaku)
842 break; /* hankaku */
843 } else if (*(u_char *) s != 0x80 &&
844 *(u_char *) s != 0xa0 &&
845 *(u_char *) s <= (extend ? 0xfc : 0xef)) {
846 sjis = 2; /* zenkaku */
847 jis8 = 0;
848 break;
849 }
850 sjis = 0; /* not SJIS */
851 }
852 break;
853 case 2:
854 /*
855 * shift-JIS second byte.
856 */
857 if (0x40 <= *(u_char *) s && *(u_char *) s != 0x7f &&
858 *(u_char *) s <= 0xfc)
859 sjis = 1;
860 else
861 sjis = 0; /* not SJIS */
862 break;
863 }
864 }
865 if (euc == 1)
866 code |= EUC;
867 if (dec == 1)
868 code |= DEC;
869 if (sjis == 1)
870 code |= zenkaku || !jis8 ? SJIS : SJIS | JIS8;
871 return (code);
872 }
873
874 /**********************************************************************
875 * *
876 * Output Routines *
877 * *
878 **********************************************************************/
879 void (*outascii)(), (*outkanji)(), (*outgaiji)(), (*outkana)();
880
881 /*---------------------------------------------------------------------
882 NAME
883 out
884 ---------------------------------------------------------------------*/
out(str,len,code)885 unsigned out(str, len, code)
886 char *str;
887 int len;
888 register unsigned code;
889 {
890 register char *s;
891 register int i;
892 void outsjis();
893
894 for (s = str; s < str + len; s += i) {
895 i = 1;
896 switch (*(u_char *) s) {
897 case ESC:
898 if (inmode == M_SO)
899 break;
900 if (compare("$B", s + 1) || compare("$@", s + 1)) {
901 inmode = M_KANJI; /* kanji */
902 insi |= bitflag(((u_char *) s)[2]);
903 i = 3;
904 } else if (compare("&@\033$B", s + 1)) {
905 inmode = M_KANJI; /* kanji 1990 */
906 innj |= bitflag('B');
907 i = 6;
908 } else if (compare("(B", s + 1) || compare("(J", s + 1) ||
909 compare("(H", s + 1)) {
910 inmode = M_ASCII; /* kanji end */
911 inso |= bitflag(((u_char *) s)[2]);
912 i = 3;
913 } else if (compare("(I", s + 1)) {
914 inmode = M_ESCI; /* "ESC(I" */
915 inso |= bitflag('I');
916 i = 3;
917 } else if (compare("$(D", s + 1)) {
918 inmode = M_GAIJI; /* gaiji */
919 ingj |= bitflag('D');
920 i = 4;
921 } else
922 break;
923 code |= JIS;
924 continue;
925 case SO:
926 if (inmode == M_ASCII) {
927 code |= JIS;
928 inmode = M_SO;
929 continue;
930 }
931 break;
932 case SI:
933 if (inmode == M_SO) {
934 inmode = M_ASCII;
935 continue;
936 }
937 break;
938 }
939 if (inmode != M_ASCII) {
940 if (0x20 < ((u_char *) s)[0] && ((u_char *) s)[0] < 0x7f)
941 switch (inmode) {
942 case M_KANJI:
943 (*outkanji)(((u_char *) s)[0],
944 ((u_char *) s)[1] & 0x7f);
945 i = 2;
946 continue;
947 case M_GAIJI:
948 (*outgaiji)(((u_char *) s)[0],
949 ((u_char *) s)[1] & 0x7f);
950 i = 2;
951 continue;
952 case M_SO:
953 case M_ESCI:
954 (*outkana)(((u_char *) s)[0]);
955 continue;
956 }
957 } else if (((u_char *) s)[0] & 0x80)
958 if (code & (EUC | DEC)) {
959 /*
960 * EUC or DEC:
961 */
962 if (0xa0 < ((u_char *) s)[0] &&
963 ((u_char *) s)[0] < 0xff) {
964 if (!(((u_char *) s)[1] & 0x80) && code & DEC) {
965 /*
966 * DEC gaiji:
967 */
968 code &= ~EUC; /* definitely DEC */
969 (*outgaiji)(((u_char *) s)[0] & 0x7f,
970 ((u_char *) s)[1]);
971 } else
972 /*
973 * EUC code set 1 (kanji), DEC kanji:
974 */
975 (*outkanji)(((u_char *) s)[0] & 0x7f,
976 ((u_char *) s)[1] & 0x7f);
977 } else if (((u_char *) s)[0] == SS2 && code & EUC &&
978 0xa0 < ((u_char *) s)[1] &&
979 ((u_char *) s)[1] < 0xff) {
980 /*
981 * EUC code set 2 (hankaku kana):
982 */
983 code &= ~DEC; /* probably EUC */
984 (*outkana)(((u_char *) s)[1] & 0x7f);
985 } else if (((u_char *) s)[0] == SS3 && code & EUC &&
986 0xa0 < ((u_char *) s)[1] &&
987 ((u_char *) s)[1] < 0xff &&
988 0xa0 < ((u_char *) s)[2] &&
989 ((u_char *) s)[2] < 0xff) {
990 /*
991 * EUC code set 3 (gaiji):
992 */
993 code &= ~DEC; /* probably EUC */
994 (*outgaiji)(((u_char *) s)[1] & 0x7f,
995 ((u_char *) s)[2] & 0x7f);
996 i = 3;
997 continue;
998 } else {
999 /*
1000 * Control character (C1):
1001 */
1002 if (outcode != SJIS && (outcode != EUC ||
1003 ((u_char *) s)[0] != SS2 &&
1004 ((u_char *) s)[0] != SS3))
1005 putchar(((u_char *) s)[0]);
1006 continue;
1007 }
1008 i = 2;
1009 continue;
1010 } else if (code & (SJIS | JIS8)) {
1011 /*
1012 * Shift-JIS or JIS8:
1013 */
1014 if (!(code & SJIS) || 0xa0 < ((u_char *) s)[0] &&
1015 ((u_char *) s)[0] < 0xe0)
1016 /*
1017 * Hankaku kana:
1018 */
1019 (*outkana)(((u_char *) s)[0] & 0x7f);
1020 else {
1021 /*
1022 * Shift-JIS kanji:
1023 */
1024 code &= ~JIS8; /* definitely shift-JIS */
1025 outsjis(((u_char *) s)[0], ((u_char *) s)[1]);
1026 i = 2;
1027 }
1028 continue;
1029 }
1030 (*outascii)(((u_char *) s)[0]);
1031 }
1032 return (code);
1033 }
1034
1035 /*---------------------------------------------------------------------
1036 NAME
1037 outsjis
1038 ---------------------------------------------------------------------*/
outsjis(c1,c2)1039 void outsjis(c1, c2)
1040 register int c1, c2;
1041 {
1042 register int c;
1043
1044 c = c1 * 2 - (c1 <= 0x9f ? 0x00e1 : (c1 < 0xf0 ? 0x0161 : 0x01bf));
1045 if (c2 < 0x9f)
1046 c2 = c2 - (c2 > 0x7f ? 0x20 : 0x1f);
1047 else {
1048 c2 = c2 - 0x7e;
1049 c++;
1050 }
1051 (*(c1 <= 0xef ? outkanji : outgaiji))(c, c2);
1052 }
1053
1054 /**********************************************************************
1055 * *
1056 * Conversion Routines *
1057 * *
1058 **********************************************************************/
1059 void outchar();
1060 void jisascii(), jiskanji(), jisgaiji();
1061 void jiskana(), jiskanak(), jiskana8();
1062 void euckanji(), eucgaiji(), euckana();
1063 void sjiskanji(), sjisgaiji(), sjiskana();
1064 void decascii(), deckanji(), decgaiji(), deckana();
1065
1066 int lastkana = 0; /* last hankaku kana for DEC */
1067
1068 /*---------------------------------------------------------------------
1069 NAME
1070 setfunc
1071 ---------------------------------------------------------------------*/
setfunc()1072 void setfunc()
1073 {
1074 switch (outcode) {
1075 case EUC:
1076 outascii = outchar;
1077 outkanji = euckanji;
1078 outgaiji = eucgaiji;
1079 outkana = euckana;
1080 break;
1081 case DEC:
1082 outascii = decascii;
1083 outkanji = deckanji;
1084 outgaiji = decgaiji;
1085 outkana = deckana;
1086 break;
1087 case SJIS:
1088 outascii = outchar;
1089 outkanji = sjiskanji;
1090 outgaiji = sjisgaiji;
1091 outkana = sjiskana;
1092 break;
1093 default:
1094 outascii = jisascii;
1095 outkanji = jiskanji;
1096 outgaiji = jisgaiji;
1097 switch (outcode) {
1098 case JIS: /* mode: M_ASCII M_KANJI M_GAIJI
1099 * M_SO */
1100 outkana = jiskana;
1101 break;
1102 case JIS | ESCI: /* mode: M_ASCII M_KANJI M_GAIJI
1103 * M_ESCI */
1104 outkana = jiskanak;
1105 break;
1106 case JIS | JIS8: /* mode: M_ASCII M_KANJI M_GAIJI */
1107 outkana = jiskana8;
1108 break;
1109 }
1110 break;
1111 }
1112 }
1113
1114 /*---------------------------------------------------------------------
1115 NAME
1116 outchar
1117 ---------------------------------------------------------------------*/
outchar(c)1118 void outchar(c)
1119 register int c;
1120 {
1121 putchar(c);
1122 }
1123
1124 /*---------------------------------------------------------------------
1125 NAME
1126 jisascii
1127 ---------------------------------------------------------------------*/
jisascii(c)1128 void jisascii(c)
1129 register int c;
1130 {
1131 switch (outmode) {
1132 case M_ASCII:
1133 break;
1134 case M_SO:
1135 putchar(SI);
1136 outmode = M_ASCII;
1137 break;
1138 default:
1139 fputs(shiftout, stdout);
1140 outmode = M_ASCII;
1141 break;
1142 }
1143 putchar(c);
1144 }
1145
1146 /*---------------------------------------------------------------------
1147 NAME
1148 jiskanji
1149 ---------------------------------------------------------------------*/
jiskanji(c1,c2)1150 void jiskanji(c1, c2)
1151 register int c1, c2;
1152 {
1153 if (outmode != M_KANJI) {
1154 if (outmode == M_SO)
1155 putchar(SI);
1156 fputs(shiftin, stdout);
1157 outmode = M_KANJI;
1158 }
1159 putchar(c1);
1160 putchar(c2);
1161 }
1162
1163 /*---------------------------------------------------------------------
1164 NAME
1165 jisgaiji
1166 ---------------------------------------------------------------------*/
jisgaiji(c1,c2)1167 void jisgaiji(c1, c2)
1168 register int c1, c2;
1169 {
1170 if (nogaiji)
1171 jiskanji(ZENPAD >> 8, ZENPAD & 0xff);
1172 else {
1173 if (outmode != M_GAIJI) {
1174 if (outmode == M_SO)
1175 putchar(SI);
1176 fputs("\033$(D", stdout);
1177 outmode = M_GAIJI;
1178 }
1179 putchar(c1);
1180 putchar(c2);
1181 }
1182 }
1183
1184 /*---------------------------------------------------------------------
1185 NAME
1186 jiskana
1187 ---------------------------------------------------------------------*/
jiskana(c)1188 void jiskana(c)
1189 register int c;
1190 {
1191 if (outmode != M_SO) {
1192 if (outmode != M_ASCII)
1193 fputs(shiftout, stdout);
1194 putchar(SO);
1195 outmode = M_SO;
1196 }
1197 putchar(!nogaiji || 0x20 < c && c < 0x60 ? c : HANPAD);
1198 }
1199
1200 /*---------------------------------------------------------------------
1201 NAME
1202 jiskanak
1203 ---------------------------------------------------------------------*/
jiskanak(c)1204 void jiskanak(c)
1205 register int c;
1206 {
1207 if (outmode != M_ESCI) {
1208 fputs("\033(I", stdout);
1209 outmode = M_ESCI;
1210 }
1211 putchar(!nogaiji || 0x20 < c && c < 0x60 ? c : HANPAD);
1212 }
1213
1214 /*---------------------------------------------------------------------
1215 NAME
1216 jiskana8
1217 ---------------------------------------------------------------------*/
jiskana8(c)1218 void jiskana8(c)
1219 register int c;
1220 {
1221 if (outmode != M_ASCII) {
1222 fputs(shiftout, stdout);
1223 outmode = M_ASCII;
1224 }
1225 putchar((!nogaiji || 0x20 < c && c < 0x60 ? c : HANPAD) | 0x80);
1226 }
1227
1228 /*---------------------------------------------------------------------
1229 NAME
1230 euckanji
1231 ---------------------------------------------------------------------*/
euckanji(c1,c2)1232 void euckanji(c1, c2)
1233 register int c1, c2;
1234 {
1235 putchar(c1 | 0x80);
1236 putchar(c2 | 0x80);
1237 }
1238
1239 /*---------------------------------------------------------------------
1240 NAME
1241 eucgaiji
1242 ---------------------------------------------------------------------*/
eucgaiji(c1,c2)1243 void eucgaiji(c1, c2)
1244 register int c1, c2;
1245 {
1246 if (nogaiji) {
1247 putchar(ZENPAD >> 8 | 0x80);
1248 putchar(ZENPAD & 0xff | 0x80);
1249 } else {
1250 putchar(SS3);
1251 putchar(c1 | 0x80);
1252 putchar(c2 | 0x80);
1253 }
1254 }
1255
1256 /*---------------------------------------------------------------------
1257 NAME
1258 euckana
1259 ---------------------------------------------------------------------*/
euckana(c)1260 void euckana(c)
1261 register int c;
1262 {
1263 putchar(SS2);
1264 putchar((!nogaiji || 0x20 < c && c < 0x60 ? c : HANPAD) | 0x80);
1265 }
1266
1267 /*---------------------------------------------------------------------
1268 NAME
1269 sjiskanji
1270 ---------------------------------------------------------------------*/
sjiskanji(c1,c2)1271 void sjiskanji(c1, c2)
1272 register int c1, c2;
1273 {
1274 putchar((c1 - 1 >> 1) + (c1 <= 0x5e ? 0x71 : 0xb1));
1275 putchar(c2 + (c1 & 1 ? (c2 < 0x60 ? 0x1f : 0x20) : 0x7e));
1276 }
1277
1278 /*---------------------------------------------------------------------
1279 NAME
1280 sjisgaiji
1281 DESCRIPTION
1282 Characters are mapped as follows:
1283 0x2121 to 0x3a7e --> 0xf040 to 0xfcfc
1284 0x3b21 to 0x7e7e --> 0xfcfc
1285 ---------------------------------------------------------------------*/
sjisgaiji(c1,c2)1286 void sjisgaiji(c1, c2)
1287 register int c1, c2;
1288 {
1289 if (nogaiji)
1290 sjiskanji(ZENPAD >> 8, ZENPAD & 0xff);
1291 else {
1292 putchar(c1 < 0x3b ? (c1 - 1 >> 1) + 0xe0 : 0xfc);
1293 putchar(c1 < 0x3b ? c2 +
1294 (c1 & 1 ? (c2 < 0x60 ? 0x1f : 0x20) : 0x7e) : 0xfc);
1295 }
1296 }
1297
1298 /*---------------------------------------------------------------------
1299 NAME
1300 sjiskana
1301 ---------------------------------------------------------------------*/
sjiskana(c)1302 void sjiskana(c)
1303 register int c;
1304 {
1305 putchar(0x20 < c && c < 0x60 ? c | 0x80 : HANPAD | 0x80);
1306 }
1307
1308 /*---------------------------------------------------------------------
1309 NAME
1310 decascii
1311 ---------------------------------------------------------------------*/
decascii(c)1312 void decascii(c)
1313 register int c;
1314 {
1315 if (lastkana) {
1316 putchar(kanatbl[lastkana] >> 8);
1317 putchar(kanatbl[lastkana] & 0xff);
1318 lastkana = 0;
1319 }
1320 putchar(c);
1321 }
1322
1323 /*---------------------------------------------------------------------
1324 NAME
1325 deckanji
1326 ---------------------------------------------------------------------*/
deckanji(c1,c2)1327 void deckanji(c1, c2)
1328 register int c1, c2;
1329 {
1330 if (lastkana) {
1331 putchar(kanatbl[lastkana] >> 8);
1332 putchar(kanatbl[lastkana] & 0xff);
1333 lastkana = 0;
1334 }
1335 putchar(c1 | 0x80);
1336 putchar(c2 | 0x80);
1337 }
1338
1339 /*---------------------------------------------------------------------
1340 NAME
1341 decgaiji
1342 ---------------------------------------------------------------------*/
decgaiji(c1,c2)1343 void decgaiji(c1, c2)
1344 register int c1, c2;
1345 {
1346 if (lastkana) {
1347 putchar(kanatbl[lastkana] >> 8);
1348 putchar(kanatbl[lastkana] & 0xff);
1349 lastkana = 0;
1350 }
1351 if (nogaiji) {
1352 putchar(ZENPAD >> 8 | 0x80);
1353 putchar(ZENPAD & 0xff | 0x80);
1354 } else {
1355 putchar(c1 | 0x80);
1356 putchar(c2);
1357 }
1358 }
1359
1360 /*---------------------------------------------------------------------
1361 NAME
1362 deckana
1363 ---------------------------------------------------------------------*/
deckana(c)1364 void deckana(c)
1365 register int c;
1366 {
1367 register int cc;
1368 int i;
1369 extern unsigned char dakuon[];
1370
1371 if (lastkana) {
1372 cc = kanatbl[lastkana];
1373 if ((c == 0x5e || c == 0x5f) &&
1374 (i = dakuon[lastkana] & (c == 0x5e ? 1 : 2))) {
1375 cc += i;
1376 c = -1;
1377 }
1378 putchar(cc >> 8);
1379 putchar(cc & 0xff);
1380 }
1381 if (c < 0x21 || 0x5f < c) {
1382 if (c > 0) {
1383 putchar(ZENPAD >> 8);
1384 putchar(ZENPAD & 0xff);
1385 }
1386 lastkana = 0;
1387 } else
1388 lastkana = c - 0x20;
1389 }
1390
1391 /*---------------------------------------------------------------------
1392 TYPE
1393 table
1394 NAME
1395 katakana, hiragana, dakuon - JIS X0201 kana to JIS kanji in DEC
1396 ---------------------------------------------------------------------*/
1397 unsigned short katakana[] = {
1398 0, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1,
1399 0xa5a3, 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3,
1400 0xa1bc, 0xa5a2, 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad,
1401 0xa5af, 0xa5b1, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd,
1402 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6, 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc,
1403 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5, 0xa5d8, 0xa5db, 0xa5de,
1404 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6, 0xa5e8, 0xa5e9,
1405 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab, 0xa1ac,
1406 };
1407
1408 unsigned short hiragana[] = {
1409 0, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa4f2, 0xa4a1,
1410 0xa4a3, 0xa4a5, 0xa4a7, 0xa4a9, 0xa4e3, 0xa4e5, 0xa4e7, 0xa4c3,
1411 0xa1bc, 0xa4a2, 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ad,
1412 0xa4af, 0xa4b1, 0xa4b3, 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd,
1413 0xa4bf, 0xa4c1, 0xa4c4, 0xa4c6, 0xa4c8, 0xa4ca, 0xa4cb, 0xa4cc,
1414 0xa4cd, 0xa4ce, 0xa4cf, 0xa4d2, 0xa4d5, 0xa4d8, 0xa4db, 0xa4de,
1415 0xa4df, 0xa4e0, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e6, 0xa4e8, 0xa4e9,
1416 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ed, 0xa4ef, 0xa4f3, 0xa1ab, 0xa1ac,
1417 };
1418
1419 unsigned char dakuon[] = {
1420 0, 0, 0, 0, 0, 0, 0, 0,
1421 0, 0, 0, 0, 0, 0, 0, 0,
1422 0, 0, 0, 0, 0, 0, 1, 1,
1423 1, 1, 1, 1, 1, 1, 1, 1,
1424 1, 1, 1, 1, 1, 0, 0, 0,
1425 0, 0, 3, 3, 3, 3, 3, 0,
1426 0, 0, 0, 0, 0, 0, 0, 0,
1427 0, 0, 0, 0, 0, 0, 0, 0,
1428 };
1429