1 /*
2 * KAKASI (Kanji Kana Simple inversion program)
3 * $Header: kanjiio.c,v 2.0 92/07/18 16:11:09 takahasi Exp $
4 * Copyright (C) 1992
5 * Hironobu Takahashi (takahasi@tiny.or.jp)
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either versions 2, or (at your option)
10 * any later version.
11 *
12 * This program is distributed in the hope that it will be useful
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with KAKASI, see the file COPYING. If not, write to the Free
19 * Software Foundation Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21 /* $Log: kanjiio.c,v $
22 * Revision 2.0 92/07/18 16:11:09 takahasi
23 * *** empty log message ***
24 *
25 * Revision 2.0 92/07/18 15:38:40 takahasi
26 *
27 */
28 #include <stdio.h>
29 #include <string.h>
30 #include "kakasi.h"
31
32 int input_term_type = UNKNOWN;
33
34 int input_GL = SETG0;
35 int input_GR = SETG3;
36 int input_G[5] = {ASCII, KATAKANA, KATAKANA, JIS83, SJKANA};
37
38 int output_term_type = UNKNOWN;
39
40 int output_GL = SETG0;
41 int output_GR = SETG3;
42 int output_G[5] = {ASCII, KATAKANA, KATAKANA, JIS83, SJKANA};
43
44 /* 1 ʸ���Хåե� */
45
46 static Character kanji_buf={OTHER, 0, 0};
47 static int kanji_buf_set = 0;
48
ungetkanji(c)49 void ungetkanji(c)
50 Character *c;
51 {
52 kanji_buf.type = c->type;
53 kanji_buf.c1 = c->c1;
54 kanji_buf.c2 = c->c2;
55 kanji_buf_set = 1;
56 }
57
58 /* 1 ʸ������ */
59
60 static unsigned char input_stack[1024];
61 static input_stack_depth = 0;
62
get1byte()63 static int get1byte()
64 {
65 if (input_stack_depth == 0)
66 return getchar();
67 else
68 return input_stack[-- input_stack_depth];
69 }
70
unget1byte(c)71 static void unget1byte(c)
72 int c;
73 {
74 input_stack[input_stack_depth ++] = c;
75 }
76
getc0set1(gn)77 static int getc0set1(gn)
78 int gn;
79 {
80 int c3;
81 int set;
82
83 switch(c3 = get1byte()) {
84 case 'B':
85 set = ASCII; break;
86 case 'J':
87 set = JISROMAN; break;
88 case 'O':
89 set = GRAPHIC; break;
90 case 'I':
91 set = KATAKANA; break;
92 default:
93 unget1byte(c3); return -1;
94 }
95 input_G[gn] = set;
96 return 0;
97 }
98
set_terms(type,term_type,GL,GR,G)99 static void set_terms(type, term_type, GL, GR, G)
100 int type;
101 int *term_type;
102 int *GL;
103 int *GR;
104 int *G;
105 {
106 *term_type = type;
107
108 switch(type) {
109 case OLDJIS:
110 *GL=SETG0, *GR=SETG1,
111 G[0]=JISROMAN, G[1]=KATAKANA, G[2]=KATAKANA, G[3]=KATAKANA;
112 break;
113 case NEWJIS:
114 *GL=SETG0, *GR=SETG1,
115 G[0]=ASCII, G[1]=KATAKANA, G[2]=KATAKANA, G[3]=KATAKANA;
116 break;
117 case DEC:
118 *GL=SETG0, *GR=SETG3,
119 G[0]=ASCII, G[1]=GRAPHIC, G[2]=KATAKANA, G[3]=JIS83;
120 break;
121 case EUC:
122 *GL=SETG0, *GR=SETG3,
123 G[0]=ASCII, G[1]=KATAKANA, G[2]=KATAKANA, G[3]=JIS83;
124 break;
125 case MSKANJI:
126 *GL=SETG0, *GR=SJKANA,
127 G[0]=ASCII, G[1]=KATAKANA, G[2]=KATAKANA, G[3]=KATAKANA;
128 break;
129 }
130 }
131
set_input_term(type)132 void set_input_term(type)
133 int type;
134 {
135 set_terms(type, &input_term_type, &input_GL, &input_GR, input_G);
136 }
137
set_output_term(type)138 void set_output_term(type)
139 int type;
140 {
141 set_terms(type, &output_term_type, &output_GL, &output_GR, output_G);
142 }
143
getc0set2(gn)144 static int getc0set2(gn)
145 int gn;
146 {
147 int c4;
148 int set;
149
150 switch(c4 = get1byte()) {
151 case '@':
152 set = JIS78;
153 if (input_term_type == UNKNOWN)
154 set_input_term(OLDJIS);
155 if (output_term_type == UNKNOWN)
156 set_output_term(OLDJIS);
157 break;
158 case 'B':
159 set = JIS83;
160 if (input_term_type == UNKNOWN)
161 set_input_term(NEWJIS);
162 if (output_term_type == UNKNOWN)
163 set_output_term(NEWJIS);
164 break;
165 default:
166 unget1byte(c4); return -1;
167 }
168 input_G[gn] = set;
169 return 0;
170 }
171
getc0(c,c1)172 static void getc0(c, c1)
173 Character *c;
174 int c1;
175 {
176 int c2, c3;
177 int GL_save, GR_save;
178
179 switch(c1) {
180 case '\033':
181 switch(c2 = get1byte()) {
182 case '(':
183 if (getc0set1(SETG0) != 0) {
184 unget1byte(c2); c->type = OTHER; c->c1 = c1; return;
185 }
186 break;
187 case ')':
188 if (getc0set1(SETG1) != 0) {
189 unget1byte(c2); c->type = OTHER; c->c1 = c1; return;
190 }
191 break;
192 case '*':
193 if (getc0set1(SETG2) != 0) {
194 unget1byte(c2); c->type = OTHER; c->c1 = c1; return;
195 }
196 break;
197 case '+':
198 if (getc0set1(SETG3) != 0) {
199 unget1byte(c2); c->type = OTHER; c->c1 = c1; return;
200 }
201 break;
202 case '$':
203 switch(c3 = get1byte()) {
204 case '@':
205 if (input_term_type == UNKNOWN)
206 set_input_term(OLDJIS);
207 if (output_term_type == UNKNOWN)
208 set_output_term(OLDJIS);
209 input_G[SETG0] = JIS78;
210 break;
211 case 'B':
212 if (input_term_type == UNKNOWN)
213 set_input_term(NEWJIS);
214 if (output_term_type == UNKNOWN)
215 set_output_term(NEWJIS);
216 input_G[SETG0] = JIS83;
217 break;
218 case '(':
219 if (getc0set2(SETG0) != 0) {
220 unget1byte(c3); unget1byte(c2);
221 c->type = OTHER; c->c1 = c1; return;
222 }
223 break;
224 case ')':
225 if (getc0set2(SETG1) != 0) {
226 unget1byte(c3); unget1byte(c2);
227 c->type = OTHER; c->c1 = c1; return;
228 }
229 break;
230 case '*':
231 if (getc0set2(SETG2) != 0) {
232 unget1byte(c3); unget1byte(c2);
233 c->type = OTHER; c->c1 = c1; return;
234 }
235 break;
236 case '+':
237 if (getc0set2(SETG3) != 0) {
238 unget1byte(c3); unget1byte(c2);
239 c->type = OTHER; c->c1 = c1; return;
240 }
241 break;
242 default:
243 unget1byte(c3);
244 unget1byte(c2);
245 c->type = OTHER; c->c1 = c1; return;
246 }
247 break;
248 case 'n':
249 input_GL = SETG2;
250 break;
251 case 'o':
252 input_GL = SETG3;
253 break;
254 case '~':
255 input_GR = SETG1;
256 break;
257 case '}':
258 input_GR = SETG2;
259 break;
260 case '|':
261 input_GR = SETG3;
262 break;
263 case 'N':
264 GL_save = input_GL;
265 GR_save = input_GR;
266 input_GL = SETG2;
267 input_GR = SETG2;
268 getkanji(c);
269 input_GL = GL_save;
270 input_GR = GR_save;
271 return;
272 case 'O':
273 GL_save = input_GL;
274 GR_save = input_GR;
275 input_GL = SETG3;
276 input_GR = SETG3;
277 getkanji(c);
278 input_GL = GL_save;
279 input_GR = GR_save;
280 return;
281 default:
282 unget1byte(c2);
283 c->type = OTHER; c->c1 = c1; return;
284 }
285 break;
286 case 0xe:
287 input_GL = SETG1;
288 break;
289 case 0xf:
290 input_GL = SETG0;
291 break;
292 case EOF:
293 c->type = OTHER; c->c1 = 0xff; return;
294 default:
295 c->type = OTHER; c->c1 = c1; return;
296 }
297 getkanji(c);
298 }
299
getc1(c,c1)300 static void getc1(c, c1)
301 Character *c;
302 int c1;
303 {
304 int GL_save, GR_save;
305
306 switch(c1) {
307 case 0x8e:
308 GL_save = input_GL;
309 GR_save = input_GR;
310 input_GL = SETG2;
311 input_GR = SETG2;
312 getkanji(c);
313 input_GL = GL_save;
314 input_GR = GR_save;
315 return;
316 case 0x8f:
317 GL_save = input_GL;
318 GR_save = input_GR;
319 input_GL = SETG3;
320 input_GR = SETG3;
321 getkanji(c);
322 input_GL = GL_save;
323 input_GR = GR_save;
324 return;
325 default:
326 c->type = OTHER; c->c1 = c1; return;
327 }
328 }
329
getkanji(c)330 void getkanji(c)
331 Character *c;
332 {
333 int c1;
334
335 if (kanji_buf_set) {
336 c->type = kanji_buf.type;
337 c->c1 = kanji_buf.c1;
338 c->c2 = kanji_buf.c2;
339 kanji_buf_set = 0;
340 return;
341 }
342
343 c1 = get1byte();
344 if (c1 < 0x20) { /* C0 */
345 getc0(c, c1);
346 } else if (c1 < 0x7f) { /* GL */
347 c->type = input_G[input_GL];
348 switch(c->type) {
349 case JIS78:
350 c->c1 = c1|0x80; c->c2 = get1byte()|0x80;
351 exc78_83(c);
352 break;
353 case JIS83:
354 c->c1 = c1|0x80; c->c2 = get1byte()|0x80;
355 break;
356 default:
357 c->c1 = c1;
358 }
359 } else if (c1 == 0x7f) { /* C0 */
360 c->type = OTHER; c->c1 = c1;
361 } else { /* 0x80 - 0xff */
362 if (input_term_type == UNKNOWN) {
363 int c2, term_type;
364
365 c2 = get1byte(); unget1byte(c2);
366 if ((c1 <= 0x9f) && (c1 >= 0x81) &&
367 (c2 >= 0x40) && (c2 <= 0xfc) && (c2 != 0x7f))
368 term_type = MSKANJI;
369 else if ((c1 <= 0xe9) && (c1 >= 0xe0) &&
370 (c2 >= 0x40) && (c2 <= 0xfc) && (c2 != 0x7f))
371 term_type = MSKANJI;
372 else if ((c1 == 0xea) &&
373 (c2 >= 0x40) && (c2 <= 0x0a5) && (c2 != 0x7f))
374 term_type = MSKANJI;
375 else if ((c1 <= 0xf4) && (c1 >= 0xa1) &&
376 (c2 >= 0xa1) && (c2 <= 0xfe))
377 term_type = DEC;
378 else
379 term_type = NEWJIS;
380 set_input_term(term_type);
381 if (output_term_type == UNKNOWN) {
382 set_output_term(term_type);
383 }
384 }
385
386 if (input_term_type == MSKANJI) {
387 if ((0xa0 <= c1) && (c1 <= 0xdf)) {
388 c->type=KATAKANA; c->c1 = c1&0x7f;
389 } else if ((0x81 <= c1) && (c1 <= 0xea)) {
390 int o1, o2, c2;
391
392 c2 = get1byte();
393 if (c2 >= 0x9f) {
394 if (c1 >= 0xe0) o1 = c1*2 - 0xe0;
395 else o1 = c1*2 - 0x60;
396 o2 = c2 + 2;
397 } else {
398 if (c1 >= 0xe0) o1 = c1*2 - 0xe1;
399 else o1 = c1*2 - 0x61;
400 if (c2 >= 0x7f) o2 = c2 + 0x60;
401 else o2 = c2 + 0x61;
402 }
403 c->type=JIS83;
404 c->c1 = o1;
405 c->c2 = o2;
406 } else {
407 c->type=OTHER; c->c1 = c1;
408 }
409 } else {
410 if (c1 < 0xa0) { /* C1 */
411 getc1(c, c1);
412 } else if (c1 < 0xff) { /* GR */
413 c->type = input_G[input_GR];
414 switch(c->type) {
415 case JIS78:
416 c->c1 = c1; c->c2 = get1byte()|0x80;
417 exc78_83(c);
418 case JIS83:
419 c->c1 = c1; c->c2 = get1byte()|0x80;
420 break;
421 default:
422 c->c1 = c1 & 0x7f;
423 }
424 } else if (c1 == 0xff) { /* C1 */
425 c->type = OTHER; c->c1 = c1;
426 }
427 }
428 }
429 }
430
separator_proc(c)431 static void separator_proc(c)
432 Character *c;
433 {
434 Character sep;
435
436 switch(c->type) {
437 case OTHER:
438 case ASCII:
439 case JISROMAN:
440 switch(c->c1) {
441 case ' ':
442 case '\011':
443 case '\015':
444 separator_out = 0;
445 return;
446 }
447 }
448
449 if (separator_out != 2) {
450 separator_out = 1;
451 return;
452 }
453
454 sep.type = OTHER;
455 sep.c1 = ' ';
456 putkanji(&sep);
457 separator_out = 1;
458 }
459
460 /* 1 ʸ������ */
461
putkanji(c)462 void putkanji(c)
463 Character *c;
464 {
465 if (bunkatu_mode) {
466 separator_proc(c);
467 }
468
469 switch(output_term_type) {
470 case UNKNOWN:
471 switch(c->type) {
472 case OTHER:
473 case ASCII:
474 case JISROMAN:
475 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
476 putchar('\033');putchar('(');putchar('J');
477 output_G[0] = JISROMAN;}
478 if (output_GL != SETG0) {
479 putchar(0xf); output_GL = SETG0;}
480 putchar(c->c1);
481 break;
482 case KATAKANA:
483 if (output_G[0] != KATAKANA) {
484 putchar('\033');putchar('(');putchar('I');
485 output_G[0] = KATAKANA;}
486 if (output_GL != SETG0) {
487 putchar(0xf); output_GL = SETG0;}
488 putchar(c->c1);
489 break;
490 case JIS83:
491 case JIS78:
492 if ((output_G[0] != JIS78) && (output_G[0] != JIS83)) {
493 putchar('\033');putchar('$');putchar('@');
494 output_G[0] = JIS78;}
495 if (output_GL != SETG0) {
496 putchar(0xf); output_GL = SETG0;}
497 putchar((c->c1)&0x7f);
498 putchar((c->c2)&0x7f);
499 break;
500 }
501 break;
502 case OLDJIS:
503 switch(c->type) {
504 case OTHER:
505 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
506 putchar('\033');putchar('(');putchar('J');
507 output_G[0] = JISROMAN;}
508 if (output_GL != SETG0) {
509 putchar(0xf); output_GL = SETG0;}
510 putchar(c->c1);
511 break;
512 case ASCII:
513 if (output_G[0] != ASCII) {
514 putchar('\033');putchar('(');putchar('B');
515 output_G[0] = ASCII;}
516 if (output_GL != SETG0) {
517 putchar(0xf); output_GL = SETG0;}
518 putchar((c->c1)&0x7f);
519 break;
520 case JISROMAN:
521 if (output_G[0] != JISROMAN) {
522 putchar('\033');putchar('(');putchar('J');
523 output_G[0] = JISROMAN;}
524 if (output_GL != SETG0) {
525 putchar(0xf); output_GL = SETG0;}
526 putchar((c->c1)&0x7f);
527 break;
528 case KATAKANA:
529 if (output_G[0] != KATAKANA) {
530 putchar('\033');putchar('(');putchar('I');
531 output_G[0] = KATAKANA;}
532 if (output_GL != SETG0) {
533 putchar(0xf); output_GL = SETG0;}
534 putchar(c->c1);
535 break;
536 case JIS83:
537 exc78_83(c);
538 case JIS78:
539 if (output_G[0] != JIS78) {
540 putchar('\033');putchar('$');putchar('@');
541 output_G[0] = JIS78;}
542 if (output_GL != SETG0) {
543 putchar(0xf); output_GL = SETG0;}
544 putchar((c->c1)&0x7f);
545 putchar((c->c2)&0x7f);
546 break;
547 }
548 break;
549 case NEWJIS:
550 switch(c->type) {
551 case OTHER:
552 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
553 putchar('\033');putchar('(');putchar('B');
554 output_G[0] = ASCII;}
555 if (output_GL != SETG0) {
556 putchar(0xf); output_GL = SETG0;}
557 putchar(c->c1);
558 break;
559 case ASCII:
560 if (output_G[0] != ASCII) {
561 putchar('\033');putchar('(');putchar('B');
562 output_G[0] = ASCII;}
563 if (output_GL != SETG0) {
564 putchar(0xf); output_GL = SETG0;}
565 putchar((c->c1)&0x7f);
566 break;
567 case JISROMAN:
568 if (output_G[0] != JISROMAN) {
569 putchar('\033');putchar('(');putchar('J');
570 output_G[0] = JISROMAN;}
571 if (output_GL != SETG0) {
572 putchar(0xf); output_GL = SETG0;}
573 putchar((c->c1)&0x7f);
574 break;
575 case KATAKANA:
576 if (output_G[0] != KATAKANA) {
577 putchar('\033');putchar('(');putchar('I');
578 output_G[0] = KATAKANA;}
579 if (output_GL != SETG0) {
580 putchar(0xf); output_GL = SETG0;}
581 putchar(c->c1);
582 break;
583 case JIS78:
584 exc78_83(c);
585 case JIS83:
586 if (output_G[0] != JIS83) {
587 putchar('\033');putchar('$');putchar('B');
588 output_G[0] = JIS83;}
589 if (output_GL != SETG0) {
590 putchar(0xf); output_GL = SETG0;}
591 putchar((c->c1)&0x7f);
592 putchar((c->c2)&0x7f);
593 break;
594 }
595 break;
596 case DEC:
597 switch(c->type) {
598 case OTHER:
599 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
600 putchar('\033');putchar('(');putchar('B');
601 output_G[0] = ASCII;}
602 if (output_GL != SETG0) {
603 putchar(0xf); output_GL = SETG0;}
604 putchar(c->c1);
605 break;
606 case ASCII:
607 if (output_G[0] != ASCII) {
608 putchar('\033');putchar('(');putchar('B');
609 output_G[0] = ASCII;}
610 if (output_GL != SETG0) {
611 putchar(0xf); output_GL = SETG0;}
612 putchar((c->c1)&0x7f);
613 break;
614 case JISROMAN:
615 if (output_G[0] != JISROMAN) {
616 putchar('\033');putchar('(');putchar('J');
617 output_G[0] = JISROMAN;}
618 if (output_GL != SETG0) {
619 putchar(0xf); output_GL = SETG0;}
620 putchar((c->c1)&0x7f);
621 break;
622 case KATAKANA:
623 if (output_G[2] != KATAKANA) {
624 putchar('\033');putchar('*');putchar('I');
625 output_G[2] = KATAKANA;}
626 if (output_GR != SETG2) {
627 putchar('\033');putchar('}');output_GR=SETG2;}
628 putchar((c->c1)|0x80);
629 break;
630 case GRAPHIC:
631 if (output_G[1] != GRAPHIC) {
632 putchar('\033');putchar(')');putchar('0');
633 output_G[2] = GRAPHIC;}
634 if (output_GR != SETG1) {
635 putchar('\033');putchar('~');output_GR=SETG1;}
636 putchar((c->c1)|0x80);
637 break;
638 case JIS78:
639 exc78_83(c);
640 case JIS83:
641 if (output_G[3] != JIS83) {
642 putchar('\033');putchar('$');putchar('+');putchar('B');
643 output_G[3] = JIS83;}
644 if (output_GR != SETG3) {
645 putchar('\033'); putchar('|'); output_GR = SETG3;}
646 putchar((c->c1)|0x80);
647 putchar((c->c2)|0x80);
648 break;
649 }
650 break;
651 case EUC:
652 switch(c->type) {
653 case OTHER:
654 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
655 putchar('\033');putchar('(');putchar('B');
656 output_G[0] = ASCII;}
657 if (output_GL != SETG0) {
658 putchar(0xf); output_GL = SETG0;}
659 putchar(c->c1);
660 break;
661 case ASCII:
662 if (output_G[0] != ASCII) {
663 putchar('\033');putchar('(');putchar('B');
664 output_G[0] = ASCII;}
665 if (output_GL != SETG0) {
666 putchar(0xf); output_GL = SETG0;}
667 putchar((c->c1)&0x7f);
668 break;
669 case JISROMAN:
670 if (output_G[0] != JISROMAN) {
671 putchar('\033');putchar('(');putchar('J');
672 output_G[0] = JISROMAN;}
673 if (output_GL != SETG0) {
674 putchar(0xf); output_GL = SETG0;}
675 putchar((c->c1)&0x7f);
676 break;
677 case KATAKANA:
678 if (output_G[2] != KATAKANA) {
679 putchar('\033');putchar('*');putchar('I');
680 output_G[2] = KATAKANA;}
681 putchar(0x8e);
682 putchar((c->c1)|0x80);
683 break;
684 case JIS78:
685 exc78_83(c);
686 case JIS83:
687 if (output_G[3] != JIS83) {
688 putchar('\033');putchar('$');putchar('+');putchar('B');
689 output_G[3] = JIS83;}
690 if (output_GR != SETG3) {
691 putchar('\033'); putchar('|'); output_GR = SETG3;}
692 putchar((c->c1)|0x80);
693 putchar((c->c2)|0x80);
694 break;
695 }
696 break;
697 case MSKANJI:
698 switch(c->type) {
699 case OTHER:
700 if ((output_G[0] != ASCII) && (output_G[0] != JISROMAN)) {
701 putchar('\033');putchar('(');putchar('B');
702 output_G[0] = ASCII;}
703 if (output_GL != SETG0) {
704 putchar(0xf); output_GL = SETG0;}
705 putchar(c->c1);
706 break;
707 case ASCII:
708 case JISROMAN:
709 putchar((c->c1)&0x7f);
710 break;
711 case KATAKANA:
712 putchar((c->c1)|0x80);
713 break;
714 case JIS78:
715 exc78_83(c);
716 case JIS83:
717 {
718 int o1, o2;
719
720 if ((c->c1) & 1) {
721 o1 = c->c1/2 + ((c->c1 < 0xdf) ? 0x31 : 0x71);
722 o2 = c->c2 - ((c->c2 >= 0xe0) ? 0x60 : 0x61);
723 } else {
724 o1 = c->c1/2 + ((c->c1 < 0xdf) ? 0x30 : 0x70);
725 o2 = c->c2 - 2;
726 }
727 putchar(o1);
728 putchar(o2);
729 break;
730 }
731 }
732 break;
733 }
734 }
735
term_type_str(str)736 int term_type_str(str)
737 char *str;
738 {
739 if ((strncmp(str, "oldjis", 6) == 0) ||
740 (strncmp(str, "jisold", 6) == 0))
741 return OLDJIS;
742 if (strncmp(str, "dec", 6) == 0)
743 return DEC;
744 if ((strncmp(str, "euc", 6) == 0) ||
745 (strncmp(str, "att", 6) == 0))
746 return EUC;
747 if ((strncmp(str, "sjis", 6) == 0) ||
748 (strncmp(str, "msjis", 6) == 0) ||
749 (strncmp(str, "shiftjis", 6) == 0))
750 return MSKANJI;
751
752 return NEWJIS;
753 }
754