1 /*
2 * Copyright (c) 1994-2005 Kazushi (Jam) Marukawa
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice in the documentation and/or other materials provided with
12 * the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
23 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
24 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27
28 /*
29 * Routines to manipulate a buffer to hold string of multi bytes character.
30 * Detect a character set from input string and convert them to internal
31 * codes. And convert it to other codes to display them.
32 */
33
34 #include "defines.h"
35 #include "less.h"
36
37 #include <stdio.h>
38 #include <assert.h>
39
40 #if STDC_HEADERS
41 #include <stdlib.h>
42 #include <string.h>
43 #endif
44
45 #if JAPANESE
46 #include "kanji_map.h"
47 #include "unicode_map.h"
48 #endif
49 #include "unicode_type.h"
50
51 #define LESS 1
52
53 /* TODO: remove caller control_char(), change_control_char() and ecalloc() */
54 extern int control_char ();
55 extern void change_control_char ();
56 extern void* ecalloc ();
57
58
59 #if ISO
60
61 static void multi_reparse();
62 static int check_ft();
63
64
65 #if JAPANESE
66
67 int markwrongchar = 1;
68
69
70 /*
71 * Macro for character detection
72 */
73 #define ISJIS(c) (0x21 <= (c) && (c) <= 0x7e)
74 #define ISUJIS(c) (0xa1 <= (c) && (c) <= 0xfe)
75 #define ISUJISSS(c) ((c) == 0x8e || (c) == 0x8f)
76 #define ISUJISKANJI(c1,c2) (ISUJIS(c1) && ISUJIS(c2))
77 #define ISUJISKANJI1(c) (ISUJIS(c))
78 #define ISUJISKANA(c1,c2) ((c1) == 0x8e && ISUJIS(c2))
79 #define ISUJISKANA1(c) ((c) == 0x8e)
80 #define ISUJISKANJISUP(c1,c2,c3) ((c1) == 0x8f && ISUJIS(c2) && ISUJIS(c3))
81 #define ISUJISKANJISUP1(c) ((c) == 0x8f)
82 #define ISSJISKANJI(c1,c2) (((0x81 <= (c1) && (c1) <= 0x9f) || \
83 (0xe0 <= (c1) && (c1) <= 0xfc)) && \
84 (0x40 <= (c2) && (c2) <= 0xfc && (c2) != 0x7f))
85 #define ISSJISKANJI1(c) ((0x81 <= (c) && (c) <= 0x9f) || \
86 (0xe0 <= (c) && (c) <= 0xfc))
87 #define ISSJISKANA(c) (0xa1 <= (c) && (c) <= 0xdf)
88 #define ISUTF8_HEAD(c) (0xc0 <= (c) && (c) < 0xfe)
89 #define ISUTF8_REST(c) (((c) & 0xc0) == 0x80)
90 #define ISUTF8_1(c) ((c) <= 0x7f)
91 #define ISUTF8_2(c1,c2) (((c1) & 0xe0) == 0xc0 && ISUTF8_REST(c2))
92 #define ISUTF8_3(c1,c2,c3) (((c1) & 0xf0) == 0xe0 && ISUTF8_REST(c2) && \
93 ISUTF8_REST(c3))
94 #define ISUTF8_4(c1,c2,c3,c4) (((c1) & 0xf8) == 0xf0 && ISUTF8_REST(c2) && \
95 ISUTF8_REST(c3) && ISUTF8_REST(c4))
96 #define ISUTF8_5(c1,c2,c3,c4,c5) \
97 (((c1) & 0xfc) == 0xf8 && ISUTF8_REST(c2) && ISUTF8_REST(c3) && \
98 ISUTF8_REST(c4) && ISUTF8_REST(c5))
99 #define ISUTF8_6(c1,c2,c3,c4,c5,c6) \
100 (((c1) & 0xfe) == 0xfc && ISUTF8_REST(c2) && ISUTF8_REST(c3) && \
101 ISUTF8_REST(c4) && ISUTF8_REST(c5) && ISUTF8_REST(c6))
102
103 #define UTF8_2(c0, c1) ((((c0) & 0x1f) << 6) \
104 | (((c1) & 0x3f)))
105 #define UTF8_3(c0, c1, c2) ((((c0) & 0x0f) << 12) \
106 | (((c1) & 0x3f) << 6) \
107 | ((c2) & 0x3f))
108 #define UTF8_4(c0, c1, c2, c3) ((((c0) & 0x07) << 18) \
109 | (((c1) & 0x3f) << 12) \
110 | (((c2) & 0x3f) << 6) \
111 | ((c3) & 0x3f))
112 #define UTF8_5(c0, c1, c2, c3, c4) ((((c0) & 0x03) << 24) \
113 | (((c1) & 0x3f) << 18) \
114 | (((c2) & 0x3f) << 12) \
115 | (((c3) & 0x3f) << 6) \
116 | ((c4) & 0x3f))
117 #define UTF8_6(c0, c1, c2, c3, c4, c5) ((((c0) & 0x01) << 30) \
118 | (((c1) & 0x3f) << 24) \
119 | (((c2) & 0x3f) << 18) \
120 | (((c3) & 0x3f) << 12) \
121 | (((c4) & 0x3f) << 6) \
122 | ((c5) & 0x3f))
123 #endif
124
125
126 /*
127 * Definitions for understanding the escape sequence.
128 * Following escape sequences which be understood by less:
129 * ESC 2/4 2/8,2/9,2/10,2/11,2/13,2/14,2/15 F
130 * ESC 2/4 4/0,4/1,4/2
131 * ESC 2/6 F
132 * ESC 2/8,2/9,2/10,2/11,2/13,2/14,2/15 F
133 * ESC 2/12 F This is used in MULE. Less support this as input.
134 * 0/14,0/15
135 * ESC 4/14,4/15,6/14,6/15,7/12,7/13,7/14
136 * 8/14,8/15
137 */
138 enum escape_sequence {
139 NOESC, /* No */ ESC_, /* ^[ */
140 ESC_2_4, /* ^[$ */ ESC_2_4_8, /* ^[$( */
141 ESC_2_4_9, /* ^[$) */ ESC_2_4_10, /* ^[$* */
142 ESC_2_4_11, /* ^[$+ */ ESC_2_4_13, /* ^[$- */
143 ESC_2_4_14, /* ^[$. */ ESC_2_4_15, /* ^[$/ */
144 ESC_2_6, /* ^[& */ ESC_2_8, /* ^[( */
145 ESC_2_9, /* ^[) */ ESC_2_10, /* ^[* */
146 ESC_2_11, /* ^[+ */ ESC_2_12, /* ^[, */
147 ESC_2_13, /* ^[- */ ESC_2_14, /* ^[. */
148 ESC_2_15, /* ^[/ */ ESC_5_11, /* ^[[ */
149 };
150
151
152 static SETCHARSET def_scs = SCSASCII | SCSOTHERISO;
153 static ENCSET def_input = ESISO7; /* Default character set of left plane */
154 static ENCSET def_inputr = ESISO8; /* Default character set of right plane */
155 static int def_gs[4] = {
156 ASCII, /* Default g0 plane status */
157 WRONGCS, /* Default g1 plane status */
158 WRONGCS, /* Default g2 plane status */
159 WRONGCS /* Default g3 plane status */
160 };
161
162 static ENCSET output = ESISO8; /* Character set for output */
163 #if JAPANESE
164 static J_PRIORITY def_priority = PUTF8; /* Which code was given priority. */
165 #endif
166
167 static UWidth utfwidth = UWIDTH_NORMAL; /* default UTF-8 Width */
168
169 typedef POSITION m_position;
170 #define M_NULL_POS ((POSITION)(-1))
171
172 /*
173 * Structure to represent character set information.
174 *
175 * This data set contains current character set and other information
176 * to keep the status of ISO-2022 escape sequence.
177 */
178 struct m_status {
179 /* Graphi Sets */
180 int gs[4]; /* Current g0..g3 plane sets. */
181 /* gl, gr, and sg refer one of 4 planes. */
182 int gl; /* Current gl plane status */
183 int gr; /* Current gr plane status */
184 int sg; /* Current status of single-shifted plane */
185 #define WRONGPLANE (-1)
186 #define ISVALIDPLANE(mp,plane) ((mp)->ms->plane != WRONGPLANE)
187 #define FINDCS(mp,c) ((mp)->ms->gs[(ISVALIDPLANE((mp), sg) ? (mp)->ms->sg : \
188 ((c) & 0x80) ? (mp)->ms->gr : (mp)->ms->gl)])
189 #define PLANE2CS(mp,plane) ((mp)->ms->gs[(mp)->ms->plane])
190
191 int irr; /* Identify revised registration number */
192 };
193
194 struct multibuf {
195 struct {
196 SETCHARSET scs;
197 ENCSET input;
198 ENCSET inputr;
199 } io;
200
201 ENCSET orig_io_right;
202 int rotation_io_right;
203
204 enum escape_sequence eseq;
205 /*
206 * Variables to control of escape sequences as output.
207 */
208 int cs; /* Current character set */
209 struct m_status* ms;
210 #if JAPANESE
211 J_PRIORITY priority; /* Which code was given priority. */
212 int sequence_counter; /* Special counter for detect UJIS KANJI. */
213 #endif
214
215 CHARSET icharset; /* Last non ASCII character set of input */
216
217 /*
218 * Small buffers to hold all parsing bytes of multi-byte characters.
219 *
220 * multi_parse() function receive a sequence of byte and buffer it.
221 * Each time multi_parse() recognize full data sequence to represent
222 * one character, it converts the data into internal data and returns
223 * converted data.
224 *
225 * Caller must buffer it somewhere and output it using outbuf() of
226 * outchar(). Those output functions() converts internal data into
227 * appropriate data stream for choosen output device.
228 *
229 * As internal data, we use char[] and CHARSET[] to keep byte and
230 * additional information, respectively. We choose ISO-2022 style
231 * data format as our internal data format because it is most easy
232 * to work with. It has completely separated planes for each
233 * character set. This helps code conversion and others alot.
234 * For example, we don't need to work to separate Chinese and
235 * Japanese because they are separated from the beginning in ISO-2022
236 * although UTF-8 uses only single plane with all CJK character sets.
237 */
238 /*
239 * Buffer for input/parsing
240 */
241 m_position lastpos; /* position of last byte */
242 m_position startpos; /* position of first byte buffered */
243 unsigned char inbuf[80];
244 m_position laststartpos; /* position of first byte buffered last time */
245 int lastsg; /* last single-shifted plane (ms->sg) */
246 /*
247 * Buffer for internalized/converted data
248 */
249 unsigned char multiint[80]; /* Byte data */
250 CHARSET multics[80]; /* Character set data (no UJIS/SJIS/UTF */
251 /* because all of them are converted into */
252 /* internal data format) */
253 int intindex; /* Index of multiint */
254 };
255
256 #define INBUF(mp) ((mp)->inbuf[(mp)->lastpos%sizeof((mp)->inbuf)])
257 #define INBUF0(mp) ((mp)->inbuf[(mp)->startpos%sizeof((mp)->inbuf)])
258 #define INBUF1(mp) ((mp)->inbuf[((mp)->startpos+1)%sizeof((mp)->inbuf)])
259 #define INBUF2(mp) ((mp)->inbuf[((mp)->startpos+2)%sizeof((mp)->inbuf)])
260 #define INBUF3(mp) ((mp)->inbuf[((mp)->startpos+3)%sizeof((mp)->inbuf)])
261 #define INBUF4(mp) ((mp)->inbuf[((mp)->startpos+4)%sizeof((mp)->inbuf)])
262 #define INBUF5(mp) ((mp)->inbuf[((mp)->startpos+5)%sizeof((mp)->inbuf)])
263 #define INBUFI(mp,i) ((mp)->inbuf[(i)%sizeof((mp)->inbuf)])
264
unicode_type(c)265 static int unicode_type(c)
266 int c;
267 {
268 if (c < 0) {
269 return 0;
270 } else if (c < 0x20000) {
271 return utype_map[c];
272 } else if (c < 0x40000) {
273 return (UTYPE_EXIST | UTYPE_WIDE);
274 } else if (c < 0xe0000) {
275 return 0;
276 } else if (c < 0xe0080) {
277 return (UTYPE_EXIST | UTYPE_FORMAT);
278 } else if (c < 0xe01f0) {
279 return (UTYPE_EXIST | UTYPE_NSP_MODIFIER);
280 } else if (c >= 0xf0000 && c <= 0xffffd) {
281 return (UTYPE_EXIST | UTYPE_AMBIGUOUS);
282 } else if (c >= 0x100000 && c <= 0x10fffd) {
283 return (UTYPE_EXIST | UTYPE_AMBIGUOUS);
284 } else {
285 return 0;
286 }
287 }
288
289 static int
get_utfwidth(uc)290 get_utfwidth(uc)
291 int uc;
292 {
293 int utype = unicode_type(uc);
294
295 if (!(utype * UTYPE_EXIST))
296 return WRONGUCS_H;
297 if (utype & UTYPE_CONTROL)
298 return WRONGUCS_H;
299 if (utype & (UTYPE_NSP_MODIFIER | UTYPE_FORMAT | UTYPE_SEPARATOR))
300 return UTF8Z;
301
302 switch (utfwidth) {
303 case UWIDTH_NONE:
304 break;
305 case UWIDTH_NORMAL:
306 if (utype & UTYPE_WIDE) return UTF8W;
307 break;
308 case UWIDTH_CJK:
309 if (utype & (UTYPE_WIDE | UTYPE_AMBIGUOUS)) return UTF8W;
310 break;
311 case UWIDTH_JA:
312 if (utype & (UTYPE_WIDE | UTYPE_AMBIGUOUS | UTYPE_JA)) return UTF8W;
313 break;
314 case UWIDTH_ALMOST:
315 if (uc >= 0x80) return UTF8W;
316 break;
317 case UWIDTH_ALL:
318 return UTF8W;
319 }
320
321 return UTF8;
322 }
323
code_length(mp,cs)324 static int code_length(mp, cs)
325 MULBUF* mp;
326 CHARSET cs;
327 {
328 #if JAPANESE
329 unsigned char c;
330 #endif
331
332 if (CSISWRONG(cs))
333 return 1;
334
335 #if JAPANESE
336 switch (CS2CHARSET(cs)) {
337 case UTF8:
338 case UTF8Z:
339 case UTF8W:
340 c = INBUF0(mp);
341 if (c < 0xC0) return 1;
342 if (c < 0xe0) return 2;
343 if (c < 0xf0) return 3;
344 if (c < 0xf8) return 4;
345 if (c < 0xfc) return 5;
346 if (c < 0xfe) return 6;
347 return 1;
348 case UJIS:
349 case UJIS2000:
350 case UJIS2004:
351 c = INBUF0(mp);
352 if (ISUJISKANJI1(c)) return 2;
353 if (ISUJISKANA1(c)) return 2;
354 if (ISUJISKANJISUP1(c)) return 3;
355 return 1;
356 case SJIS:
357 case SJIS2000:
358 case SJIS2004:
359 c = INBUF0(mp);
360 if (ISSJISKANJI1(c)) return 2;
361 if (ISSJISKANA(c)) return 1;
362 return 1;
363 }
364 #endif
365
366 switch (CS2TYPE(cs))
367 {
368 case TYPE_94_CHARSET:
369 case TYPE_96_CHARSET:
370 return 1;
371 case TYPE_94N_CHARSET:
372 case TYPE_96N_CHARSET:
373 switch (CS2FT(cs) & 0x70)
374 {
375 case 0x30: return 2; /* for private use */
376 case 0x40:
377 case 0x50: return 2;
378 case 0x60: return 3;
379 case 0x70: return 4; /* or more bytes */
380 }
381 }
382 assert(0);
383 return (0);
384 }
385
386 /*
387 * Convert first byte of buffered data as one byte ASCII data
388 * without any conversion.
389 */
noconv1(mp)390 static void noconv1(mp)
391 MULBUF *mp;
392 {
393 mp->multiint[mp->intindex] = INBUF0(mp);
394 mp->multics[mp->intindex] = ASCII;
395 mp->intindex++;
396 mp->startpos++;
397 }
398
399 /*
400 * Convert first byte of buffered data as one byte WRONGCS data
401 * without any conversion.
402 */
wrongcs1(mp)403 static void wrongcs1(mp)
404 MULBUF *mp;
405 {
406 mp->multiint[mp->intindex] = INBUF0(mp);
407 mp->multics[mp->intindex] = WRONGCS;
408 mp->intindex++;
409 mp->startpos++;
410 }
411
412 /*
413 * Write a wrongmark on out buffer.
414 */
put_wrongmark(mp)415 static void put_wrongmark(mp)
416 MULBUF *mp;
417 {
418 mp->multiint[mp->intindex + 0] = '"';
419 mp->multiint[mp->intindex + 1] = '.';
420 mp->multics[mp->intindex + 0] = JISX0208KANJI;
421 mp->multics[mp->intindex + 1] = REST_MASK | JISX0208KANJI;
422 mp->intindex += 2;
423 /* flush buffer */
424 mp->startpos = mp->lastpos + 1;
425 }
426
427 /*
428 * Write WRONGUCS characters
429 */
wrongucs(mp,uc)430 static void wrongucs(mp, uc)
431 MULBUF *mp;
432 int uc;
433 {
434 if (markwrongchar) {
435 put_wrongmark(mp);
436 return;
437 }
438
439 if (uc < 0x80) {
440 wrongcs1(mp);
441 } else if (uc < 0x800) {
442 mp->multiint[mp->intindex] = INBUF0(mp) & 0x9f;
443 mp->multics[mp->intindex] = WRONGUCS_H;
444 mp->multiint[mp->intindex + 1] = INBUF1(mp);
445 mp->multics[mp->intindex + 1] = WRONGUCS_T | REST_MASK;
446 mp->intindex += 2;
447 } else if (uc < 0x10000) {
448 mp->multiint[mp->intindex] = INBUF0(mp) & 0x8f;
449 mp->multics[mp->intindex] = WRONGUCS_H;
450 mp->multiint[mp->intindex + 1] = INBUF1(mp);
451 mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
452 mp->multiint[mp->intindex + 2] = INBUF2(mp);
453 mp->multics[mp->intindex + 2] = WRONGUCS_T | REST_MASK;
454 mp->intindex += 3;
455 } else if (uc < 0x200000) {
456 mp->multiint[mp->intindex] = INBUF0(mp) & 0x87;
457 mp->multics[mp->intindex] = WRONGUCS_H;
458 mp->multiint[mp->intindex + 1] = INBUF1(mp);
459 mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
460 mp->multiint[mp->intindex + 2] = INBUF2(mp);
461 mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
462 mp->multiint[mp->intindex + 3] = INBUF3(mp);
463 mp->multics[mp->intindex + 3] = WRONGUCS_T | REST_MASK;
464 mp->intindex += 4;
465 } else if (uc < 0x4000000) {
466 mp->multiint[mp->intindex] = INBUF0(mp) & 0x83;
467 mp->multics[mp->intindex] = WRONGUCS_H;
468 mp->multiint[mp->intindex + 1] = INBUF1(mp);
469 mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
470 mp->multiint[mp->intindex + 2] = INBUF2(mp);
471 mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
472 mp->multiint[mp->intindex + 3] = INBUF3(mp);
473 mp->multics[mp->intindex + 3] = WRONGUCS_M | REST_MASK;
474 mp->multiint[mp->intindex + 4] = INBUF4(mp);
475 mp->multics[mp->intindex + 4] = WRONGUCS_T | REST_MASK;
476 mp->intindex += 5;
477 } else {
478 mp->multiint[mp->intindex] = INBUF0(mp) & 0x81;
479 mp->multics[mp->intindex] = WRONGUCS_H;
480 mp->multiint[mp->intindex + 1] = INBUF1(mp);
481 mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
482 mp->multiint[mp->intindex + 2] = INBUF2(mp);
483 mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
484 mp->multiint[mp->intindex + 3] = INBUF3(mp);
485 mp->multics[mp->intindex + 3] = WRONGUCS_M | REST_MASK;
486 mp->multiint[mp->intindex + 4] = INBUF3(mp);
487 mp->multics[mp->intindex + 4] = WRONGUCS_M | REST_MASK;
488 mp->multiint[mp->intindex + 5] = INBUF5(mp);
489 mp->multics[mp->intindex + 5] = WRONGUCS_T | REST_MASK;
490 mp->intindex += 5;
491 }
492
493 /* flush buffer */
494 mp->startpos = mp->lastpos + 1;
495 }
496
497 /*
498 * Convert first several bytes of buffered data.
499 *
500 * If less is in marking mode, it erase several bytes of data (depend on
501 * the current character set) and write "?" mark on output buffer.
502 * If less is not in marking mode, it calls wrongcs1().
503 */
wrongchar(mp)504 static void wrongchar(mp)
505 MULBUF *mp;
506 {
507 if (markwrongchar) {
508 switch (CS2CHARSET(mp->multics[mp->intindex])) {
509 case JISX0201KANA:
510 case JISX0201ROMAN:
511 case LATIN1:
512 case LATIN2:
513 case LATIN3:
514 case LATIN4:
515 case GREEK:
516 case ARABIC:
517 case HEBREW:
518 case CYRILLIC:
519 case LATIN5:
520 /* Should I use one byte character, like '?' or '_'? */
521 put_wrongmark(mp);
522 break;
523 case JISX0208_78KANJI:
524 case JISX0208KANJI:
525 case JISX0208_90KANJI:
526 case JISX0212KANJISUP:
527 case JISX0213KANJI1:
528 case JISX0213KANJI2:
529 case JISX02132004KANJI1:
530 case UJIS:
531 case UJIS2000:
532 case UJIS2004:
533 case SJIS:
534 case SJIS2000:
535 case SJIS2004:
536 case UTF8Z:
537 case UTF8:
538 case UTF8W:
539 put_wrongmark(mp);
540 break;
541 case GB2312:
542 case KSC5601:
543 default:
544 put_wrongmark(mp);
545 break;
546 }
547 } else {
548 while (mp->startpos <= mp->lastpos) {
549 wrongcs1(mp);
550 }
551 }
552 }
553
554 /*
555 * Internalize input stream.
556 * We recognized input data as using ISO coding set.
557 */
internalize_iso(mp)558 static void internalize_iso(mp)
559 MULBUF *mp;
560 {
561 register int i;
562 m_position pos;
563 m_position to;
564 int intindex;
565 int dummy;
566
567 /*
568 * If character set points empty character set, reject buffered data.
569 */
570 if (CSISWRONG(mp->cs)) {
571 wrongcs1(mp);
572 return;
573 }
574
575 /*
576 * If character set points 94 or 94x94 character set, reject
577 * DEL and SPACE codes in buffered data.
578 */
579 if (CS2TYPE(mp->cs) == TYPE_94_CHARSET ||
580 CS2TYPE(mp->cs) == TYPE_94N_CHARSET) {
581 unsigned char c = INBUF(mp);
582 if ((c & 0x7f) == 0x7f) {
583 if (mp->lastpos - mp->startpos + 1 == 1) {
584 wrongcs1(mp);
585 } else {
586 wrongcs1(mp);
587 multi_reparse(mp);
588 }
589 return;
590 } else if ((c & 0x7f) == 0x20) {
591 /*
592 * A 0x20 (SPACE) code is wrong, but I treat it as
593 * a SPACE.
594 */
595 if (mp->lastpos - mp->startpos + 1 == 1) {
596 noconv1(mp);
597 } else {
598 wrongcs1(mp);
599 multi_reparse(mp);
600 }
601 return;
602 }
603 }
604
605 /*
606 * Otherwise, keep buffering.
607 */
608 pos = mp->startpos;
609 to = pos + code_length(mp, mp->cs) - 1;
610 if (mp->lastpos < to) {
611 return; /* Not enough, so go back to fetch next data. */
612 }
613
614 /*
615 * We buffered enough data for one character of multi byte characters.
616 * Therefore, start to convert this buffered data into a first character.
617 */
618 intindex = mp->intindex;
619 mp->multiint[intindex] = INBUFI(mp, pos) & 0x7f;
620 mp->multics[intindex] = mp->cs;
621 intindex++;
622 for (pos++; pos <= to; pos++) {
623 mp->multiint[intindex] = INBUFI(mp, pos) & 0x7f;
624 mp->multics[intindex] = REST_MASK | mp->cs;
625 intindex++;
626 }
627
628 /*
629 * codeset JIS X 0208:1990 validation
630 */
631 if (mp->cs == JISX0208_90KANJI && !(mp->io.scs & SCSJISX0208_1990)) {
632 wrongchar(mp);
633 return;
634 }
635
636 /*
637 * Check newly converted code. If it is not valid code,
638 * less may mark it as not valid code.
639 */
640 if (chisvalid_cs(mp->io.scs,
641 &mp->multiint[mp->intindex],
642 &mp->multics[mp->intindex])) {
643 mp->icharset = mp->cs;
644 mp->intindex = intindex;
645 mp->startpos = pos;
646 } else {
647 /*
648 * less ignore the undefined codes
649 */
650 wrongchar(mp);
651 }
652 }
653
654 #if JAPANESE
655 /*
656 * Internalize input stream encoded by UJIS encoding scheme.
657 *
658 * Return 1 if input is recognized well.
659 * Return 0 if input is rejected.
660 */
internalize_ujis(mp)661 static int internalize_ujis(mp)
662 MULBUF *mp;
663 {
664 if (mp->lastpos - mp->startpos + 1 == 1) {
665 /* do nothing. return 1 to get next byte */
666 return 1;
667 } else if (mp->lastpos - mp->startpos + 1 == 2) {
668 int c0 = INBUF0(mp);
669 int c1 = INBUF1(mp);
670 if (ISUJISKANA(c0, c1)) {
671 mp->cs = JISX0201KANA;
672 mp->icharset = UJIS;
673 mp->multiint[mp->intindex] = c1 & 0x7f;
674 mp->multics[mp->intindex] = mp->cs;
675 mp->intindex += 1;
676 mp->startpos = mp->lastpos + 1;
677 return 1;
678 } else if (ISUJISKANJI(c0, c1)) {
679 int ch = checkKANJI(mp->io.scs,
680 SCSJISX0208_1983 | SCSJISX0208_1990
681 | SCSJISX0213_2000 | SCSJISX0213_2004,
682 c0, c1);
683 if (ch == 0) {
684 /* undefined. less ignore them */
685 wrongchar(mp);
686 return 1;
687 }
688 mp->icharset = UJIS;
689 mp->cs = (ch >> 16) & 0x7fff;
690 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
691 mp->multics[mp->intindex] = mp->cs;
692 mp->multiint[mp->intindex + 1] = ch & 0x7f;
693 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
694 mp->intindex += 2;
695 mp->startpos = mp->lastpos + 1;
696 return 1;
697 } else if (ISUJISKANJISUP(c0, c1, 0xa1)) {
698 /* do nothing. return 1 to get next byte */
699 mp->multics[mp->intindex] = UJIS;
700 return 1;
701 }
702 } else if (mp->lastpos - mp->startpos + 1 == 3) {
703 int c0 = INBUF0(mp);
704 int c1 = INBUF1(mp);
705 int c2 = INBUF2(mp);
706 if (ISUJISKANJISUP(c0, c1, c2)) {
707 int ch = checkKANJI(mp->io.scs,
708 SCSJISX0213_2ND | SCSJISX0212_1990,
709 c1, c2);
710 if (ch == 0) {
711 /* undefined. less ignore them */
712 wrongchar(mp);
713 return 1;
714 }
715 mp->icharset = UJIS;
716 mp->cs = (ch >> 16) & 0x7fff;
717 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
718 mp->multics[mp->intindex] = mp->cs;
719 mp->multiint[mp->intindex + 1] = ch & 0x7f;
720 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
721 mp->intindex += 2;
722 mp->startpos = mp->lastpos + 1;
723 return 1;
724 }
725 }
726 /* return 0 because this data sequence is not matched to UJIS */
727 return 0;
728 }
729
730 /*
731 * Internalize input stream encoded by SJIS encoding scheme.
732 *
733 * Return 1 if input is recognized well.
734 * Return 0 if input is rejected.
735 */
internalize_sjis(mp)736 static int internalize_sjis(mp)
737 MULBUF *mp;
738 {
739 if (mp->lastpos - mp->startpos + 1 == 1) {
740 int c0 = INBUF(mp);
741 if (ISSJISKANA(c0)) {
742 mp->cs = JISX0201KANA;
743 mp->icharset = SJIS;
744 mp->multiint[mp->intindex] = c0 & 0x7f;
745 mp->multics[mp->intindex] = mp->cs;
746 mp->intindex += 1;
747 mp->startpos = mp->lastpos + 1;
748 return 1;
749 } else {
750 /* do nothing. return 1 to get next byte */
751 return 1;
752 }
753 } else if (mp->lastpos - mp->startpos + 1 == 2) {
754 int c0 = INBUF0(mp);
755 int c1 = INBUF1(mp);
756 if (ISSJISKANJI(c0, c1)) {
757 int ktype;
758
759 if (c0 < 0xf0) {
760 /* JIS X 0213:2000 plane 1 or JIS X 0208:1997 */
761 if (c0 <= 0x9f) c0 = (c0-0x81)*2 + 0x21;
762 else c0 = (c0-0xc1)*2 + 0x21;
763 if (c1 <= 0x7e) c1 -= 0x1f;
764 else if (c1 <= 0x9e) c1 -= 0x20;
765 else c1 -= 0x7e, c0 += 1;
766
767 int ch = checkKANJI(mp->io.scs,
768 SCSJISX0208_1983 | SCSJISX0208_1990
769 | SCSJISX0213_2000 | SCSJISX0213_2004,
770 c0, c1);
771 if (ch == 0) {
772 /* undefined. less ignore them */
773 wrongchar(mp);
774 return 1;
775 }
776 mp->icharset = SJIS;
777 mp->cs = (ch >> 16) & 0x7fff;
778 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
779 mp->multics[mp->intindex] = mp->cs;
780 mp->multiint[mp->intindex + 1] = ch & 0x7f;
781 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
782 mp->intindex += 2;
783 mp->startpos = mp->lastpos + 1;
784 return 1;
785 } else {
786 /* JIS X 0213:2000 plane 2 */
787 if (c0 == 0xf0)
788 if (c1 <= 0x9e) c0 = 0x21;
789 else c0 = 0x27;
790 else if (c0 == 0xf1) c0 = 0x23;
791 else if (c0 == 0xf2)
792 if (c1 <= 0x9e) c0 = 0x25;
793 else c0 = 0x2b;
794 else if (c0 == 0xf3) c0 = 0x2d;
795 else if (c0 == 0xf4)
796 if (c1 <= 0x9e) c0 = 0x2f;
797 else c0 = 0x6d;
798 else c0 = (c0 - 0xf5) * 2 + 0x6f;
799 if (c1 <= 0x7e) c1 -= 0x1f;
800 else if (c1 <= 0x9e) c1 -= 0x20;
801 else c1 -= 0x7e, c0 += 1;
802
803 int ch = checkKANJI(mp->io.scs, SCSJISX0213_2ND, c0, c1);
804 if (ch == 0) {
805 /* undefined. less ignore them */
806 wrongchar(mp);
807 return 1;
808 }
809 mp->icharset = SJIS;
810 mp->cs = (ch >> 16) & 0x7fff;
811 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
812 mp->multics[mp->intindex] = mp->cs;
813 mp->multiint[mp->intindex + 1] = ch & 0x7f;
814 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
815 mp->intindex += 2;
816 mp->startpos = mp->lastpos + 1;
817 return 1;
818 }
819 /* data are recognized as kanji or wrong data, so return 1 */
820 return 1;
821 }
822 }
823 /* return 0 because this data sequence is not matched to SJIS */
824 return 0;
825 }
826
827 /*
828 * Internalize input stream encoded by CP932 encoding scheme.
829 *
830 * Return 1 if input is recognized well.
831 * Return 0 if input is rejected.
832 */
internalize_cp932(mp)833 static int internalize_cp932(mp)
834 MULBUF *mp;
835 {
836 if (mp->lastpos - mp->startpos + 1 == 1) {
837 int c0 = INBUF(mp);
838 if (ISSJISKANA(c0)) {
839 mp->cs = JISX0201KANA;
840 mp->icharset = CP932;
841 mp->multiint[mp->intindex] = c0 & 0x7f;
842 mp->multics[mp->intindex] = mp->cs;
843 mp->intindex += 1;
844 mp->startpos = mp->lastpos + 1;
845 return 1;
846 } else {
847 /* do nothing. return 1 to get next byte */
848 return 1;
849 }
850 } else if (mp->lastpos - mp->startpos + 1 == 2) {
851 int c0 = INBUF0(mp);
852 int c1 = INBUF1(mp);
853 if (ISSJISKANJI(c0, c1)) {
854 int ofs;
855
856 if (c0 <= 0x9f) c0 = (c0-0x81)*2 + 0x21;
857 else c0 = (c0-0xc1)*2 + 0x21;
858 if (c1 <= 0x7e) c1 -= 0x1f;
859 else if (c1 <= 0x9e) c1 -= 0x20;
860 else c1 -= 0x7e, c0 += 1;
861
862 ofs = (c0 - 0x21) * 94 + (c1 - 0x21);
863 if ((c0 < 0x30 && c0 != 0x2d
864 && ucode_kanji1[ofs] == ucode_cp932[ofs])
865 || (c0 >= 0x30 && c0 <= 0x74)) {
866 int ch = checkKANJI(mp->io.scs,
867 SCSJISX0208_1983 | SCSJISX0208_1990,
868 c0, c1);
869 if (ch == 0) {
870 /* undefined. less ignore them */
871 wrongchar(mp);
872 return 1;
873 }
874 mp->icharset = CP932;
875 mp->cs = (ch >> 16) & 0x7fff;
876 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
877 mp->multics[mp->intindex] = mp->cs;
878 mp->multiint[mp->intindex + 1] = ch & 0x7f;
879 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
880 mp->intindex += 2;
881 mp->startpos = mp->lastpos + 1;
882 return 1;
883 } else {
884 if (c0 > 0x7e) c0 -= 0x4f;
885 int ch = checkKANJI(mp->io.scs, SCSCP932EX, c0, c1);
886 if (ch == 0) {
887 /* undefined. less ignore them */
888 wrongchar(mp);
889 return 1;
890 }
891 mp->icharset = CP932;
892 mp->cs = (ch >> 16) & 0x7fff;
893 mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
894 mp->multics[mp->intindex] = mp->cs;
895 mp->multiint[mp->intindex + 1] = ch & 0x7f;
896 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
897 mp->intindex += 2;
898 mp->startpos = mp->lastpos + 1;
899 return 1;
900 }
901 /* data are recognized as kanji or wrong data, so return 1 */
902 return 1;
903 }
904 }
905 /* return 0 because this data sequence is not matched to CP932 */
906 return 0;
907 }
908
909 /*
910 * Internalize UTF-8 character to traditional Codeset
911 *
912 * Return 1 if input has convetred well.
913 * Return 0 if input has failed.
914 */
ucs2codeset(mp,uc)915 static int ucs2codeset(mp, uc)
916 MULBUF *mp;
917 int uc;
918 {
919 int plane = (uc & 0x7ffff0000) >> 16;
920 int code = uc & 0xffff;
921 int umap;
922 int dummy;
923 int cs;
924 int cc;
925 int intindex;
926
927 /*
928 * lookup unicode table
929 */
930 if (plane == 0)
931 umap = unicode0_map[code];
932 else if (plane == 2)
933 umap = unicode2_map[code];
934 else
935 return 0;
936 if (umap == U_error)
937 return 0;
938
939 /*
940 * check codeset
941 */
942 cs = UMAP_CS(umap);
943
944 /*
945 * buffering
946 */
947 cc = UMAP_CHAR(umap);
948 switch (CS2TYPE(cs))
949 {
950 case TYPE_94_CHARSET:
951 case TYPE_96_CHARSET:
952 mp->icharset = UTF8;
953 mp->multiint[mp->intindex] = cc & 0x7f;
954 mp->multics[mp->intindex] = cs;
955 mp->intindex += 1;
956 return 1;
957 case TYPE_94N_CHARSET:
958 case TYPE_96N_CHARSET:
959 mp->icharset = UTF8;
960 mp->multiint[mp->intindex] = (cc / 94) + 0x21;
961 mp->multics[mp->intindex] = cs;
962 mp->multiint[mp->intindex + 1] = (cc % 94) + 0x21;
963 mp->multics[mp->intindex + 1] = REST_MASK | cs;
964 mp->intindex += 2;
965 return 1;
966 }
967
968 return 0;
969 }
970
971 static struct st_ucs_combining {
972 int c1;
973 int c2;
974 int u1;
975 int u2;
976 } jisx0213_comb[] = {
977 { 4, 87, 0x304b, 0x309a, },
978 { 4, 88, 0x304d, 0x309a, },
979 { 4, 89, 0x304f, 0x309a, },
980 { 4, 90, 0x3051, 0x309a, },
981 { 4, 91, 0x3053, 0x309a, },
982 { 5, 87, 0x30ab, 0x309a, },
983 { 5, 88, 0x30ad, 0x309a, },
984 { 5, 89, 0x30af, 0x309a, },
985 { 5, 90, 0x30b1, 0x309a, },
986 { 5, 91, 0x30b3, 0x309a, },
987 { 5, 92, 0x30bb, 0x309a, },
988 { 5, 93, 0x30c4, 0x309a, },
989 { 5, 94, 0x30c8, 0x309a, },
990 { 6, 88, 0x31f7, 0x309a, },
991 { 11, 36, 0x00e6, 0x0300, },
992 { 11, 40, 0x0254, 0x0300, },
993 { 11, 41, 0x0254, 0x0301, },
994 { 11, 42, 0x028c, 0x0300, },
995 { 11, 43, 0x028c, 0x0301, },
996 { 11, 44, 0x0259, 0x0300, },
997 { 11, 45, 0x0259, 0x0301, },
998 { 11, 46, 0x025a, 0x0300, },
999 { 11, 47, 0x025a, 0x0301, },
1000 { 11, 69, 0x02e9, 0x02e5 },
1001 { 11, 70, 0x02e5, 0x02e9 },
1002 { 0, 0, 0, 0 },
1003 };
1004
1005 static int pending_ucs = 0;
1006 static int
flush_pending_ucs(mp)1007 flush_pending_ucs(mp)
1008 MULBUF *mp;
1009 {
1010 if (pending_ucs) {
1011 ucs2codeset(mp, pending_ucs);
1012 pending_ucs = 0;
1013 return 1;
1014 }
1015 return 0;
1016 }
1017
1018 static int
ucs2codeset_combind(mp,uc)1019 ucs2codeset_combind(mp, uc)
1020 MULBUF *mp;
1021 int uc;
1022 {
1023 struct st_ucs_combining *p;
1024
1025 if (uc < 0) {
1026 pending_ucs = 0;
1027 return 1;
1028 }
1029
1030 if (uc == 0) {
1031 if (pending_ucs)
1032 ucs2codeset(mp, pending_ucs);
1033 pending_ucs = 0;
1034 return 1;
1035 }
1036
1037 if (!(mp->io.scs & (SCSJISX0213_2000 | SCSJISX0213_2004)))
1038 return ucs2codeset(mp, uc);
1039
1040 if (pending_ucs) {
1041 for (p = jisx0213_comb; p->c1; ++ p) {
1042 if (p->u1 == pending_ucs && p->u2 == uc) {
1043 mp->icharset = UTF8;
1044 mp->cs = JISX0213KANJI1;
1045 mp->multiint[mp->intindex] = p->c1 + 0x20;
1046 mp->multics[mp->intindex] = JISX0213KANJI1;
1047 mp->multiint[mp->intindex + 1] = p->c2 + 0x20;
1048 mp->multics[mp->intindex + 1] = REST_MASK | JISX0213KANJI1;
1049 mp->intindex += 2;
1050 mp->startpos = mp->lastpos + 1;
1051 pending_ucs = 0;
1052 return 1;
1053 }
1054 }
1055 ucs2codeset(mp, pending_ucs);
1056 pending_ucs = 0;
1057 }
1058
1059 for (p = jisx0213_comb; p->c1; ++ p) {
1060 if (p->u1 == uc) {
1061 mp->startpos = mp->lastpos + 1;
1062 pending_ucs = uc;
1063 return 1;
1064 }
1065 }
1066
1067 mp->startpos = mp->lastpos + 1;
1068 return ucs2codeset(mp, uc);
1069 }
1070
1071 /*
1072 * Internalize input stream encoded by UTF8 encoding scheme.
1073 *
1074 * Return 1 if input is recognized well.
1075 * Return 0 if input is rejected.
1076 */
internalize_utf8(mp)1077 static int internalize_utf8(mp)
1078 MULBUF *mp;
1079 {
1080 int uc;
1081 int cs;
1082
1083 if (mp->lastpos - mp->startpos + 1 == 1) {
1084 /* do nothing. return 1 to get next byte */
1085 return 1;
1086 } else if (mp->lastpos - mp->startpos + 1 == 2) {
1087 int c0 = INBUF0(mp);
1088 int c1 = INBUF1(mp);
1089 if (ISUTF8_2(c0, c1)) {
1090 uc = UTF8_2(c0, c1);
1091 if (ucs2codeset_combind(mp, uc))
1092 return 1;
1093 if (!(mp->io.scs & SCSUTF8)) {
1094 wrongucs(mp, uc);
1095 return 1;
1096 }
1097 cs = get_utfwidth(uc);
1098 if (CSISWRONG(cs)) {
1099 wrongucs(mp, uc);
1100 return 1;
1101 }
1102 mp->cs = cs;
1103 mp->icharset = UTF8;
1104 mp->multiint[mp->intindex] = c0;
1105 mp->multics[mp->intindex] = mp->cs;
1106 mp->multiint[mp->intindex + 1] = c1;
1107 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1108 mp->intindex += 2;
1109 mp->startpos = mp->lastpos + 1;
1110 return 1;
1111 } else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1)) {
1112 /* do nothing. return 1 to get next byte */
1113 return 1;
1114 }
1115 } else if (mp->lastpos - mp->startpos + 1 == 3) {
1116 int c0 = INBUF0(mp);
1117 int c1 = INBUF1(mp);
1118 int c2 = INBUF2(mp);
1119 if (ISUTF8_3(c0, c1, c2)) {
1120 uc = UTF8_3(c0, c1, c2);
1121 if (ucs2codeset_combind(mp, uc))
1122 return 1;
1123 if (!(mp->io.scs & SCSUTF8)) {
1124 wrongucs(mp, uc);
1125 return 1;
1126 }
1127 cs = get_utfwidth(uc);
1128 if (CSISWRONG(cs)) {
1129 wrongucs(mp, uc);
1130 return 1;
1131 }
1132 mp->cs = cs;
1133 mp->icharset = UTF8;
1134 mp->multiint[mp->intindex] = c0;
1135 mp->multics[mp->intindex] = mp->cs;
1136 mp->multiint[mp->intindex + 1] = c1;
1137 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1138 mp->multiint[mp->intindex + 2] = c2;
1139 mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1140 mp->intindex += 3;
1141 mp->startpos = mp->lastpos + 1;
1142 return 1;
1143 } else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2)) {
1144 /* do nothing. return 1 to get next byte */
1145 return 1;
1146 }
1147 } else if (mp->lastpos - mp->startpos + 1 == 4) {
1148 int c0 = INBUF0(mp);
1149 int c1 = INBUF1(mp);
1150 int c2 = INBUF2(mp);
1151 int c3 = INBUF3(mp);
1152 if (ISUTF8_4(c0, c1, c2, c3)) {
1153 uc = UTF8_4(c0, c1, c2, c3);
1154 if (ucs2codeset_combind(mp, uc))
1155 return 1;
1156 if (!(mp->io.scs & SCSUTF8)) {
1157 wrongucs(mp, uc);
1158 return 1;
1159 }
1160 cs = get_utfwidth(uc);
1161 if (CSISWRONG(cs)) {
1162 wrongucs(mp, uc);
1163 return 1;
1164 }
1165 mp->cs = cs;
1166 mp->icharset = UTF8;
1167 mp->multiint[mp->intindex] = c0;
1168 mp->multics[mp->intindex] = mp->cs;
1169 mp->multiint[mp->intindex + 1] = c1;
1170 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1171 mp->multiint[mp->intindex + 2] = c2;
1172 mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1173 mp->multiint[mp->intindex + 3] = c3;
1174 mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1175 mp->intindex += 4;
1176 mp->startpos = mp->lastpos + 1;
1177 return 1;
1178 } else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2) &&
1179 ISUTF8_REST(c3)) {
1180 /* do nothing. return 1 to get next byte */
1181 return 1;
1182 }
1183 } else if (mp->lastpos - mp->startpos + 1 == 5) {
1184 int c0 = INBUF0(mp);
1185 int c1 = INBUF1(mp);
1186 int c2 = INBUF2(mp);
1187 int c3 = INBUF3(mp);
1188 int c4 = INBUF4(mp);
1189 if (ISUTF8_5(c0, c1, c2, c3, c4)) {
1190 uc = UTF8_5(c0, c1, c2, c3, c4);
1191 if (ucs2codeset_combind(mp, uc))
1192 return 1;
1193 if (!(mp->io.scs & SCSUTF8)) {
1194 wrongucs(mp, uc);
1195 return 1;
1196 }
1197 cs = get_utfwidth(uc);
1198 if (CSISWRONG(cs)) {
1199 wrongucs(mp, uc);
1200 return 1;
1201 }
1202 mp->cs = cs;
1203 mp->icharset = UTF8;
1204 mp->multiint[mp->intindex] = c0;
1205 mp->multics[mp->intindex] = mp->cs;
1206 mp->multiint[mp->intindex + 1] = c1;
1207 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1208 mp->multiint[mp->intindex + 2] = c2;
1209 mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1210 mp->multiint[mp->intindex + 3] = c3;
1211 mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1212 mp->multiint[mp->intindex + 4] = c4;
1213 mp->multics[mp->intindex + 4] = REST_MASK | mp->cs;
1214 mp->intindex += 5;
1215 mp->startpos = mp->lastpos + 1;
1216 return 1;
1217 } else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2) &&
1218 ISUTF8_REST(c3) && ISUTF8_REST(c4)) {
1219 /* do nothing. return 1 to get next byte */
1220 return 1;
1221 }
1222 } else if (mp->lastpos - mp->startpos + 1 == 6) {
1223 int c0 = INBUF0(mp);
1224 int c1 = INBUF1(mp);
1225 int c2 = INBUF2(mp);
1226 int c3 = INBUF3(mp);
1227 int c4 = INBUF4(mp);
1228 int c5 = INBUF5(mp);
1229 if (ISUTF8_6(c0, c1, c2, c3, c4, c5)) {
1230 uc = UTF8_6(c0, c1, c2, c3, c4, c5);
1231 if (ucs2codeset_combind(mp, uc))
1232 return 1;
1233 if (!(mp->io.scs & SCSUTF8)) {
1234 wrongucs(mp, uc);
1235 return 1;
1236 }
1237 cs = get_utfwidth(uc);
1238 if (CSISWRONG(cs)) {
1239 wrongucs(mp, uc);
1240 return 1;
1241 }
1242 mp->cs = cs;
1243 mp->icharset = UTF8;
1244 mp->multiint[mp->intindex] = c0;
1245 mp->multics[mp->intindex] = mp->cs;
1246 mp->multiint[mp->intindex + 1] = c1;
1247 mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1248 mp->multiint[mp->intindex + 2] = c2;
1249 mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1250 mp->multiint[mp->intindex + 3] = c3;
1251 mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1252 mp->multiint[mp->intindex + 4] = c4;
1253 mp->multics[mp->intindex + 4] = REST_MASK | mp->cs;
1254 mp->multiint[mp->intindex + 5] = c5;
1255 mp->multics[mp->intindex + 5] = REST_MASK | mp->cs;
1256 mp->intindex += 6;
1257 mp->startpos = mp->lastpos + 1;
1258 return 1;
1259 }
1260 }
1261 /* return 0 because this data sequence is not matched to UTF8 */
1262 return 0;
1263 }
1264
1265 #endif
1266
internalize(mp)1267 static void internalize(mp)
1268 MULBUF *mp;
1269 {
1270 int c = INBUF(mp);
1271
1272 if (mp->lastpos - mp->startpos + 1 == 1) {
1273 if ((c <= 0x7f && mp->io.input == ESNOCONV) ||
1274 (c >= 0x80 && mp->io.inputr == ESNOCONV)) {
1275 #if JAPANESE
1276 mp->sequence_counter = 0;
1277 flush_pending_ucs(mp);
1278 #endif
1279 if (control_char(c)) {
1280 wrongcs1(mp);
1281 } else {
1282 noconv1(mp);
1283 }
1284 return;
1285 } else if (c >= 0x80 && mp->io.inputr == ESNONE) {
1286 #if JAPANESE
1287 mp->sequence_counter = 0;
1288 flush_pending_ucs(mp);
1289 #endif
1290 wrongcs1(mp);
1291 return;
1292 }
1293
1294 mp->cs = ASCII;
1295 if (c < 0x20) {
1296 #if JAPANESE
1297 mp->sequence_counter = 0;
1298 flush_pending_ucs(mp);
1299 #endif
1300 wrongcs1(mp);
1301 return;
1302 } else if (c <= 0x7f ||
1303 ((mp->io.inputr & ESISO8)
1304 && (0xa0 <= c && c <= 0xff)
1305 && (mp->ms->sg != WRONGPLANE
1306 || !CSISWRONG(mp->ms->gs[mp->ms->gr])))) {
1307 #if JAPANESE
1308 mp->sequence_counter = 0;
1309 flush_pending_ucs(mp);
1310 #endif
1311 /*
1312 * Decide current character set.
1313 */
1314 mp->cs = FINDCS(mp, c);
1315
1316 /*
1317 * Check cs that fit for output code set.
1318 */
1319 /* JIS cannot output JISX0212, JISX0213_2, or ISO2022 */
1320 if ((output == ESJIS83) &&
1321 mp->cs != ASCII &&
1322 mp->cs != JISX0201KANA &&
1323 mp->cs != JISX0201ROMAN &&
1324 mp->cs != JISX0208_78KANJI &&
1325 mp->cs != JISX0208KANJI &&
1326 mp->cs != JISX0208_90KANJI &&
1327 mp->cs != JISX0213KANJI1 &&
1328 mp->cs != JISX02132004KANJI1) {
1329 wrongcs1(mp);
1330 multi_reparse(mp);
1331 return;
1332 }
1333
1334 /* UJIS cannot output regular ISO2022 except JIS */
1335 if ((output == ESUJIS) &&
1336 mp->cs != ASCII &&
1337 mp->cs != JISX0201KANA &&
1338 mp->cs != JISX0201ROMAN &&
1339 mp->cs != JISX0208_78KANJI &&
1340 mp->cs != JISX0208KANJI &&
1341 mp->cs != JISX0208_90KANJI &&
1342 mp->cs != JISX0212KANJISUP &&
1343 mp->cs != JISX0213KANJI1 &&
1344 mp->cs != JISX0213KANJI2 &&
1345 mp->cs != JISX02132004KANJI1) {
1346 wrongcs1(mp);
1347 multi_reparse(mp);
1348 return;
1349 }
1350
1351 /* SJIS cannot output JISX0212 or ISO2022 */
1352 if ((output == ESSJIS) &&
1353 mp->cs != ASCII &&
1354 mp->cs != JISX0201KANA &&
1355 mp->cs != JISX0201ROMAN &&
1356 mp->cs != JISX0208_78KANJI &&
1357 mp->cs != JISX0208KANJI &&
1358 mp->cs != JISX0208_90KANJI &&
1359 mp->cs != JISX0213KANJI1 &&
1360 mp->cs != JISX0213KANJI2 &&
1361 mp->cs != JISX02132004KANJI1) {
1362 wrongcs1(mp);
1363 multi_reparse(mp);
1364 return;
1365 }
1366
1367 /* CP932 cannot output regular ISO2022 except JIS */
1368 if ((output == ESCP932) &&
1369 mp->cs != ASCII &&
1370 mp->cs != JISX0201KANA &&
1371 mp->cs != JISX0201ROMAN &&
1372 mp->cs != JISX0208_78KANJI &&
1373 mp->cs != JISX0208KANJI &&
1374 mp->cs != JISX0208_90KANJI &&
1375 mp->cs != JISX0212KANJISUP &&
1376 mp->cs != JISX0213KANJI1 &&
1377 mp->cs != JISX0213KANJI2 &&
1378 mp->cs != JISX02132004KANJI1 &&
1379 mp->cs != CP932) {
1380 wrongcs1(mp);
1381 multi_reparse(mp);
1382 return;
1383 }
1384
1385 if (mp->cs != ASCII)
1386 mp->icharset = mp->cs;
1387
1388 internalize_iso(mp);
1389 return;
1390 } else if (control_char(c)) {
1391 #if JAPANESE
1392 mp->sequence_counter = 0;
1393 flush_pending_ucs(mp);
1394 #endif
1395 wrongcs1(mp);
1396 return;
1397 }
1398 #if JAPANESE
1399 if (mp->priority == PSJIS && ISSJISKANA(c)) {
1400 if (mp->io.inputr & ESUJIS) {
1401 mp->sequence_counter++;
1402 if (mp->sequence_counter % 2 == 1 &&
1403 INBUF0(mp) != 0xa4) /* ???? */
1404 {
1405 mp->sequence_counter = 0;
1406 }
1407 if (mp->sequence_counter >= 6)
1408 /*
1409 * It looks like a sequence of UJIS
1410 * hiragana. Thus we give priority
1411 * to not PSJIS.
1412 */
1413 mp->priority = PUJIS;
1414 }
1415 flush_pending_ucs(mp);
1416 if (mp->io.inputr & ESCP932)
1417 internalize_cp932(mp);
1418 else if (mp->io.inputr & ESSJIS)
1419 internalize_sjis(mp);
1420 return;
1421 } else if (mp->io.inputr & (ESUJIS | ESSJIS | ESUTF8 | ESCP932)) {
1422 mp->sequence_counter = 0;
1423 return;
1424 }
1425 mp->sequence_counter = 0;
1426 #endif
1427 wrongcs1(mp);
1428 return;
1429 }
1430
1431 #if JAPANESE
1432 assert(mp->sequence_counter == 0);
1433 #endif
1434 if (c < 0x20) {
1435 flush_pending_ucs(mp);
1436 wrongcs1(mp);
1437 multi_reparse(mp);
1438 return;
1439 } else if (mp->cs != ASCII &&
1440 (c <= 0x7f ||
1441 ((mp->io.inputr & ESISO8)
1442 && (0xa0 <= c && c <= 0xff)
1443 && (mp->ms->sg != WRONGPLANE
1444 || !CSISWRONG(mp->ms->gs[mp->ms->gr]))))) {
1445 flush_pending_ucs(mp);
1446 if (mp->cs != FINDCS(mp, c)) {
1447 wrongcs1(mp);
1448 multi_reparse(mp);
1449 } else {
1450 internalize_iso(mp);
1451 }
1452 return;
1453 } else if (control_char(c)) {
1454 flush_pending_ucs(mp);
1455 wrongcs1(mp);
1456 multi_reparse(mp);
1457 return;
1458 }
1459 #if JAPANESE
1460 if (mp->lastpos - mp->startpos + 1 == 2) {
1461 if (mp->priority == PSJIS) {
1462 if (mp->io.inputr & ESCP932) {
1463 if (internalize_cp932(mp))
1464 return;
1465 } else {
1466 if (internalize_sjis(mp)) {
1467 return;
1468 }
1469 }
1470 } else if (mp->priority == PUJIS) {
1471 if (internalize_ujis(mp)) {
1472 return;
1473 }
1474 } else if (mp->priority == PUTF8) {
1475 if (internalize_utf8(mp)) {
1476 return;
1477 }
1478 flush_pending_ucs(mp);
1479 }
1480
1481 if (mp->io.inputr & ESUTF8) {
1482 if (internalize_utf8(mp)) {
1483 mp->priority = PUTF8;
1484 return;
1485 }
1486 flush_pending_ucs(mp);
1487 }
1488 if (mp->io.inputr & ESUJIS) {
1489 if (internalize_ujis(mp)) {
1490 mp->priority = PUJIS;
1491 return;
1492 }
1493 }
1494 if (mp->io.inputr & ESSJIS) {
1495 flush_pending_ucs(mp);
1496 if (internalize_sjis(mp)) {
1497 mp->priority = PSJIS;
1498 return;
1499 }
1500 }
1501 if (mp->io.inputr & ESCP932) {
1502 flush_pending_ucs(mp);
1503 if (internalize_cp932(mp)) {
1504 mp->priority = PSJIS;
1505 return;
1506 }
1507 }
1508 } else if (mp->lastpos - mp->startpos + 1 == 3) {
1509 if (mp->priority == PUJIS) {
1510 if (internalize_ujis(mp)) {
1511 return;
1512 }
1513 } else if (mp->priority == PUTF8) {
1514 if (internalize_utf8(mp)) {
1515 return;
1516 }
1517 flush_pending_ucs(mp);
1518 }
1519
1520 if (mp->io.inputr & ESUTF8) {
1521 if (internalize_utf8(mp)) {
1522 mp->priority = PUTF8;
1523 return;
1524 }
1525 flush_pending_ucs(mp);
1526 }
1527 if (mp->io.inputr & ESUJIS) {
1528 if (internalize_ujis(mp)) {
1529 mp->priority = PUJIS;
1530 return;
1531 }
1532 }
1533 } else if (mp->lastpos - mp->startpos + 1 <= 6) {
1534 if (mp->io.inputr & ESUTF8) {
1535 if (internalize_utf8(mp)) {
1536 mp->priority = PUTF8;
1537 return;
1538 }
1539 flush_pending_ucs(mp);
1540 }
1541 }
1542 #endif
1543 wrongcs1(mp);
1544 multi_reparse(mp);
1545 }
1546
1547 /*
1548 * Check routines
1549 */
check_ft(mp,c,type,plane)1550 static int check_ft(mp, c, type, plane)
1551 MULBUF *mp;
1552 register int c;
1553 int type;
1554 int *plane;
1555 {
1556 if (type == TYPE_94_CHARSET) {
1557 switch (c) {
1558 case 'B': /* ASCII */
1559 goto ok;
1560 case 'I': /* JIS X 0201 right half (Katakana) */
1561 case 'J': /* JIS X 0201 left half (Roman) */
1562 if (mp->io.scs & SCSJISX0201_1976) goto ok;
1563 }
1564 } else if (type == TYPE_94N_CHARSET) {
1565 switch (c) {
1566 case '@': /* JIS C 6226-1978 */
1567 if (mp->io.scs & SCSJISC6226_1978) goto ok;
1568 break;
1569 case 'B': /* JIS X 0208-1983, JIS X 0208:1990, or JIS X 0208:1997 */
1570 if (mp->io.scs & (SCSJISX0208_1983 | SCSJISX0208_1990)) goto ok;
1571 break;
1572 case 'D': /* JIS X 0212:1990 */
1573 if (mp->io.scs & SCSJISX0212_1990) goto ok;
1574 break;
1575 case 'O': /* JIS X 0213:2000 plane 1 */
1576 if (mp->io.scs & SCSJISX0213_2000) goto ok;
1577 break;
1578 case 'P': /* JIS X 0213:2000 plane 2 or JIS X 0213:2004 plane 2 */
1579 if (mp->io.scs & (SCSJISX0213_2000 | SCSJISX0213_2004)) goto ok;
1580 break;
1581 case 'Q': /* JIS X 0213:2004 plane 1 */
1582 if (mp->io.scs & SCSJISX0213_2004) goto ok;
1583 break;
1584 }
1585 }
1586 if ((mp->io.scs & SCSOTHERISO) && 0x30 <= c && c <= 0x7e) {
1587 /* accepting all other ISO, so OK */
1588 goto ok;
1589 }
1590 return (-1);
1591 ok:
1592 *plane = (mp->ms->irr ? IRR2CS(mp->ms->irr) : 0) | TYPE2CS(type) | FT2CS(c);
1593 mp->ms->irr = 0;
1594 mp->eseq = NOESC;
1595 return (0);
1596 }
1597
check_irr(mp,c)1598 static int check_irr(mp, c)
1599 MULBUF *mp;
1600 register int c;
1601 {
1602 if (0x40 <= c && c <= 0x7e) {
1603 mp->ms->irr = CODE2IRR(c);
1604 mp->eseq = NOESC;
1605 return (0);
1606 }
1607 return (-1);
1608 }
1609
fix_status_for_escape_sequence(mp)1610 static void fix_status_for_escape_sequence(mp)
1611 MULBUF *mp;
1612 {
1613 if (mp->eseq == NOESC) {
1614 switch (CS2TYPE(ISVALIDPLANE(mp, sg) ? PLANE2CS(mp, sg) :
1615 PLANE2CS(mp, gl))) {
1616 case TYPE_96_CHARSET:
1617 case TYPE_96N_CHARSET:
1618 change_control_char(0177, 0);
1619 break;
1620 case TYPE_94_CHARSET:
1621 case TYPE_94N_CHARSET:
1622 change_control_char(0177, 1);
1623 break;
1624 }
1625 switch (CS2TYPE(ISVALIDPLANE(mp, sg) ? PLANE2CS(mp, sg) :
1626 PLANE2CS(mp, gr))) {
1627 case TYPE_96_CHARSET:
1628 case TYPE_96N_CHARSET:
1629 change_control_char(0377, 0);
1630 break;
1631 case TYPE_94_CHARSET:
1632 case TYPE_94N_CHARSET:
1633 change_control_char(0377, 1);
1634 break;
1635 }
1636 }
1637 }
1638
check_escape_sequence(mp)1639 static int check_escape_sequence(mp)
1640 MULBUF *mp;
1641 {
1642 int c = INBUF(mp);
1643
1644 switch (mp->eseq) {
1645 case ESC_:
1646 switch (c) {
1647 case '$': mp->eseq = ESC_2_4; break;
1648 case '&': mp->eseq = ESC_2_6; break;
1649 case '(': mp->eseq = ESC_2_8; break;
1650 case ')': mp->eseq = ESC_2_9; break;
1651 case '*': mp->eseq = ESC_2_10; break;
1652 case '+': mp->eseq = ESC_2_11; break;
1653 case ',': mp->eseq = ESC_2_12; break;
1654 case '-': mp->eseq = ESC_2_13; break;
1655 case '.': mp->eseq = ESC_2_14; break;
1656 case '/': mp->eseq = ESC_2_15; break;
1657 case 'N': mp->ms->sg = 2; mp->eseq = NOESC; /*SS2*/break;
1658 case 'O': mp->ms->sg = 3; mp->eseq = NOESC; /*SS3*/break;
1659 case 'n': mp->ms->gl = 2; mp->eseq = NOESC; break;
1660 case 'o': mp->ms->gl = 3; mp->eseq = NOESC; break;
1661 case '|': if (!(mp->io.inputr & ESISO8)) goto wrong;
1662 mp->ms->gr = 3; mp->eseq = NOESC; break;
1663 case '}': if (!(mp->io.inputr & ESISO8)) goto wrong;
1664 mp->ms->gr = 2; mp->eseq = NOESC; break;
1665 case '~': if (!(mp->io.inputr & ESISO8)) goto wrong;
1666 mp->ms->gr = 1; mp->eseq = NOESC; break;
1667 case '[': mp->eseq = ESC_5_11; break;
1668 default: goto wrong;
1669 }
1670 break;
1671 case ESC_2_4:
1672 switch (c) {
1673 case '(': mp->eseq = ESC_2_4_8; break;
1674 case ')': mp->eseq = ESC_2_4_9; break;
1675 case '*': mp->eseq = ESC_2_4_10; break;
1676 case '+': mp->eseq = ESC_2_4_11; break;
1677 case '-': mp->eseq = ESC_2_4_13; break;
1678 case '.': mp->eseq = ESC_2_4_14; break;
1679 case '/': mp->eseq = ESC_2_4_15; break;
1680 case '@':
1681 case 'A':
1682 case 'B': if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[0])) == 0)
1683 break;
1684 default: goto wrong;
1685 }
1686 break;
1687 case ESC_2_6:
1688 if (check_irr(mp, c) == 0) break;
1689 goto wrong;
1690 case ESC_2_8:
1691 if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[0])) == 0) break;
1692 goto wrong;
1693 case ESC_2_9:
1694 if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[1])) == 0) break;
1695 goto wrong;
1696 case ESC_2_10:
1697 if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[2])) == 0) break;
1698 goto wrong;
1699 case ESC_2_11:
1700 if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[3])) == 0) break;
1701 goto wrong;
1702 case ESC_2_12:
1703 if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[0])) == 0) break;
1704 goto wrong;
1705 case ESC_2_13:
1706 if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[1])) == 0) break;
1707 goto wrong;
1708 case ESC_2_14:
1709 if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[2])) == 0) break;
1710 goto wrong;
1711 case ESC_2_15:
1712 if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[3])) == 0) break;
1713 goto wrong;
1714 case ESC_2_4_8:
1715 if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[0])) == 0) break;
1716 goto wrong;
1717 case ESC_2_4_9:
1718 if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[1])) == 0) break;
1719 goto wrong;
1720 case ESC_2_4_10:
1721 if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[2])) == 0) break;
1722 goto wrong;
1723 case ESC_2_4_11:
1724 if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[3])) == 0) break;
1725 goto wrong;
1726 case ESC_2_4_13:
1727 if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[1])) == 0) break;
1728 goto wrong;
1729 case ESC_2_4_14:
1730 if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[2])) == 0) break;
1731 goto wrong;
1732 case ESC_2_4_15:
1733 if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[3])) == 0) break;
1734 goto wrong;
1735 case ESC_5_11:
1736 if (mp->lastpos - 20 > mp->startpos) /* ESC sequence to long */
1737 goto wrong;
1738 if (is_ansi_end(c))
1739 goto disp_esc;
1740 if (!is_ansi_middle(c))
1741 goto wrong;
1742 break;
1743 case NOESC:
1744 /*
1745 * This sequence is wrong if we buffered some data.
1746 */
1747 if (mp->lastpos > mp->startpos) {
1748 switch (c) {
1749 case 0033:
1750 case 0016:
1751 case 0017:
1752 case 0031: goto wrong;
1753 default: goto wrongone;
1754 }
1755 }
1756 /*
1757 * Nothing is buffered. So, check this sequence.
1758 */
1759 switch (c) {
1760 case 0033: mp->eseq = ESC_; break;
1761 case 0016: mp->ms->gl = 1; mp->eseq = NOESC; break;
1762 case 0017: mp->ms->gl = 0; mp->eseq = NOESC; break;
1763 case 0031: mp->ms->sg = 2; mp->eseq = NOESC; /*SS2*/ break;
1764 case 0216:
1765 if (!(mp->io.inputr & ESISO8) || CSISWRONG(mp->ms->gs[2]))
1766 goto wrongone;
1767 mp->ms->sg = 2;
1768 mp->eseq = NOESC; /*SS2*/
1769 break;
1770 case 0217:
1771 if (!(mp->io.inputr & ESISO8) || CSISWRONG(mp->ms->gs[3]))
1772 goto wrongone;
1773 mp->ms->sg = 3;
1774 mp->eseq = NOESC; /*SS3*/
1775 break;
1776 default: goto wrongone;
1777 }
1778 break;
1779 default:
1780 assert(0);
1781 }
1782 if (mp->eseq == NOESC) {
1783 fix_status_for_escape_sequence(mp);
1784 mp->startpos = mp->lastpos + 1;
1785 return (0);
1786 }
1787 return (0);
1788 disp_esc:
1789 if (mp->eseq != NOESC) {
1790 mp->eseq = NOESC;
1791 fix_status_for_escape_sequence(mp);
1792 }
1793 wrongcs1(mp);
1794 multi_reparse(mp);
1795 return (0);
1796 wrong:
1797 if (mp->eseq != NOESC) {
1798 mp->eseq = NOESC;
1799 fix_status_for_escape_sequence(mp);
1800 }
1801 mp->multiint[mp->intindex] = INBUF0(mp);
1802 mp->multics[mp->intindex] = WRONG_ESC;
1803 mp->intindex++;
1804 mp->startpos++;
1805 multi_reparse(mp);
1806 return (0);
1807 wrongone:
1808 assert(mp->eseq == NOESC);
1809 return (-1);
1810 }
1811
1812 struct planeset {
1813 char *name;
1814 char *planeset;
1815 } planesets[] = {
1816 { "ascii", "" },
1817 { "ctext", "\\e-A" },
1818 { "latin1", "\\e-A" },
1819 { "iso8859-1", "\\e-A" },
1820 { "latin2", "\\e-B" },
1821 { "iso8859-2", "\\e-B" },
1822 { "latin3", "\\e-C" },
1823 { "iso8859-3", "\\e-C" },
1824 { "latin4", "\\e-D" },
1825 { "iso8859-4", "\\e-D" },
1826 { "cyrillic", "\\e-L" },
1827 { "iso8859-5", "\\e-L" },
1828 { "arabic", "\\e-G" },
1829 { "iso8859-6", "\\e-G" },
1830 { "greek", "\\e-F" },
1831 { "iso8859-7", "\\e-F" },
1832 { "hebrew", "\\e-H" },
1833 { "iso8859-8", "\\e-H" },
1834 { "latin5", "\\e-M" },
1835 { "iso8859-9", "\\e-M" },
1836 { "latin6", "\\e-V" },
1837 { "iso8859-10", "\\e-V" },
1838 { "thai", "\\e-T" },
1839 { "iso8859-11", "\\e-T" },
1840 { "latin7", "\\e-Y" },
1841 { "iso8859-13", "\\e-Y" },
1842 { "latin8", "\\e-_" },
1843 { "iso8859-14", "\\e-_" },
1844 { "latin9", "\\e-b" },
1845 { "iso8859-15", "\\e-b" },
1846 { "latin10", "\\e-f" },
1847 { "iso8859-16", "\\e-f" },
1848 { "jisx0201", "\\e(J\\e)I" },
1849 { "japanese", "\\e$)B\\e*I\\e$+D" },
1850 { "ujis", "\\e$)B\\e*I\\e$+D" },
1851 { "euc", "\\e$)B\\e*I\\e$+D" },
1852 { "euc-jisx0213", "\\e$)O\\e*I\\e$+P" },
1853 { NULL, "" }
1854 };
1855
set_planeset(name)1856 int set_planeset(name)
1857 register char *name;
1858 {
1859 register struct planeset *p;
1860 MULBUF *mp;
1861 int ret;
1862 int i;
1863
1864 if (name == NULL) {
1865 return -1;
1866 }
1867 for (p = planesets; p->name != NULL; p++) {
1868 if (strcasecmp(name, p->name) == 0) {
1869 name = p->planeset;
1870 break;
1871 }
1872 }
1873 mp = new_multibuf();
1874 init_priority(mp);
1875 while (*name) {
1876 if (*name == '\\' &&
1877 (*(name + 1) == 'e' || *(name + 1) == 'E')) {
1878 ++mp->lastpos;
1879 INBUF(mp) = '\033';
1880 ret = check_escape_sequence(mp);
1881 name += 2;
1882 } else {
1883 ++mp->lastpos;
1884 INBUF(mp) = *name++;
1885 ret = check_escape_sequence(mp);
1886 }
1887 if (ret < 0 || mp->intindex > 0) {
1888 free(mp);
1889 return -1;
1890 }
1891 }
1892 def_gs[0] = mp->ms->gs[0];
1893 def_gs[1] = mp->ms->gs[1];
1894 def_gs[2] = mp->ms->gs[2];
1895 def_gs[3] = mp->ms->gs[3];
1896 free(mp);
1897 return 0;
1898 }
1899
init_def_scs_es(scs,input,inputr,out)1900 void init_def_scs_es(scs, input, inputr, out)
1901 SETCHARSET scs;
1902 ENCSET input;
1903 ENCSET inputr;
1904 ENCSET out;
1905 {
1906 def_scs = scs;
1907 def_input = input;
1908 def_inputr = inputr;
1909 output = out;
1910
1911 if (inputr & ESUTF8)
1912 make_unicode_map(scs, output & ESUTF8);
1913 }
1914
init_def_priority(pri)1915 void init_def_priority(pri)
1916 J_PRIORITY pri;
1917 {
1918 #if JAPANESE
1919 assert(pri == PUJIS || pri == PSJIS || pri == PUTF8);
1920 def_priority = pri;
1921 #endif
1922 }
1923
init_priority(mp)1924 void init_priority(mp)
1925 MULBUF *mp;
1926 {
1927 #if JAPANESE
1928 if ((mp->io.inputr & (ESSJIS | ESCP932)) && (mp->io.inputr & ESUJIS))
1929 mp->priority = def_priority;
1930 else if (mp->io.inputr & ESUTF8)
1931 mp->priority = PUTF8;
1932 else if (mp->io.inputr & ESUJIS)
1933 mp->priority = PUJIS;
1934 else if (mp->io.inputr & (ESSJIS | ESCP932))
1935 mp->priority = PSJIS;
1936 else
1937 mp->priority = PNONE;
1938 mp->sequence_counter = 0;
1939 #endif
1940 }
1941
get_priority(mp)1942 J_PRIORITY get_priority(mp)
1943 MULBUF *mp;
1944 {
1945 #if JAPANESE
1946 return (mp->priority);
1947 #else
1948 return (PNONE);
1949 #endif
1950 }
1951
set_priority(mp,pri)1952 void set_priority(mp, pri)
1953 MULBUF *mp;
1954 J_PRIORITY pri;
1955 {
1956 #if JAPANESE
1957 assert(pri == PSJIS || pri == PUJIS || pri == PUTF8 || pri == PNONE);
1958 mp->priority = pri;
1959 #endif
1960 }
1961
set_utfwidth(u)1962 void set_utfwidth(u)
1963 UWidth u;
1964 {
1965 assert(u >= UWIDTH_NONE && u <= UWIDTH_ALL);
1966 utfwidth = u;
1967 }
1968
new_multibuf()1969 MULBUF *new_multibuf()
1970 {
1971 MULBUF *mp = (MULBUF*) ecalloc(1, sizeof(MULBUF));
1972 mp->io.scs = def_scs;
1973 mp->io.input = def_input;
1974 mp->io.inputr = def_inputr;
1975 mp->orig_io_right = def_inputr;
1976 mp->rotation_io_right = 0;
1977 mp->eseq = NOESC;
1978 mp->ms = (struct m_status*) ecalloc(1, sizeof(struct m_status));
1979 init_multibuf(mp);
1980 return (mp);
1981 }
1982
clear_multibuf(mp)1983 void clear_multibuf(mp)
1984 MULBUF *mp;
1985 {
1986 mp->lastpos = M_NULL_POS;
1987 mp->startpos = 0;
1988 mp->laststartpos = 0;
1989 mp->lastsg = WRONGPLANE;
1990 mp->intindex = 0;
1991 }
1992
init_ms(ms)1993 static void init_ms(ms)
1994 struct m_status *ms;
1995 {
1996 ms->gs[0] = def_gs[0];
1997 ms->gs[1] = def_gs[1];
1998 ms->gs[2] = def_gs[2];
1999 ms->gs[3] = def_gs[3];
2000 ms->gl = 0;
2001 ms->gr = 1;
2002 ms->sg = WRONGPLANE;
2003 ms->irr = 0;
2004 }
2005
init_multibuf(mp)2006 void init_multibuf(mp)
2007 MULBUF *mp;
2008 {
2009 mp->cs = ASCII;
2010 init_ms(mp->ms);
2011 if (mp->eseq != NOESC) {
2012 mp->eseq = NOESC;
2013 }
2014 fix_status_for_escape_sequence(mp);
2015 #if JAPANESE
2016 mp->sequence_counter = 0;
2017 #endif
2018 mp->icharset = ASCII;
2019 clear_multibuf(mp);
2020 }
2021
2022 /*
2023 * Buffering characters untile get a guarantee that it is right sequence.
2024 */
check_new_buffered_byte(mp)2025 static void check_new_buffered_byte(mp)
2026 MULBUF *mp;
2027 {
2028 m_position last_startpos = mp->startpos;
2029
2030 if (mp->io.input & (ESJIS83 | ESISO7 | ESISO8)) {
2031 if (check_escape_sequence(mp) == 0) {
2032 return; /* going process well */
2033 }
2034 }
2035
2036 /* it is not a escape sequence, try to use it as character */
2037 internalize(mp);
2038
2039 /*
2040 * If a character was detected in internalize(),
2041 * clean sg since single shift affect only one character.
2042 */
2043 if (last_startpos != mp->startpos) {
2044 mp->lastsg = mp->ms->sg;
2045 if (mp->ms->sg != WRONGPLANE) {
2046 mp->ms->sg = WRONGPLANE;
2047 fix_status_for_escape_sequence(mp);
2048 }
2049 }
2050 }
2051
2052 /*
2053 * Re-parse all buffered data.
2054 *
2055 * This routine is called when we find a problem in buffered data.
2056 * We firstly take out the first byte of buffered data before we call
2057 * this function. This routine parse all rest of buffered data again.
2058 */
multi_reparse(mp)2059 static void multi_reparse(mp)
2060 MULBUF *mp;
2061 {
2062 m_position to;
2063
2064 /*
2065 * We found something wrong and going to move first byte.
2066 * So, we clear single-shifted character set because it will
2067 * shift only this one byte being makred wrong.
2068 */
2069 if (mp->ms->sg != WRONGPLANE) {
2070 mp->ms->sg = WRONGPLANE;
2071 fix_status_for_escape_sequence(mp);
2072 }
2073
2074 #if JAPANESE
2075 /*
2076 * Quick japanese code hack.
2077 * Check whether character is SJIS KANA or not.
2078 * If it is SJIS KANA, it means our prediction was failed.
2079 * Now going to fall back to SJIS KANA mode.
2080 */
2081 if ((mp->priority == PSJIS || (mp->io.inputr & (ESSJIS | ESCP932))) &&
2082 CSISWRONG(mp->multics[mp->intindex - 1]) &&
2083 ISSJISKANA(mp->multiint[mp->intindex - 1])) {
2084 mp->cs = JISX0201KANA;
2085 mp->priority = PSJIS;
2086 mp->icharset = SJIS;
2087 mp->multiint[mp->intindex - 1] &= 0x7f;
2088 mp->multics[mp->intindex - 1] = mp->cs;
2089 }
2090 #endif
2091
2092 /*
2093 * Retry to parse rest of buffered data.
2094 */
2095 to = mp->lastpos;
2096 for (mp->lastpos = mp->startpos; mp->lastpos <= to; mp->lastpos++) {
2097 check_new_buffered_byte(mp);
2098 }
2099 mp->lastpos = to;
2100 }
2101
2102 #if LESS
multi_find_cs(mp,pos)2103 void multi_find_cs(mp, pos)
2104 MULBUF* mp;
2105 m_position pos;
2106 {
2107 int c;
2108 m_position lpos = pos;
2109
2110 if (ch_seek(pos) == 0) {
2111 /*
2112 * Back up to the beginning of the line.
2113 */
2114 while ((c = ch_back_get()) != '\n' && c != EOI) ;
2115 if (c == '\n') {
2116 (void)ch_forw_get();
2117 }
2118
2119 lpos = ch_tell();
2120
2121 if (lpos != pos) {
2122 while (lpos < pos) {
2123 c = ch_forw_get();
2124 assert(c != EOI && c != '\n');
2125 multi_parse(mp, c, NULL_POSITION, NULL, NULL);
2126 lpos++;
2127 }
2128 ch_seek(pos);
2129 }
2130 }
2131 }
2132 #endif
2133
2134 #define DEBUG 0
2135 #if DEBUG
2136 int debug = 1;
2137 #endif
2138
2139 /*
2140 * Manage m_status data structure to maintain ISO-2022 status of input stream.
2141 */
multi_start_buffering(mp,pos)2142 void multi_start_buffering(mp, pos)
2143 MULBUF *mp;
2144 m_position pos;
2145 {
2146 /* buffer must be empty */
2147 assert(mp->lastpos < mp->startpos);
2148
2149 /* initialize m_status if it is necessary */
2150 if (pos == mp->lastpos + 2 || pos == mp->laststartpos) {
2151 /*
2152 * pos == mp->lastpos+2 if this line is started after \n.
2153 * pos == mp->laststartpos if this line is started by a non-fit
2154 * character.
2155 */
2156 /* restore backed up sg */
2157 if (mp->ms->sg != mp->lastsg) {
2158 mp->ms->sg = mp->lastsg;
2159 fix_status_for_escape_sequence(mp);
2160 }
2161 /* adjust pointers */
2162 mp->startpos = pos;
2163 mp->lastpos = pos - 1;
2164 } else {
2165 /*
2166 * pos == somewhere else if this function is called after jump_loc().
2167 */
2168 #if DEBUG
2169 if (debug) {
2170 fprintf(stderr, "%qd, %qd, %qd, %qd\n", pos, mp->lastpos,
2171 mp->startpos, mp->laststartpos);
2172 fprintf(stderr, "oct %qo, %qo, %qo, %qo\n", pos, mp->lastpos,
2173 mp->startpos, mp->laststartpos);
2174 }
2175 #endif
2176 init_multibuf(mp);
2177 #if LESS
2178 multi_find_cs(mp, pos);
2179 clear_multibuf(mp);
2180 #endif
2181
2182 /* adjust pointers */
2183 mp->startpos = pos;
2184 mp->lastpos = pos - 1;
2185 mp->laststartpos = pos;
2186 }
2187 }
2188
2189 /*
2190 * Buffering characters untile get a guarantee that it is right sequence.
2191 */
multi_parse(mp,c,pos,mbd,mpos)2192 void multi_parse(mp, c, pos, mbd, mpos)
2193 MULBUF* mp;
2194 int c;
2195 m_position pos;
2196 M_BUFDATA* mbd;
2197 POSITION* mpos;
2198 {
2199 if (c < 0) {
2200 if (mpos != NULL) {
2201 *mpos = mp->startpos;
2202 }
2203
2204 /*
2205 * output pending unicode character
2206 */
2207 flush_pending_ucs(mp);
2208
2209 /*
2210 * Force to flush all buffering characters.
2211 */
2212 if (mp->eseq != NOESC) {
2213 mp->eseq = NOESC;
2214 fix_status_for_escape_sequence(mp);
2215 }
2216 while (mp->startpos <= mp->lastpos) {
2217 wrongcs1(mp);
2218 multi_reparse(mp);
2219 }
2220
2221 if (mbd != NULL) {
2222 mbd->cbuf = mp->multiint;
2223 mbd->csbuf = mp->multics;
2224 mbd->byte = mp->intindex;
2225 }
2226 mp->intindex = 0;
2227 } else {
2228 if (pos != NULL_POSITION) {
2229 assert(pos == mp->lastpos + 1);
2230 mp->lastpos = pos;
2231 } else {
2232 mp->lastpos++;
2233 }
2234 INBUF(mp) = c;
2235
2236 mp->laststartpos = mp->startpos;
2237 if (mpos != NULL) {
2238 *mpos = mp->startpos;
2239 }
2240
2241 /*
2242 * Put it into buffer and parse it.
2243 */
2244 check_new_buffered_byte(mp);
2245
2246 if (mbd != NULL) {
2247 mbd->cbuf = mp->multiint;
2248 mbd->csbuf = mp->multics;
2249 mbd->byte = mp->intindex;
2250 }
2251 mp->intindex = 0;
2252 }
2253 }
2254
2255 /*
2256 * Flush buffered data.
2257 */
multi_flush(mp,mbd,mpos)2258 void multi_flush(mp, mbd, mpos)
2259 MULBUF* mp;
2260 M_BUFDATA* mbd;
2261 POSITION* mpos;
2262 {
2263 multi_parse(mp, -1, NULL_POSITION, mbd, mpos);
2264 }
2265
2266 /*
2267 * Discard buffered data.
2268 */
multi_discard(mp)2269 void multi_discard(mp)
2270 MULBUF* mp;
2271 {
2272 multi_parse(mp, -1, NULL_POSITION, NULL, NULL);
2273 }
2274
set_codesets(mp,input,inputr)2275 void set_codesets(mp, input, inputr)
2276 MULBUF *mp;
2277 ENCSET input;
2278 ENCSET inputr;
2279 {
2280 mp->io.input = input;
2281 mp->io.inputr = inputr;
2282 if (inputr & ESUTF8)
2283 make_unicode_map(mp->io.scs, output & ESUTF8);
2284 }
2285
2286 /*
2287 * Return string representation about multi bytes character
2288 * which was buffered.
2289 */
get_icharset_string(mp)2290 char *get_icharset_string(mp)
2291 MULBUF *mp;
2292 {
2293 static char buf[10];
2294
2295 switch (mp->icharset)
2296 {
2297 #if JAPANESE
2298 /*
2299 * Code set
2300 */
2301 case SJIS: return ("SJIS");
2302 case SJIS2000: return ("SJIS-2000");
2303 case SJIS2004: return ("SJIS-2004");
2304 case CP932: return ("CP932");
2305 case UJIS: return ("UJIS");
2306 case UJIS2000: return ("UJIS-2000");
2307 case UJIS2004: return ("UJIS-2004");
2308 case UTF8Z: return ("UTF-8");
2309 case UTF8: return ("UTF-8");
2310 case UTF8W: return ("UTF-8");
2311 #endif
2312 /*
2313 * Character set
2314 */
2315 case ASCII: return ("ASCII");
2316 case JISX0201KANA: return ("JIS-KANA");
2317 case JISX0201ROMAN: return ("JIS-ROMAN");
2318 case LATIN1: return ("LATIN1");
2319 case LATIN2: return ("LATIN2");
2320 case LATIN3: return ("LATIN3");
2321 case LATIN4: return ("LATIN4");
2322 case GREEK: return ("GREEK");
2323 case ARABIC: return ("ARABIC");
2324 case HEBREW: return ("HEBREW");
2325 case CYRILLIC: return ("CYRILLIC");
2326 case LATIN5: return ("LATIN5");
2327 case THAI: return ("THAI");
2328 case LATIN6: return ("LATIN6");
2329 case LATIN7: return ("LATIN7");
2330 case LATIN8: return ("LATIN8");
2331 case LATIN9: return ("LATIN9");
2332 case LATIN10: return ("LATIN10");
2333 case JISX0208_78KANJI: return ("KANJI:1978");
2334 case GB2312: return ("GB2312");
2335 case JISX0208KANJI: return ("KANJI:1983");
2336 case JISX0208_90KANJI: return ("KANJI:1990");
2337 case KSC5601: return ("KSC5601");
2338 case JISX0212KANJISUP: return ("JIS-KANJISP");
2339 case JISX0213KANJI1: return ("X0213:2000-1");
2340 case JISX0213KANJI2: return ("X0213:2000-2");
2341 case JISX02132004KANJI1:return ("X0213:2004-1");
2342 }
2343 switch (CS2TYPE(mp->icharset))
2344 {
2345 case TYPE_94_CHARSET:
2346 strcpy(buf, "94( )");
2347 buf[3] = CS2FT(mp->icharset);
2348 break;
2349 case TYPE_96_CHARSET:
2350 strcpy(buf, "96( )");
2351 buf[3] = CS2FT(mp->icharset);
2352 break;
2353 case TYPE_94N_CHARSET:
2354 strcpy(buf, "94N( )");
2355 buf[4] = CS2FT(mp->icharset);
2356 break;
2357 case TYPE_96N_CHARSET:
2358 strcpy(buf, "96N( )");
2359 buf[4] = CS2FT(mp->icharset);
2360 break;
2361 default:
2362 assert(0);
2363 }
2364 if (CS2IRR(mp->icharset) > 0)
2365 {
2366 char num[3];
2367 sprintf(num, "%d", CS2IRR(mp->icharset));
2368 strcat(buf, num);
2369 }
2370 return (buf);
2371 }
2372
2373 static int old_gl_output_charset = ASCII; /* Last displayed character set */
2374 static int old_gr_output_charset = WRONGCS;
2375 static int old_shift = 0;
2376
make_escape_sequence(charset)2377 static unsigned char *make_escape_sequence(charset)
2378 int charset;
2379 {
2380 static unsigned char p[9];
2381 int len;
2382
2383 if (CSISWRONG(charset))
2384 {
2385 charset = ASCII;
2386 }
2387
2388 if (old_gl_output_charset != charset
2389 || old_gr_output_charset != charset) {
2390 p[0] = '\033';
2391 len = 1;
2392 if ((output & (ESISO7 | ESISO8)) && CS2IRR(charset) > 0)
2393 {
2394 p[len] = '&';
2395 p[len + 1] = IRR2CODE(CS2IRR(charset));
2396 p[len + 2] = '\033';
2397 len += 3;
2398 }
2399 /*
2400 * Call 94 or 94N character set to G0/GL plane.
2401 * Call 96 or 96N character set to G1/GR plane.
2402 */
2403 switch (CS2TYPE(charset))
2404 {
2405 case TYPE_94_CHARSET:
2406 p[len] = '(';
2407 p[len + 1] = CS2FT(charset);
2408 len += 2;
2409 old_gl_output_charset = charset;
2410 break;
2411 case TYPE_94N_CHARSET:
2412 switch (CS2FT(charset))
2413 {
2414 case '@':
2415 case 'A':
2416 case 'B':
2417 p[len] = '$';
2418 p[len + 1] = CS2FT(charset);
2419 len += 2;
2420 break;
2421 default:
2422 p[len] = '$';
2423 p[len + 1] = '(';
2424 p[len + 2] = CS2FT(charset);
2425 len += 3;
2426 break;
2427 }
2428 old_gl_output_charset = charset;
2429 break;
2430 case TYPE_96_CHARSET:
2431 p[len] = '-';
2432 p[len + 1] = CS2FT(charset);
2433 len += 2;
2434 old_gr_output_charset = charset;
2435 break;
2436 case TYPE_96N_CHARSET:
2437 p[len] = '$';
2438 p[len + 1] = '-';
2439 p[len + 2] = CS2FT(charset);
2440 len += 3;
2441 old_gr_output_charset = charset;
2442 break;
2443 }
2444 }
2445 /*
2446 * If output is not ESISO8, use SO and SI to call G1 to GL.
2447 * Otherwise, we use GR directly, so no need to call G1
2448 * since G1 is called GR already.
2449 */
2450 if (!(output & ESISO8))
2451 {
2452 switch (CS2TYPE(charset))
2453 {
2454 case TYPE_94_CHARSET:
2455 case TYPE_94N_CHARSET:
2456 if (old_shift) {
2457 p[len] = '\017';
2458 len++;
2459 old_shift = 0;
2460 }
2461 break;
2462 case TYPE_96_CHARSET:
2463 case TYPE_96N_CHARSET:
2464 if (!old_shift) {
2465 p[len] = '\016';
2466 len++;
2467 old_shift = 1;
2468 }
2469 break;
2470 }
2471 }
2472 p[len] = '\0';
2473 return (p);
2474 }
2475
2476 static char cvbuffer[32];
2477 static int cvindex = 0;
2478 static char *nullcvbuffer = "";
2479
convert_to_iso(c,cs)2480 static char *convert_to_iso(c, cs)
2481 int c;
2482 int cs;
2483 {
2484 register unsigned char *p;
2485 static char buffer2[2];
2486
2487 if ((output & ESISO8) && c != 0 &&
2488 (CS2TYPE(cs) == TYPE_96_CHARSET ||
2489 CS2TYPE(cs) == TYPE_96N_CHARSET))
2490 c |= 0x80;
2491
2492 buffer2[0] = c;
2493 buffer2[1] = '\0';
2494
2495 cs = CS2CHARSET(cs);
2496 if (cs == CP932)
2497 {
2498 /* not supported */
2499 cvindex = 0;
2500 return (nullcvbuffer);
2501 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2502 {
2503 /* not supported */
2504 cvindex = 0;
2505 return (nullcvbuffer);
2506 }
2507 if (CSISREST(cs))
2508 {
2509 return (buffer2);
2510 }
2511 if (CSISWRONG(cs))
2512 {
2513 cs = ASCII;
2514 }
2515
2516 if (c & 0x80) {
2517 if (cs == old_gr_output_charset) {
2518 return (buffer2);
2519 }
2520 } else {
2521 if (cs == old_gl_output_charset && old_shift == 0) {
2522 return (buffer2);
2523 } else if (cs == old_gr_output_charset && old_shift == 1) {
2524 return (buffer2);
2525 }
2526 }
2527
2528 p = make_escape_sequence(cs);
2529 strcpy(cvbuffer, p);
2530 strcat(cvbuffer, buffer2);
2531 return (cvbuffer);
2532 }
2533
convert_to_jis(c,cs)2534 static char *convert_to_jis(c, cs)
2535 int c;
2536 int cs;
2537 {
2538 register unsigned char *p;
2539 static char buffer2[3];
2540
2541 if (c == 0)
2542 {
2543 cvindex = 0;
2544 return (nullcvbuffer);
2545 }
2546
2547 buffer2[cvindex++] = c;
2548 buffer2[cvindex] = '\0';
2549
2550 if (CSISWRONG(cs))
2551 {
2552 cs = ASCII;
2553 }
2554
2555 cs = CS2CHARSET(cs);
2556
2557 if (cs == ASCII || cs == JISX0201ROMAN)
2558 {
2559 assert(cvindex == 1);
2560 cvindex = 0;
2561 } else if (cs == JISX0201KANA)
2562 {
2563 assert(cvindex == 1);
2564 cvindex = 0;
2565 } else if (cs == JISX0208_78KANJI)
2566 {
2567 if (cvindex == 1)
2568 return (nullcvbuffer);
2569 assert(cvindex == 2);
2570 jis78to90(buffer2);
2571 cs = JISX0208_90KANJI;
2572 cvindex = 0;
2573 } else if (cs == JISX0208KANJI || cs == JISX0208_90KANJI)
2574 {
2575 if (cvindex == 1)
2576 return (nullcvbuffer);
2577 assert(cvindex == 2);
2578 cvindex = 0;
2579 } else if (cs == JISX0213KANJI1)
2580 {
2581 if (cvindex == 1)
2582 return (nullcvbuffer);
2583 assert(cvindex == 2);
2584 cvindex = 0;
2585 cs = JISX0208KANJI;
2586 } else if (cs == JISX02132004KANJI1)
2587 {
2588 if (cvindex == 1)
2589 return (nullcvbuffer);
2590 assert(cvindex == 2);
2591 cvindex = 0;
2592 cs = JISX0208KANJI;
2593 } else if (cs == CP932)
2594 {
2595 /* not supported */
2596 cvindex = 0;
2597 return (nullcvbuffer);
2598 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2599 {
2600 /* not supported */
2601 cvindex = 0;
2602 return (nullcvbuffer);
2603 } else
2604 {
2605 assert(0);
2606 cvindex = 0;
2607 }
2608
2609 if (cs == old_gl_output_charset)
2610 {
2611 return (buffer2);
2612 }
2613 else
2614 {
2615 p = make_escape_sequence(cs);
2616 strcpy(cvbuffer, p);
2617 strcat(cvbuffer, buffer2);
2618 return (cvbuffer);
2619 }
2620 }
2621
2622 #if JAPANESE
convert_to_ujis(c,cs)2623 static char *convert_to_ujis(c, cs)
2624 int c;
2625 int cs;
2626 {
2627 if (c == 0)
2628 {
2629 cvindex = 0;
2630 return (nullcvbuffer);
2631 }
2632
2633 cvbuffer[cvindex++] = c;
2634 cvbuffer[cvindex] = '\0';
2635
2636 if (CSISWRONG(cs))
2637 {
2638 cs = ASCII;
2639 }
2640
2641 cs = CS2CHARSET(cs);
2642 if (cs == ASCII || cs == JISX0201ROMAN)
2643 {
2644 assert(cvindex == 1);
2645 cvindex = 0;
2646 return (cvbuffer);
2647 } else if (cs == JISX0201KANA)
2648 {
2649 assert(cvindex == 1);
2650 cvbuffer[2] = '\0';
2651 cvbuffer[1] = cvbuffer[0] | 0x80;
2652 cvbuffer[0] = 0x8e;
2653 cvindex = 0;
2654 return (cvbuffer);
2655 } else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2656 cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
2657 cs == JISX02132004KANJI1)
2658 {
2659 if (cvindex == 1)
2660 return (nullcvbuffer);
2661 assert(cvindex == 2);
2662 if (cs == JISX0208_78KANJI)
2663 jis78to90(cvbuffer);
2664 cvbuffer[0] |= 0x80;
2665 cvbuffer[1] |= 0x80;
2666 cvindex = 0;
2667 return (cvbuffer);
2668 } else if (cs == JISX0212KANJISUP || cs == JISX0213KANJI2)
2669 {
2670 if (cvindex == 1)
2671 return (nullcvbuffer);
2672 assert(cvindex == 2);
2673 cvbuffer[2] = cvbuffer[1] | 0x80;
2674 cvbuffer[1] = cvbuffer[0] | 0x80;
2675 cvbuffer[0] = 0x8f;
2676 cvbuffer[3] = '\0';
2677 cvindex = 0;
2678 return (cvbuffer);
2679 } else if (cs == CP932)
2680 {
2681 /* not supported */
2682 cvindex = 0;
2683 return (nullcvbuffer);
2684 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2685 {
2686 /* not supported */
2687 cvindex = 0;
2688 return (nullcvbuffer);
2689 }
2690 assert(0);
2691 cvindex = 0;
2692 return (cvbuffer);
2693 }
2694
convert_to_sjis(c,cs)2695 static char *convert_to_sjis(c, cs)
2696 int c;
2697 int cs;
2698 {
2699 if (c == 0)
2700 {
2701 cvindex = 0;
2702 return (nullcvbuffer);
2703 }
2704
2705 cvbuffer[cvindex++] = c;
2706 cvbuffer[cvindex] = '\0';
2707
2708 if (CSISWRONG(cs))
2709 {
2710 cs = ASCII;
2711 }
2712
2713 cs = CS2CHARSET(cs);
2714
2715 if (cs == ASCII || cs == JISX0201ROMAN)
2716 {
2717 assert(cvindex == 1);
2718 cvindex = 0;
2719 return (cvbuffer);
2720 } else if (cs == JISX0201KANA)
2721 {
2722 assert(cvindex == 1);
2723 cvbuffer[0] |= 0x80;
2724 cvindex = 0;
2725 return (cvbuffer);
2726 } else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2727 cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
2728 cs == JISX02132004KANJI1)
2729 {
2730 register int c1, c2, c3;
2731 static unsigned char table_sjis[] = {
2732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2733 0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
2734 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
2735 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
2736 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
2737 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
2738 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
2739 };
2740
2741 if (cvindex == 1)
2742 return (nullcvbuffer);
2743 assert(cvindex == 2);
2744 if (cs == JISX0208_78KANJI)
2745 jis78to90(cvbuffer);
2746 c3 = cvbuffer[0] & 0x7f;
2747 c1 = c3 & 1;
2748 c2 = (cvbuffer[1] & 0x7f) + (c1 ? 0x40 - 0x21 : 0x9e - 0x21);
2749 c1 = table_sjis[c3 / 2 + c1];
2750 cvbuffer[0] = c1;
2751 cvbuffer[1] = c2 + (c2 >= 0x7f ? 1 : 0);
2752 cvindex = 0;
2753 return (cvbuffer);
2754 } else if (cs == JISX0213KANJI2)
2755 {
2756 register int c1, c2, c3;
2757 if (cvindex == 1)
2758 return (nullcvbuffer);
2759 assert(cvindex == 2);
2760 c3 = cvbuffer[0] & 0x7f;
2761 c1 = c3 & 1;
2762 c2 = (cvbuffer[1] & 0x7f) +
2763 (c1 ? 0x40 - 0x21 : 0x9e - 0x21);
2764 if (c3 <= 0x25) {
2765 /* Map 1, 3, 4, and 5-KU */
2766 /* Note: 2-KU is rejected already. */
2767 c1 = (c3 - 0x21) / 2 + 0xf0;
2768 } else if (c3 == 0x28) {
2769 /* Map 8-KU */
2770 c1 = 0xf0;
2771 } else if (c3 <= 0x2f) {
2772 /* Map 12, 13, 14, and 15-KU */
2773 c1 = (c3 - 0x2b) / 2 + 0xf2;
2774 } else {
2775 /* Map 78-94 KU. */
2776 /* Note: 16-77 KU is rejected already. */
2777 c1 = (c3 - 0x6d) / 2 + 0xf4;
2778 }
2779 cvbuffer[0] = c1;
2780 cvbuffer[1] = c2 + (c2 >= 0x7f ? 1 : 0);
2781 cvindex = 0;
2782 return (cvbuffer);
2783 } else if (cs == CP932)
2784 {
2785 /* not supported */
2786 cvindex = 0;
2787 return (nullcvbuffer);
2788 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2789 {
2790 /* not supported */
2791 cvindex = 0;
2792 return (nullcvbuffer);
2793 }
2794 assert(0);
2795 cvindex = 0;
2796 return (cvbuffer);
2797 }
2798
convert_to_cp932(c,cs)2799 static char *convert_to_cp932(c, cs)
2800 int c;
2801 int cs;
2802 {
2803 if (c == 0)
2804 {
2805 cvindex = 0;
2806 return (nullcvbuffer);
2807 }
2808
2809 cvbuffer[cvindex++] = c;
2810 cvbuffer[cvindex] = '\0';
2811
2812 if (CSISWRONG(cs))
2813 {
2814 cs = ASCII;
2815 }
2816
2817 cs = CS2CHARSET(cs);
2818
2819 if (cs == ASCII || cs == JISX0201ROMAN)
2820 {
2821 assert(cvindex == 1);
2822 cvindex = 0;
2823 return (cvbuffer);
2824 } else if (cs == JISX0201KANA)
2825 {
2826 assert(cvindex == 1);
2827 cvbuffer[0] |= 0x80;
2828 cvindex = 0;
2829 return (cvbuffer);
2830 } else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2831 cs == JISX0208_90KANJI)
2832 {
2833 int i = cvbuffer[0] & 0x7f;
2834 int j = cvbuffer[1] & 0x7f;
2835
2836 if (cvindex == 1)
2837 return (nullcvbuffer);
2838 assert(cvindex == 2);
2839
2840 cvbuffer[0] = (i - 0x21) / 2 + ((i <= 0x5e) ? 0x81 : 0xc1);
2841 cvbuffer[1] = j + ((i & 1) ?((j <= 0x5f) ?0x1f :0x20) :0x7e);
2842 cvindex = 0;
2843 return (cvbuffer);
2844 } else if (cs == CP932)
2845 {
2846 int i = cvbuffer[0] & 0x7f;
2847 int j = cvbuffer[1] & 0x7f;
2848
2849 if (cvindex == 1)
2850 return (nullcvbuffer);
2851 assert(cvindex == 2);
2852
2853 if (i >= 0x30 && i <= 0x74) i += 0x4f;
2854 cvbuffer[0] = (i - 0x21) / 2 + ((i <= 0x5e) ? 0x81 : 0xc1);
2855 cvbuffer[1] = j + ((i & 1) ?((j <= 0x5f) ?0x1f :0x20) :0x7e);
2856 cvindex = 0;
2857 return (cvbuffer);
2858 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2859 {
2860 /* not supported */
2861 cvindex = 0;
2862 return (nullcvbuffer);
2863 }
2864 assert(0);
2865 cvindex = 0;
2866 return (cvbuffer);
2867 }
2868
2869 #endif
2870
convUTF8(buf,c)2871 static char *convUTF8(buf, c)
2872 char *buf;
2873 int c;
2874 {
2875 buf[0] = '\0';
2876
2877 if (c < 0) {
2878 return (buf);
2879 } else if (c < 0x80) {
2880 buf[0] = c;
2881 buf[1] = '\0';
2882 } else if (c < 0x800) {
2883 buf[0] = ((c >> 6) & 0x1f) | 0xc0;
2884 buf[1] = (c & 0x3f) | 0x80;
2885 buf[2] = '\0';
2886 } else if (c < 0x10000) {
2887 buf[0] = ((c >> 12) & 0x0f) | 0xe0;
2888 buf[1] = ((c >> 6) & 0x3f) | 0x80;
2889 buf[2] = (c & 0x3f) | 0x80;
2890 buf[3] = '\0';
2891 } else if (c < 0x200000) {
2892 buf[0] = ((c >> 18) & 0x07) | 0xf0;
2893 buf[1] = ((c >> 12) & 0x3f) | 0x80;
2894 buf[2] = ((c >> 6) & 0x3f) | 0x80;
2895 buf[3] = (c & 0x3f) | 0x80;
2896 buf[4] = '\0';
2897 } else if (c < 0x4000000) {
2898 buf[0] = ((c >> 24) & 0x03) | 0xf8;
2899 buf[1] = ((c >> 18) & 0x3f) | 0x80;
2900 buf[2] = ((c >> 12) & 0x3f) | 0x80;
2901 buf[3] = ((c >> 6) & 0x3f) | 0x80;
2902 buf[4] = (c & 0x3f) | 0x80;
2903 buf[5] = '\0';
2904 } else {
2905 buf[0] = ((c >> 30) & 0x01) | 0xfc;
2906 buf[1] = ((c >> 24) & 0x3f) | 0x80;
2907 buf[2] = ((c >> 18) & 0x3f) | 0x80;
2908 buf[3] = ((c >> 12) & 0x3f) | 0x80;
2909 buf[4] = ((c >> 6) & 0x3f) | 0x80;
2910 buf[5] = (c & 0x3f) | 0x80;
2911 buf[6] = '\0';
2912 }
2913
2914 return (buf);
2915 }
2916
2917 static int
need_combining_utf8(c,cs)2918 need_combining_utf8(c, cs)
2919 int c;
2920 int cs;
2921 {
2922 #if JAPANESE
2923 if (cs == JISX0213KANJI1 || cs == JISX02132004KANJI1) {
2924 struct st_ucs_combining *p;
2925 for (p = jisx0213_comb; p->c1; ++ p) {
2926 if (p->c1 +0x20 == (cvbuffer[0] & 0x7f)
2927 && p->c2 + 0x20 == (c & 0x7f)) {
2928 convUTF8(cvbuffer, p->u1);
2929 if (p->u2 > 0)
2930 convUTF8(cvbuffer + strlen(cvbuffer), p->u2);
2931 return 1;
2932 }
2933 }
2934 }
2935 #endif
2936
2937 return 0;
2938 }
2939
2940
convert_to_utf8(c,cs)2941 static char *convert_to_utf8(c, cs)
2942 int c;
2943 int cs;
2944 {
2945 if (c == 0)
2946 {
2947 cvindex = 0;
2948 return (nullcvbuffer);
2949 }
2950
2951 cvbuffer[cvindex++] = c;
2952 cvbuffer[cvindex] = '\0';
2953
2954 if (CSISWRONG(cs))
2955 {
2956 cs = ASCII;
2957 }
2958
2959 cs = CS2CHARSET(cs);
2960 if (cs == ASCII)
2961 {
2962 assert(cvindex == 1);
2963 cvindex = 0;
2964 return (cvbuffer);
2965 } else if (cs == UTF8 || cs == UTF8W || cs == UTF8Z)
2966 {
2967 if (ISUTF8_HEAD(c)) {
2968 assert(cvindex == 1);
2969 return (nullcvbuffer);
2970 } else if (ISUTF8_REST(c)) {
2971 int head = cvbuffer[0];
2972 if ((head & 0xe0) == 0xc0) {
2973 assert(cvindex == 2);
2974 cvindex = 0;
2975 return (cvbuffer);
2976 } else if ((head & 0xf0) == 0xe0) {
2977 if (cvindex <= 2)
2978 return (nullcvbuffer);
2979 assert(cvindex == 3);
2980 cvindex = 0;
2981 return (cvbuffer);
2982 } else if ((head & 0xf8) == 0xf0) {
2983 if (cvindex <= 3)
2984 return (nullcvbuffer);
2985 assert(cvindex == 4);
2986 cvindex = 0;
2987 return (cvbuffer);
2988 } else if ((head & 0xfc) == 0xf8) {
2989 if (cvindex <= 4)
2990 return (nullcvbuffer);
2991 assert(cvindex == 5);
2992 cvindex = 0;
2993 return (cvbuffer);
2994 } else if ((head & 0xfe) == 0xfc) {
2995 if (cvindex <= 5)
2996 return (nullcvbuffer);
2997 assert(cvindex == 6);
2998 cvindex = 0;
2999 return (cvbuffer);
3000 }
3001 assert(0);
3002 }
3003 } else if (cs == JISX0201ROMAN)
3004 {
3005 assert(cvindex == 1);
3006 cvindex = 0;
3007 return convUTF8(cvbuffer, ucode_latin[UMAP_JISX0201][c]);
3008 } else if (cs == JISX0201KANA)
3009 {
3010 assert(cvindex == 1);
3011 cvindex = 0;
3012 return convUTF8(cvbuffer, ucode_latin[UMAP_JISX0201][c | 0x80]);
3013 } else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
3014 cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
3015 cs == JISX02132004KANJI1)
3016 {
3017 int num;
3018 if (cvindex == 1)
3019 return (nullcvbuffer);
3020 assert(cvindex == 2);
3021 if (need_combining_utf8(c, cs)) {
3022 cvindex = 0;
3023 return (cvbuffer);
3024 }
3025 if (cs == JISX0208_78KANJI)
3026 jis78to90(cvbuffer);
3027 num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3028 if (num < 0 || num > U_kanji) {
3029 cvindex = 0;
3030 return (nullcvbuffer);
3031 }
3032 cvindex = 0;
3033 return convUTF8(cvbuffer, ucode_kanji1[num]);
3034 } else if (cs == JISX0212KANJISUP || cs == JISX0213KANJI2)
3035 {
3036 int num;
3037 if (cvindex == 1)
3038 return (nullcvbuffer);
3039 assert(cvindex == 2);
3040 num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3041 if (num < 0 || num > U_kanji) {
3042 cvindex = 0;
3043 return (nullcvbuffer);
3044 }
3045 cvindex = 0;
3046 return convUTF8(cvbuffer, ucode_kanji2[num]);
3047 } else if (cs == CP932)
3048 {
3049 int num;
3050 if (cvindex == 1)
3051 return (nullcvbuffer);
3052 assert(cvindex == 2);
3053 num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3054 if (num < 0 || num > U_kanji) {
3055 cvindex = 0;
3056 return (nullcvbuffer);
3057 }
3058 cvindex = 0;
3059 return convUTF8(cvbuffer, ucode_cp932[num]);
3060 } else {
3061 int i;
3062 for (i = UMAP_ISO8859_1; i <= UMAP_ISO8859_16; ++ i) {
3063 if (cs == iso8859_list[i]) {
3064 assert(cvindex == 1);
3065 cvindex = 0;
3066 return convUTF8(cvbuffer, ucode_latin[i][c | 0x80]);
3067 }
3068 }
3069 }
3070 cvindex = 0;
3071 return (cvbuffer);
3072 }
3073
outchar(c,cs)3074 char *outchar(c, cs)
3075 int c;
3076 CHARSET cs;
3077 {
3078 if (c < 0)
3079 {
3080 c = 0;
3081 cs = ASCII;
3082 }
3083
3084 if (output & (ESISO7 | ESISO8))
3085 return (convert_to_iso(c, cs));
3086 if (output & ESJIS83)
3087 return (convert_to_jis(c, cs));
3088 #if JAPANESE
3089 if (output & ESUJIS)
3090 return (convert_to_ujis(c, cs));
3091 if (output & ESSJIS)
3092 return (convert_to_sjis(c, cs));
3093 if (output & ESCP932)
3094 return (convert_to_cp932(c, cs));
3095 #endif
3096 if (output & ESUTF8)
3097 return (convert_to_utf8(c, cs));
3098 cvbuffer[0] = c;
3099 cvbuffer[1] = '\0';
3100 return (cvbuffer);
3101 }
3102
outbuf(p,cs)3103 char *outbuf(p, cs)
3104 unsigned char *p;
3105 CHARSET cs;
3106 {
3107 static char buffer[1024];
3108 char *s;
3109 int i = 0;
3110
3111 while (*p != '\0')
3112 {
3113 s = outchar(*p++, cs);
3114 while (*s != '\0')
3115 buffer[i++] = *s++;
3116 assert(i < (int)sizeof(buffer));
3117 }
3118 buffer[i] = '\0';
3119 return (buffer);
3120 }
3121
mwidth(c,cs)3122 int mwidth(c, cs)
3123 int c;
3124 CHARSET cs;
3125 {
3126 if (CSISREST(cs))
3127 return (0);
3128
3129 switch (cs)
3130 {
3131 case UTF8Z:
3132 return 0;
3133 case UTF8:
3134 return 1;
3135 case UTF8W:
3136 return 2;
3137 }
3138
3139 switch (CS2TYPE(cs))
3140 {
3141 case TYPE_94_CHARSET:
3142 case TYPE_96_CHARSET:
3143 return (1);
3144 case TYPE_94N_CHARSET:
3145 case TYPE_96N_CHARSET:
3146 return (2);
3147 default:
3148 assert(0);
3149 return (0);
3150 }
3151 }
3152
rotate_right_codeset(mp)3153 char *rotate_right_codeset(mp)
3154 MULBUF *mp;
3155 {
3156 char *p = NULL;
3157
3158 mp->rotation_io_right++;
3159 mp->rotation_io_right %= 8;
3160 switch (mp->rotation_io_right) {
3161 case 0: p = "original"; mp->io.inputr = mp->orig_io_right; break;
3162 case 1: p = "utf-8"; mp->io.inputr = ESUTF8;
3163 make_unicode_map(mp->io.scs, output & ESUTF8); break;
3164 case 2: p = "ujis"; mp->io.inputr = ESUJIS; break;
3165 case 3: p = "sjis"; mp->io.inputr = ESSJIS; break;
3166 case 4: p = "cp932"; mp->io.inputr = ESCP932; break;
3167 case 5: p = "iso8"; mp->io.inputr = ESISO8; break;
3168 case 6: p = "noconv"; mp->io.inputr = ESNOCONV; break;
3169 case 7: p = "none"; mp->io.inputr = ESNONE; break;
3170 default: assert(0); break;
3171 }
3172 init_priority(mp);
3173 return (p);
3174 }
3175
3176 #endif
3177
strlen_cs(str,cs)3178 int strlen_cs(str, cs)
3179 char* str;
3180 CHARSET* cs;
3181 {
3182 int i = 0;
3183 if (cs == NULL)
3184 return strlen(str);
3185 while (*str != NULCH || !CSISNULLCS(*cs)) {
3186 str++;
3187 cs++;
3188 i++;
3189 }
3190 return i;
3191 }
3192
chlen_cs(chstr,cs)3193 int chlen_cs(chstr, cs)
3194 char* chstr;
3195 CHARSET* cs;
3196 {
3197 int i;
3198 if (cs == NULL)
3199 {
3200 if (chstr == NULL || *chstr == NULCH)
3201 return 0;
3202 else
3203 return 1;
3204 }
3205 if (*chstr == NULCH && CSISNULLCS(*cs))
3206 return 0;
3207 i = 0;
3208 do {
3209 i++;
3210 cs++;
3211 } while (CSISREST(*cs));
3212 return i;
3213 }
3214
strdup_cs(str,cs,csout)3215 char* strdup_cs(str, cs, csout)
3216 char* str;
3217 CHARSET* cs;
3218 CHARSET** csout;
3219 {
3220 int len = strlen_cs(str, cs);
3221 char* save_str = (char *)ecalloc(len + 1, 1);
3222 CHARSET* save_cs = (CHARSET *)ecalloc(len + 1, sizeof(CHARSET));
3223 memcpy(save_str, str, sizeof(char) * (len + 1));
3224 if (cs)
3225 memcpy(save_cs, cs, sizeof(CHARSET) * (len + 1));
3226 else {
3227 cs = save_cs;
3228 while (--len >= 0)
3229 *cs++ = ASCII;
3230 *cs = NULLCS;
3231 }
3232 *csout = save_cs;
3233 return save_str;
3234 }
3235