1 /*
2  * Copyright (c) 1994-2005  Kazushi (Jam) Marukawa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice in the documentation and/or other materials provided with
12  *    the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
23  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
24  * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * Routines to manipulate a buffer to hold string of multi bytes character.
30  * Detect a character set from input string and convert them to internal
31  * codes.  And convert it to other codes to display them.
32  */
33 
34 #include "defines.h"
35 #include "less.h"
36 
37 #include <stdio.h>
38 #include <assert.h>
39 
40 #if STDC_HEADERS
41 #include <stdlib.h>
42 #include <string.h>
43 #endif
44 
45 #if JAPANESE
46 #include "kanji_map.h"
47 #include "unicode_map.h"
48 #endif
49 #include "unicode_type.h"
50 
51 #define LESS 1
52 
53 /* TODO: remove caller control_char(), change_control_char() and ecalloc() */
54 extern int control_char ();
55 extern void change_control_char ();
56 extern void* ecalloc ();
57 
58 
59 #if ISO
60 
61 static void multi_reparse();
62 static int check_ft();
63 
64 
65 #if JAPANESE
66 
67 int markwrongchar = 1;
68 
69 
70 /*
71  * Macro for character detection
72  */
73 #define ISJIS(c)		(0x21 <= (c) && (c) <= 0x7e)
74 #define ISUJIS(c)		(0xa1 <= (c) && (c) <= 0xfe)
75 #define ISUJISSS(c)		((c) == 0x8e || (c) == 0x8f)
76 #define ISUJISKANJI(c1,c2)	(ISUJIS(c1) && ISUJIS(c2))
77 #define ISUJISKANJI1(c)		(ISUJIS(c))
78 #define ISUJISKANA(c1,c2)	((c1) == 0x8e && ISUJIS(c2))
79 #define ISUJISKANA1(c)		((c) == 0x8e)
80 #define ISUJISKANJISUP(c1,c2,c3) ((c1) == 0x8f && ISUJIS(c2) && ISUJIS(c3))
81 #define ISUJISKANJISUP1(c)	((c) == 0x8f)
82 #define ISSJISKANJI(c1,c2)	(((0x81 <= (c1) && (c1) <= 0x9f) || \
83 				  (0xe0 <= (c1) && (c1) <= 0xfc)) && \
84 				 (0x40 <= (c2) && (c2) <= 0xfc && (c2) != 0x7f))
85 #define ISSJISKANJI1(c)		((0x81 <= (c) && (c) <= 0x9f) || \
86 				 (0xe0 <= (c) && (c) <= 0xfc))
87 #define ISSJISKANA(c)		(0xa1 <= (c) && (c) <= 0xdf)
88 #define ISUTF8_HEAD(c)		(0xc0 <= (c) && (c) < 0xfe)
89 #define ISUTF8_REST(c)		(((c) & 0xc0) == 0x80)
90 #define ISUTF8_1(c)		((c) <= 0x7f)
91 #define ISUTF8_2(c1,c2)		(((c1) & 0xe0) == 0xc0 && ISUTF8_REST(c2))
92 #define ISUTF8_3(c1,c2,c3)	(((c1) & 0xf0) == 0xe0 && ISUTF8_REST(c2) && \
93 				 ISUTF8_REST(c3))
94 #define ISUTF8_4(c1,c2,c3,c4)	(((c1) & 0xf8) == 0xf0 && ISUTF8_REST(c2) && \
95 				 ISUTF8_REST(c3) && ISUTF8_REST(c4))
96 #define ISUTF8_5(c1,c2,c3,c4,c5) \
97 	(((c1) & 0xfc) == 0xf8 && ISUTF8_REST(c2) && ISUTF8_REST(c3) && \
98 	 ISUTF8_REST(c4) && ISUTF8_REST(c5))
99 #define ISUTF8_6(c1,c2,c3,c4,c5,c6) \
100 	(((c1) & 0xfe) == 0xfc && ISUTF8_REST(c2) && ISUTF8_REST(c3) && \
101 	 ISUTF8_REST(c4) && ISUTF8_REST(c5) && ISUTF8_REST(c6))
102 
103 #define UTF8_2(c0, c1)                 ((((c0) & 0x1f) << 6) \
104 					| (((c1) & 0x3f)))
105 #define UTF8_3(c0, c1, c2)             ((((c0) & 0x0f) << 12) \
106 					| (((c1) & 0x3f) << 6)	\
107 					| ((c2) & 0x3f))
108 #define UTF8_4(c0, c1, c2, c3)         ((((c0) & 0x07) << 18) \
109 					| (((c1) & 0x3f) << 12)	\
110 					| (((c2) & 0x3f) << 6) \
111 					| ((c3) & 0x3f))
112 #define UTF8_5(c0, c1, c2, c3, c4)     ((((c0) & 0x03) << 24) \
113 					| (((c1) & 0x3f) << 18) \
114 					| (((c2) & 0x3f) << 12) \
115 					| (((c3) & 0x3f) << 6)	\
116 					| ((c4) & 0x3f))
117 #define UTF8_6(c0, c1, c2, c3, c4, c5) ((((c0) & 0x01) << 30)  \
118 					| (((c1) & 0x3f) << 24) \
119 					| (((c2) & 0x3f) << 18) \
120 					| (((c3) & 0x3f) << 12)	\
121 					| (((c4) & 0x3f) << 6)	\
122 					| ((c5) & 0x3f))
123 #endif
124 
125 
126 /*
127  * Definitions for understanding the escape sequence.
128  * Following escape sequences which be understood by less:
129  *  ESC 2/4 2/8,2/9,2/10,2/11,2/13,2/14,2/15 F
130  *  ESC 2/4 4/0,4/1,4/2
131  *  ESC 2/6 F
132  *  ESC 2/8,2/9,2/10,2/11,2/13,2/14,2/15 F
133  *  ESC 2/12 F		This is used in MULE.  Less support this as input.
134  *  0/14,0/15
135  *  ESC 4/14,4/15,6/14,6/15,7/12,7/13,7/14
136  *  8/14,8/15
137  */
138 enum escape_sequence {
139     NOESC,		/* No */	ESC_,		/* ^[ */
140     ESC_2_4,	/* ^[$ */	ESC_2_4_8,	/* ^[$( */
141     ESC_2_4_9,	/* ^[$) */	ESC_2_4_10,	/* ^[$* */
142     ESC_2_4_11,	/* ^[$+ */	ESC_2_4_13,	/* ^[$- */
143     ESC_2_4_14,	/* ^[$. */	ESC_2_4_15,	/* ^[$/ */
144     ESC_2_6,	/* ^[& */	ESC_2_8,	/* ^[( */
145     ESC_2_9,	/* ^[) */	ESC_2_10,	/* ^[* */
146     ESC_2_11,	/* ^[+ */	ESC_2_12,	/* ^[, */
147     ESC_2_13,	/* ^[- */	ESC_2_14,	/* ^[. */
148     ESC_2_15,	/* ^[/ */	ESC_5_11,	/* ^[[ */
149 };
150 
151 
152 static SETCHARSET def_scs = SCSASCII | SCSOTHERISO;
153 static ENCSET def_input = ESISO7;	/* Default character set of left plane */
154 static ENCSET def_inputr = ESISO8;	/* Default character set of right plane */
155 static int def_gs[4] = {
156     ASCII,				/* Default g0 plane status */
157     WRONGCS,				/* Default g1 plane status */
158     WRONGCS,				/* Default g2 plane status */
159     WRONGCS				/* Default g3 plane status */
160 };
161 
162 static ENCSET output = ESISO8;		/* Character set for output */
163 #if JAPANESE
164 static J_PRIORITY def_priority = PUTF8;	/* Which code was given priority. */
165 #endif
166 
167 static UWidth utfwidth = UWIDTH_NORMAL;	/* default UTF-8 Width */
168 
169 typedef POSITION m_position;
170 #define M_NULL_POS	((POSITION)(-1))
171 
172 /*
173  * Structure to represent character set information.
174  *
175  * This data set contains current character set and other information
176  * to keep the status of ISO-2022 escape sequence.
177  */
178 struct m_status {
179     /* Graphi Sets */
180     int gs[4];			/* Current g0..g3 plane sets. */
181 				/* gl, gr, and sg refer one of 4 planes. */
182     int gl;			/* Current gl plane status */
183     int gr;			/* Current gr plane status */
184     int sg;			/* Current status of single-shifted plane */
185 #define WRONGPLANE		(-1)
186 #define ISVALIDPLANE(mp,plane)	((mp)->ms->plane != WRONGPLANE)
187 #define FINDCS(mp,c)	((mp)->ms->gs[(ISVALIDPLANE((mp), sg) ? (mp)->ms->sg : \
188 				 ((c) & 0x80) ? (mp)->ms->gr : (mp)->ms->gl)])
189 #define PLANE2CS(mp,plane)	((mp)->ms->gs[(mp)->ms->plane])
190 
191     int irr;			/* Identify revised registration number */
192 };
193 
194 struct multibuf {
195     struct {
196 	SETCHARSET scs;
197 	ENCSET input;
198 	ENCSET inputr;
199     } io;
200 
201     ENCSET orig_io_right;
202     int rotation_io_right;
203 
204     enum escape_sequence eseq;
205     /*
206      * Variables to control of escape sequences as output.
207      */
208     int cs;			/* Current character set */
209     struct m_status* ms;
210 #if JAPANESE
211     J_PRIORITY priority;	/* Which code was given priority. */
212     int sequence_counter;	/* Special counter for detect UJIS KANJI. */
213 #endif
214 
215     CHARSET icharset;		/* Last non ASCII character set of input */
216 
217     /*
218      * Small buffers to hold all parsing bytes of multi-byte characters.
219      *
220      * multi_parse() function receive a sequence of byte and buffer it.
221      * Each time multi_parse() recognize full data sequence to represent
222      * one character, it converts the data into internal data and returns
223      * converted data.
224      *
225      * Caller must buffer it somewhere and output it using outbuf() of
226      * outchar().  Those output functions() converts internal data into
227      * appropriate data stream for choosen output device.
228      *
229      * As internal data, we use char[] and CHARSET[] to keep byte and
230      * additional information, respectively.  We choose ISO-2022 style
231      * data format as our internal data format because it is most easy
232      * to work with.  It has completely separated planes for each
233      * character set.  This helps code conversion and others alot.
234      * For example, we don't need to work to separate Chinese and
235      * Japanese because they are separated from the beginning in ISO-2022
236      * although UTF-8 uses only single plane with all CJK character sets.
237      */
238     /*
239      * Buffer for input/parsing
240      */
241     m_position lastpos;		/* position of last byte */
242     m_position startpos;	/* position of first byte buffered */
243     unsigned char inbuf[80];
244     m_position laststartpos;	/* position of first byte buffered last time */
245     int lastsg;			/* last single-shifted plane (ms->sg) */
246     /*
247      * Buffer for internalized/converted data
248      */
249     unsigned char multiint[80];	/* Byte data */
250     CHARSET multics[80];	/* Character set data (no UJIS/SJIS/UTF */
251 				/* because all of them are converted into */
252 				/* internal data format) */
253     int intindex;		/* Index of multiint */
254 };
255 
256 #define INBUF(mp)	((mp)->inbuf[(mp)->lastpos%sizeof((mp)->inbuf)])
257 #define INBUF0(mp)	((mp)->inbuf[(mp)->startpos%sizeof((mp)->inbuf)])
258 #define INBUF1(mp)	((mp)->inbuf[((mp)->startpos+1)%sizeof((mp)->inbuf)])
259 #define INBUF2(mp)	((mp)->inbuf[((mp)->startpos+2)%sizeof((mp)->inbuf)])
260 #define INBUF3(mp)	((mp)->inbuf[((mp)->startpos+3)%sizeof((mp)->inbuf)])
261 #define INBUF4(mp)	((mp)->inbuf[((mp)->startpos+4)%sizeof((mp)->inbuf)])
262 #define INBUF5(mp)	((mp)->inbuf[((mp)->startpos+5)%sizeof((mp)->inbuf)])
263 #define INBUFI(mp,i)	((mp)->inbuf[(i)%sizeof((mp)->inbuf)])
264 
unicode_type(c)265 static int unicode_type(c)
266 int c;
267 {
268     if (c < 0) {
269 	return 0;
270     } else if (c < 0x20000) {
271 	return utype_map[c];
272     } else if (c < 0x40000) {
273 	return (UTYPE_EXIST | UTYPE_WIDE);
274     } else if (c < 0xe0000) {
275 	return 0;
276     } else if (c < 0xe0080) {
277 	return (UTYPE_EXIST | UTYPE_FORMAT);
278     } else if (c < 0xe01f0) {
279 	return (UTYPE_EXIST | UTYPE_NSP_MODIFIER);
280     } else if (c >= 0xf0000 && c <= 0xffffd) {
281 	return (UTYPE_EXIST | UTYPE_AMBIGUOUS);
282     } else if (c >= 0x100000 && c <= 0x10fffd) {
283 	return (UTYPE_EXIST | UTYPE_AMBIGUOUS);
284     } else {
285 	return 0;
286     }
287 }
288 
289 static int
get_utfwidth(uc)290 get_utfwidth(uc)
291 int uc;
292 {
293     int utype = unicode_type(uc);
294 
295     if (!(utype * UTYPE_EXIST))
296 	return WRONGUCS_H;
297     if (utype & UTYPE_CONTROL)
298 	return WRONGUCS_H;
299     if (utype & (UTYPE_NSP_MODIFIER | UTYPE_FORMAT | UTYPE_SEPARATOR))
300 	return UTF8Z;
301 
302     switch (utfwidth) {
303     case UWIDTH_NONE:
304 	break;
305     case UWIDTH_NORMAL:
306 	if (utype & UTYPE_WIDE) return UTF8W;
307 	break;
308     case UWIDTH_CJK:
309 	if (utype & (UTYPE_WIDE | UTYPE_AMBIGUOUS)) return UTF8W;
310 	break;
311     case UWIDTH_JA:
312 	if (utype & (UTYPE_WIDE | UTYPE_AMBIGUOUS | UTYPE_JA)) return UTF8W;
313 	break;
314     case UWIDTH_ALMOST:
315 	if (uc >= 0x80) return UTF8W;
316 	break;
317     case UWIDTH_ALL:
318 	return UTF8W;
319     }
320 
321     return UTF8;
322 }
323 
code_length(mp,cs)324 static int code_length(mp, cs)
325 MULBUF* mp;
326 CHARSET cs;
327 {
328 #if JAPANESE
329     unsigned char c;
330 #endif
331 
332     if (CSISWRONG(cs))
333 	return 1;
334 
335 #if JAPANESE
336     switch (CS2CHARSET(cs)) {
337     case UTF8:
338     case UTF8Z:
339     case UTF8W:
340 	c = INBUF0(mp);
341 	if (c < 0xC0) return 1;
342 	if (c < 0xe0) return 2;
343 	if (c < 0xf0) return 3;
344 	if (c < 0xf8) return 4;
345 	if (c < 0xfc) return 5;
346 	if (c < 0xfe) return 6;
347 	return 1;
348     case UJIS:
349     case UJIS2000:
350     case UJIS2004:
351 	c = INBUF0(mp);
352 	if (ISUJISKANJI1(c)) return 2;
353 	if (ISUJISKANA1(c)) return 2;
354 	if (ISUJISKANJISUP1(c)) return 3;
355 	return 1;
356     case SJIS:
357     case SJIS2000:
358     case SJIS2004:
359 	c = INBUF0(mp);
360 	if (ISSJISKANJI1(c)) return 2;
361 	if (ISSJISKANA(c)) return 1;
362 	return 1;
363     }
364 #endif
365 
366     switch (CS2TYPE(cs))
367     {
368     case TYPE_94_CHARSET:
369     case TYPE_96_CHARSET:
370 	return 1;
371     case TYPE_94N_CHARSET:
372     case TYPE_96N_CHARSET:
373 	switch (CS2FT(cs) & 0x70)
374 	{
375 	case 0x30: return 2;	/* for private use */
376 	case 0x40:
377 	case 0x50: return 2;
378 	case 0x60: return 3;
379 	case 0x70: return 4;	/* or more bytes */
380 	}
381     }
382     assert(0);
383     return (0);
384 }
385 
386 /*
387  * Convert first byte of buffered data as one byte ASCII data
388  * without any conversion.
389  */
noconv1(mp)390 static void noconv1(mp)
391 MULBUF *mp;
392 {
393     mp->multiint[mp->intindex] = INBUF0(mp);
394     mp->multics[mp->intindex] = ASCII;
395     mp->intindex++;
396     mp->startpos++;
397 }
398 
399 /*
400  * Convert first byte of buffered data as one byte WRONGCS data
401  * without any conversion.
402  */
wrongcs1(mp)403 static void wrongcs1(mp)
404 MULBUF *mp;
405 {
406     mp->multiint[mp->intindex] = INBUF0(mp);
407     mp->multics[mp->intindex] = WRONGCS;
408     mp->intindex++;
409     mp->startpos++;
410 }
411 
412 /*
413  * Write a wrongmark on out buffer.
414  */
put_wrongmark(mp)415 static void put_wrongmark(mp)
416 MULBUF *mp;
417 {
418     mp->multiint[mp->intindex + 0] = '"';
419     mp->multiint[mp->intindex + 1] = '.';
420     mp->multics[mp->intindex + 0] = JISX0208KANJI;
421     mp->multics[mp->intindex + 1] = REST_MASK | JISX0208KANJI;
422     mp->intindex += 2;
423     /* flush buffer */
424     mp->startpos = mp->lastpos + 1;
425 }
426 
427 /*
428  * Write WRONGUCS characters
429  */
wrongucs(mp,uc)430 static void wrongucs(mp, uc)
431 MULBUF *mp;
432 int uc;
433 {
434     if (markwrongchar) {
435 	put_wrongmark(mp);
436 	return;
437     }
438 
439     if (uc < 0x80) {
440 	wrongcs1(mp);
441     } else if (uc < 0x800) {
442 	mp->multiint[mp->intindex] = INBUF0(mp) & 0x9f;
443 	mp->multics[mp->intindex] = WRONGUCS_H;
444 	mp->multiint[mp->intindex + 1] = INBUF1(mp);
445 	mp->multics[mp->intindex + 1] = WRONGUCS_T | REST_MASK;
446 	mp->intindex += 2;
447     } else if (uc < 0x10000) {
448 	mp->multiint[mp->intindex] = INBUF0(mp) & 0x8f;
449 	mp->multics[mp->intindex] = WRONGUCS_H;
450 	mp->multiint[mp->intindex + 1] = INBUF1(mp);
451 	mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
452 	mp->multiint[mp->intindex + 2] = INBUF2(mp);
453 	mp->multics[mp->intindex + 2] = WRONGUCS_T | REST_MASK;
454 	mp->intindex += 3;
455     } else if (uc < 0x200000) {
456 	mp->multiint[mp->intindex] = INBUF0(mp) & 0x87;
457 	mp->multics[mp->intindex] = WRONGUCS_H;
458 	mp->multiint[mp->intindex + 1] = INBUF1(mp);
459 	mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
460 	mp->multiint[mp->intindex + 2] = INBUF2(mp);
461 	mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
462 	mp->multiint[mp->intindex + 3] = INBUF3(mp);
463 	mp->multics[mp->intindex + 3] = WRONGUCS_T | REST_MASK;
464 	mp->intindex += 4;
465     } else if (uc < 0x4000000) {
466 	mp->multiint[mp->intindex] = INBUF0(mp) & 0x83;
467 	mp->multics[mp->intindex] = WRONGUCS_H;
468 	mp->multiint[mp->intindex + 1] = INBUF1(mp);
469 	mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
470 	mp->multiint[mp->intindex + 2] = INBUF2(mp);
471 	mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
472 	mp->multiint[mp->intindex + 3] = INBUF3(mp);
473 	mp->multics[mp->intindex + 3] = WRONGUCS_M | REST_MASK;
474 	mp->multiint[mp->intindex + 4] = INBUF4(mp);
475 	mp->multics[mp->intindex + 4] = WRONGUCS_T | REST_MASK;
476 	mp->intindex += 5;
477     } else {
478 	mp->multiint[mp->intindex] = INBUF0(mp) & 0x81;
479 	mp->multics[mp->intindex] = WRONGUCS_H;
480 	mp->multiint[mp->intindex + 1] = INBUF1(mp);
481 	mp->multics[mp->intindex + 1] = WRONGUCS_M | REST_MASK;
482 	mp->multiint[mp->intindex + 2] = INBUF2(mp);
483 	mp->multics[mp->intindex + 2] = WRONGUCS_M | REST_MASK;
484 	mp->multiint[mp->intindex + 3] = INBUF3(mp);
485 	mp->multics[mp->intindex + 3] = WRONGUCS_M | REST_MASK;
486 	mp->multiint[mp->intindex + 4] = INBUF3(mp);
487 	mp->multics[mp->intindex + 4] = WRONGUCS_M | REST_MASK;
488 	mp->multiint[mp->intindex + 5] = INBUF5(mp);
489 	mp->multics[mp->intindex + 5] = WRONGUCS_T | REST_MASK;
490 	mp->intindex += 5;
491     }
492 
493     /* flush buffer */
494     mp->startpos = mp->lastpos + 1;
495 }
496 
497 /*
498  * Convert first several bytes of buffered data.
499  *
500  *  If less is in marking mode, it erase several bytes of data (depend on
501  * the current character set) and write "?" mark on output buffer.
502  *  If less is not in marking mode, it calls wrongcs1().
503  */
wrongchar(mp)504 static void wrongchar(mp)
505 MULBUF *mp;
506 {
507     if (markwrongchar) {
508 	switch (CS2CHARSET(mp->multics[mp->intindex])) {
509 	case JISX0201KANA:
510 	case JISX0201ROMAN:
511 	case LATIN1:
512 	case LATIN2:
513 	case LATIN3:
514 	case LATIN4:
515 	case GREEK:
516 	case ARABIC:
517 	case HEBREW:
518 	case CYRILLIC:
519 	case LATIN5:
520 	    /* Should I use one byte character, like '?' or '_'? */
521 	    put_wrongmark(mp);
522 	    break;
523 	case JISX0208_78KANJI:
524 	case JISX0208KANJI:
525 	case JISX0208_90KANJI:
526 	case JISX0212KANJISUP:
527 	case JISX0213KANJI1:
528 	case JISX0213KANJI2:
529 	case JISX02132004KANJI1:
530 	case UJIS:
531 	case UJIS2000:
532 	case UJIS2004:
533 	case SJIS:
534 	case SJIS2000:
535 	case SJIS2004:
536 	case UTF8Z:
537 	case UTF8:
538 	case UTF8W:
539 	    put_wrongmark(mp);
540 	    break;
541 	case GB2312:
542 	case KSC5601:
543 	default:
544 	    put_wrongmark(mp);
545 	    break;
546 	}
547     } else {
548 	while (mp->startpos <= mp->lastpos) {
549 	    wrongcs1(mp);
550 	}
551     }
552 }
553 
554 /*
555  * Internalize input stream.
556  * We recognized input data as using ISO coding set.
557  */
internalize_iso(mp)558 static void internalize_iso(mp)
559 MULBUF *mp;
560 {
561     register int i;
562     m_position pos;
563     m_position to;
564     int intindex;
565     int dummy;
566 
567     /*
568      * If character set points empty character set, reject buffered data.
569      */
570     if (CSISWRONG(mp->cs)) {
571 	wrongcs1(mp);
572 	return;
573     }
574 
575     /*
576      * If character set points 94 or 94x94 character set, reject
577      * DEL and SPACE codes in buffered data.
578      */
579     if (CS2TYPE(mp->cs) == TYPE_94_CHARSET ||
580 	CS2TYPE(mp->cs) == TYPE_94N_CHARSET) {
581 	unsigned char c = INBUF(mp);
582 	if ((c & 0x7f) == 0x7f) {
583 	    if (mp->lastpos - mp->startpos + 1 == 1) {
584 		wrongcs1(mp);
585 	    } else {
586 		wrongcs1(mp);
587 		multi_reparse(mp);
588 	    }
589 	    return;
590 	} else if ((c & 0x7f) == 0x20) {
591 	    /*
592 	     * A 0x20 (SPACE) code is wrong, but I treat it as
593 	     * a SPACE.
594 	     */
595 	    if (mp->lastpos - mp->startpos + 1 == 1) {
596 		noconv1(mp);
597 	    } else {
598 		wrongcs1(mp);
599 		multi_reparse(mp);
600 	    }
601 	    return;
602 	}
603     }
604 
605     /*
606      * Otherwise, keep buffering.
607      */
608     pos = mp->startpos;
609     to = pos + code_length(mp, mp->cs) - 1;
610     if (mp->lastpos < to) {
611 	return;		/* Not enough, so go back to fetch next data. */
612     }
613 
614     /*
615      * We buffered enough data for one character of multi byte characters.
616      * Therefore, start to convert this buffered data into a first character.
617      */
618     intindex = mp->intindex;
619     mp->multiint[intindex] = INBUFI(mp, pos) & 0x7f;
620     mp->multics[intindex] = mp->cs;
621     intindex++;
622     for (pos++; pos <= to; pos++) {
623 	mp->multiint[intindex] = INBUFI(mp, pos) & 0x7f;
624 	mp->multics[intindex] = REST_MASK | mp->cs;
625 	intindex++;
626     }
627 
628     /*
629      *  codeset JIS X 0208:1990 validation
630      */
631     if (mp->cs == JISX0208_90KANJI && !(mp->io.scs & SCSJISX0208_1990)) {
632 	wrongchar(mp);
633 	return;
634     }
635 
636     /*
637      * Check newly converted code.  If it is not valid code,
638      * less may mark it as not valid code.
639      */
640     if (chisvalid_cs(mp->io.scs,
641 		     &mp->multiint[mp->intindex],
642 		     &mp->multics[mp->intindex])) {
643 	mp->icharset = mp->cs;
644 	mp->intindex = intindex;
645 	mp->startpos = pos;
646     } else {
647 	/*
648 	 * less ignore the undefined codes
649 	 */
650 	wrongchar(mp);
651     }
652 }
653 
654 #if JAPANESE
655 /*
656  * Internalize input stream encoded by UJIS encoding scheme.
657  *
658  * Return 1 if input is recognized well.
659  * Return 0 if input is rejected.
660  */
internalize_ujis(mp)661 static int internalize_ujis(mp)
662 MULBUF *mp;
663 {
664     if (mp->lastpos - mp->startpos + 1 == 1) {
665 	/* do nothing.  return 1 to get next byte */
666 	return 1;
667     } else if (mp->lastpos - mp->startpos + 1 == 2) {
668 	int c0 = INBUF0(mp);
669 	int c1 = INBUF1(mp);
670 	if (ISUJISKANA(c0, c1)) {
671 	    mp->cs = JISX0201KANA;
672 	    mp->icharset = UJIS;
673 	    mp->multiint[mp->intindex] = c1 & 0x7f;
674 	    mp->multics[mp->intindex] = mp->cs;
675 	    mp->intindex += 1;
676 	    mp->startpos = mp->lastpos + 1;
677 	    return 1;
678 	} else if (ISUJISKANJI(c0, c1)) {
679 	    int ch = checkKANJI(mp->io.scs,
680 				SCSJISX0208_1983 | SCSJISX0208_1990
681 				| SCSJISX0213_2000 | SCSJISX0213_2004,
682 				c0, c1);
683 	    if (ch == 0) {
684 		/* undefined.  less ignore them */
685 		wrongchar(mp);
686 		return 1;
687 	    }
688 	    mp->icharset = UJIS;
689 	    mp->cs = (ch >> 16) & 0x7fff;
690 	    mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
691 	    mp->multics[mp->intindex] = mp->cs;
692 	    mp->multiint[mp->intindex + 1] = ch & 0x7f;
693 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
694 	    mp->intindex += 2;
695 	    mp->startpos = mp->lastpos + 1;
696 	    return 1;
697 	} else if (ISUJISKANJISUP(c0, c1, 0xa1)) {
698 	    /* do nothing.  return 1 to get next byte */
699 	    mp->multics[mp->intindex] = UJIS;
700 	    return 1;
701 	}
702     } else if (mp->lastpos - mp->startpos + 1 == 3) {
703 	int c0 = INBUF0(mp);
704 	int c1 = INBUF1(mp);
705 	int c2 = INBUF2(mp);
706 	if (ISUJISKANJISUP(c0, c1, c2)) {
707 	    int ch = checkKANJI(mp->io.scs,
708 				SCSJISX0213_2ND | SCSJISX0212_1990,
709 				c1, c2);
710 	    if (ch == 0) {
711 		/* undefined.  less ignore them */
712 		wrongchar(mp);
713 		return 1;
714 	    }
715 	    mp->icharset = UJIS;
716 	    mp->cs = (ch >> 16) & 0x7fff;
717 	    mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
718 	    mp->multics[mp->intindex] = mp->cs;
719 	    mp->multiint[mp->intindex + 1] = ch & 0x7f;
720 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
721 	    mp->intindex += 2;
722 	    mp->startpos = mp->lastpos + 1;
723 	    return 1;
724 	}
725     }
726     /* return 0 because this data sequence is not matched to UJIS */
727     return 0;
728 }
729 
730 /*
731  * Internalize input stream encoded by SJIS encoding scheme.
732  *
733  * Return 1 if input is recognized well.
734  * Return 0 if input is rejected.
735  */
internalize_sjis(mp)736 static int internalize_sjis(mp)
737 MULBUF *mp;
738 {
739     if (mp->lastpos - mp->startpos + 1 == 1) {
740 	int c0 = INBUF(mp);
741 	if (ISSJISKANA(c0)) {
742 	    mp->cs = JISX0201KANA;
743 	    mp->icharset = SJIS;
744 	    mp->multiint[mp->intindex] = c0 & 0x7f;
745 	    mp->multics[mp->intindex] = mp->cs;
746 	    mp->intindex += 1;
747 	    mp->startpos = mp->lastpos + 1;
748 	    return 1;
749 	} else {
750 	    /* do nothing.  return 1 to get next byte */
751 	    return 1;
752 	}
753     } else if (mp->lastpos - mp->startpos + 1 == 2) {
754 	int c0 = INBUF0(mp);
755 	int c1 = INBUF1(mp);
756 	if (ISSJISKANJI(c0, c1)) {
757 	    int ktype;
758 
759 	    if (c0 < 0xf0) {
760 		/* JIS X 0213:2000 plane 1 or JIS X 0208:1997 */
761 		if (c0 <= 0x9f) c0 = (c0-0x81)*2 + 0x21;
762 		else            c0 = (c0-0xc1)*2 + 0x21;
763 		if (c1 <= 0x7e)      c1 -= 0x1f;
764 		else if (c1 <= 0x9e) c1 -= 0x20;
765 		else                 c1 -= 0x7e, c0 += 1;
766 
767 		int ch = checkKANJI(mp->io.scs,
768 				    SCSJISX0208_1983 | SCSJISX0208_1990
769 				    | SCSJISX0213_2000 | SCSJISX0213_2004,
770 				    c0, c1);
771 		if (ch == 0) {
772 		    /* undefined.  less ignore them */
773 		    wrongchar(mp);
774 		    return 1;
775 		}
776 		mp->icharset = SJIS;
777 		mp->cs = (ch >> 16) & 0x7fff;
778 		mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
779 		mp->multics[mp->intindex] = mp->cs;
780 		mp->multiint[mp->intindex + 1] = ch & 0x7f;
781 		mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
782 		mp->intindex += 2;
783 		mp->startpos = mp->lastpos + 1;
784 		return 1;
785 	    } else {
786 		/* JIS X 0213:2000 plane 2 */
787 		if (c0 == 0xf0)
788 		    if (c1 <= 0x9e)  c0 = 0x21;
789 		    else             c0 = 0x27;
790 		else if (c0 == 0xf1) c0 = 0x23;
791 		else if (c0 == 0xf2)
792 		    if (c1 <= 0x9e)  c0 = 0x25;
793 		    else             c0 = 0x2b;
794 		else if (c0 == 0xf3) c0 = 0x2d;
795 		else if (c0 == 0xf4)
796 		    if (c1 <= 0x9e)  c0 = 0x2f;
797 		    else             c0 = 0x6d;
798 		else                 c0 = (c0 - 0xf5) * 2 + 0x6f;
799 		if (c1 <= 0x7e)      c1 -= 0x1f;
800 		else if (c1 <= 0x9e) c1 -= 0x20;
801 		else                 c1 -= 0x7e, c0 += 1;
802 
803 		int ch = checkKANJI(mp->io.scs, SCSJISX0213_2ND, c0, c1);
804 		if (ch == 0) {
805 		    /* undefined.  less ignore them */
806 		    wrongchar(mp);
807 		    return 1;
808 		}
809 		mp->icharset = SJIS;
810 		mp->cs = (ch >> 16) & 0x7fff;
811 		mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
812 		mp->multics[mp->intindex] = mp->cs;
813 		mp->multiint[mp->intindex + 1] = ch & 0x7f;
814 		mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
815 		mp->intindex += 2;
816 		mp->startpos = mp->lastpos + 1;
817 		return 1;
818 	    }
819 	    /* data are recognized as kanji or wrong data, so return 1 */
820 	    return 1;
821 	}
822     }
823     /* return 0 because this data sequence is not matched to SJIS */
824     return 0;
825 }
826 
827 /*
828  * Internalize input stream encoded by CP932 encoding scheme.
829  *
830  * Return 1 if input is recognized well.
831  * Return 0 if input is rejected.
832  */
internalize_cp932(mp)833 static int internalize_cp932(mp)
834 MULBUF *mp;
835 {
836     if (mp->lastpos - mp->startpos + 1 == 1) {
837 	int c0 = INBUF(mp);
838 	if (ISSJISKANA(c0)) {
839 	    mp->cs = JISX0201KANA;
840 	    mp->icharset = CP932;
841 	    mp->multiint[mp->intindex] = c0 & 0x7f;
842 	    mp->multics[mp->intindex] = mp->cs;
843 	    mp->intindex += 1;
844 	    mp->startpos = mp->lastpos + 1;
845 	    return 1;
846 	} else {
847 	    /* do nothing.  return 1 to get next byte */
848 	    return 1;
849 	}
850     } else if (mp->lastpos - mp->startpos + 1 == 2) {
851 	int c0 = INBUF0(mp);
852 	int c1 = INBUF1(mp);
853 	if (ISSJISKANJI(c0, c1)) {
854 	    int ofs;
855 
856 	    if (c0 <= 0x9f) c0 = (c0-0x81)*2 + 0x21;
857 	    else            c0 = (c0-0xc1)*2 + 0x21;
858 	    if (c1 <= 0x7e)      c1 -= 0x1f;
859 	    else if (c1 <= 0x9e) c1 -= 0x20;
860 	    else                 c1 -= 0x7e, c0 += 1;
861 
862 	    ofs = (c0 - 0x21) * 94 + (c1 - 0x21);
863 	    if ((c0 < 0x30 && c0 != 0x2d
864 		 && ucode_kanji1[ofs] == ucode_cp932[ofs])
865 		|| (c0 >= 0x30 && c0 <= 0x74)) {
866 		int ch = checkKANJI(mp->io.scs,
867 				    SCSJISX0208_1983 | SCSJISX0208_1990,
868 				    c0, c1);
869 		if (ch == 0) {
870 		    /* undefined.  less ignore them */
871 		    wrongchar(mp);
872 		    return 1;
873 		}
874 		mp->icharset = CP932;
875 		mp->cs = (ch >> 16) & 0x7fff;
876 		mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
877 		mp->multics[mp->intindex] = mp->cs;
878 		mp->multiint[mp->intindex + 1] = ch & 0x7f;
879 		mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
880 		mp->intindex += 2;
881 		mp->startpos = mp->lastpos + 1;
882 		return 1;
883 	    } else {
884 		if (c0 > 0x7e) c0 -= 0x4f;
885 		int ch = checkKANJI(mp->io.scs, SCSCP932EX, c0, c1);
886 		if (ch == 0) {
887 		    /* undefined.  less ignore them */
888 		    wrongchar(mp);
889 		    return 1;
890 		}
891 		mp->icharset = CP932;
892 		mp->cs = (ch >> 16) & 0x7fff;
893 		mp->multiint[mp->intindex] = (ch >> 8) & 0x7f;
894 		mp->multics[mp->intindex] = mp->cs;
895 		mp->multiint[mp->intindex + 1] = ch & 0x7f;
896 		mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
897 		mp->intindex += 2;
898 		mp->startpos = mp->lastpos + 1;
899 		return 1;
900 	    }
901 	    /* data are recognized as kanji or wrong data, so return 1 */
902 	    return 1;
903 	}
904     }
905     /* return 0 because this data sequence is not matched to CP932 */
906     return 0;
907 }
908 
909 /*
910  * Internalize UTF-8 character to traditional Codeset
911  *
912  * Return 1 if input has convetred well.
913  * Return 0 if input has failed.
914  */
ucs2codeset(mp,uc)915 static int ucs2codeset(mp, uc)
916 MULBUF *mp;
917 int uc;
918 {
919     int plane = (uc & 0x7ffff0000) >> 16;
920     int code = uc & 0xffff;
921     int umap;
922     int dummy;
923     int cs;
924     int cc;
925     int intindex;
926 
927     /*
928      *  lookup unicode table
929      */
930     if (plane == 0)
931 	umap = unicode0_map[code];
932     else if (plane == 2)
933 	umap = unicode2_map[code];
934     else
935 	return 0;
936     if (umap == U_error)
937 	return 0;
938 
939     /*
940      *  check codeset
941      */
942     cs = UMAP_CS(umap);
943 
944     /*
945      * buffering
946      */
947     cc = UMAP_CHAR(umap);
948     switch (CS2TYPE(cs))
949     {
950     case TYPE_94_CHARSET:
951     case TYPE_96_CHARSET:
952 	mp->icharset = UTF8;
953 	mp->multiint[mp->intindex] = cc & 0x7f;
954 	mp->multics[mp->intindex] = cs;
955 	mp->intindex += 1;
956 	return 1;
957     case TYPE_94N_CHARSET:
958     case TYPE_96N_CHARSET:
959 	mp->icharset = UTF8;
960 	mp->multiint[mp->intindex] = (cc / 94) + 0x21;
961 	mp->multics[mp->intindex] = cs;
962 	mp->multiint[mp->intindex + 1] = (cc % 94) + 0x21;
963 	mp->multics[mp->intindex + 1] = REST_MASK | cs;
964 	mp->intindex += 2;
965 	return 1;
966     }
967 
968     return 0;
969 }
970 
971 static struct st_ucs_combining {
972     int c1;
973     int c2;
974     int u1;
975     int u2;
976 } jisx0213_comb[] = {
977     { 4, 87, 0x304b, 0x309a, },
978     { 4, 88, 0x304d, 0x309a, },
979     { 4, 89, 0x304f, 0x309a, },
980     { 4, 90, 0x3051, 0x309a, },
981     { 4, 91, 0x3053, 0x309a, },
982     { 5, 87, 0x30ab, 0x309a, },
983     { 5, 88, 0x30ad, 0x309a, },
984     { 5, 89, 0x30af, 0x309a, },
985     { 5, 90, 0x30b1, 0x309a, },
986     { 5, 91, 0x30b3, 0x309a, },
987     { 5, 92, 0x30bb, 0x309a, },
988     { 5, 93, 0x30c4, 0x309a, },
989     { 5, 94, 0x30c8, 0x309a, },
990     { 6, 88, 0x31f7, 0x309a, },
991     { 11, 36, 0x00e6, 0x0300, },
992     { 11, 40, 0x0254, 0x0300, },
993     { 11, 41, 0x0254, 0x0301, },
994     { 11, 42, 0x028c, 0x0300, },
995     { 11, 43, 0x028c, 0x0301, },
996     { 11, 44, 0x0259, 0x0300, },
997     { 11, 45, 0x0259, 0x0301, },
998     { 11, 46, 0x025a, 0x0300, },
999     { 11, 47, 0x025a, 0x0301, },
1000     { 11, 69, 0x02e9, 0x02e5 },
1001     { 11, 70, 0x02e5, 0x02e9 },
1002     { 0, 0, 0, 0 },
1003 };
1004 
1005 static int pending_ucs = 0;
1006 static int
flush_pending_ucs(mp)1007 flush_pending_ucs(mp)
1008 MULBUF *mp;
1009 {
1010     if (pending_ucs) {
1011 	ucs2codeset(mp, pending_ucs);
1012 	pending_ucs = 0;
1013 	return 1;
1014     }
1015     return 0;
1016 }
1017 
1018 static int
ucs2codeset_combind(mp,uc)1019 ucs2codeset_combind(mp, uc)
1020 MULBUF *mp;
1021 int uc;
1022 {
1023     struct st_ucs_combining *p;
1024 
1025     if (uc < 0) {
1026 	pending_ucs = 0;
1027 	return 1;
1028     }
1029 
1030     if (uc == 0) {
1031 	if (pending_ucs)
1032 	    ucs2codeset(mp, pending_ucs);
1033 	pending_ucs = 0;
1034 	return 1;
1035     }
1036 
1037     if (!(mp->io.scs & (SCSJISX0213_2000 | SCSJISX0213_2004)))
1038 	return ucs2codeset(mp, uc);
1039 
1040     if (pending_ucs) {
1041 	for (p = jisx0213_comb; p->c1; ++ p) {
1042 	    if (p->u1 == pending_ucs && p->u2 == uc) {
1043 		mp->icharset = UTF8;
1044 		mp->cs = JISX0213KANJI1;
1045 		mp->multiint[mp->intindex] = p->c1 + 0x20;
1046 		mp->multics[mp->intindex] = JISX0213KANJI1;
1047 		mp->multiint[mp->intindex + 1] = p->c2 + 0x20;
1048 		mp->multics[mp->intindex + 1] = REST_MASK | JISX0213KANJI1;
1049 		mp->intindex += 2;
1050 		mp->startpos = mp->lastpos + 1;
1051 		pending_ucs = 0;
1052 		return 1;
1053 	    }
1054 	}
1055 	ucs2codeset(mp, pending_ucs);
1056 	pending_ucs = 0;
1057     }
1058 
1059     for (p = jisx0213_comb; p->c1; ++ p) {
1060 	if (p->u1 == uc) {
1061 	    mp->startpos = mp->lastpos + 1;
1062 	    pending_ucs = uc;
1063 	    return 1;
1064 	}
1065     }
1066 
1067     mp->startpos = mp->lastpos + 1;
1068     return ucs2codeset(mp, uc);
1069 }
1070 
1071 /*
1072  * Internalize input stream encoded by UTF8 encoding scheme.
1073  *
1074  * Return 1 if input is recognized well.
1075  * Return 0 if input is rejected.
1076  */
internalize_utf8(mp)1077 static int internalize_utf8(mp)
1078 MULBUF *mp;
1079 {
1080     int uc;
1081     int cs;
1082 
1083     if (mp->lastpos - mp->startpos + 1 == 1) {
1084 	/* do nothing.  return 1 to get next byte */
1085 	return 1;
1086     } else if (mp->lastpos - mp->startpos + 1 == 2) {
1087 	int c0 = INBUF0(mp);
1088 	int c1 = INBUF1(mp);
1089 	if (ISUTF8_2(c0, c1)) {
1090 	    uc = UTF8_2(c0, c1);
1091 	    if (ucs2codeset_combind(mp, uc))
1092 		return 1;
1093 	    if (!(mp->io.scs & SCSUTF8)) {
1094 		wrongucs(mp, uc);
1095 		return 1;
1096 	    }
1097 	    cs = get_utfwidth(uc);
1098 	    if (CSISWRONG(cs)) {
1099 		wrongucs(mp, uc);
1100 		return 1;
1101 	    }
1102 	    mp->cs = cs;
1103 	    mp->icharset = UTF8;
1104 	    mp->multiint[mp->intindex] = c0;
1105 	    mp->multics[mp->intindex] = mp->cs;
1106 	    mp->multiint[mp->intindex + 1] = c1;
1107 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1108 	    mp->intindex += 2;
1109 	    mp->startpos = mp->lastpos + 1;
1110 	    return 1;
1111 	} else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1)) {
1112 	    /* do nothing.  return 1 to get next byte */
1113 	    return 1;
1114 	}
1115     } else if (mp->lastpos - mp->startpos + 1 == 3) {
1116 	int c0 = INBUF0(mp);
1117 	int c1 = INBUF1(mp);
1118 	int c2 = INBUF2(mp);
1119 	if (ISUTF8_3(c0, c1, c2)) {
1120 	    uc = UTF8_3(c0, c1, c2);
1121 	    if (ucs2codeset_combind(mp, uc))
1122 		return 1;
1123 	    if (!(mp->io.scs & SCSUTF8)) {
1124 		wrongucs(mp, uc);
1125 		return 1;
1126 	    }
1127 	    cs = get_utfwidth(uc);
1128 	    if (CSISWRONG(cs)) {
1129 		wrongucs(mp, uc);
1130 		return 1;
1131 	    }
1132 	    mp->cs = cs;
1133 	    mp->icharset = UTF8;
1134 	    mp->multiint[mp->intindex] = c0;
1135 	    mp->multics[mp->intindex] = mp->cs;
1136 	    mp->multiint[mp->intindex + 1] = c1;
1137 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1138 	    mp->multiint[mp->intindex + 2] = c2;
1139 	    mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1140 	    mp->intindex += 3;
1141 	    mp->startpos = mp->lastpos + 1;
1142 	    return 1;
1143 	} else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2)) {
1144 	    /* do nothing.  return 1 to get next byte */
1145 	    return 1;
1146 	}
1147     } else if (mp->lastpos - mp->startpos + 1 == 4) {
1148 	int c0 = INBUF0(mp);
1149 	int c1 = INBUF1(mp);
1150 	int c2 = INBUF2(mp);
1151 	int c3 = INBUF3(mp);
1152 	if (ISUTF8_4(c0, c1, c2, c3)) {
1153 	    uc = UTF8_4(c0, c1, c2, c3);
1154 	    if (ucs2codeset_combind(mp, uc))
1155 		return 1;
1156 	    if (!(mp->io.scs & SCSUTF8)) {
1157 		wrongucs(mp, uc);
1158 		return 1;
1159 	    }
1160 	    cs = get_utfwidth(uc);
1161 	    if (CSISWRONG(cs)) {
1162 		wrongucs(mp, uc);
1163 		return 1;
1164 	    }
1165 	    mp->cs = cs;
1166 	    mp->icharset = UTF8;
1167 	    mp->multiint[mp->intindex] = c0;
1168 	    mp->multics[mp->intindex] = mp->cs;
1169 	    mp->multiint[mp->intindex + 1] = c1;
1170 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1171 	    mp->multiint[mp->intindex + 2] = c2;
1172 	    mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1173 	    mp->multiint[mp->intindex + 3] = c3;
1174 	    mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1175 	    mp->intindex += 4;
1176 	    mp->startpos = mp->lastpos + 1;
1177 	    return 1;
1178 	} else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2) &&
1179 		   ISUTF8_REST(c3)) {
1180 	    /* do nothing.  return 1 to get next byte */
1181 	    return 1;
1182 	}
1183     } else if (mp->lastpos - mp->startpos + 1 == 5) {
1184 	int c0 = INBUF0(mp);
1185 	int c1 = INBUF1(mp);
1186 	int c2 = INBUF2(mp);
1187 	int c3 = INBUF3(mp);
1188 	int c4 = INBUF4(mp);
1189 	if (ISUTF8_5(c0, c1, c2, c3, c4)) {
1190 	    uc = UTF8_5(c0, c1, c2, c3, c4);
1191 	    if (ucs2codeset_combind(mp, uc))
1192 		return 1;
1193 	    if (!(mp->io.scs & SCSUTF8)) {
1194 		wrongucs(mp, uc);
1195 		return 1;
1196 	    }
1197 	    cs = get_utfwidth(uc);
1198 	    if (CSISWRONG(cs)) {
1199 		wrongucs(mp, uc);
1200 		return 1;
1201 	    }
1202 	    mp->cs = cs;
1203 	    mp->icharset = UTF8;
1204 	    mp->multiint[mp->intindex] = c0;
1205 	    mp->multics[mp->intindex] = mp->cs;
1206 	    mp->multiint[mp->intindex + 1] = c1;
1207 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1208 	    mp->multiint[mp->intindex + 2] = c2;
1209 	    mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1210 	    mp->multiint[mp->intindex + 3] = c3;
1211 	    mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1212 	    mp->multiint[mp->intindex + 4] = c4;
1213 	    mp->multics[mp->intindex + 4] = REST_MASK | mp->cs;
1214 	    mp->intindex += 5;
1215 	    mp->startpos = mp->lastpos + 1;
1216 	    return 1;
1217 	} else if (ISUTF8_HEAD(c0) && ISUTF8_REST(c1) && ISUTF8_REST(c2) &&
1218 		   ISUTF8_REST(c3) && ISUTF8_REST(c4)) {
1219 	    /* do nothing.  return 1 to get next byte */
1220 	    return 1;
1221 	}
1222     } else if (mp->lastpos - mp->startpos + 1 == 6) {
1223 	int c0 = INBUF0(mp);
1224 	int c1 = INBUF1(mp);
1225 	int c2 = INBUF2(mp);
1226 	int c3 = INBUF3(mp);
1227 	int c4 = INBUF4(mp);
1228 	int c5 = INBUF5(mp);
1229 	if (ISUTF8_6(c0, c1, c2, c3, c4, c5)) {
1230 	    uc = UTF8_6(c0, c1, c2, c3, c4, c5);
1231 	    if (ucs2codeset_combind(mp, uc))
1232 		return 1;
1233 	    if (!(mp->io.scs & SCSUTF8)) {
1234 		wrongucs(mp, uc);
1235 		return 1;
1236 	    }
1237 	    cs = get_utfwidth(uc);
1238 	    if (CSISWRONG(cs)) {
1239 		wrongucs(mp, uc);
1240 		return 1;
1241 	    }
1242 	    mp->cs = cs;
1243 	    mp->icharset = UTF8;
1244 	    mp->multiint[mp->intindex] = c0;
1245 	    mp->multics[mp->intindex] = mp->cs;
1246 	    mp->multiint[mp->intindex + 1] = c1;
1247 	    mp->multics[mp->intindex + 1] = REST_MASK | mp->cs;
1248 	    mp->multiint[mp->intindex + 2] = c2;
1249 	    mp->multics[mp->intindex + 2] = REST_MASK | mp->cs;
1250 	    mp->multiint[mp->intindex + 3] = c3;
1251 	    mp->multics[mp->intindex + 3] = REST_MASK | mp->cs;
1252 	    mp->multiint[mp->intindex + 4] = c4;
1253 	    mp->multics[mp->intindex + 4] = REST_MASK | mp->cs;
1254 	    mp->multiint[mp->intindex + 5] = c5;
1255 	    mp->multics[mp->intindex + 5] = REST_MASK | mp->cs;
1256 	    mp->intindex += 6;
1257 	    mp->startpos = mp->lastpos + 1;
1258 	    return 1;
1259 	}
1260     }
1261     /* return 0 because this data sequence is not matched to UTF8 */
1262     return 0;
1263 }
1264 
1265 #endif
1266 
internalize(mp)1267 static void internalize(mp)
1268 MULBUF *mp;
1269 {
1270     int c = INBUF(mp);
1271 
1272     if (mp->lastpos - mp->startpos + 1 == 1) {
1273 	if ((c <= 0x7f && mp->io.input == ESNOCONV) ||
1274 	    (c >= 0x80 && mp->io.inputr == ESNOCONV)) {
1275 #if JAPANESE
1276 	    mp->sequence_counter = 0;
1277 	    flush_pending_ucs(mp);
1278 #endif
1279 	    if (control_char(c)) {
1280 		    wrongcs1(mp);
1281 	    } else {
1282 		    noconv1(mp);
1283 	    }
1284 	    return;
1285 	} else if (c >= 0x80 && mp->io.inputr == ESNONE) {
1286 #if JAPANESE
1287 	    mp->sequence_counter = 0;
1288 	    flush_pending_ucs(mp);
1289 #endif
1290 	    wrongcs1(mp);
1291 	    return;
1292 	}
1293 
1294 	mp->cs = ASCII;
1295 	if (c < 0x20) {
1296 #if JAPANESE
1297 	    mp->sequence_counter = 0;
1298 	    flush_pending_ucs(mp);
1299 #endif
1300 	    wrongcs1(mp);
1301 	    return;
1302 	} else if (c <= 0x7f ||
1303 		   ((mp->io.inputr & ESISO8)
1304 		    && (0xa0 <= c && c <= 0xff)
1305 		    && (mp->ms->sg != WRONGPLANE
1306 			|| !CSISWRONG(mp->ms->gs[mp->ms->gr])))) {
1307 #if JAPANESE
1308 	    mp->sequence_counter = 0;
1309 	    flush_pending_ucs(mp);
1310 #endif
1311 	    /*
1312 	     * Decide current character set.
1313 	     */
1314 	    mp->cs = FINDCS(mp, c);
1315 
1316 	    /*
1317 	     * Check cs that fit for output code set.
1318 	     */
1319 	    /* JIS cannot output JISX0212, JISX0213_2, or ISO2022 */
1320 	    if ((output == ESJIS83) &&
1321 		mp->cs != ASCII &&
1322 		mp->cs != JISX0201KANA &&
1323 		mp->cs != JISX0201ROMAN &&
1324 		mp->cs != JISX0208_78KANJI &&
1325 		mp->cs != JISX0208KANJI &&
1326 		mp->cs != JISX0208_90KANJI &&
1327 		mp->cs != JISX0213KANJI1 &&
1328 		mp->cs != JISX02132004KANJI1) {
1329 		wrongcs1(mp);
1330 		multi_reparse(mp);
1331 		return;
1332 	    }
1333 
1334 	    /* UJIS cannot output regular ISO2022 except JIS */
1335 	    if ((output == ESUJIS) &&
1336 		mp->cs != ASCII &&
1337 		mp->cs != JISX0201KANA &&
1338 		mp->cs != JISX0201ROMAN &&
1339 		mp->cs != JISX0208_78KANJI &&
1340 		mp->cs != JISX0208KANJI &&
1341 		mp->cs != JISX0208_90KANJI &&
1342 		mp->cs != JISX0212KANJISUP &&
1343 		mp->cs != JISX0213KANJI1 &&
1344 		mp->cs != JISX0213KANJI2 &&
1345 		mp->cs != JISX02132004KANJI1) {
1346 		wrongcs1(mp);
1347 		multi_reparse(mp);
1348 		return;
1349 	    }
1350 
1351 	    /* SJIS cannot output JISX0212 or ISO2022 */
1352 	    if ((output == ESSJIS) &&
1353 		mp->cs != ASCII &&
1354 		mp->cs != JISX0201KANA &&
1355 		mp->cs != JISX0201ROMAN &&
1356 		mp->cs != JISX0208_78KANJI &&
1357 		mp->cs != JISX0208KANJI &&
1358 		mp->cs != JISX0208_90KANJI &&
1359 		mp->cs != JISX0213KANJI1 &&
1360 		mp->cs != JISX0213KANJI2 &&
1361 		mp->cs != JISX02132004KANJI1) {
1362 		wrongcs1(mp);
1363 		multi_reparse(mp);
1364 		return;
1365 	    }
1366 
1367 	    /* CP932 cannot output regular ISO2022 except JIS */
1368 	    if ((output == ESCP932) &&
1369 		mp->cs != ASCII &&
1370 		mp->cs != JISX0201KANA &&
1371 		mp->cs != JISX0201ROMAN &&
1372 		mp->cs != JISX0208_78KANJI &&
1373 		mp->cs != JISX0208KANJI &&
1374 		mp->cs != JISX0208_90KANJI &&
1375 		mp->cs != JISX0212KANJISUP &&
1376 		mp->cs != JISX0213KANJI1 &&
1377 		mp->cs != JISX0213KANJI2 &&
1378 		mp->cs != JISX02132004KANJI1 &&
1379 		mp->cs != CP932) {
1380 		wrongcs1(mp);
1381 		multi_reparse(mp);
1382 		return;
1383 	    }
1384 
1385 	    if (mp->cs != ASCII)
1386 		mp->icharset = mp->cs;
1387 
1388 	    internalize_iso(mp);
1389 	    return;
1390 	} else if (control_char(c)) {
1391 #if JAPANESE
1392 	    mp->sequence_counter = 0;
1393 	    flush_pending_ucs(mp);
1394 #endif
1395 	    wrongcs1(mp);
1396 	    return;
1397 	}
1398 #if JAPANESE
1399 	if (mp->priority == PSJIS && ISSJISKANA(c)) {
1400 	    if (mp->io.inputr & ESUJIS) {
1401 		mp->sequence_counter++;
1402 		if (mp->sequence_counter % 2 == 1 &&
1403 		    INBUF0(mp) != 0xa4) /* ???? */
1404 		{
1405 		    mp->sequence_counter = 0;
1406 		}
1407 		if (mp->sequence_counter >= 6)
1408 		    /*
1409 		     * It looks like a sequence of UJIS
1410 		     * hiragana.  Thus we give priority
1411 		     * to not PSJIS.
1412 		     */
1413 		    mp->priority = PUJIS;
1414 	    }
1415 	    flush_pending_ucs(mp);
1416 	    if (mp->io.inputr & ESCP932)
1417 		internalize_cp932(mp);
1418 	    else if (mp->io.inputr & ESSJIS)
1419 		internalize_sjis(mp);
1420 	    return;
1421 	} else if (mp->io.inputr & (ESUJIS | ESSJIS | ESUTF8 | ESCP932)) {
1422 	    mp->sequence_counter = 0;
1423 	    return;
1424 	}
1425 	mp->sequence_counter = 0;
1426 #endif
1427 	wrongcs1(mp);
1428 	return;
1429     }
1430 
1431 #if JAPANESE
1432     assert(mp->sequence_counter == 0);
1433 #endif
1434     if (c < 0x20) {
1435 	flush_pending_ucs(mp);
1436 	wrongcs1(mp);
1437 	multi_reparse(mp);
1438 	return;
1439     } else if (mp->cs != ASCII &&
1440 	       (c <= 0x7f ||
1441 		((mp->io.inputr & ESISO8)
1442 		 && (0xa0 <= c && c <= 0xff)
1443 		 && (mp->ms->sg != WRONGPLANE
1444 		     || !CSISWRONG(mp->ms->gs[mp->ms->gr]))))) {
1445 	flush_pending_ucs(mp);
1446 	if (mp->cs != FINDCS(mp, c)) {
1447 	    wrongcs1(mp);
1448 	    multi_reparse(mp);
1449 	} else {
1450 	    internalize_iso(mp);
1451 	}
1452 	return;
1453     } else if (control_char(c)) {
1454 	flush_pending_ucs(mp);
1455 	wrongcs1(mp);
1456 	multi_reparse(mp);
1457 	return;
1458     }
1459 #if JAPANESE
1460     if (mp->lastpos - mp->startpos + 1 == 2) {
1461 	if (mp->priority == PSJIS) {
1462 	    if (mp->io.inputr & ESCP932) {
1463 		if (internalize_cp932(mp))
1464 		    return;
1465 	    } else {
1466 		if (internalize_sjis(mp)) {
1467 		    return;
1468 		}
1469 	    }
1470 	} else if (mp->priority == PUJIS) {
1471 	    if (internalize_ujis(mp)) {
1472 		return;
1473 	    }
1474 	} else if (mp->priority == PUTF8) {
1475 	    if (internalize_utf8(mp)) {
1476 		return;
1477 	    }
1478 	    flush_pending_ucs(mp);
1479 	}
1480 
1481 	if (mp->io.inputr & ESUTF8) {
1482 	    if (internalize_utf8(mp)) {
1483 		mp->priority = PUTF8;
1484 		return;
1485 	    }
1486 	    flush_pending_ucs(mp);
1487 	}
1488 	if (mp->io.inputr & ESUJIS) {
1489 	    if (internalize_ujis(mp)) {
1490 		mp->priority = PUJIS;
1491 		return;
1492 	    }
1493 	}
1494 	if (mp->io.inputr & ESSJIS) {
1495 	    flush_pending_ucs(mp);
1496 	    if (internalize_sjis(mp)) {
1497 		mp->priority = PSJIS;
1498 		return;
1499 	    }
1500 	}
1501 	if (mp->io.inputr & ESCP932) {
1502 	    flush_pending_ucs(mp);
1503 	    if (internalize_cp932(mp)) {
1504 		mp->priority = PSJIS;
1505 		return;
1506 	    }
1507 	}
1508     } else if (mp->lastpos - mp->startpos + 1 == 3) {
1509 	if (mp->priority == PUJIS) {
1510 	    if (internalize_ujis(mp)) {
1511 		return;
1512 	    }
1513 	} else if (mp->priority == PUTF8) {
1514 	    if (internalize_utf8(mp)) {
1515 		return;
1516 	    }
1517 	    flush_pending_ucs(mp);
1518 	}
1519 
1520 	if (mp->io.inputr & ESUTF8) {
1521 	    if (internalize_utf8(mp)) {
1522 		mp->priority = PUTF8;
1523 		return;
1524 	    }
1525 	    flush_pending_ucs(mp);
1526 	}
1527 	if (mp->io.inputr & ESUJIS) {
1528 	    if (internalize_ujis(mp)) {
1529 		mp->priority = PUJIS;
1530 		return;
1531 	    }
1532 	}
1533     } else if (mp->lastpos - mp->startpos + 1 <= 6) {
1534 	if (mp->io.inputr & ESUTF8) {
1535 	    if (internalize_utf8(mp)) {
1536 		mp->priority = PUTF8;
1537 		return;
1538 	    }
1539 	    flush_pending_ucs(mp);
1540 	}
1541     }
1542 #endif
1543     wrongcs1(mp);
1544     multi_reparse(mp);
1545 }
1546 
1547 /*
1548  * Check routines
1549  */
check_ft(mp,c,type,plane)1550 static int check_ft(mp, c, type, plane)
1551 MULBUF *mp;
1552 register int c;
1553 int type;
1554 int *plane;
1555 {
1556     if (type == TYPE_94_CHARSET) {
1557 	switch (c) {
1558 	case 'B': /* ASCII */
1559 	    goto ok;
1560 	case 'I': /* JIS X 0201 right half (Katakana) */
1561 	case 'J': /* JIS X 0201 left half (Roman) */
1562 	    if (mp->io.scs & SCSJISX0201_1976) goto ok;
1563 	}
1564     } else if (type == TYPE_94N_CHARSET) {
1565 	switch (c) {
1566 	case '@': /* JIS C 6226-1978 */
1567 	    if (mp->io.scs & SCSJISC6226_1978) goto ok;
1568 	    break;
1569 	case 'B': /* JIS X 0208-1983, JIS X 0208:1990, or JIS X 0208:1997 */
1570 	    if (mp->io.scs & (SCSJISX0208_1983 | SCSJISX0208_1990)) goto ok;
1571 	    break;
1572 	case 'D': /* JIS X 0212:1990 */
1573 	    if (mp->io.scs & SCSJISX0212_1990) goto ok;
1574 	    break;
1575 	case 'O': /* JIS X 0213:2000 plane 1 */
1576 	    if (mp->io.scs & SCSJISX0213_2000) goto ok;
1577 	    break;
1578 	case 'P': /* JIS X 0213:2000 plane 2 or JIS X 0213:2004 plane 2 */
1579 	    if (mp->io.scs & (SCSJISX0213_2000 | SCSJISX0213_2004)) goto ok;
1580 	    break;
1581 	case 'Q': /* JIS X 0213:2004 plane 1 */
1582 	    if (mp->io.scs & SCSJISX0213_2004) goto ok;
1583 	    break;
1584 	}
1585     }
1586     if ((mp->io.scs & SCSOTHERISO) && 0x30 <= c && c <= 0x7e) {
1587 	/* accepting all other ISO, so OK */
1588 	goto ok;
1589     }
1590     return (-1);
1591 ok:
1592     *plane = (mp->ms->irr ? IRR2CS(mp->ms->irr) : 0) | TYPE2CS(type) | FT2CS(c);
1593     mp->ms->irr = 0;
1594     mp->eseq = NOESC;
1595     return (0);
1596 }
1597 
check_irr(mp,c)1598 static int check_irr(mp, c)
1599 MULBUF *mp;
1600 register int c;
1601 {
1602     if (0x40 <= c && c <= 0x7e) {
1603 	mp->ms->irr = CODE2IRR(c);
1604 	mp->eseq = NOESC;
1605 	return (0);
1606     }
1607     return (-1);
1608 }
1609 
fix_status_for_escape_sequence(mp)1610 static void fix_status_for_escape_sequence(mp)
1611 MULBUF *mp;
1612 {
1613     if (mp->eseq == NOESC) {
1614 	switch (CS2TYPE(ISVALIDPLANE(mp, sg) ? PLANE2CS(mp, sg) :
1615 					       PLANE2CS(mp, gl))) {
1616 	case TYPE_96_CHARSET:
1617 	case TYPE_96N_CHARSET:
1618 	    change_control_char(0177, 0);
1619 	    break;
1620 	case TYPE_94_CHARSET:
1621 	case TYPE_94N_CHARSET:
1622 	    change_control_char(0177, 1);
1623 	    break;
1624 	}
1625 	switch (CS2TYPE(ISVALIDPLANE(mp, sg) ? PLANE2CS(mp, sg) :
1626 					       PLANE2CS(mp, gr))) {
1627 	case TYPE_96_CHARSET:
1628 	case TYPE_96N_CHARSET:
1629 	    change_control_char(0377, 0);
1630 	    break;
1631 	case TYPE_94_CHARSET:
1632 	case TYPE_94N_CHARSET:
1633 	    change_control_char(0377, 1);
1634 	    break;
1635 	}
1636     }
1637 }
1638 
check_escape_sequence(mp)1639 static int check_escape_sequence(mp)
1640 MULBUF *mp;
1641 {
1642     int c = INBUF(mp);
1643 
1644     switch (mp->eseq) {
1645     case ESC_:
1646 	switch (c) {
1647 	case '$': mp->eseq = ESC_2_4; break;
1648 	case '&': mp->eseq = ESC_2_6; break;
1649 	case '(': mp->eseq = ESC_2_8; break;
1650 	case ')': mp->eseq = ESC_2_9; break;
1651 	case '*': mp->eseq = ESC_2_10; break;
1652 	case '+': mp->eseq = ESC_2_11; break;
1653 	case ',': mp->eseq = ESC_2_12; break;
1654 	case '-': mp->eseq = ESC_2_13; break;
1655 	case '.': mp->eseq = ESC_2_14; break;
1656 	case '/': mp->eseq = ESC_2_15; break;
1657 	case 'N': mp->ms->sg = 2; mp->eseq = NOESC; /*SS2*/break;
1658 	case 'O': mp->ms->sg = 3; mp->eseq = NOESC; /*SS3*/break;
1659 	case 'n': mp->ms->gl = 2; mp->eseq = NOESC; break;
1660 	case 'o': mp->ms->gl = 3; mp->eseq = NOESC; break;
1661 	case '|': if (!(mp->io.inputr & ESISO8)) goto wrong;
1662 		  mp->ms->gr = 3; mp->eseq = NOESC; break;
1663 	case '}': if (!(mp->io.inputr & ESISO8)) goto wrong;
1664 		  mp->ms->gr = 2; mp->eseq = NOESC; break;
1665 	case '~': if (!(mp->io.inputr & ESISO8)) goto wrong;
1666 		  mp->ms->gr = 1; mp->eseq = NOESC; break;
1667 	case '[': mp->eseq = ESC_5_11; break;
1668 	default:  goto wrong;
1669 	}
1670 	break;
1671     case ESC_2_4:
1672 	switch (c) {
1673 	case '(': mp->eseq = ESC_2_4_8; break;
1674 	case ')': mp->eseq = ESC_2_4_9; break;
1675 	case '*': mp->eseq = ESC_2_4_10; break;
1676 	case '+': mp->eseq = ESC_2_4_11; break;
1677 	case '-': mp->eseq = ESC_2_4_13; break;
1678 	case '.': mp->eseq = ESC_2_4_14; break;
1679 	case '/': mp->eseq = ESC_2_4_15; break;
1680 	case '@':
1681 	case 'A':
1682 	case 'B': if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[0])) == 0)
1683 			break;
1684 	default:  goto wrong;
1685 	}
1686 	break;
1687     case ESC_2_6:
1688 	if (check_irr(mp, c) == 0) break;
1689 	goto wrong;
1690     case ESC_2_8:
1691 	if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[0])) == 0) break;
1692 	goto wrong;
1693     case ESC_2_9:
1694 	if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[1])) == 0) break;
1695 	goto wrong;
1696     case ESC_2_10:
1697 	if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[2])) == 0) break;
1698 	goto wrong;
1699     case ESC_2_11:
1700 	if (check_ft(mp, c, TYPE_94_CHARSET, &(mp->ms->gs[3])) == 0) break;
1701 	goto wrong;
1702     case ESC_2_12:
1703 	if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[0])) == 0) break;
1704 	goto wrong;
1705     case ESC_2_13:
1706 	if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[1])) == 0) break;
1707 	goto wrong;
1708     case ESC_2_14:
1709 	if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[2])) == 0) break;
1710 	goto wrong;
1711     case ESC_2_15:
1712 	if (check_ft(mp, c, TYPE_96_CHARSET, &(mp->ms->gs[3])) == 0) break;
1713 	goto wrong;
1714     case ESC_2_4_8:
1715 	if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[0])) == 0) break;
1716 	goto wrong;
1717     case ESC_2_4_9:
1718 	if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[1])) == 0) break;
1719 	goto wrong;
1720     case ESC_2_4_10:
1721 	if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[2])) == 0) break;
1722 	goto wrong;
1723     case ESC_2_4_11:
1724 	if (check_ft(mp, c, TYPE_94N_CHARSET, &(mp->ms->gs[3])) == 0) break;
1725 	goto wrong;
1726     case ESC_2_4_13:
1727 	if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[1])) == 0) break;
1728 	goto wrong;
1729     case ESC_2_4_14:
1730 	if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[2])) == 0) break;
1731 	goto wrong;
1732     case ESC_2_4_15:
1733 	if (check_ft(mp, c, TYPE_96N_CHARSET, &(mp->ms->gs[3])) == 0) break;
1734 	goto wrong;
1735     case ESC_5_11:
1736 	if (mp->lastpos - 20 > mp->startpos) /* ESC sequence to long */
1737 	    goto wrong;
1738 	if (is_ansi_end(c))
1739 	    goto disp_esc;
1740 	if (!is_ansi_middle(c))
1741 	    goto wrong;
1742 	break;
1743     case NOESC:
1744 	/*
1745 	 * This sequence is wrong if we buffered some data.
1746 	 */
1747 	if (mp->lastpos > mp->startpos) {
1748 	    switch (c) {
1749 	    case 0033:
1750 	    case 0016:
1751 	    case 0017:
1752 	    case 0031: goto wrong;
1753 	    default:   goto wrongone;
1754 	    }
1755 	}
1756 	/*
1757 	 * Nothing is buffered.  So, check this sequence.
1758 	 */
1759 	switch (c) {
1760 	case 0033: mp->eseq = ESC_; break;
1761 	case 0016: mp->ms->gl = 1; mp->eseq = NOESC; break;
1762 	case 0017: mp->ms->gl = 0; mp->eseq = NOESC; break;
1763 	case 0031: mp->ms->sg = 2; mp->eseq = NOESC; /*SS2*/ break;
1764 	case 0216:
1765 	    if (!(mp->io.inputr & ESISO8) || CSISWRONG(mp->ms->gs[2]))
1766 		goto wrongone;
1767 	    mp->ms->sg = 2;
1768 	    mp->eseq = NOESC; /*SS2*/
1769 	    break;
1770 	case 0217:
1771 	    if (!(mp->io.inputr & ESISO8) || CSISWRONG(mp->ms->gs[3]))
1772 		goto wrongone;
1773 	    mp->ms->sg = 3;
1774 	    mp->eseq = NOESC; /*SS3*/
1775 	    break;
1776 	default:   goto wrongone;
1777 	}
1778 	break;
1779     default:
1780 	assert(0);
1781     }
1782     if (mp->eseq == NOESC) {
1783 	fix_status_for_escape_sequence(mp);
1784 	mp->startpos = mp->lastpos + 1;
1785 	return (0);
1786     }
1787     return (0);
1788 disp_esc:
1789     if (mp->eseq != NOESC) {
1790 	mp->eseq = NOESC;
1791 	fix_status_for_escape_sequence(mp);
1792     }
1793     wrongcs1(mp);
1794     multi_reparse(mp);
1795     return (0);
1796 wrong:
1797     if (mp->eseq != NOESC) {
1798 	mp->eseq = NOESC;
1799 	fix_status_for_escape_sequence(mp);
1800     }
1801     mp->multiint[mp->intindex] = INBUF0(mp);
1802     mp->multics[mp->intindex] = WRONG_ESC;
1803     mp->intindex++;
1804     mp->startpos++;
1805     multi_reparse(mp);
1806     return (0);
1807 wrongone:
1808     assert(mp->eseq == NOESC);
1809     return (-1);
1810 }
1811 
1812 struct planeset {
1813     char *name;
1814     char *planeset;
1815 } planesets[] = {
1816     { "ascii",		""	},
1817     { "ctext",		"\\e-A"	},
1818     { "latin1",		"\\e-A"	},
1819     { "iso8859-1",	"\\e-A"	},
1820     { "latin2",		"\\e-B"	},
1821     { "iso8859-2",	"\\e-B"	},
1822     { "latin3",		"\\e-C"	},
1823     { "iso8859-3",	"\\e-C"	},
1824     { "latin4",		"\\e-D"	},
1825     { "iso8859-4",	"\\e-D"	},
1826     { "cyrillic",	"\\e-L"	},
1827     { "iso8859-5",	"\\e-L"	},
1828     { "arabic",		"\\e-G"	},
1829     { "iso8859-6",	"\\e-G"	},
1830     { "greek",		"\\e-F"	},
1831     { "iso8859-7",	"\\e-F"	},
1832     { "hebrew",		"\\e-H"	},
1833     { "iso8859-8",	"\\e-H"	},
1834     { "latin5",		"\\e-M"	},
1835     { "iso8859-9",	"\\e-M"	},
1836     { "latin6",		"\\e-V"	},
1837     { "iso8859-10",	"\\e-V"	},
1838     { "thai",		"\\e-T"	},
1839     { "iso8859-11",	"\\e-T"	},
1840     { "latin7",		"\\e-Y"	},
1841     { "iso8859-13",	"\\e-Y"	},
1842     { "latin8",		"\\e-_"	},
1843     { "iso8859-14",	"\\e-_"	},
1844     { "latin9",		"\\e-b"	},
1845     { "iso8859-15",	"\\e-b"	},
1846     { "latin10",	"\\e-f"	},
1847     { "iso8859-16",	"\\e-f"	},
1848     { "jisx0201",	"\\e(J\\e)I" },
1849     { "japanese",	"\\e$)B\\e*I\\e$+D" },
1850     { "ujis",		"\\e$)B\\e*I\\e$+D" },
1851     { "euc",		"\\e$)B\\e*I\\e$+D" },
1852     { "euc-jisx0213",	"\\e$)O\\e*I\\e$+P" },
1853     { NULL,		"" }
1854 };
1855 
set_planeset(name)1856 int set_planeset(name)
1857 register char *name;
1858 {
1859     register struct planeset *p;
1860     MULBUF *mp;
1861     int ret;
1862     int i;
1863 
1864     if (name == NULL) {
1865 	return -1;
1866     }
1867     for (p = planesets; p->name != NULL; p++) {
1868 	if (strcasecmp(name, p->name) == 0) {
1869 	    name = p->planeset;
1870 	    break;
1871 	}
1872     }
1873     mp = new_multibuf();
1874     init_priority(mp);
1875     while (*name) {
1876 	if (*name == '\\' &&
1877 	    (*(name + 1) == 'e' || *(name + 1) == 'E')) {
1878 	    ++mp->lastpos;
1879 	    INBUF(mp) = '\033';
1880 	    ret = check_escape_sequence(mp);
1881 	    name += 2;
1882 	} else {
1883 	    ++mp->lastpos;
1884 	    INBUF(mp) = *name++;
1885 	    ret = check_escape_sequence(mp);
1886 	}
1887 	if (ret < 0 || mp->intindex > 0) {
1888 	    free(mp);
1889 	    return -1;
1890 	}
1891     }
1892     def_gs[0] = mp->ms->gs[0];
1893     def_gs[1] = mp->ms->gs[1];
1894     def_gs[2] = mp->ms->gs[2];
1895     def_gs[3] = mp->ms->gs[3];
1896     free(mp);
1897     return 0;
1898 }
1899 
init_def_scs_es(scs,input,inputr,out)1900 void init_def_scs_es(scs, input, inputr, out)
1901 SETCHARSET scs;
1902 ENCSET input;
1903 ENCSET inputr;
1904 ENCSET out;
1905 {
1906     def_scs = scs;
1907     def_input = input;
1908     def_inputr = inputr;
1909     output = out;
1910 
1911     if (inputr & ESUTF8)
1912 	    make_unicode_map(scs, output & ESUTF8);
1913 }
1914 
init_def_priority(pri)1915 void init_def_priority(pri)
1916 J_PRIORITY pri;
1917 {
1918 #if JAPANESE
1919     assert(pri == PUJIS || pri == PSJIS || pri == PUTF8);
1920     def_priority = pri;
1921 #endif
1922 }
1923 
init_priority(mp)1924 void init_priority(mp)
1925 MULBUF *mp;
1926 {
1927 #if JAPANESE
1928     if ((mp->io.inputr & (ESSJIS | ESCP932)) && (mp->io.inputr & ESUJIS))
1929 	mp->priority = def_priority;
1930     else if (mp->io.inputr & ESUTF8)
1931 	mp->priority = PUTF8;
1932     else if (mp->io.inputr & ESUJIS)
1933 	mp->priority = PUJIS;
1934     else if (mp->io.inputr & (ESSJIS | ESCP932))
1935 	mp->priority = PSJIS;
1936     else
1937 	mp->priority = PNONE;
1938     mp->sequence_counter = 0;
1939 #endif
1940 }
1941 
get_priority(mp)1942 J_PRIORITY get_priority(mp)
1943 MULBUF *mp;
1944 {
1945 #if JAPANESE
1946     return (mp->priority);
1947 #else
1948     return (PNONE);
1949 #endif
1950 }
1951 
set_priority(mp,pri)1952 void set_priority(mp, pri)
1953 MULBUF *mp;
1954 J_PRIORITY pri;
1955 {
1956 #if JAPANESE
1957     assert(pri == PSJIS || pri == PUJIS || pri == PUTF8 || pri == PNONE);
1958     mp->priority = pri;
1959 #endif
1960 }
1961 
set_utfwidth(u)1962 void set_utfwidth(u)
1963 UWidth u;
1964 {
1965     assert(u >= UWIDTH_NONE && u <= UWIDTH_ALL);
1966     utfwidth = u;
1967 }
1968 
new_multibuf()1969 MULBUF *new_multibuf()
1970 {
1971     MULBUF *mp = (MULBUF*) ecalloc(1, sizeof(MULBUF));
1972     mp->io.scs = def_scs;
1973     mp->io.input = def_input;
1974     mp->io.inputr = def_inputr;
1975     mp->orig_io_right = def_inputr;
1976     mp->rotation_io_right = 0;
1977     mp->eseq = NOESC;
1978     mp->ms = (struct m_status*) ecalloc(1, sizeof(struct m_status));
1979     init_multibuf(mp);
1980     return (mp);
1981 }
1982 
clear_multibuf(mp)1983 void clear_multibuf(mp)
1984 MULBUF *mp;
1985 {
1986     mp->lastpos = M_NULL_POS;
1987     mp->startpos = 0;
1988     mp->laststartpos = 0;
1989     mp->lastsg = WRONGPLANE;
1990     mp->intindex = 0;
1991 }
1992 
init_ms(ms)1993 static void init_ms(ms)
1994 struct m_status *ms;
1995 {
1996     ms->gs[0] = def_gs[0];
1997     ms->gs[1] = def_gs[1];
1998     ms->gs[2] = def_gs[2];
1999     ms->gs[3] = def_gs[3];
2000     ms->gl = 0;
2001     ms->gr = 1;
2002     ms->sg = WRONGPLANE;
2003     ms->irr = 0;
2004 }
2005 
init_multibuf(mp)2006 void init_multibuf(mp)
2007 MULBUF *mp;
2008 {
2009     mp->cs = ASCII;
2010     init_ms(mp->ms);
2011     if (mp->eseq != NOESC) {
2012 	mp->eseq = NOESC;
2013     }
2014     fix_status_for_escape_sequence(mp);
2015 #if JAPANESE
2016     mp->sequence_counter = 0;
2017 #endif
2018     mp->icharset = ASCII;
2019     clear_multibuf(mp);
2020 }
2021 
2022 /*
2023  * Buffering characters untile get a guarantee that it is right sequence.
2024  */
check_new_buffered_byte(mp)2025 static void check_new_buffered_byte(mp)
2026 MULBUF *mp;
2027 {
2028     m_position last_startpos = mp->startpos;
2029 
2030     if (mp->io.input & (ESJIS83 | ESISO7 | ESISO8)) {
2031 	if (check_escape_sequence(mp) == 0) {
2032 	    return;		/* going process well */
2033 	}
2034     }
2035 
2036     /* it is not a escape sequence, try to use it as character */
2037     internalize(mp);
2038 
2039     /*
2040      * If a character was detected in internalize(),
2041      * clean sg since single shift affect only one character.
2042      */
2043     if (last_startpos != mp->startpos) {
2044 	mp->lastsg = mp->ms->sg;
2045 	if (mp->ms->sg != WRONGPLANE) {
2046 	    mp->ms->sg = WRONGPLANE;
2047 	    fix_status_for_escape_sequence(mp);
2048 	}
2049     }
2050 }
2051 
2052 /*
2053  * Re-parse all buffered data.
2054  *
2055  * This routine is called when we find a problem in buffered data.
2056  * We firstly take out the first byte of buffered data before we call
2057  * this function.  This routine parse all rest of buffered data again.
2058  */
multi_reparse(mp)2059 static void multi_reparse(mp)
2060 MULBUF *mp;
2061 {
2062     m_position to;
2063 
2064     /*
2065      * We found something wrong and going to move first byte.
2066      * So, we clear single-shifted character set because it will
2067      * shift only this one byte being makred wrong.
2068      */
2069     if (mp->ms->sg != WRONGPLANE) {
2070 	mp->ms->sg = WRONGPLANE;
2071 	fix_status_for_escape_sequence(mp);
2072     }
2073 
2074 #if JAPANESE
2075     /*
2076      * Quick japanese code hack.
2077      * Check whether character is SJIS KANA or not.
2078      * If it is SJIS KANA, it means our prediction was failed.
2079      * Now going to fall back to SJIS KANA mode.
2080      */
2081     if ((mp->priority == PSJIS || (mp->io.inputr & (ESSJIS | ESCP932))) &&
2082 	CSISWRONG(mp->multics[mp->intindex - 1]) &&
2083 	ISSJISKANA(mp->multiint[mp->intindex - 1])) {
2084 	mp->cs = JISX0201KANA;
2085 	mp->priority = PSJIS;
2086 	mp->icharset = SJIS;
2087 	mp->multiint[mp->intindex - 1] &= 0x7f;
2088 	mp->multics[mp->intindex - 1] = mp->cs;
2089     }
2090 #endif
2091 
2092     /*
2093      * Retry to parse rest of buffered data.
2094      */
2095     to = mp->lastpos;
2096     for (mp->lastpos = mp->startpos; mp->lastpos <= to; mp->lastpos++) {
2097 	check_new_buffered_byte(mp);
2098     }
2099     mp->lastpos = to;
2100 }
2101 
2102 #if LESS
multi_find_cs(mp,pos)2103 void multi_find_cs(mp, pos)
2104 MULBUF* mp;
2105 m_position pos;
2106 {
2107     int c;
2108     m_position lpos = pos;
2109 
2110     if (ch_seek(pos) == 0) {
2111 	/*
2112 	 * Back up to the beginning of the line.
2113 	 */
2114 	while ((c = ch_back_get()) != '\n' && c != EOI) ;
2115 	if (c == '\n') {
2116 	    (void)ch_forw_get();
2117 	}
2118 
2119 	lpos = ch_tell();
2120 
2121 	if (lpos != pos) {
2122 	    while (lpos < pos) {
2123 		c = ch_forw_get();
2124 		assert(c != EOI && c != '\n');
2125 		multi_parse(mp, c, NULL_POSITION, NULL, NULL);
2126 		lpos++;
2127 	    }
2128 	    ch_seek(pos);
2129 	}
2130     }
2131 }
2132 #endif
2133 
2134 #define DEBUG 0
2135 #if DEBUG
2136 int debug = 1;
2137 #endif
2138 
2139 /*
2140  * Manage m_status data structure to maintain ISO-2022 status of input stream.
2141  */
multi_start_buffering(mp,pos)2142 void multi_start_buffering(mp, pos)
2143 MULBUF *mp;
2144 m_position pos;
2145 {
2146     /* buffer must be empty */
2147     assert(mp->lastpos < mp->startpos);
2148 
2149     /* initialize m_status if it is necessary */
2150     if (pos == mp->lastpos + 2 || pos == mp->laststartpos) {
2151 	/*
2152 	 * pos == mp->lastpos+2 if this line is started after \n.
2153 	 * pos == mp->laststartpos if this line is started by a non-fit
2154 	 * character.
2155 	 */
2156 	/* restore backed up sg */
2157 	if (mp->ms->sg != mp->lastsg) {
2158 	    mp->ms->sg = mp->lastsg;
2159 	    fix_status_for_escape_sequence(mp);
2160 	}
2161 	/* adjust pointers */
2162 	mp->startpos = pos;
2163 	mp->lastpos = pos - 1;
2164     } else {
2165 	/*
2166 	 * pos == somewhere else if this function is called after jump_loc().
2167 	 */
2168 #if DEBUG
2169 	if (debug) {
2170 	    fprintf(stderr, "%qd, %qd, %qd, %qd\n", pos, mp->lastpos,
2171 		mp->startpos, mp->laststartpos);
2172 	    fprintf(stderr, "oct %qo, %qo, %qo, %qo\n", pos, mp->lastpos,
2173 		mp->startpos, mp->laststartpos);
2174 	}
2175 #endif
2176 	init_multibuf(mp);
2177 #if LESS
2178 	multi_find_cs(mp, pos);
2179 	clear_multibuf(mp);
2180 #endif
2181 
2182 	/* adjust pointers */
2183 	mp->startpos = pos;
2184 	mp->lastpos = pos - 1;
2185 	mp->laststartpos = pos;
2186     }
2187 }
2188 
2189 /*
2190  * Buffering characters untile get a guarantee that it is right sequence.
2191  */
multi_parse(mp,c,pos,mbd,mpos)2192 void multi_parse(mp, c, pos, mbd, mpos)
2193 MULBUF* mp;
2194 int c;
2195 m_position pos;
2196 M_BUFDATA* mbd;
2197 POSITION* mpos;
2198 {
2199     if (c < 0) {
2200 	if (mpos != NULL) {
2201 	    *mpos = mp->startpos;
2202 	}
2203 
2204 	/*
2205 	 * output pending unicode character
2206 	 */
2207 	flush_pending_ucs(mp);
2208 
2209 	/*
2210 	 * Force to flush all buffering characters.
2211 	 */
2212 	if (mp->eseq != NOESC) {
2213 	    mp->eseq = NOESC;
2214 	    fix_status_for_escape_sequence(mp);
2215 	}
2216 	while (mp->startpos <= mp->lastpos) {
2217 	    wrongcs1(mp);
2218 	    multi_reparse(mp);
2219 	}
2220 
2221 	if (mbd != NULL) {
2222 	    mbd->cbuf = mp->multiint;
2223 	    mbd->csbuf = mp->multics;
2224 	    mbd->byte = mp->intindex;
2225 	}
2226 	mp->intindex = 0;
2227     } else {
2228 	if (pos != NULL_POSITION) {
2229 	    assert(pos == mp->lastpos + 1);
2230 	    mp->lastpos = pos;
2231 	} else {
2232 	    mp->lastpos++;
2233 	}
2234 	INBUF(mp) = c;
2235 
2236 	mp->laststartpos = mp->startpos;
2237 	if (mpos != NULL) {
2238 	    *mpos = mp->startpos;
2239 	}
2240 
2241 	/*
2242 	 * Put it into buffer and parse it.
2243 	 */
2244 	check_new_buffered_byte(mp);
2245 
2246 	if (mbd != NULL) {
2247 	    mbd->cbuf = mp->multiint;
2248 	    mbd->csbuf = mp->multics;
2249 	    mbd->byte = mp->intindex;
2250 	}
2251 	mp->intindex = 0;
2252     }
2253 }
2254 
2255 /*
2256  * Flush buffered data.
2257  */
multi_flush(mp,mbd,mpos)2258 void multi_flush(mp, mbd, mpos)
2259 MULBUF* mp;
2260 M_BUFDATA* mbd;
2261 POSITION* mpos;
2262 {
2263     multi_parse(mp, -1, NULL_POSITION, mbd, mpos);
2264 }
2265 
2266 /*
2267  * Discard buffered data.
2268  */
multi_discard(mp)2269 void multi_discard(mp)
2270 MULBUF* mp;
2271 {
2272     multi_parse(mp, -1, NULL_POSITION, NULL, NULL);
2273 }
2274 
set_codesets(mp,input,inputr)2275 void set_codesets(mp, input, inputr)
2276 MULBUF *mp;
2277 ENCSET input;
2278 ENCSET inputr;
2279 {
2280     mp->io.input = input;
2281     mp->io.inputr = inputr;
2282     if (inputr & ESUTF8)
2283 	    make_unicode_map(mp->io.scs, output & ESUTF8);
2284 }
2285 
2286 /*
2287  * Return string representation about multi bytes character
2288  * which was buffered.
2289  */
get_icharset_string(mp)2290 char *get_icharset_string(mp)
2291 MULBUF *mp;
2292 {
2293 	static char buf[10];
2294 
2295 	switch (mp->icharset)
2296 	{
2297 #if JAPANESE
2298 	/*
2299 	 * Code set
2300 	 */
2301 	case SJIS:		return ("SJIS");
2302 	case SJIS2000:		return ("SJIS-2000");
2303 	case SJIS2004:		return ("SJIS-2004");
2304 	case CP932:             return ("CP932");
2305 	case UJIS:		return ("UJIS");
2306 	case UJIS2000:		return ("UJIS-2000");
2307 	case UJIS2004:		return ("UJIS-2004");
2308 	case UTF8Z:             return ("UTF-8");
2309 	case UTF8:              return ("UTF-8");
2310 	case UTF8W:             return ("UTF-8");
2311 #endif
2312 	/*
2313 	 * Character set
2314 	 */
2315 	case ASCII:		return ("ASCII");
2316 	case JISX0201KANA:	return ("JIS-KANA");
2317 	case JISX0201ROMAN:	return ("JIS-ROMAN");
2318 	case LATIN1:		return ("LATIN1");
2319 	case LATIN2:		return ("LATIN2");
2320 	case LATIN3:		return ("LATIN3");
2321 	case LATIN4:		return ("LATIN4");
2322 	case GREEK:		return ("GREEK");
2323 	case ARABIC:		return ("ARABIC");
2324 	case HEBREW:		return ("HEBREW");
2325 	case CYRILLIC:		return ("CYRILLIC");
2326 	case LATIN5:		return ("LATIN5");
2327 	case THAI:		return ("THAI");
2328 	case LATIN6:		return ("LATIN6");
2329 	case LATIN7:		return ("LATIN7");
2330 	case LATIN8:		return ("LATIN8");
2331 	case LATIN9:		return ("LATIN9");
2332 	case LATIN10:		return ("LATIN10");
2333 	case JISX0208_78KANJI:	return ("KANJI:1978");
2334 	case GB2312:		return ("GB2312");
2335 	case JISX0208KANJI:	return ("KANJI:1983");
2336 	case JISX0208_90KANJI:	return ("KANJI:1990");
2337 	case KSC5601:		return ("KSC5601");
2338 	case JISX0212KANJISUP:	return ("JIS-KANJISP");
2339 	case JISX0213KANJI1:	return ("X0213:2000-1");
2340 	case JISX0213KANJI2:	return ("X0213:2000-2");
2341 	case JISX02132004KANJI1:return ("X0213:2004-1");
2342 	}
2343 	switch (CS2TYPE(mp->icharset))
2344 	{
2345 	case TYPE_94_CHARSET:
2346 		strcpy(buf, "94( )");
2347 		buf[3] = CS2FT(mp->icharset);
2348 		break;
2349 	case TYPE_96_CHARSET:
2350 		strcpy(buf, "96( )");
2351 		buf[3] = CS2FT(mp->icharset);
2352 		break;
2353 	case TYPE_94N_CHARSET:
2354 		strcpy(buf, "94N( )");
2355 		buf[4] = CS2FT(mp->icharset);
2356 		break;
2357 	case TYPE_96N_CHARSET:
2358 		strcpy(buf, "96N( )");
2359 		buf[4] = CS2FT(mp->icharset);
2360 		break;
2361 	default:
2362 		assert(0);
2363 	}
2364 	if (CS2IRR(mp->icharset) > 0)
2365 	{
2366 		char num[3];
2367 		sprintf(num, "%d", CS2IRR(mp->icharset));
2368 		strcat(buf, num);
2369 	}
2370 	return (buf);
2371 }
2372 
2373 static int old_gl_output_charset = ASCII; /* Last displayed character set */
2374 static int old_gr_output_charset = WRONGCS;
2375 static int old_shift = 0;
2376 
make_escape_sequence(charset)2377 static unsigned char *make_escape_sequence(charset)
2378 int charset;
2379 {
2380 	static unsigned char p[9];
2381 	int len;
2382 
2383 	if (CSISWRONG(charset))
2384 	{
2385 		charset = ASCII;
2386 	}
2387 
2388 	if (old_gl_output_charset != charset
2389 	    || old_gr_output_charset != charset) {
2390 		p[0] = '\033';
2391 		len = 1;
2392 		if ((output & (ESISO7 | ESISO8)) && CS2IRR(charset) > 0)
2393 		{
2394 			p[len] = '&';
2395 			p[len + 1] = IRR2CODE(CS2IRR(charset));
2396 			p[len + 2] = '\033';
2397 			len += 3;
2398 		}
2399 		/*
2400 		 * Call 94 or 94N character set to G0/GL plane.
2401 		 * Call 96 or 96N character set to G1/GR plane.
2402 		 */
2403 		switch (CS2TYPE(charset))
2404 		{
2405 		case TYPE_94_CHARSET:
2406 			p[len] = '(';
2407 			p[len + 1] = CS2FT(charset);
2408 			len += 2;
2409 			old_gl_output_charset = charset;
2410 			break;
2411 		case TYPE_94N_CHARSET:
2412 			switch (CS2FT(charset))
2413 			{
2414 			case '@':
2415 			case 'A':
2416 			case 'B':
2417 				p[len] = '$';
2418 				p[len + 1] = CS2FT(charset);
2419 				len += 2;
2420 				break;
2421 			default:
2422 				p[len] = '$';
2423 				p[len + 1] = '(';
2424 				p[len + 2] = CS2FT(charset);
2425 				len += 3;
2426 				break;
2427 			}
2428 			old_gl_output_charset = charset;
2429 			break;
2430 		case TYPE_96_CHARSET:
2431 			p[len] = '-';
2432 			p[len + 1] = CS2FT(charset);
2433 			len += 2;
2434 			old_gr_output_charset = charset;
2435 			break;
2436 		case TYPE_96N_CHARSET:
2437 			p[len] = '$';
2438 			p[len + 1] = '-';
2439 			p[len + 2] = CS2FT(charset);
2440 			len += 3;
2441 			old_gr_output_charset = charset;
2442 			break;
2443 		}
2444 	}
2445 	/*
2446 	 * If output is not ESISO8, use SO and SI to call G1 to GL.
2447 	 * Otherwise, we use GR directly, so no need to call G1
2448 	 * since G1 is called GR already.
2449 	 */
2450 	if (!(output & ESISO8))
2451 	{
2452 		switch (CS2TYPE(charset))
2453 		{
2454 		case TYPE_94_CHARSET:
2455 		case TYPE_94N_CHARSET:
2456 			if (old_shift) {
2457 				p[len] = '\017';
2458 				len++;
2459 				old_shift = 0;
2460 			}
2461 			break;
2462 		case TYPE_96_CHARSET:
2463 		case TYPE_96N_CHARSET:
2464 			if (!old_shift) {
2465 				p[len] = '\016';
2466 				len++;
2467 				old_shift = 1;
2468 			}
2469 			break;
2470 		}
2471 	}
2472 	p[len] = '\0';
2473 	return (p);
2474 }
2475 
2476 static char cvbuffer[32];
2477 static int cvindex = 0;
2478 static char *nullcvbuffer = "";
2479 
convert_to_iso(c,cs)2480 static char *convert_to_iso(c, cs)
2481 int c;
2482 int cs;
2483 {
2484 	register unsigned char *p;
2485 	static char buffer2[2];
2486 
2487 	if ((output & ESISO8) && c != 0 &&
2488 	    (CS2TYPE(cs) == TYPE_96_CHARSET ||
2489 	     CS2TYPE(cs) == TYPE_96N_CHARSET))
2490 		c |= 0x80;
2491 
2492 	buffer2[0] = c;
2493 	buffer2[1] = '\0';
2494 
2495 	cs = CS2CHARSET(cs);
2496 	if (cs == CP932)
2497 	{
2498 		/* not supported */
2499 		cvindex = 0;
2500 		return (nullcvbuffer);
2501 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2502 	{
2503 		/* not supported */
2504 		cvindex = 0;
2505 		return (nullcvbuffer);
2506 	}
2507 	if (CSISREST(cs))
2508 	{
2509 		return (buffer2);
2510 	}
2511 	if (CSISWRONG(cs))
2512 	{
2513 		cs = ASCII;
2514 	}
2515 
2516 	if (c & 0x80) {
2517 	    if (cs == old_gr_output_charset) {
2518 		return (buffer2);
2519 	    }
2520 	} else {
2521 	    if (cs == old_gl_output_charset && old_shift == 0) {
2522 		return (buffer2);
2523 	    } else if (cs == old_gr_output_charset && old_shift == 1) {
2524 		return (buffer2);
2525 	    }
2526 	}
2527 
2528 	p = make_escape_sequence(cs);
2529 	strcpy(cvbuffer, p);
2530 	strcat(cvbuffer, buffer2);
2531 	return (cvbuffer);
2532 }
2533 
convert_to_jis(c,cs)2534 static char *convert_to_jis(c, cs)
2535 int c;
2536 int cs;
2537 {
2538 	register unsigned char *p;
2539 	static char buffer2[3];
2540 
2541 	if (c == 0)
2542 	{
2543 		cvindex = 0;
2544 		return (nullcvbuffer);
2545 	}
2546 
2547 	buffer2[cvindex++] = c;
2548 	buffer2[cvindex] = '\0';
2549 
2550 	if (CSISWRONG(cs))
2551 	{
2552 		cs = ASCII;
2553 	}
2554 
2555 	cs = CS2CHARSET(cs);
2556 
2557 	if (cs == ASCII || cs == JISX0201ROMAN)
2558 	{
2559 		assert(cvindex == 1);
2560 		cvindex = 0;
2561 	} else if (cs == JISX0201KANA)
2562 	{
2563 		assert(cvindex == 1);
2564 		cvindex = 0;
2565 	} else if (cs == JISX0208_78KANJI)
2566 	{
2567 		if (cvindex == 1)
2568 			return (nullcvbuffer);
2569 		assert(cvindex == 2);
2570 		jis78to90(buffer2);
2571 		cs = JISX0208_90KANJI;
2572 		cvindex = 0;
2573 	} else if (cs == JISX0208KANJI || cs == JISX0208_90KANJI)
2574 	{
2575 		if (cvindex == 1)
2576 			return (nullcvbuffer);
2577 		assert(cvindex == 2);
2578 		cvindex = 0;
2579 	} else if (cs == JISX0213KANJI1)
2580 	{
2581 		if (cvindex == 1)
2582 			return (nullcvbuffer);
2583 		assert(cvindex == 2);
2584 		cvindex = 0;
2585 		cs = JISX0208KANJI;
2586 	} else if (cs == JISX02132004KANJI1)
2587 	{
2588 		if (cvindex == 1)
2589 			return (nullcvbuffer);
2590 		assert(cvindex == 2);
2591 		cvindex = 0;
2592 		cs = JISX0208KANJI;
2593 	} else if (cs == CP932)
2594 	{
2595 		/* not supported */
2596 		cvindex = 0;
2597 		return (nullcvbuffer);
2598 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2599 	{
2600 		/* not supported */
2601 		cvindex = 0;
2602 		return (nullcvbuffer);
2603 	} else
2604 	{
2605 		assert(0);
2606 		cvindex = 0;
2607 	}
2608 
2609 	if (cs == old_gl_output_charset)
2610 	{
2611 		return (buffer2);
2612 	}
2613 	else
2614 	{
2615 		p = make_escape_sequence(cs);
2616 		strcpy(cvbuffer, p);
2617 		strcat(cvbuffer, buffer2);
2618 		return (cvbuffer);
2619 	}
2620 }
2621 
2622 #if JAPANESE
convert_to_ujis(c,cs)2623 static char *convert_to_ujis(c, cs)
2624 int c;
2625 int cs;
2626 {
2627 	if (c == 0)
2628 	{
2629 		cvindex = 0;
2630 		return (nullcvbuffer);
2631 	}
2632 
2633 	cvbuffer[cvindex++] = c;
2634 	cvbuffer[cvindex] = '\0';
2635 
2636 	if (CSISWRONG(cs))
2637 	{
2638 		cs = ASCII;
2639 	}
2640 
2641 	cs = CS2CHARSET(cs);
2642 	if (cs == ASCII || cs == JISX0201ROMAN)
2643 	{
2644 		assert(cvindex == 1);
2645 		cvindex = 0;
2646 		return (cvbuffer);
2647 	} else if (cs == JISX0201KANA)
2648 	{
2649 		assert(cvindex == 1);
2650 		cvbuffer[2] = '\0';
2651 		cvbuffer[1] = cvbuffer[0] | 0x80;
2652 		cvbuffer[0] = 0x8e;
2653 		cvindex = 0;
2654 		return (cvbuffer);
2655 	} else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2656 		   cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
2657 		   cs == JISX02132004KANJI1)
2658 	{
2659 		if (cvindex == 1)
2660 			return (nullcvbuffer);
2661 		assert(cvindex == 2);
2662 		if (cs == JISX0208_78KANJI)
2663 			jis78to90(cvbuffer);
2664 		cvbuffer[0] |= 0x80;
2665 		cvbuffer[1] |= 0x80;
2666 		cvindex = 0;
2667 		return (cvbuffer);
2668 	} else if (cs == JISX0212KANJISUP || cs == JISX0213KANJI2)
2669 	{
2670 		if (cvindex == 1)
2671 			return (nullcvbuffer);
2672 		assert(cvindex == 2);
2673 		cvbuffer[2] = cvbuffer[1] | 0x80;
2674 		cvbuffer[1] = cvbuffer[0] | 0x80;
2675 		cvbuffer[0] = 0x8f;
2676 		cvbuffer[3] = '\0';
2677 		cvindex = 0;
2678 		return (cvbuffer);
2679 	} else if (cs == CP932)
2680 	{
2681 		/* not supported */
2682 		cvindex = 0;
2683 		return (nullcvbuffer);
2684 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2685 	{
2686 		/* not supported */
2687 		cvindex = 0;
2688 		return (nullcvbuffer);
2689 	}
2690 	assert(0);
2691 	cvindex = 0;
2692 	return (cvbuffer);
2693 }
2694 
convert_to_sjis(c,cs)2695 static char *convert_to_sjis(c, cs)
2696 int c;
2697 int cs;
2698 {
2699 	if (c == 0)
2700 	{
2701 		cvindex = 0;
2702 		return (nullcvbuffer);
2703 	}
2704 
2705 	cvbuffer[cvindex++] = c;
2706 	cvbuffer[cvindex] = '\0';
2707 
2708 	if (CSISWRONG(cs))
2709 	{
2710 		cs = ASCII;
2711 	}
2712 
2713 	cs = CS2CHARSET(cs);
2714 
2715 	if (cs == ASCII || cs == JISX0201ROMAN)
2716 	{
2717 		assert(cvindex == 1);
2718 		cvindex = 0;
2719 		return (cvbuffer);
2720 	} else if (cs == JISX0201KANA)
2721 	{
2722 		assert(cvindex == 1);
2723 		cvbuffer[0] |= 0x80;
2724 		cvindex = 0;
2725 		return (cvbuffer);
2726 	} else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2727 		   cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
2728 		   cs == JISX02132004KANJI1)
2729 	{
2730 		register int c1, c2, c3;
2731 		static unsigned char table_sjis[] = {
2732 			0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2733 			   0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
2734 			0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
2735 			0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
2736 			0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
2737 			0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
2738 			0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
2739 		};
2740 
2741 		if (cvindex == 1)
2742 			return (nullcvbuffer);
2743 		assert(cvindex == 2);
2744 		if (cs == JISX0208_78KANJI)
2745 			jis78to90(cvbuffer);
2746 		c3 = cvbuffer[0] & 0x7f;
2747 		c1 = c3 & 1;
2748 		c2 = (cvbuffer[1] & 0x7f) + (c1 ? 0x40 - 0x21 : 0x9e - 0x21);
2749 		c1 = table_sjis[c3 / 2 + c1];
2750 		cvbuffer[0] = c1;
2751 		cvbuffer[1] = c2 + (c2 >= 0x7f ? 1 : 0);
2752 		cvindex = 0;
2753 		return (cvbuffer);
2754 	} else if (cs == JISX0213KANJI2)
2755 	{
2756 		register int c1, c2, c3;
2757 		if (cvindex == 1)
2758 			return (nullcvbuffer);
2759 		assert(cvindex == 2);
2760 		c3 = cvbuffer[0] & 0x7f;
2761 		c1 = c3 & 1;
2762 		c2 = (cvbuffer[1] & 0x7f) +
2763 		     (c1 ? 0x40 - 0x21 : 0x9e - 0x21);
2764 		if (c3 <= 0x25) {
2765 			/* Map 1, 3, 4, and 5-KU */
2766 			/* Note: 2-KU is rejected already. */
2767 			c1 = (c3 - 0x21) / 2 + 0xf0;
2768 		} else if (c3 == 0x28) {
2769 			/* Map 8-KU */
2770 			c1 = 0xf0;
2771 		} else if (c3 <= 0x2f) {
2772 			/* Map 12, 13, 14, and 15-KU */
2773 			c1 = (c3 - 0x2b) / 2 + 0xf2;
2774 		} else {
2775 			/* Map 78-94 KU. */
2776 			/* Note: 16-77 KU is rejected already. */
2777 			c1 = (c3 - 0x6d) / 2 + 0xf4;
2778 		}
2779 		cvbuffer[0] = c1;
2780 		cvbuffer[1] = c2 + (c2 >= 0x7f ? 1 : 0);
2781 		cvindex = 0;
2782 		return (cvbuffer);
2783 	} else if (cs == CP932)
2784 	{
2785 		/* not supported */
2786 		cvindex = 0;
2787 		return (nullcvbuffer);
2788 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2789 	{
2790 		/* not supported */
2791 		cvindex = 0;
2792 		return (nullcvbuffer);
2793 	}
2794 	assert(0);
2795 	cvindex = 0;
2796 	return (cvbuffer);
2797 }
2798 
convert_to_cp932(c,cs)2799 static char *convert_to_cp932(c, cs)
2800 int c;
2801 int cs;
2802 {
2803 	if (c == 0)
2804 	{
2805 		cvindex = 0;
2806 		return (nullcvbuffer);
2807 	}
2808 
2809 	cvbuffer[cvindex++] = c;
2810 	cvbuffer[cvindex] = '\0';
2811 
2812 	if (CSISWRONG(cs))
2813 	{
2814 		cs = ASCII;
2815 	}
2816 
2817 	cs = CS2CHARSET(cs);
2818 
2819 	if (cs == ASCII || cs == JISX0201ROMAN)
2820 	{
2821 		assert(cvindex == 1);
2822 		cvindex = 0;
2823 		return (cvbuffer);
2824 	} else if (cs == JISX0201KANA)
2825 	{
2826 		assert(cvindex == 1);
2827 		cvbuffer[0] |= 0x80;
2828 		cvindex = 0;
2829 		return (cvbuffer);
2830 	} else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
2831 		   cs == JISX0208_90KANJI)
2832 	{
2833 		int i = cvbuffer[0] & 0x7f;
2834 		int j = cvbuffer[1] & 0x7f;
2835 
2836                 if (cvindex == 1)
2837 			return (nullcvbuffer);
2838                 assert(cvindex == 2);
2839 
2840 		cvbuffer[0] = (i - 0x21) / 2 + ((i <= 0x5e) ? 0x81 : 0xc1);
2841 		cvbuffer[1] = j + ((i & 1) ?((j <= 0x5f) ?0x1f :0x20) :0x7e);
2842 		cvindex = 0;
2843 		return (cvbuffer);
2844 	} else if (cs == CP932)
2845 	{
2846 		int i = cvbuffer[0] & 0x7f;
2847 		int j = cvbuffer[1] & 0x7f;
2848 
2849                 if (cvindex == 1)
2850 			return (nullcvbuffer);
2851                 assert(cvindex == 2);
2852 
2853 		if (i >= 0x30 && i <= 0x74) i += 0x4f;
2854 		cvbuffer[0] = (i - 0x21) / 2 + ((i <= 0x5e) ? 0x81 : 0xc1);
2855 		cvbuffer[1] = j + ((i & 1) ?((j <= 0x5f) ?0x1f :0x20) :0x7e);
2856 		cvindex = 0;
2857 		return (cvbuffer);
2858 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2859 	{
2860 		/* not supported */
2861 		cvindex = 0;
2862 		return (nullcvbuffer);
2863 	}
2864 	assert(0);
2865 	cvindex = 0;
2866 	return (cvbuffer);
2867 }
2868 
2869 #endif
2870 
convUTF8(buf,c)2871 static char *convUTF8(buf, c)
2872 char *buf;
2873 int c;
2874 {
2875     buf[0] = '\0';
2876 
2877     if (c < 0) {
2878        	return (buf);
2879     } else if (c < 0x80) {
2880 	buf[0] = c;
2881 	buf[1] = '\0';
2882     } else if (c < 0x800) {
2883 	buf[0] = ((c >> 6) & 0x1f) | 0xc0;
2884 	buf[1] = (c & 0x3f) | 0x80;
2885 	buf[2] = '\0';
2886     } else if (c < 0x10000) {
2887 	buf[0] = ((c >> 12) & 0x0f) | 0xe0;
2888 	buf[1] = ((c >> 6) & 0x3f) | 0x80;
2889 	buf[2] = (c & 0x3f) | 0x80;
2890 	buf[3] = '\0';
2891     } else if (c < 0x200000) {
2892 	buf[0] = ((c >> 18) & 0x07) | 0xf0;
2893 	buf[1] = ((c >> 12) & 0x3f) | 0x80;
2894 	buf[2] = ((c >> 6) & 0x3f) | 0x80;
2895 	buf[3] = (c & 0x3f) | 0x80;
2896 	buf[4] = '\0';
2897     } else if (c < 0x4000000) {
2898 	buf[0] = ((c >> 24) & 0x03) | 0xf8;
2899 	buf[1] = ((c >> 18) & 0x3f) | 0x80;
2900 	buf[2] = ((c >> 12) & 0x3f) | 0x80;
2901 	buf[3] = ((c >> 6) & 0x3f) | 0x80;
2902 	buf[4] = (c & 0x3f) | 0x80;
2903 	buf[5] = '\0';
2904     } else {
2905 	buf[0] = ((c >> 30) & 0x01) | 0xfc;
2906 	buf[1] = ((c >> 24) & 0x3f) | 0x80;
2907 	buf[2] = ((c >> 18) & 0x3f) | 0x80;
2908 	buf[3] = ((c >> 12) & 0x3f) | 0x80;
2909 	buf[4] = ((c >> 6) & 0x3f) | 0x80;
2910 	buf[5] = (c & 0x3f) | 0x80;
2911 	buf[6] = '\0';
2912     }
2913 
2914     return (buf);
2915 }
2916 
2917 static int
need_combining_utf8(c,cs)2918 need_combining_utf8(c, cs)
2919 int c;
2920 int cs;
2921 {
2922 #if JAPANESE
2923     if (cs == JISX0213KANJI1 || cs == JISX02132004KANJI1) {
2924 	struct st_ucs_combining *p;
2925 	for (p = jisx0213_comb; p->c1; ++ p) {
2926 	    if (p->c1 +0x20 == (cvbuffer[0] & 0x7f)
2927 		&& p->c2 + 0x20 == (c & 0x7f)) {
2928 		convUTF8(cvbuffer, p->u1);
2929 		if (p->u2 > 0)
2930 		    convUTF8(cvbuffer + strlen(cvbuffer), p->u2);
2931 		return 1;
2932 	    }
2933 	}
2934   }
2935 #endif
2936 
2937   return 0;
2938 }
2939 
2940 
convert_to_utf8(c,cs)2941 static char *convert_to_utf8(c, cs)
2942 int c;
2943 int cs;
2944 {
2945 	if (c == 0)
2946 	{
2947 		cvindex = 0;
2948 		return (nullcvbuffer);
2949 	}
2950 
2951 	cvbuffer[cvindex++] = c;
2952 	cvbuffer[cvindex] = '\0';
2953 
2954 	if (CSISWRONG(cs))
2955 	{
2956 		cs = ASCII;
2957 	}
2958 
2959 	cs = CS2CHARSET(cs);
2960 	if (cs == ASCII)
2961 	{
2962 	    assert(cvindex == 1);
2963 	    cvindex = 0;
2964 	    return (cvbuffer);
2965 	} else if (cs == UTF8 || cs == UTF8W  || cs == UTF8Z)
2966 	{
2967 	    if (ISUTF8_HEAD(c)) {
2968 		assert(cvindex == 1);
2969 		return (nullcvbuffer);
2970 	    } else if (ISUTF8_REST(c)) {
2971 		int head = cvbuffer[0];
2972 		if ((head & 0xe0) == 0xc0) {
2973 		    assert(cvindex == 2);
2974 		    cvindex = 0;
2975 		    return (cvbuffer);
2976 		} else if ((head & 0xf0) == 0xe0) {
2977 		    if (cvindex <= 2)
2978 			return (nullcvbuffer);
2979 		    assert(cvindex == 3);
2980 		    cvindex = 0;
2981 		    return (cvbuffer);
2982 		} else if ((head & 0xf8) == 0xf0) {
2983 		    if (cvindex <= 3)
2984 			return (nullcvbuffer);
2985 		    assert(cvindex == 4);
2986 		    cvindex = 0;
2987 		    return (cvbuffer);
2988 		} else if ((head & 0xfc) == 0xf8) {
2989 		    if (cvindex <= 4)
2990 			return (nullcvbuffer);
2991 		    assert(cvindex == 5);
2992 		    cvindex = 0;
2993 		    return (cvbuffer);
2994 		} else if ((head & 0xfe) == 0xfc) {
2995 		    if (cvindex <= 5)
2996 			return (nullcvbuffer);
2997 		    assert(cvindex == 6);
2998 		    cvindex = 0;
2999 		    return (cvbuffer);
3000 		}
3001 		assert(0);
3002 	    }
3003 	} else if (cs == JISX0201ROMAN)
3004 	{
3005 	    assert(cvindex == 1);
3006 	    cvindex = 0;
3007 	    return convUTF8(cvbuffer, ucode_latin[UMAP_JISX0201][c]);
3008 	} else if (cs == JISX0201KANA)
3009 	{
3010 	    assert(cvindex == 1);
3011 	    cvindex = 0;
3012 	    return convUTF8(cvbuffer, ucode_latin[UMAP_JISX0201][c | 0x80]);
3013 	} else if (cs == JISX0208_78KANJI || cs == JISX0208KANJI ||
3014                    cs == JISX0208_90KANJI || cs == JISX0213KANJI1 ||
3015                    cs == JISX02132004KANJI1)
3016 	{
3017 	    int num;
3018 	    if (cvindex == 1)
3019 		return (nullcvbuffer);
3020 	    assert(cvindex == 2);
3021 	    if (need_combining_utf8(c, cs)) {
3022 		cvindex = 0;
3023 		return (cvbuffer);
3024 	    }
3025 	    if (cs == JISX0208_78KANJI)
3026 		jis78to90(cvbuffer);
3027 	    num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3028 	    if (num < 0 || num > U_kanji) {
3029 		cvindex = 0;
3030 		return (nullcvbuffer);
3031 	    }
3032 	    cvindex = 0;
3033 	    return convUTF8(cvbuffer, ucode_kanji1[num]);
3034         } else if (cs == JISX0212KANJISUP || cs == JISX0213KANJI2)
3035         {
3036 	    int num;
3037 	    if (cvindex == 1)
3038 		return (nullcvbuffer);
3039 	    assert(cvindex == 2);
3040 	    num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3041 	    if (num < 0 || num > U_kanji) {
3042 		cvindex = 0;
3043 		return (nullcvbuffer);
3044 	    }
3045 	    cvindex = 0;
3046 	    return convUTF8(cvbuffer, ucode_kanji2[num]);
3047 	} else if (cs == CP932)
3048 	{
3049 	    int num;
3050 	    if (cvindex == 1)
3051 		return (nullcvbuffer);
3052 	    assert(cvindex == 2);
3053 	    num = (cvbuffer[0] - 0x21) * 94 + (cvbuffer[1] - 0x21);
3054 	    if (num < 0 || num > U_kanji) {
3055 		cvindex = 0;
3056 		return (nullcvbuffer);
3057 	    }
3058 	    cvindex = 0;
3059 	    return convUTF8(cvbuffer, ucode_cp932[num]);
3060 	} else {
3061 	    int i;
3062 	    for (i = UMAP_ISO8859_1; i <= UMAP_ISO8859_16; ++ i) {
3063 		if (cs == iso8859_list[i]) {
3064 		    assert(cvindex == 1);
3065 		    cvindex = 0;
3066 		    return convUTF8(cvbuffer, ucode_latin[i][c | 0x80]);
3067 		}
3068 	    }
3069 	}
3070 	cvindex = 0;
3071 	return (cvbuffer);
3072 }
3073 
outchar(c,cs)3074 char *outchar(c, cs)
3075 int c;
3076 CHARSET cs;
3077 {
3078 	if (c < 0)
3079 	{
3080 		c = 0;
3081 		cs = ASCII;
3082 	}
3083 
3084 	if (output & (ESISO7 | ESISO8))
3085 		return (convert_to_iso(c, cs));
3086 	if (output & ESJIS83)
3087 		return (convert_to_jis(c, cs));
3088 #if JAPANESE
3089 	if (output & ESUJIS)
3090 		return (convert_to_ujis(c, cs));
3091 	if (output & ESSJIS)
3092 		return (convert_to_sjis(c, cs));
3093 	if (output & ESCP932)
3094 		return (convert_to_cp932(c, cs));
3095 #endif
3096 	if (output & ESUTF8)
3097 		return (convert_to_utf8(c, cs));
3098 	cvbuffer[0] = c;
3099 	cvbuffer[1] = '\0';
3100 	return (cvbuffer);
3101 }
3102 
outbuf(p,cs)3103 char *outbuf(p, cs)
3104 unsigned char *p;
3105 CHARSET cs;
3106 {
3107 	static char buffer[1024];
3108 	char *s;
3109 	int i = 0;
3110 
3111 	while (*p != '\0')
3112 	{
3113 		s = outchar(*p++, cs);
3114 		while (*s != '\0')
3115 			buffer[i++] = *s++;
3116 		assert(i < (int)sizeof(buffer));
3117 	}
3118 	buffer[i] = '\0';
3119 	return (buffer);
3120 }
3121 
mwidth(c,cs)3122 int mwidth(c, cs)
3123 int c;
3124 CHARSET cs;
3125 {
3126 	if (CSISREST(cs))
3127 		return (0);
3128 
3129 	switch (cs)
3130 	{
3131 	case UTF8Z:
3132 	    return 0;
3133 	case UTF8:
3134 	    return 1;
3135 	case UTF8W:
3136 	    return 2;
3137 	}
3138 
3139 	switch (CS2TYPE(cs))
3140 	{
3141 	case TYPE_94_CHARSET:
3142 	case TYPE_96_CHARSET:
3143 		return (1);
3144 	case TYPE_94N_CHARSET:
3145 	case TYPE_96N_CHARSET:
3146 		return (2);
3147 	default:
3148 		assert(0);
3149 		return (0);
3150 	}
3151 }
3152 
rotate_right_codeset(mp)3153 char *rotate_right_codeset(mp)
3154 MULBUF *mp;
3155 {
3156 	char *p = NULL;
3157 
3158 	mp->rotation_io_right++;
3159 	mp->rotation_io_right %= 8;
3160 	switch (mp->rotation_io_right) {
3161 	case 0: p = "original"; mp->io.inputr = mp->orig_io_right; break;
3162 	case 1: p = "utf-8"; mp->io.inputr = ESUTF8;
3163 		make_unicode_map(mp->io.scs, output & ESUTF8); break;
3164 	case 2: p = "ujis"; mp->io.inputr = ESUJIS; break;
3165 	case 3: p = "sjis"; mp->io.inputr = ESSJIS; break;
3166 	case 4: p = "cp932"; mp->io.inputr = ESCP932; break;
3167 	case 5: p = "iso8"; mp->io.inputr = ESISO8; break;
3168 	case 6: p = "noconv"; mp->io.inputr = ESNOCONV; break;
3169 	case 7: p = "none"; mp->io.inputr = ESNONE; break;
3170 	default: assert(0); break;
3171 	}
3172 	init_priority(mp);
3173 	return (p);
3174 }
3175 
3176 #endif
3177 
strlen_cs(str,cs)3178 int strlen_cs(str, cs)
3179 char* str;
3180 CHARSET* cs;
3181 {
3182 	int i = 0;
3183 	if (cs == NULL)
3184 		return strlen(str);
3185 	while (*str != NULCH || !CSISNULLCS(*cs)) {
3186 		str++;
3187 		cs++;
3188 		i++;
3189 	}
3190 	return i;
3191 }
3192 
chlen_cs(chstr,cs)3193 int chlen_cs(chstr, cs)
3194 char* chstr;
3195 CHARSET* cs;
3196 {
3197 	int i;
3198 	if (cs == NULL)
3199 	{
3200 		if (chstr == NULL || *chstr == NULCH)
3201 			return 0;
3202 		else
3203 			return 1;
3204 	}
3205 	if (*chstr == NULCH && CSISNULLCS(*cs))
3206 		return 0;
3207 	i = 0;
3208 	do {
3209 		i++;
3210 		cs++;
3211 	} while (CSISREST(*cs));
3212 	return i;
3213 }
3214 
strdup_cs(str,cs,csout)3215 char* strdup_cs(str, cs, csout)
3216 char* str;
3217 CHARSET* cs;
3218 CHARSET** csout;
3219 {
3220 	int len = strlen_cs(str, cs);
3221 	char* save_str = (char *)ecalloc(len + 1, 1);
3222 	CHARSET* save_cs = (CHARSET *)ecalloc(len + 1, sizeof(CHARSET));
3223 	memcpy(save_str, str, sizeof(char) * (len + 1));
3224 	if (cs)
3225 		memcpy(save_cs, cs, sizeof(CHARSET) * (len + 1));
3226 	else {
3227 		cs = save_cs;
3228 		while (--len >= 0)
3229 			*cs++ = ASCII;
3230 		*cs = NULLCS;
3231 	}
3232 	*csout = save_cs;
3233 	return save_str;
3234 }
3235