1 /*
2  * ctext.c -- Compound Text <-> Japanese Wide Character String converter
3  */
4 
5 /******************************************************************************
6 
7 $B!&;X<((B (designation)
8 		1byte			multi-byte
9 		94char      96char	94char       96char
10 	-------------------------------------------------------
11 	G0  :	ESC ( F	 |  -none-	ESC $ ( F  |  -none-
12 	G1  :	ESC ) F  |  ESC - F	ESC $ ) F  |  ESC $ - F
13 
14 $B!&=*C<J8;z(B F
15 	1byte
16 	    94chars
17 		B	ASCII
18 		I	JIS KANA
19 		J	JIS-ROMAN
20 	    96chars
21 		A	8859/1 right half
22 		B	8859/2 right half
23 		C	8859/3 right half
24 		D	8859/4 right half
25 		F	8859/7 right half
26 		G	8859/6 right half
27 		H	8859/8 right half
28 		M	8859/9 (DIS) right half
29 	multi-byte
30 	    94chars ^ 2
31 		A	GB Hanzi
32 		B	JIS Kanji 1983
33 		C	KS Hangul/Hanja
34 
35 -------------------------------------------------------------------------------
36 COMPOUND_TEXT $B$N;EMM(B (Comopund Text Encoding Version 1 -- MIT X Consortium Standard)
37 $B!&(BG0 G1 $B$N$_$r;HMQ$9$k!#(BG2 G3 $B$O;HMQ$7$J$$!#(B
38 $B!&(BG0 $B$,(B GL$B!"(BG1 $B$,(B GR $B$K8F$S=P$5$l$F$*$j!"$=$l$rJQ99$9$k$3$H$O$G$-$J$$!#(B
39   $B$D$^$j!"(BLocking Shift $B$*$h$S(B Single Shift $B$O;HMQ$7$J$$!#(B
40 $B!&=i4|@_Dj$H$7$F(B ISO Latin-1 $B$,(B G0/G1 $B$K;X<($5$l$F$$$k!#(B
41 $B!&%^%k%A%P%$%H$NJ8;z$r(B G0 $B$K;X<($9$k$N$K!"(BESC-$-F $B$O;HMQ$7$J$$!#(B
42   ESC-$-(-F $B$r;HMQ$9$k!#(B
43 $B!&;HMQ$G$-$k=*C<J8;z$O!">e$K=q$+$l$?DL$j!#(B
44 $B!&(BC0 $B$G;HMQ$G$-$kJ8;z$O!"(BNL TAB ESC $B$N$_$H$9$k!#(B
45 $B!&(BC1 $B$G;HMQ$G$-$kJ8;z$O(B CSI $B$N$_$H$9$k!#(B
46 $B!&%F%-%9%H$NIA2hJ}8~$N%7!<%1%s%9$,4^$^$l$k!#(B
47 	$B:8$+$i1&(B
48 	$B1&$+$i:8(B
49 	$B85$NJ}8~$KLa$k(B
50 ******************************************************************************/
51 
52 /*
53  * Copyright (c) 1989  Software Research Associates, Inc.
54  *
55  * Permission to use, copy, modify, and distribute this software and its
56  * documentation for any purpose and without fee is hereby granted, provided
57  * that the above copyright notice appear in all copies and that both that
58  * copyright notice and this permission notice appear in supporting
59  * documentation, and that the name of Software Research Associates not be
60  * used in advertising or publicity pertaining to distribution of the
61  * software without specific, written prior permission.  Software Research
62  * Associates makes no representations about the suitability of this software
63  * for any purpose.  It is provided "as is" without express or implied
64  * warranty.
65  *
66  * Author:  Makoto Ishisone, Software Research Associates, Inc., Japan
67  *		ishisone@sra.co.jp
68  */
69 
70 #ifndef lint
71 static char *rcsid = "$Id: ctext.c,v 2.6 1999/03/10 08:55:15 ishisone Exp $";
72 #endif
73 
74 typedef unsigned short	wchar;
75 
76 #define NULL	0
77 
78 #define CS96	0x100	/* 96chars CS */
79 #define MBCS	0x200	/* Multibyte CS */
80 
81 /* convJWStoCT -- Japanese Wide Character String -> COMPOUND_TEXT */
82 int
convJWStoCT(wstr,xstr,jisroman)83 convJWStoCT(wstr, xstr, jisroman)
84 register wchar *wstr;
85 register unsigned char *xstr;
86 int jisroman;	/* true $B$J$i$P(B G0 $B$N%-%c%i%/%?%;%C%H$H$7$F(B JIS ROMAN $B$r!"(B
87 		 * false $B$J$i$P(B ASCII $B$r;HMQ$9$k(B
88 		 */
89 /* Wide Character string wstr $B$r(B COMPOUND_TEXT xstr $B$KJQ49$7!"(B
90  * $BJQ498e$N%P%$%H?t$rJV$9(B($B:G8e$N(B null byte $B$O4^$^$J$$(B)$B!#$b$7(B xstr $B$,(B
91  * NULL $B$J$i$PJQ49$O$;$:!"J8;z?t$N$_$rJV$9!#(B
92  */
93 {
94 	register int	c;
95 	register int	g0, g1;
96 	register int	n = 0;
97 	int		g0cs;
98 
99 	g0cs = jisroman ? 'J' : 'B';
100 
101 	g0 = 'B';
102 	g1 = CS96|'A';
103 
104 	/*
105 	 * G0, G1 $B$O<!$N$h$&$K;H$$J,$1$k(B
106 	 *  G0: ASCII / JIS-ROMAN
107 	 *  G1: $B4A;z(B / $B$+$J(B
108 	 */
109 
110 	while (c = *wstr++) {
111 		switch (c & 0x8080) {
112 		case 0:		/* ASCII or C0 or DEL */
113 			if (g0 != g0cs) {
114 				if (xstr) {
115 					*xstr++ = '\033';
116 					*xstr++ = '(';
117 					*xstr++ = g0cs;
118 				}
119 				n += 3;
120 				g0 = g0cs;
121 				/*
122 				 * We have to invalidate G1 here,
123 				 * which is unnecessary if Xlib
124 				 * implementation is sane.
125 				 */
126 				g1 = g0cs;
127 			}
128 			/*
129 			 * Of course it isn't necessary to disignate
130 			 * ASCII to G0 before a control character, but
131 			 * someone reported certain version of Xlib needs
132 			 * this. sigh.
133 			 */
134 			if (c < ' ' || c == 0x7f) {
135 				/* C0 or DEL */
136 				if (c == '\t' || c == '\n') {
137 					if (xstr) *xstr++ = c;
138 					n++;
139 				}
140 				break;
141 			}
142 			if (xstr) *xstr++ = c & 0x7f;
143 			n++;
144 			break;
145 		case 0x80:	/* $B$+$J(B or C1 */
146 			if (0x80 <= c && c <= 0x9f) break;
147 			if (g1 != 'I') {
148 				if (xstr) {
149 					*xstr++ = '\033';
150 					*xstr++ = ')';
151 					*xstr++ = 'I';
152 				}
153 				n += 3;
154 				g1 = 'I';
155 				g0 = 'I';	/* invalidate G0. see below */
156 			}
157 			if (xstr) *xstr++ = c & 0xff;
158 			n++;
159 			break;
160 		case 0x8080:	/* $B4A;z(B */
161 			if (g1 != (MBCS|'B')) {
162 				if (xstr) {
163 					*xstr++ = '\033';
164 					*xstr++ = '$';
165 					*xstr++ = ')';
166 					*xstr++ = 'B';
167 				}
168 				n += 4;
169 				g1 = MBCS|'B';
170 				/*
171 				 * We have to invalidate G0 here,
172 				 * which is unnecessary if Xlib
173 				 * implementation is sane.
174 				 */
175 				g0 = MBCS|'B';
176 			}
177 			if (xstr) {
178 				*xstr++ = (c >> 8) & 0xff;
179 				*xstr++ = c & 0xff;
180 			}
181 			n += 2;
182 			break;
183 		default:
184 			/* $BL5;k$9$k(B */
185 			break;
186 		}
187 	}
188 	if (xstr) *xstr = '\0';
189 	return n;
190 }
191 
192 static unsigned char *
getesc(str,len)193 getesc(str, len)
194 unsigned char *str;
195 int len;
196 {
197 	register int	c;
198 
199 	/* $B%(%9%1!<%W%7!<%1%s%9$N!"%(%9%1!<%W$KB3$/(B
200 	 * $BCf4VJ8;z$H=*C<J8;z$rD4$Y$k(B
201 	 */
202 	/* $BCf4VJ8;z$O(B 02/00 $B$+$i(B 02/15 $B$^$G(B */
203 	while (len > 0) {
204 		c = *str;
205 		if (c < 0x20 || 0x2f < c)
206 			break;
207 		len--, str++;
208 	}
209 	/* $B=*C<J8;z$O(B 03/00 $B$+$i(B 07/14 $B$^$G(B */
210 	if (--len < 0 || (c = *str++) < 0x30 || 0x7e < c)
211 		return (unsigned char *)NULL;
212 
213 	return str;
214 }
215 
216 static unsigned char *
getcsi(str,len)217 getcsi(str, len)
218 unsigned char *str;
219 int len;
220 {
221 	register int	c;
222 
223 	/* CSI $B%7!<%1%s%9$N!"(BCSI $B$KB3$/(B
224 	 * $B%Q%i%a%?J8;z!&Cf4VJ8;z$H=*C<J8;z$rD4$Y$k(B
225 	 */
226 	/* $B%Q%i%a%?$O(B 03/00 $B$+$i(B 03/15 $B$^$G(B */
227 	while (len > 0) {
228 		c = *str;
229 		if (c < 0x30 || 0x3f < c)
230 			break;
231 		len--, str++;
232 	}
233 	/* $BCf4VJ8;z$O(B 02/00 $B$+$i(B 02/15 $B$^$G(B */
234 	while (len > 0) {
235 		c = *str;
236 		if (c < 0x20 || 0x2f < c)
237 			break;
238 		len--, str++;
239 	}
240 	/* $B=*C<J8;z$O(B 04/00 $B$+$i(B 07/14 $B$^$G(B */
241 	if (--len < 0 || (c = *str++) < 0x40 || 0x7e < c)
242 		return (unsigned char *)NULL;
243 
244 	return str;
245 }
246 
247 /* convCTtoJWS -- COMPOUND_TEXT -> Japanese Wide Character String */
248 int
convCTtoJWS(xstr,len,wstr)249 convCTtoJWS(xstr, len, wstr)
250 register unsigned char *xstr;
251 int len;
252 wchar *wstr;
253 /* COMPOUND_TEXT xstr $B$r(B Wide Character string wstr $B$KJQ49$7!"(B
254  * $BJQ498e$NJ8;z?t$rJV$9(B($B:G8e$N(B null $BJ8;z$O4^$^$J$$(B)$B!#$b$7(B wstr $B$,(B
255  * NULL $B$J$i$PJQ49$O$;$:!"J8;z?t$N$_$rJV$9!#(B
256  */
257 {
258 	register int	c;
259 	int	nskip;
260 	int	n = 0;
261 	int	g0, g1, gs;
262 	unsigned char	*xstr1;
263 
264 	/*
265 	 * Compound Text $BCf$K$O(B null octet $B$,4^$^$l$k2DG=@-$,$"$k(B
266 	 * $B$=$3$GJ8;zNs$ND9$5(B len $B$r0z?t$G;XDj$G$-$k$h$&$K$7$F$"$k$N$@$,!"(B
267 	 * 0 $B$"$k$$$OIi$N;~$K$O(B (null octet $B$O$J$$$b$N$H$7$F(B) strlen() $B$G(B
268 	 * $BD9$5$rD4$Y$k(B
269 	 */
270 	if (len <= 0) {
271 		len = strlen((char *)xstr);
272 	}
273 
274 	/* $B=i4|>uBV$O!"(BISO 8859/1 $B$,(B G0/G1 $B$KF~$C$F$$$k(B */
275 	g0 = 'B';	/* ASCII -> G0 */
276 	g1 = CS96|'A';	/* Latin/1 right hand part -> G1 */
277 
278 	while (len-- > 0) {
279 		switch (c = *xstr++) {
280 		case '\n':	/* NEWLINE */
281 		case '\t':	/* TAB */
282 			if (wstr) *wstr++ = c;
283 			n++;
284 			break;
285 		case 0x9b:	/* CSI */
286 			/*
287 			 * CSI $B$N0lHL7A$O(B
288 			 *	CSI {P} {I} F
289 			 * $B%Q%i%a%?(B P $B$O(B 03/00 $B$+$i(B 03/15$B!"(B
290 			 * $BCf4VJ8;z(B I $B$O(B 02/00 $B$+$i(B 02/15$B!"(B
291 			 * $B=*C<J8;z(B F $B$O(B 04/00 $B$+$i(B 07/14 $B$NHO0O(B
292 			 */
293 			/*
294 			 * $B8=:_Dj5A$5$l$F$$$k$N$O(B directionality $B$@$1$G!"(B
295 			 * $B$=$l$O(B
296 			 *	CSI-1-]		begin left-to-right text
297 			 *	CSI-2-]		begin right-to-left text
298 			 *	CSI-]		end of string
299 			 * $B$G$"$k(B
300 			 * $B$,$H$j$"$($::#$O$3$l$rL5;k$9$k$N$G!"(BCSI $B$N(B
301 			 * $B%7!<%1%s%9$O$9$Y$FL5;k!"$H$$$&$3$H$K$J$k(B
302 			 */
303 			xstr1 = getcsi(xstr, len);
304 			if (xstr1 == NULL)
305 				return -1;
306 			len -= xstr1 - xstr;
307 			xstr = xstr1;
308 			break;
309 		case '\033':	/* ESC */
310 			/*
311 			 * $B%(%9%1!<%W%7!<%1%s%9$N0lHL7A$O(B
312 			 *	ESC {I} F
313 			 * $BCf4VJ8;z(B I $B$O(B 02/00 $B$+$i(B 02/15 $B$G!"(B
314 			 * $B=*C<J8;z(B F $B$O(B 03/00 $B$+$i(B 07/14 $B$NHO0O(B
315 			 */
316 			/*
317 			 * $B8=:_Dj5A$5$l$F$$$k$N$O!"(B
318 			 *   $B%9%?%s%@!<%I%-%c%i%/%?%;%C%H(B
319 			 *	ESC-(-F
320 			 *	ESC-$-(-F
321 			 *	ESC-)-F
322 			 *	ESC---F
323 			 *	ESC-$-)-F
324 			 *   $B%N%s%9%?%s%@!<%I%-%c%i%/%?%;%C%H(B
325 			 *	ESC-%-/-[0123]
326 			 * $B%9%?%s%@!<%I$J%-%c%i%/%?%;%C%H$O@5$7$/2r<a(B
327 			 * $B$7$J$/$F$O$J$i$J$$$7!"%N%s%9%?%s%@!<%I$J$b$N$O(B
328 			 * $BL5;k$9$k$1$l$I$b%G!<%?$r%9%-%C%W$9$kI,MW$,$"$k(B
329 			 */
330 			xstr1 = getesc(xstr, len);
331 			if (xstr1 == NULL)
332 				return -1;
333 			len -= xstr1 - xstr;
334 			switch (xstr1 - xstr) {
335 			case 2:		/* ESC - I - F */
336 				switch (*xstr++) {
337 				case '(':	/* 94chars CS -> G0 */
338 					g0 = *xstr;
339 					break;
340 				case ')':	/* 94chars CS -> G1 */
341 					g1 = *xstr;
342 					break;
343 				case '-':	/* 96chars CS -> G1 */
344 					g1 = *xstr | CS96;
345 					break;
346 				default:	/* ignore */
347 					break;
348 				}
349 				break;
350 			case 3:		/* ESC - I - I - F */
351 				switch (*xstr++) {
352 				case '$':
353 					switch (*xstr++) {
354 					case '(':	/* 94chars MBCS -> G0 */
355 						g0 = *xstr | MBCS;
356 						break;
357 					case ')':	/* 94chars MBCS -> G1 */
358 						g1 = *xstr | MBCS;
359 						break;
360 					case '-':	/* 96chars MBCS -> G1 */
361 						g1 = *xstr | CS96 | MBCS;
362 						break;
363 					default:	/* ignore */
364 						break;
365 					}
366 					break;
367 				case '%':
368 					if (*xstr++ != '/') {
369 						/* unknown sequence */
370 						break;
371 					}
372 					/*
373 					 * $B%W%i%$%Y!<%H%(%s%3!<%G%#%s%0(B
374 					 * $B40A4$KL5;k$9$k(B
375 					 * $B$?$@$7$=$N$"$H$KB3$/%G!<%?$r(B
376 					 * $B%9%-%C%W$9$kI,MW$,$"$k(B
377 					 *	ESC-%-/-F-M-L
378 					 */
379 					len -= 2;
380 					if (len < 0)
381 						return -1;
382 					nskip = (*xstr1 & 0x7f) * 128 +
383 					    (*(xstr1 + 1) & 0x7f);
384 					if ((len -= nskip) < 0)
385 						return -1;
386 					xstr1 += nskip + 2;
387 					break;
388 				default:
389 					break;
390 				}
391 				break;
392 			default:
393 				break;
394 			}
395 			xstr = xstr1;
396 			break;
397 		default:
398 			if (!(c & 0x60)) {
399 				/*
400 				 * NL/TAB/ESC/CSI $B0J30$N(B C0 or C1
401 				 * $B$3$l$OL@$i$+$K%(%i!<(B
402 				 */
403 				return -1;
404 			}
405 			gs = (c & 0x80) ? g1 : g0;
406 			c &= 0x7f;
407 			if (gs & MBCS) {
408 				switch (gs & 0x70) {
409 				case 0x70:	/* 4byte/char */
410 					if (--len < 0) return -1;
411 					c = (c << 8) | (*xstr++ & 0x7f);
412 				case 0x60:	/* 3byte/char */
413 					if (--len < 0) return -1;
414 					c = (c << 8) | (*xstr++ & 0x7f);
415 				case 0x50:	/* 2byte/char */
416 				case 0x40:	/* 2byte/char */
417 					if (--len < 0) return -1;
418 					c = (c << 8) | (*xstr++ & 0x7f);
419 					break;
420 				default:
421 					return -1;
422 				}
423 			}
424 			if (wstr) {
425 				switch (gs) {
426 				case 'B':
427 				case 'J':
428 					*wstr++ = c;
429 					n++;
430 					break;
431 				case 'I':
432 					*wstr++ = 0x80 | c;
433 					n++;
434 					break;
435 				case MBCS|'B':
436 					*wstr++ = 0x8080 | c;
437 					n++;
438 					break;
439 				}
440 			} else {
441 				switch (gs) {
442 				case 'B':
443 				case 'J':
444 				case 'I':
445 					n++;
446 					break;
447 				case MBCS|'B':
448 					n++;
449 					break;
450 				}
451 			}
452 			break;
453 		}
454 	}
455 	if (wstr) *wstr = 0;
456 	return n;
457 }
458