1 /*
2 * jconv.c - alternative japanese code conversion routines
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /* Some iconv() implementations don't support japanese character encodings,
35 * or have problems handling them. This code provides an alternative way
36 * to convert these encodings.
37 */
38
39 /* This file handles conversion among UTF8, Shift-JIS, EUC_JP, and ISO2022JP.
40 * Shift-JIS and EUC_JP are based on JIS X 0213:2000. ISO2022JP partially
41 * handles ISO2022-JP-3 as well.
42 */
43
44 #include <ctype.h>
45 #include "charconv.h"
46 #include "jconv_tab.h"
47 #include "latin_tab.h"
48
49 #define INCHK(n) do{if ((int)inroom < (n)) return INPUT_NOT_ENOUGH;}while(0)
50 #define OUTCHK(n) do{if ((int)outroom < (n)) return OUTPUT_NOT_ENOUGH;}while(0)
51
52 #define ERRP(n) ((n) < 0)
53
54 #define INTERMEDIATE_BUF_SIZE 6
55
56 /* Fill outptr with substitution character. Can return jconv error code. */
do_subst(ScmConvInfo * cinfo,char * outptr,ScmSize outroom,ScmSize * outchars)57 static inline int do_subst(ScmConvInfo *cinfo,
58 char *outptr,
59 ScmSize outroom,
60 ScmSize *outchars)
61 {
62 if (cinfo->replaceSize == 0) {
63 return NO_OUTPUT_CHAR;
64 }
65 OUTCHK(cinfo->replaceSize);
66 for (int i = 0; i < cinfo->replaceSize; i++) {
67 outptr[i] = cinfo->replaceSeq[i];
68 }
69 *outchars = cinfo->replaceSize;
70 return cinfo->replaceSize;
71 }
72
73 #define DO_SUBST \
74 do { \
75 int i = do_subst(cinfo, outptr, outroom, outchars); \
76 if (i < 0) return i; \
77 } while (0)
78
79 /******************************************************************
80 *
81 * Single-unit handling routines
82 *
83 * This section defines routines that converts single input unit
84 * to single output unit, optionally affecting the state.
85 * A unit is usually a character, but sometimes one input character
86 * may be mapped to more than one output characters, or a sequence of
87 * input characters is mapped to one output character.
88 *
89 * The routine returns the number of input octets consumed, and
90 * sets the number of output octets emitted in *outchars.
91 * If an errornous condition occurs, it returns one of the following
92 * error code, and not update *outchars.
93 *
94 * ILLEGAL_SEQUENCE - Input contains illegal sequence.
95 * INPUT_NOT_ENOUGH - Input sequence ends prematurely.
96 * OUTPUT_NOT_ENOUGH - Output buffer is too small.
97 * NO_OUTPUT_CHAR - Input unit can't be represented in output CES.
98 *
99 *****************************************************************/
100
101 /*=================================================================
102 * EUC-JP
103 */
104
105 /* EUC_JISX0213 -> Shift_JIS
106 *
107 * Mapping anormalities
108 *
109 * 0x80--0xa0 except 0x8e and 0x8f : C1 region.
110 * Doesn't have corresponding SJIS bytes,
111 * so mapped to substitution char.
112 * 0xff : reserved byte. mapped to substitution char.
113 *
114 * Conversion scheme
115 * 0x00-0x7f : corresponding ASCII range.
116 * 0x80--0x8d : substitution char.
117 * 0x8e : leading byte of JISX 0201 kana
118 * 0x8f : leading byte of JISX 0212 or JISX 0213 plane 2
119 * 0x90--0xa0 : substitution char.
120 * 0xa1--0xfe : first byte (e1) of JISX 0213 plane 1
121 * 0xff : substitution char
122 *
123 * For double or trible-byte character, subsequent byte has to be in
124 * the range between 0xa1 and 0xfe inclusive. If not, it is replaced
125 * for the substitution character.
126 *
127 * If the first byte is in the range of 0xa1--0xfe, two bytes (e1, e2)
128 * is mapped to SJIS (s1, s2) by:
129 *
130 * s1 = (e1 - 0xa0 + 0x101)/2 if 0xa1 <= e1 <= 0xde
131 * (e1 - 0xa0 + 0x181)/2 if 0xdf <= e1 <= 0xfe
132 * s2 = (e2 - 0xa0 + 0x3f) if odd?(e1) && 0xa1 <= e2 <= 0xdf
133 * (e2 - 0xa0 + 0x40) if odd?(e1) && 0xe0 <= e2 <= 0xfe
134 * (e2 - 0xa0 + 0x9e) if even?(e1)
135 *
136 * If the first byte is 0x8f, the second byte (e1) and the third byte
137 * (e2) is mapped to SJIS (s1, s2) by:
138 * if (0xee <= e1 <= 0xfe) s1 = (e1 - 0xa0 + 0x19b)/2
139 * otherwise, follow the table:
140 * e1 == 0xa1 or 0xa8 => s1 = 0xf0
141 * e1 == 0xa3 or 0xa4 => s1 = 0xf1
142 * e1 == 0xa5 or 0xac => s1 = 0xf2
143 * e1 == 0xae or 0xad => s1 = 0xf3
144 * e1 == 0xaf => s1 = 0xf4
145 * If e1 is other value, it is JISX0212; we use substitution char.
146 * s2 is mapped with the same rule above.
147 */
148
eucj_sjis(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)149 static ScmSize eucj_sjis(ScmConvInfo *cinfo SCM_UNUSED,
150 const char *inptr, ScmSize inroom,
151 char *outptr, ScmSize outroom,
152 ScmSize *outchars)
153 {
154 unsigned char e1 = inptr[0];
155 if (e1 <= 0x7f) {
156 outptr[0] = e1;
157 *outchars = 1;
158 return 1;
159 }
160 if (e1 >= 0xa1 && e1 <= 0xfe) {
161 /* double byte char (JISX 0213 plane 1) */
162 unsigned char s1, s2;
163 INCHK(2);
164 unsigned char e2 = inptr[1];
165 if (e2 < 0xa1 || e2 == 0xff) {
166 DO_SUBST;
167 return 2;
168 }
169 OUTCHK(2);
170 if (e1 <= 0xde) s1 = (e1 - 0xa0 + 0x101)/2;
171 else s1 = (e1 - 0xa0 + 0x181)/2;
172 if (e1%2 == 0) {
173 s2 = e2 - 0xa0 + 0x9e;
174 } else {
175 if (e2 <= 0xdf) s2 = e2 - 0xa0 + 0x3f;
176 else s2 = e2 - 0xa0 + 0x40;
177 }
178 outptr[0] = s1;
179 outptr[1] = s2;
180 *outchars = 2;
181 return 2;
182 }
183 if (e1 == 0x8e) {
184 /* JISX 0201 kana */
185 INCHK(2);
186 unsigned char e2 = inptr[1];
187 if (e2 < 0xa1 || e2 == 0xff) {
188 DO_SUBST;
189 } else {
190 outptr[0] = e2;
191 *outchars = 1;
192 }
193 return 2;
194 }
195 if (e1 == 0x8f) {
196 /* triple byte char */
197 unsigned char s1, s2;
198 static const unsigned char cvt[] = { 0xf0, 0, 0xf1, 0xf1, 0xf2, 0, 0, 0xf0, 0, 0, 0, 0xf2, 0xf3, 0xf3, 0xf4 };
199
200 INCHK(3);
201 OUTCHK(2);
202 e1 = inptr[1];
203 unsigned char e2 = inptr[2];
204 if (e1 < 0xa1 || e1 == 0xff || e2 < 0xa1 || e2 == 0xff) {
205 DO_SUBST;
206 return 3;
207 }
208 if (e1 >= 0xee) {
209 s1 = (e1 - 0xa0 + 0x19b)/2;
210 } else if (e1 >= 0xb0) {
211 DO_SUBST;
212 return 3;
213 } else {
214 s1 = cvt[e1-0xa1];
215 if (s1 == 0) {
216 DO_SUBST;
217 return 3;
218 }
219 }
220 if (e1%2 == 0) {
221 s2 = e2 - 0xa0 + 0x9e;
222 } else {
223 if (e2 < 0xdf) s2 = e2 - 0xa0 + 0x3f;
224 else s2 = e2 - 0xa0 + 0x40;
225 }
226 outptr[0] = s1;
227 outptr[1] = s2;
228 *outchars = 2;
229 return 3;
230 }
231 /* no corresponding char */
232 DO_SUBST;
233 return 1;
234 }
235
236 /* [EUC_JP -> UTF8 conversion]
237 *
238 * Conversion strategy:
239 * If euc0 is in ASCII range, or C1 range except 0x8e or 0x8f, map it as is.
240 * If euc0 is 0x8e, use JISX0201-KANA table.
241 * If euc0 is 0x8f, use JISX0213 plane 2 table.
242 * If euc0 is in [0xa1-0xfe], use JISX0213 plane1 table.
243 * If euc0 is 0xa0 or 0xff, return ILLEGAL_SEQUENCE.
244 *
245 * JISX0213 plane2 table is consisted by a 2-level tree. The first-level
246 * returns an index to the second-level table by (euc1 - 0xa1). Only the
247 * range of JISX0213 defined region is converted; JISX0212 region will be
248 * mapped to the substitution char.
249 */
250
251 #include "eucj2ucs.c"
252
253 /* UTF8 utility. Similar stuff is included in gauche/char_utf_8.h
254 if the native encoding is UTF8, but not otherwise.
255 So I include them here as well. */
256
jconv_ucs4_to_utf8(unsigned int ucs,char * cp)257 void jconv_ucs4_to_utf8(unsigned int ucs, char *cp)
258 {
259 if (ucs < 0x80) {
260 *cp = ucs;
261 }
262 else if (ucs < 0x800) {
263 *cp++ = ((ucs>>6)&0x1f) | 0xc0;
264 *cp = (ucs&0x3f) | 0x80;
265 }
266 else if (ucs < 0x10000) {
267 *cp++ = ((ucs>>12)&0x0f) | 0xe0;
268 *cp++ = ((ucs>>6)&0x3f) | 0x80;
269 *cp = (ucs&0x3f) | 0x80;
270 }
271 else if (ucs < 0x200000) {
272 *cp++ = ((ucs>>18)&0x07) | 0xf0;
273 *cp++ = ((ucs>>12)&0x3f) | 0x80;
274 *cp++ = ((ucs>>6)&0x3f) | 0x80;
275 *cp = (ucs&0x3f) | 0x80;
276 }
277 else if (ucs < 0x4000000) {
278 *cp++ = ((ucs>>24)&0x03) | 0xf8;
279 *cp++ = ((ucs>>18)&0x3f) | 0x80;
280 *cp++ = ((ucs>>12)&0x3f) | 0x80;
281 *cp++ = ((ucs>>6)&0x3f) | 0x80;
282 *cp = (ucs&0x3f) | 0x80;
283 } else {
284 *cp++ = ((ucs>>30)&0x1) | 0xfc;
285 *cp++ = ((ucs>>24)&0x3f) | 0x80;
286 *cp++ = ((ucs>>18)&0x3f) | 0x80;
287 *cp++ = ((ucs>>12)&0x3f) | 0x80;
288 *cp++ = ((ucs>>6)&0x3f) | 0x80;
289 *cp++ = (ucs&0x3f) | 0x80;
290 }
291 }
292
293 /* Returns # of input chars, or negative error code on error */
jconv_utf8_to_ucs4(const char * cp,ScmSize size,ScmChar * ucs)294 int jconv_utf8_to_ucs4(const char *cp, ScmSize size, ScmChar *ucs)
295 {
296 u_char u0 = cp[0];
297 if (u0 < 0x80) {
298 *ucs = u0;
299 return 1;
300 } else if (u0 < 0xc0) {
301 return ILLEGAL_SEQUENCE;
302 } else if (u0 < 0xe0) {
303 if (size < 2) return INPUT_NOT_ENOUGH;
304 u_char u1 = cp[1];
305 ScmChar ch = ((u0 & 0x1f) << 6) | (u1 & 0x3f);
306 if (ch < 0x80) return ILLEGAL_SEQUENCE;
307 *ucs = ch;
308 return 2;
309 } else if (u0 < 0xf0) {
310 if (size < 3) return INPUT_NOT_ENOUGH;
311 u_char u1 = cp[1], u2 = cp[2];
312 ScmChar ch = ((u0 & 0x0f) << 12) | ((u1 & 0x3f) << 6) | (u2 & 0x3f);
313 if (ch < 0x800) return ILLEGAL_SEQUENCE;
314 *ucs = ch;
315 return 3;
316 } else if (u0 < 0xf8) {
317 if (size < 4) return INPUT_NOT_ENOUGH;
318 u_char u1 = cp[1], u2 = cp[2], u3 = cp[3];
319 ScmChar ch = ((u0 & 0x07) << 18) | ((u1 & 0x3f) << 12)
320 | ((u2 & 0x3f) << 6) | (u3 & 0x3f);
321 if (ch < 0x10000) return ILLEGAL_SEQUENCE;
322 *ucs = ch;
323 return 4;
324 } else if (u0 < 0xfc) {
325 if (size < 5) return INPUT_NOT_ENOUGH;
326 u_char u1 = cp[1], u2 = cp[2], u3 = cp[3], u4 = cp[4];
327 ScmChar ch = ((u0 & 0x03) << 24) | ((u1 & 0x3f) << 18)
328 | ((u2 & 0x3f) << 12) | ((u3 & 0x3f) << 6) | (u4 & 0x3f);
329 if (ch < 0x8000000) return ILLEGAL_SEQUENCE;
330 *ucs = ch;
331 return 5;
332 } else if (u0 < 0xfe) {
333 if (size < 6) return INPUT_NOT_ENOUGH;
334 u_char u1 = cp[1], u2 = cp[2], u3 = cp[3], u4 = cp[4], u5 = cp[5];
335 ScmChar ch = ((u0 & 0x01) << 30) | ((u1 & 0x3f) << 24)
336 | ((u2 & 0x3f) << 18) | ((u3 & 0x3f) << 12)
337 | ((u4 & 0x3f) << 6) | (u5 & 0x3f);
338 *ucs = ch;
339 return 6;
340 } else {
341 return ILLEGAL_SEQUENCE;
342 }
343 }
344
345 /* Given 'encoded' ucs, emit utf8. 'Encoded' ucs is the entry of the
346 conversion table. If ucs >= 0x100000, it is composed by two UCS2
347 character. Otherwise, it is one UCS4 character. */
eucj_utf8_emit_utf(unsigned int ucs,ScmSize inchars,char * outptr,ScmSize outroom,ScmSize * outchars)348 static inline ScmSize eucj_utf8_emit_utf(unsigned int ucs, ScmSize inchars,
349 char *outptr, ScmSize outroom,
350 ScmSize *outchars)
351 {
352 if (ucs < 0x100000) {
353 int outreq = UCS2UTF_NBYTES(ucs);
354 OUTCHK(outreq);
355 jconv_ucs4_to_utf8(ucs, outptr);
356 *outchars = outreq;
357 } else {
358 /* we need two UCS characters */
359 unsigned int ucs0 = (ucs >> 16) & 0xffff;
360 unsigned int ucs1 = ucs & 0xfff;
361 int outreq0 = UCS2UTF_NBYTES(ucs0);
362 int outreq1 = UCS2UTF_NBYTES(ucs1);
363 OUTCHK(outreq0+outreq1);
364 jconv_ucs4_to_utf8(ucs0, outptr);
365 jconv_ucs4_to_utf8(ucs1, outptr+outreq0);
366 *outchars = outreq0+outreq1;
367 }
368 return inchars;
369 }
370
eucj_utf8(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)371 static ScmSize eucj_utf8(ScmConvInfo *cinfo SCM_UNUSED,
372 const char *inptr, ScmSize inroom,
373 char *outptr, ScmSize outroom, ScmSize *outchars)
374 {
375 unsigned char e0 = (unsigned char)inptr[0];
376 if (e0 < 0xa0) {
377 if (e0 == 0x8e) {
378 /* JIS X 0201 KANA */
379 INCHK(2);
380 unsigned char e1 = (unsigned char)inptr[1];
381 if (e1 < 0xa1 || e1 > 0xdf) return ILLEGAL_SEQUENCE;
382 unsigned int ucs = 0xff61 + (e1 - 0xa1);
383 return eucj_utf8_emit_utf(ucs, 2, outptr, outroom, outchars);
384 }
385 else if (e0 == 0x8f) {
386 /* JIS X 0213 plane 2 */
387 int index;
388
389 INCHK(3);
390 unsigned char e1 = (unsigned char)inptr[1];
391 unsigned char e2 = (unsigned char)inptr[2];
392 if (e1 < 0xa1 || e1 > 0xfe || e2 < 0xa1 || e2 > 0xfe) {
393 return ILLEGAL_SEQUENCE;
394 }
395 index = euc_jisx0213_2_index[e1 - 0xa1];
396 if (index < 0) {
397 DO_SUBST;
398 return 3;
399 }
400 unsigned int ucs = euc_jisx0213_2_to_ucs2[index][e2 - 0xa1];
401 if (ucs != 0) {
402 return eucj_utf8_emit_utf(ucs, 3, outptr, outroom, outchars);
403 }
404 DO_SUBST;
405 return 3;
406 }
407 else {
408 /* ASCII or C1 region */
409 outptr[0] = e0;
410 *outchars = 1;
411 return 1;
412 }
413 }
414 if (e0 > 0xa0 && e0 < 0xff) {
415 /* JIS X 0213 plane 1 */
416 INCHK(2);
417 unsigned char e1 = (unsigned char)inptr[1];
418 if (e1 < 0xa1 || e1 > 0xfe) return ILLEGAL_SEQUENCE;
419 unsigned int ucs = euc_jisx0213_1_to_ucs2[e0 - 0xa1][e1 - 0xa1];
420 if (ucs != 0) {
421 return eucj_utf8_emit_utf(ucs, 2, outptr, outroom, outchars);
422 }
423 DO_SUBST;
424 return 2;
425 }
426 /* e0 == 0xa0 */
427 DO_SUBST;
428 return 1;
429 }
430
431 /* EUC_JP -> ISO8859-1 */
eucj_lat1(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)432 static ScmSize eucj_lat1(ScmConvInfo *cinfo,
433 const char *inptr, ScmSize inroom,
434 char *outptr, ScmSize outroom, ScmSize *outchars)
435 {
436 char u[6];
437 ScmSize nu;
438 ScmSize r = eucj_utf8(cinfo, inptr, inroom, u, 6, &nu);
439 if (r < 0) return r;
440 ScmChar ch;
441 ScmSize r2 = jconv_utf8_to_ucs4(u, nu, &ch);
442 if (r2 < 0) return r2;
443 if (ch < 0x100) {
444 *outptr = ch;
445 *outchars = 1;
446 } else {
447 DO_SUBST;
448 }
449 return r;
450 }
451
452 /* EUC_JP -> ISO2022JP(-3)
453 *
454 * For now, I follow the strategy of iso2022jp-3-compatible behavior.
455 */
456
457 /* ensure the current state is newstate. returns # of output chars.
458 may return OUTPUT_NOT_ENOUGH. */
jis_ensure_state(ScmConvInfo * cinfo,int newstate,ScmSize outbytes,char * outptr,ScmSize outroom)459 static ScmSize jis_ensure_state(ScmConvInfo *cinfo, int newstate,
460 ScmSize outbytes,
461 char *outptr, ScmSize outroom)
462 {
463 const char *escseq = NULL;
464 ScmSize esclen = 0;
465
466 if (cinfo->ostate == newstate) {
467 OUTCHK(outbytes);
468 return 0;
469 }
470 switch (newstate) {
471 case JIS_ASCII:
472 escseq = "\033(B"; esclen = 3; break;
473 case JIS_KANA:
474 escseq = "\033(I"; esclen = 3; break;
475 case JIS_0213_1:
476 escseq = "\033$B"; esclen = 3; break;
477 case JIS_0213_2:
478 escseq = "\033$(P"; esclen = 4; break;
479 case JIS_0212:
480 escseq = "\033$(D"; esclen = 4; break;
481 default:
482 /* Can't be here */
483 Scm_Panic("something wrong in jis_ensure_state: implementation error?");
484 return 0; /* dummy */
485 }
486 OUTCHK(esclen + outbytes);
487 memcpy(outptr, escseq, esclen);
488 cinfo->ostate = newstate;
489 return esclen;
490 }
491
eucj_jis(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)492 static ScmSize eucj_jis(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom,
493 char *outptr, ScmSize outroom, ScmSize *outchars)
494 {
495 unsigned char e0 = inptr[0];
496 if (e0 < 0x80) {
497 ScmSize outoffset = jis_ensure_state(cinfo, JIS_ASCII, 1, outptr, outroom);
498 if (ERRP(outoffset)) return outoffset;
499 outptr[outoffset] = e0;
500 *outchars = outoffset+1;
501 return 1;
502 } else if (e0 == 0x8e) {
503 INCHK(2);
504 unsigned char e1 = inptr[1];
505 if (e1 > 0xa0 && e1 < 0xff) {
506 ScmSize outoffset = jis_ensure_state(cinfo, JIS_KANA, 1, outptr, outroom);
507 if (ERRP(outoffset)) return outoffset;
508 outptr[outoffset] = e1 - 0x80;
509 *outchars = outoffset+1;
510 return 2;
511 }
512 } else if (e0 == 0x8f) {
513 INCHK(3);
514 e0 = inptr[1];
515 unsigned char e1 = inptr[2];
516 if (e0 > 0xa0 && e0 < 0xff && e1 > 0xa0 && e1 < 0xff) {
517 int newstate = JIS_0212;
518 switch (e0) {
519 case 0xa1:; case 0xa3:; case 0xa4:; case 0xa5:;
520 case 0xa8:; case 0xac:; case 0xad:; case 0xae:; case 0xaf:;
521 newstate = JIS_0213_2; break;
522 default:
523 if (e0 >= 0xee) newstate = JIS_0213_2;
524 }
525 ScmSize outoffset = jis_ensure_state(cinfo, newstate, 2, outptr, outroom);
526 outptr[outoffset] = e0 - 0x80;
527 outptr[outoffset+1] = e1 - 0x80;
528 *outchars = outoffset+1;
529 return 3;
530 }
531 } else if (e0 > 0xa0 && e0 < 0xff) {
532 INCHK(2);
533 unsigned char e1 = inptr[1];
534 if (e1 > 0xa0 && e1 < 0xff) {
535 ScmSize outoffset = jis_ensure_state(cinfo, JIS_0213_1, 2, outptr, outroom);
536 if (ERRP(outoffset)) return outoffset;
537 outptr[outoffset] = e0 - 0x80;
538 outptr[outoffset+1] = e1 - 0x80;
539 *outchars = outoffset+2;
540 return 2;
541 }
542 }
543 return ILLEGAL_SEQUENCE;
544 }
545
546
547 /* EUC-JP -> ASCII */
eucj_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)548 static ScmSize eucj_ascii(ScmConvInfo *cinfo,
549 const char *inptr, ScmSize inroom,
550 char *outptr, ScmSize outroom,
551 ScmSize *outchars)
552 {
553 unsigned char e1 = inptr[0];
554 if (e1 <= 0x7f) {
555 outptr[0] = e1;
556 *outchars = 1;
557 return 1;
558 }
559 if (e1 >= 0xa1 && e1 <= 0xfe) {
560 /* double byte char (JISX 0213 plane 1) */
561 INCHK(2);
562 DO_SUBST;
563 return 2;
564 }
565 if (e1 == 0x8e) {
566 INCHK(2);
567 DO_SUBST;
568 return 2;
569 }
570 if (e1 == 0x8f) {
571 INCHK(3);
572 DO_SUBST;
573 return 3;
574 }
575 DO_SUBST;
576 return 1;
577 }
578
579 /*=================================================================
580 * Shift JIS
581 */
582
583 /* Shift_JISX0213 -> EUC-JP
584 *
585 * Mapping anormalities
586 *
587 * 0x5c, 0x7e : Shift_JISX0213 mapping table maps 0x5c to U+00A5
588 * (YEN SIGN) and 0x7e to U+203E (OVERLINE). But mapping so
589 * breaks the program code written in Shift JIS. I map them
590 * to the corresponding ASCII chars.
591 * 0xfd, 0xfe, 0xff : These are reserved bytes. Apple uses these
592 * bytes for vendor extension:
593 * 0xfd - U+00A9 COPYRIGHT SIGN |EUC A9A6 |JISX0213
594 * 0xfe - U+2122 TRADE MARK SIGN |EUC 8FA2EF|JISX0212
595 * 0xff - U+2026 HORIZONTAL ELLIPSIS|EUC A1C4 |JISX0208
596 * This is a one-direction mapping.
597 * 0x80, 0xa0 : These are reserved bytes. Replaced to the
598 * one-byte substitution character of destination encoding.
599 *
600 * Conversion scheme
601 * 0x00-0x7f : corresponding ASCII range.
602 * 0x80 : substitution character
603 * 0x81 -- 0x9f : first byte (s1) of double byte range for JIS X 0213 m=1
604 * 0xa0 : substitution character
605 * 0xa1 -- 0xdf : JISX 0201 kana = s1-0x80
606 * 0xe0 -- 0xef : first byte (s1) of double byte range for JIS X 0213 m=1
607 * 0xf0 -- 0xfc : first byte (s1) of double byte range for JIS X 0213 m=2
608 * 0xfd : U+00A9, EUC A9A6, JISX0213 (1, 0x09, 0x06)
609 * 0xfe : U+2122, EUC 8FA2EF, JISX0212
610 * 0xff : U+2026, EUC A1C4, JISX0208 (1, 0x01, 0x24)
611 *
612 * For double-byte character, second byte s2 must be in the range of
613 * 0x40 <= s2 <= 0x7e or 0x80 <= s2 <= 0xfc. Otherwise, double-byte
614 * substitution character is used.
615 *
616 * two bytes (s1, s2) maps to JIS X 0213 (m, k, t) by
617 * m = 1 if s1 <= 0xef, 2 otherwise
618 * k = (s1-0x80)*2 - ((s2 < 0x9f)? 1 : 0) if s1 <= 0x9f
619 * (s1-0xc0)*2 - ((s2 < 0x9f)? 1 : 0) if 0xe0 <= s1 <= 0xef
620 * (s1-0x9e)*2 - ((s2 < 0x89)? 1 : 0) if s1 >= 0xf5
621 * otherwise, use the following table
622 * s1 k (s2>=0x80, s2<0x80)
623 * 0xf0 (0x01, 0x08)
624 * 0xf1 (0x03, 0x04)
625 * 0xf2 (0x05, 0x0c)
626 * 0xf3 (0x0e, 0x0d)
627 * 0xf4 (0x0f, 0x4e)
628 * t = s2-0x3f if s2 < 0x7f
629 * s2-0x40 if s2 < 0x9f
630 * s2-0x9e otherwise
631 *
632 * JIS X 0213 to EUC-JP is a straightfoward conversion.
633 */
634
sjis_eucj(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)635 static ScmSize sjis_eucj(ScmConvInfo *cinfo SCM_UNUSED,
636 const char *inptr, ScmSize inroom,
637 char *outptr, ScmSize outroom,
638 ScmSize *outchars)
639 {
640 static const unsigned char cvt[] = { 0xa1, 0xa8, 0xa3, 0xa4, 0xa5, 0xac, 0xae, 0xad, 0xaf, 0xee };
641
642 unsigned char s1 = inptr[0];
643 if (s1 <= 0x7f) {
644 *outptr = s1;
645 *outchars = 1;
646 return 1;
647 }
648 if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 <= 0xfc)) {
649 /* Double byte char */
650 unsigned char m, e1, e2;
651 INCHK(2);
652 unsigned char s2 = inptr[1];
653 if (s2 < 0x40 || s2 > 0xfc) {
654 DO_SUBST;
655 return 2;
656 }
657
658 if (s1 <= 0x9f) {
659 OUTCHK(2);
660 m = 1;
661 e1 = (s1-0x80)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
662 } else if (s1 <= 0xef) {
663 OUTCHK(2);
664 m = 1;
665 e1 = (s1-0xc0)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
666 } else if (s1 >= 0xf5) {
667 OUTCHK(3);
668 m = 2;
669 e1 = (s1-0xf5)*2 + 0x50 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
670 } else {
671 OUTCHK(3);
672 m = 2;
673 e1 = cvt[(s1-0xf0)*2+((s2 < 0x9f)? 1 : 0)];
674 }
675
676 if (s2 < 0x7f) {
677 e2 = s2 - 0x3f + 0xa0;
678 } else if (s2 < 0x9f) {
679 e2 = s2 - 0x40 + 0xa0;
680 } else {
681 e2 = s2 - 0x9e + 0xa0;
682 }
683 if (m == 1) {
684 outptr[0] = e1;
685 outptr[1] = e2;
686 *outchars = 2;
687 } else {
688 outptr[0] = 0x8f;
689 outptr[1] = e1;
690 outptr[2] = e2;
691 *outchars = 3;
692 }
693 return 2;
694 }
695 if (s1 >= 0xa1 && s1 <= 0xdf) {
696 /* JISX0201 KANA */
697 OUTCHK(2);
698 outptr[0] = 0x8e;
699 outptr[1] = s1;
700 *outchars = 2;
701 return 1;
702 }
703 if (s1 == 0xfd) {
704 /* copyright mark */
705 OUTCHK(2);
706 outptr[0] = 0xa9;
707 outptr[1] = 0xa6;
708 *outchars = 2;
709 return 1;
710 }
711 if (s1 == 0xfe) {
712 /* trademark sign. this is not in JISX0213, but in JISX0212. */
713 OUTCHK(3);
714 outptr[0] = 0x8f;
715 outptr[1] = 0xa2;
716 outptr[2] = 0xef;
717 *outchars = 3;
718 return 1;
719 }
720 if (s1 == 0xff) {
721 /* horizontal ellipsis. */
722 OUTCHK(2);
723 outptr[0] = 0xa1;
724 outptr[1] = 0xc4;
725 *outchars = 2;
726 return 1;
727 }
728
729 /* s1 == 0x80 or 0xa0 */
730 DO_SUBST;
731 return 2;
732 }
733
734 /* SJIS -> ASCII */
735
sjis_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)736 static ScmSize sjis_ascii(ScmConvInfo *cinfo,
737 const char *inptr, ScmSize inroom,
738 char *outptr, ScmSize outroom,
739 ScmSize *outchars)
740 {
741 unsigned char s1 = inptr[0];
742 if (s1 <= 0x7f) {
743 outptr[0] = s1;
744 *outchars = 1;
745 return 1;
746 }
747 if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 < 0xfc)) {
748 INCHK(2);
749 DO_SUBST;
750 *outchars = cinfo->replaceSize;
751 return 2;
752 }
753 else {
754 DO_SUBST;
755 *outchars = cinfo->replaceSize;
756 return 1;
757 }
758 }
759
760 /*=================================================================
761 * UTF8
762 */
763
764 /* Conversion between UTF8 and EUC_JP is based on the table found at
765 * http://isweb11.infoseek.co.jp/computer/wakaba/table/jis-note.ja.html
766 *
767 * There are some characters in JISX0213 that can't be represented
768 * in a single Unicode character, but can be with a combining character.
769 * In such case, EUC_JP to UTF8 conversion uses combining character,
770 * but UTF8 to EUC_JP conversion translates the combining character into
771 * another character. For example, a single JISX0213 katakana 'nga'
772 * (hiragana "ka" with han-dakuon mark) will translates to Unicode
773 * U+304B+309A (HIRAGANA LETTER KA + COMBINING KATAKANA-HIRAGANA SEMI-VOICED
774 * SOUND MARK). When this sequence is converted to EUC_JP again, it
775 * becomes EUCJ 0xA4AB + 0xA1AC. This is an implementation limitation,
776 * and should be removed in later release.
777 */
778
779 /* [UTF8 -> EUC_JP conversion]
780 *
781 * EUC-JP has the corresponding characters to the wide range of
782 * UCS characters.
783 *
784 * UCS4 character # of EUC_JP characters
785 * ---------------------------------------
786 * U+0000+0xxx 564
787 * U+0000+1xxx 6
788 * U+0000+2xxx 321
789 * U+0000+3xxx 422
790 * U+0000+4xxx 347
791 * U+0000+5xxx 1951
792 * U+0000+6xxx 2047
793 * U+0000+7xxx 1868
794 * U+0000+8xxx 1769
795 * U+0000+9xxx 1583
796 * U+0000+fxxx 241
797 * U+0002+xxxx 302
798 *
799 * It is so wide and so sparse that naive lookup table implementation from
800 * UCS to EUC can be space-wasting. I use hierarchical table with some
801 * ad-hoc heuristics. Since the hierarchical table is used, I directly
802 * translates UTF8 to EUC_JP, without converting it to UCS4.
803 *
804 * Strategy outline: say input consists of bytes named u0, u1, ....
805 *
806 * u0 <= 0x7f : ASCII range
807 * u0 in [0xc2-0xd1] : UTF8 uses 2 bytes. Some mappings within this range
808 * is either very regular or very small, and they are
809 * hardcoded. Other mappings uses table lookup.
810 * u0 == 0xe1 : UTF8 uses 3 bytes. There are only 6 characters in this
811 * range, and it is hardcoded.
812 * u0 in [0xe2-0xe9, 0xef] : Large number of characters are in this range.
813 * Two-level table of 64 entries each is used to dispatch the
814 * characters.
815 * u0 == 0xf0 : UTF8 uses 4 bytes. u1 is in [0xa0-0xaa]. u2 and u3 is
816 * used for dispatch table of 64 entries each.
817 *
818 * The final table entry is unsigned short. 0x0000 means no corresponding
819 * character is defined in EUC_JP. >=0x8000 is the EUC_JP character itself.
820 * < 0x8000 means the character is in G3 plane; 0x8f should be preceded,
821 * and 0x8000 must be added to the value.
822 */
823
824 #include "ucs2eucj.c"
825
826 /* Emit given euc char */
utf2euc_emit_euc(unsigned short euc,ScmSize inchars,char * outptr,ScmSize outroom,ScmSize * outchars)827 static inline ScmSize utf2euc_emit_euc(unsigned short euc,
828 ScmSize inchars,
829 char *outptr,
830 ScmSize outroom,
831 ScmSize *outchars)
832 {
833 if (euc < 0x8000) {
834 OUTCHK(3);
835 outptr[0] = 0x8f;
836 outptr[1] = (euc >> 8) + 0x80;
837 outptr[2] = euc & 0xff;
838 *outchars = 3;
839 } else {
840 OUTCHK(2);
841 outptr[0] = (euc >> 8);
842 outptr[1] = euc & 0xff;
843 *outchars = 2;
844 }
845 return inchars;
846 }
847
848 /* handle 2-byte UTF8 sequence. 0xc0 <= u0 <= 0xdf */
utf2euc_2(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)849 static inline ScmSize utf2euc_2(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
850 const char *inptr, ScmSize inroom,
851 char *outptr, ScmSize outroom,
852 ScmSize *outchars)
853 {
854 const unsigned short *etab = NULL;
855
856 INCHK(2);
857 unsigned char u1 = (unsigned char)inptr[1];
858 if (u1 < 0x80 || u1 >= 0xc0) return ILLEGAL_SEQUENCE;
859
860 switch (u0) {
861 case 0xc2: etab = utf2euc_c2; break;
862 case 0xc3: etab = utf2euc_c3; break;
863 case 0xc4: etab = utf2euc_c4; break;
864 case 0xc5: etab = utf2euc_c5; break;
865 case 0xc6:
866 if (u1 == 0x93) { /* U+0193 -> euc ABA9 */
867 return utf2euc_emit_euc(0xaba9, 2, outptr, outroom, outchars);
868 } else break;
869 case 0xc7: etab = utf2euc_c7; break;
870 case 0xc9: etab = utf2euc_c9; break;
871 case 0xca: etab = utf2euc_ca; break;
872 case 0xcb: etab = utf2euc_cb; break;
873 case 0xcc: etab = utf2euc_cc; break;
874 case 0xcd:
875 if (u1 == 0xa1) { /* U+0361 -> euc ABD2 */
876 return utf2euc_emit_euc(0xabd2, 2, outptr, outroom, outchars);
877 } else break;
878 case 0xce: etab = utf2euc_ce; break;
879 case 0xcf: etab = utf2euc_cf; break;
880 case 0xd0: etab = utf2euc_d0; break;
881 case 0xd1: etab = utf2euc_d1; break;
882 default:
883 break;
884 }
885 if (etab != NULL) {
886 /* table lookup */
887 unsigned short euc = etab[u1-0x80];
888 if (euc != 0) {
889 return utf2euc_emit_euc(euc, 2, outptr, outroom, outchars);
890 }
891 }
892 DO_SUBST;
893 return 2;
894 }
895
896 /* handle 3-byte UTF8 sequence. 0xe0 <= u0 <= 0xef */
utf2euc_3(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)897 static inline ScmSize utf2euc_3(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
898 const char *inptr, ScmSize inroom,
899 char *outptr, ScmSize outroom,
900 ScmSize *outchars)
901 {
902 const unsigned char *tab1 = NULL;
903 const unsigned short (*tab2)[64] = NULL;
904
905 INCHK(3);
906 unsigned char u1 = (unsigned char)inptr[1];
907 unsigned char u2 = (unsigned char)inptr[2];
908
909 switch (u0) {
910 case 0xe1: /* special case : there's only 6 chars */
911 {
912 unsigned short euc = 0;
913 if (u1 == 0xb8) {
914 if (u2 == 0xbe) euc = 0xa8f2;
915 else if (u2 == 0xbf) euc = 0xa8f3;
916 } else if (u1 == 0xbd) {
917 if (u2 == 0xb0) euc = 0xabc6;
918 else if (u2 == 0xb1) euc = 0xabc7;
919 else if (u2 == 0xb2) euc = 0xabd0;
920 else if (u2 == 0xb3) euc = 0xabd1;
921 }
922 return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
923 }
924 case 0xe2: tab1 = utf2euc_e2; tab2 = utf2euc_e2_xx; break;
925 case 0xe3: tab1 = utf2euc_e3; tab2 = utf2euc_e3_xx; break;
926 case 0xe4: tab1 = utf2euc_e4; tab2 = utf2euc_e4_xx; break;
927 case 0xe5: tab1 = utf2euc_e5; tab2 = utf2euc_e5_xx; break;
928 case 0xe6: tab1 = utf2euc_e6; tab2 = utf2euc_e6_xx; break;
929 case 0xe7: tab1 = utf2euc_e7; tab2 = utf2euc_e7_xx; break;
930 case 0xe8: tab1 = utf2euc_e8; tab2 = utf2euc_e8_xx; break;
931 case 0xe9: tab1 = utf2euc_e9; tab2 = utf2euc_e9_xx; break;
932 case 0xef: tab1 = utf2euc_ef; tab2 = utf2euc_ef_xx; break;
933 default:
934 break;
935 }
936 if (tab1 != NULL) {
937 unsigned char ind = tab1[u1-0x80];
938 if (ind != 0) {
939 unsigned short euc = tab2[ind-1][u2-0x80];
940 if (euc != 0) {
941 return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
942 }
943 }
944 }
945 DO_SUBST;
946 return 3;
947 }
948
949 /* handle 4-byte UTF8 sequence. u0 == 0xf0, 0xa0 <= u1 <= 0xaa */
utf2euc_4(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)950 static inline ScmSize utf2euc_4(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
951 const char *inptr, ScmSize inroom,
952 char *outptr, ScmSize outroom,
953 ScmSize *outchars)
954 {
955 const unsigned short *tab = NULL;
956
957 INCHK(4);
958 if (u0 != 0xf0) {
959 DO_SUBST;
960 return 4;
961 }
962 unsigned char u1 = (unsigned char)inptr[1];
963 unsigned char u2 = (unsigned char)inptr[2];
964 unsigned char u3 = (unsigned char)inptr[3];
965
966 switch (u1) {
967 case 0xa0: tab = utf2euc_f0_a0; break;
968 case 0xa1: tab = utf2euc_f0_a1; break;
969 case 0xa2: tab = utf2euc_f0_a2; break;
970 case 0xa3: tab = utf2euc_f0_a3; break;
971 case 0xa4: tab = utf2euc_f0_a4; break;
972 case 0xa5: tab = utf2euc_f0_a5; break;
973 case 0xa6: tab = utf2euc_f0_a6; break;
974 case 0xa7: tab = utf2euc_f0_a7; break;
975 case 0xa8: tab = utf2euc_f0_a8; break;
976 case 0xa9: tab = utf2euc_f0_a9; break;
977 case 0xaa: tab = utf2euc_f0_aa; break;
978 default:
979 break;
980 }
981 if (tab != NULL) {
982 unsigned short u2u3 = u2*256 + u3;
983 for (int i=0; tab[i]; i+=2) {
984 if (tab[i] == u2u3) {
985 unsigned short euc = tab[i+1];
986 if (euc != 0) {
987 return utf2euc_emit_euc(euc, 4, outptr, outroom, outchars);
988 }
989 }
990 }
991 }
992 DO_SUBST;
993 return 4;
994 }
995
996 /* Body of UTF8 -> EUC_JP conversion */
utf8_eucj(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)997 static ScmSize utf8_eucj(ScmConvInfo *cinfo,
998 const char *inptr, ScmSize inroom,
999 char *outptr, ScmSize outroom,
1000 ScmSize *outchars)
1001 {
1002 unsigned char u0 = (unsigned char)inptr[0];
1003
1004 if (u0 <= 0x7f) {
1005 *outptr = u0;
1006 *outchars = 1;
1007 return 1;
1008 }
1009 if (u0 <= 0xbf) {
1010 /* invalid UTF8 sequence */
1011 return ILLEGAL_SEQUENCE;
1012 }
1013 if (u0 <= 0xdf) {
1014 /* 2-byte UTF8 sequence */
1015 return utf2euc_2(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1016 }
1017 if (u0 <= 0xef) {
1018 /* 3-byte UTF8 sequence */
1019 return utf2euc_3(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1020 }
1021 if (u0 <= 0xf7) {
1022 /* 4-byte UTF8 sequence */
1023 return utf2euc_4(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1024 }
1025 if (u0 <= 0xfb) {
1026 /* 5-byte UTF8 sequence */
1027 INCHK(5);
1028 DO_SUBST;
1029 return 5;
1030 }
1031 if (u0 <= 0xfd) {
1032 /* 6-byte UTF8 sequence */
1033 INCHK(6);
1034 DO_SUBST;
1035 return 6;
1036 }
1037 return ILLEGAL_SEQUENCE;
1038 }
1039
1040 /* UTF8 -> UTF16 */
utf8_utf16(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1041 static ScmSize utf8_utf16(ScmConvInfo *cinfo,
1042 const char *inptr, ScmSize inroom,
1043 char *outptr, ScmSize outroom,
1044 ScmSize *outchars)
1045 {
1046 ScmSize reqsize = 0;
1047 int ostate = cinfo->ostate;
1048 int need_bom = FALSE;
1049 ScmChar ch;
1050
1051 if (ostate == UTF_DEFAULT) {
1052 reqsize += 2;
1053 need_bom = TRUE;
1054 ostate = UTF_BE;
1055 }
1056 int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1057 if (r < 0) return r;
1058 if (ch < 0x10000) reqsize += 2;
1059 else reqsize += 4;
1060
1061 OUTCHK(reqsize);
1062 if (need_bom) {
1063 if (ostate == UTF_BE) {
1064 outptr[0] = 0xfe;
1065 outptr[1] = 0xff;
1066 } else {
1067 outptr[1] = 0xfe;
1068 outptr[0] = 0xff;
1069 }
1070 outptr += 2;
1071 }
1072 if (ch < 0x10000) {
1073 char u[2];
1074 u[0] = (ch >> 8) & 0xff;
1075 u[1] = ch & 0xff;
1076 if (ostate == UTF_BE) {
1077 outptr[0] = u[0];
1078 outptr[1] = u[1];
1079 } else {
1080 outptr[1] = u[0];
1081 outptr[0] = u[1];
1082 }
1083 } else {
1084 ch -= 0x10000;
1085 char u[2];
1086 u[0] = 0xd8 + ((ch >> 18) & 0x03);
1087 u[1] = (ch >> 10) & 0xff;
1088 if (ostate == UTF_BE) {
1089 outptr[0] = u[0];
1090 outptr[1] = u[1];
1091 } else {
1092 outptr[1] = u[0];
1093 outptr[0] = u[1];
1094 }
1095 u[0] = 0xdc + ((ch >> 8) & 0x03);
1096 u[1] = ch & 0xff;
1097 if (ostate == UTF_BE) {
1098 outptr[2] = u[0];
1099 outptr[3] = u[1];
1100 } else {
1101 outptr[3] = u[0];
1102 outptr[2] = u[1];
1103 }
1104 }
1105 cinfo->ostate = ostate;
1106 *outchars = reqsize;
1107 return r;
1108 }
1109
1110 /* UTF8 -> UTF32 */
utf8_utf32(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1111 static ScmSize utf8_utf32(ScmConvInfo *cinfo,
1112 const char *inptr, ScmSize inroom,
1113 char *outptr, ScmSize outroom,
1114 ScmSize *outchars)
1115 {
1116 ScmSize reqsize = 0;
1117 int ostate = cinfo->ostate;
1118 int need_bom = FALSE;
1119 ScmChar ch;
1120
1121 if (ostate == UTF_DEFAULT) {
1122 reqsize += 4;
1123 need_bom = TRUE;
1124 ostate = UTF_BE;
1125 }
1126 int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1127 if (r < 0) return r;
1128 reqsize += 4;
1129
1130 OUTCHK(reqsize);
1131 if (need_bom) {
1132 if (ostate == UTF_BE) {
1133 outptr[0] = 0;
1134 outptr[1] = 0;
1135 outptr[2] = 0xfe;
1136 outptr[3] = 0xff;
1137 } else {
1138 outptr[3] = 0;
1139 outptr[2] = 0;
1140 outptr[1] = 0xfe;
1141 outptr[0] = 0xff;
1142 }
1143 outptr += 4;
1144 }
1145 if (ostate == UTF_BE) {
1146 outptr[0] = (ch >> 24) & 0xff;
1147 outptr[1] = (ch >> 16) & 0xff;
1148 outptr[2] = (ch >> 8) & 0xff;
1149 outptr[3] = ch & 0xff;
1150 } else {
1151 outptr[3] = (ch >> 24) & 0xff;
1152 outptr[2] = (ch >> 16) & 0xff;
1153 outptr[1] = (ch >> 8) & 0xff;
1154 outptr[0] = ch & 0xff;
1155 }
1156 cinfo->ostate = ostate;
1157 *outchars = reqsize;
1158 return r;
1159 }
1160
1161 /* UTF8 -> Latin1 */
utf8_lat1(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1162 static ScmSize utf8_lat1(ScmConvInfo *cinfo,
1163 const char *inptr, ScmSize inroom,
1164 char *outptr, ScmSize outroom,
1165 ScmSize *outchars)
1166 {
1167 ScmChar ch;
1168 int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1169 if (r < 0) return r;
1170 if (ch < 0x100) {
1171 *outptr = ch;
1172 *outchars = 1;
1173 } else {
1174 DO_SUBST;
1175 }
1176 return r;
1177 }
1178
1179 /* UTF8 -> ASCII */
utf8_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1180 static ScmSize utf8_ascii(ScmConvInfo *cinfo,
1181 const char *inptr, ScmSize inroom,
1182 char *outptr, ScmSize outroom,
1183 ScmSize *outchars)
1184 {
1185 ScmChar ch;
1186 int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1187 if (r < 0) return r;
1188 if (ch < 0x80) {
1189 *outptr = ch;
1190 *outchars = 1;
1191 } else {
1192 DO_SUBST;
1193 }
1194 return r;
1195 }
1196
1197 /*=================================================================
1198 * UTF16
1199 */
1200
1201 /* For now, we first convert it to utf8, for we already have the table
1202 directly supports utf8. Theoretically though, having ucs4 to
1203 jis table would speed it up. */
1204
utf16_utf8(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1205 static ScmSize utf16_utf8(ScmConvInfo *cinfo,
1206 const char *inptr, ScmSize inroom,
1207 char *outptr, ScmSize outroom,
1208 ScmSize *outchars)
1209 {
1210 INCHK(2);
1211 int istate = cinfo->istate;
1212 ScmSize inread = 0;
1213 if (istate == UTF_DEFAULT) {
1214 if ((u_char)inptr[0] == 0xfe && (u_char)inptr[1] == 0xff) {
1215 inptr += 2;
1216 inroom -= 2;
1217 inread += 2;
1218 INCHK(2);
1219 istate = UTF_BE;
1220 } else if ((u_char)inptr[0] == 0xff && (u_char)inptr[1] == 0xfe) {
1221 inptr += 2;
1222 inroom -= 2;
1223 inread += 2;
1224 INCHK(2);
1225 istate = UTF_LE;
1226 } else {
1227 /* Arbitrary choice */
1228 istate = UTF_BE;
1229 }
1230 }
1231
1232 u_char u[2];
1233 if (istate == UTF_BE) {
1234 u[0] = inptr[0];
1235 u[1] = inptr[1];
1236 } else {
1237 u[0] = inptr[1];
1238 u[1] = inptr[0];
1239 }
1240
1241 ScmChar ch;
1242
1243 if ((u[0] & 0xdc) == 0xd8) {
1244 /* surrogate */
1245 inptr += 2;
1246 inroom -= 2;
1247 INCHK(2);
1248 u_char v[2];
1249 if (istate == UTF_BE) {
1250 v[0] = inptr[0];
1251 v[1] = inptr[1];
1252 } else {
1253 v[0] = inptr[1];
1254 v[1] = inptr[0];
1255 }
1256 if ((v[1] & 0xdc) == 0xdc) {
1257 ch = (((u[0] & 0x03) << 18)
1258 | (u[1] << 10)
1259 | ((v[0] & 0x03) << 8)
1260 | v[1])
1261 + 0x10000;
1262 inread += 4;
1263 } else {
1264 /* We only have first half of a surrogate pair.
1265 We leave the second character in the input, and try to
1266 substitute the first. */
1267 DO_SUBST;
1268 cinfo->istate = istate;
1269 return inread;
1270 }
1271 } else if ((u[0] & 0xdc) == 0xdc) {
1272 /* Stray second half of a surrogate pair. */
1273 DO_SUBST;
1274 return inread;
1275 } else {
1276 inread += 2;
1277 ch = (u[0] << 8) + u[1];
1278 }
1279
1280 int outreq = UCS2UTF_NBYTES(ch);
1281 OUTCHK(outreq);
1282 jconv_ucs4_to_utf8(ch, outptr);
1283 cinfo->istate = istate;
1284 *outchars = outreq;
1285 return inread;
1286 }
1287
1288 /* This handles BOM stuff. It is pretty twisted, for we need to keep the
1289 internal state consistent even when we return an error. */
utf16_utf16(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1290 static ScmSize utf16_utf16(ScmConvInfo *cinfo,
1291 const char *inptr, ScmSize inroom,
1292 char *outptr, ScmSize outroom,
1293 ScmSize *outchars)
1294 {
1295 ScmSize consumed = 0;
1296 ScmSize emitted = 0;
1297
1298 if (cinfo->istate == UTF_DEFAULT || cinfo->ostate == UTF_DEFAULT) {
1299 /* We come here only at the beginning. */
1300 int istate = 0;
1301
1302 if (cinfo->istate == UTF_DEFAULT) {
1303 INCHK(2);
1304 if ((u_char)inptr[0] == 0xfe && (u_char)inptr[1] == 0xff) {
1305 consumed += 2;
1306 istate = UTF_BE;
1307 inptr += 2;
1308 inroom -= 2;
1309 } else if ((u_char)inptr[0] == 0xff && (u_char)inptr[1] == 0xfe) {
1310 consumed += 2;
1311 istate = UTF_LE;
1312 inptr += 2;
1313 inroom -= 2;
1314 } else {
1315 istate = UTF_BE;
1316 }
1317 }
1318 INCHK(2);
1319 if (cinfo->ostate == UTF_DEFAULT) {
1320 OUTCHK(4);
1321 outptr[0] = 0xfe;
1322 outptr[1] = 0xff;
1323 outptr += 2;
1324 outroom -= 2;
1325 emitted += 2;
1326 cinfo->ostate = UTF_BE;
1327 } else {
1328 OUTCHK(2);
1329 }
1330 cinfo->istate = istate;
1331 } else {
1332 INCHK(2);
1333 OUTCHK(2);
1334 }
1335
1336 char u[2];
1337 if (cinfo->istate == UTF_BE) {
1338 u[0] = inptr[0];
1339 u[1] = inptr[1];
1340 } else {
1341 u[1] = inptr[0];
1342 u[0] = inptr[1];
1343 }
1344 if (cinfo->ostate == UTF_BE) {
1345 outptr[0] = u[0];
1346 outptr[1] = u[1];
1347 } else {
1348 outptr[1] = u[0];
1349 outptr[0] = u[1];
1350 }
1351 *outchars = emitted + 2;
1352 return consumed + 2;
1353 }
1354
1355 /*=================================================================
1356 * UTF32
1357 */
1358
utf32_utf8(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1359 static ScmSize utf32_utf8(ScmConvInfo *cinfo,
1360 const char *inptr, ScmSize inroom,
1361 char *outptr, ScmSize outroom,
1362 ScmSize *outchars)
1363 {
1364 INCHK(4);
1365 int istate = cinfo->istate;
1366 ScmSize inread = 0;
1367 if (istate == UTF_DEFAULT) {
1368 if ((u_char)inptr[0] == 0
1369 && (u_char)inptr[1] == 0
1370 && (u_char)inptr[2] == 0xfe
1371 && (u_char)inptr[3] == 0xff) {
1372 inptr += 4;
1373 inroom -= 4;
1374 inread += 4;
1375 INCHK(4);
1376 istate = UTF_BE;
1377 } else if ((u_char)inptr[0] == 0xff
1378 && (u_char)inptr[1] == 0xfe
1379 && (u_char)inptr[2] == 0
1380 && (u_char)inptr[3] == 0) {
1381 inptr += 4;
1382 inroom -= 4;
1383 inread += 4;
1384 INCHK(4);
1385 istate = UTF_LE;
1386 } else {
1387 /* Arbitrary choice */
1388 istate = UTF_BE;
1389 }
1390 }
1391
1392 u_char u[4];
1393 if (istate == UTF_BE) {
1394 u[0] = inptr[0];
1395 u[1] = inptr[1];
1396 u[2] = inptr[2];
1397 u[3] = inptr[3];
1398 } else {
1399 u[0] = inptr[3];
1400 u[1] = inptr[2];
1401 u[2] = inptr[1];
1402 u[3] = inptr[0];
1403 }
1404 inread += 4;
1405
1406 ScmChar ch = (u[0] << 24) | (u[1] << 16) | (u[2] << 8) | u[3];
1407
1408 int outreq = UCS2UTF_NBYTES(ch);
1409 OUTCHK(outreq);
1410 jconv_ucs4_to_utf8(ch, outptr);
1411 cinfo->istate = istate;
1412 *outchars = outreq;
1413 return inread;
1414 }
1415
1416 /* This handles BOM stuff. */
utf32_utf32(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1417 static ScmSize utf32_utf32(ScmConvInfo *cinfo,
1418 const char *inptr, ScmSize inroom,
1419 char *outptr, ScmSize outroom,
1420 ScmSize *outchars)
1421 {
1422 ScmSize consumed = 0;
1423 ScmSize emitted = 0;
1424
1425 if (cinfo->istate == UTF_DEFAULT || cinfo->ostate == UTF_DEFAULT) {
1426 /* We come here only at the beginning. */
1427 int istate = 0;
1428
1429 if (cinfo->istate == UTF_DEFAULT) {
1430 INCHK(4);
1431 if ((u_char)inptr[0] == 0
1432 && (u_char)inptr[1] == 0
1433 && (u_char)inptr[2] == 0xfe
1434 && (u_char)inptr[3] == 0xff) {
1435 consumed += 4;
1436 istate = UTF_BE;
1437 inptr += 4;
1438 inroom -= 4;
1439 } else if ((u_char)inptr[0] == 0xff
1440 && (u_char)inptr[1] == 0xfe
1441 && (u_char)inptr[2] == 0
1442 && (u_char)inptr[3] == 0) {
1443 consumed += 4;
1444 istate = UTF_LE;
1445 inptr += 4;
1446 inroom -= 4;
1447 } else {
1448 istate = UTF_BE;
1449 }
1450 }
1451 INCHK(4);
1452 if (cinfo->ostate == UTF_DEFAULT) {
1453 OUTCHK(8);
1454 outptr[0] = 0;
1455 outptr[1] = 0;
1456 outptr[2] = 0xfe;
1457 outptr[3] = 0xff;
1458 outptr += 4;
1459 outroom -= 4;
1460 emitted += 4;
1461 cinfo->ostate = UTF_BE;
1462 } else {
1463 OUTCHK(4);
1464 }
1465 cinfo->istate = istate;
1466 } else {
1467 INCHK(4);
1468 OUTCHK(4);
1469 }
1470
1471 char u[4];
1472 if (cinfo->istate == UTF_BE) {
1473 u[0] = inptr[0];
1474 u[1] = inptr[1];
1475 u[2] = inptr[2];
1476 u[3] = inptr[3];
1477 } else {
1478 u[3] = inptr[0];
1479 u[2] = inptr[1];
1480 u[1] = inptr[2];
1481 u[0] = inptr[3];
1482 }
1483 if (cinfo->ostate == UTF_BE) {
1484 outptr[0] = u[0];
1485 outptr[1] = u[1];
1486 outptr[2] = u[2];
1487 outptr[3] = u[3];
1488 } else {
1489 outptr[3] = u[0];
1490 outptr[2] = u[1];
1491 outptr[1] = u[2];
1492 outptr[0] = u[3];
1493 }
1494 *outchars = emitted + 4;
1495 return consumed + 4;
1496 }
1497
1498 /*=================================================================
1499 * ISO2022-JP
1500 */
1501
1502 /* ISO2022-JP{-1(,2),3} -> EUC_JP
1503 * Strategy: accepts as many possibilities as possible.
1504 * The following escape sequence is recognized:
1505 * (See Lunde, CJKV information processing, O'Reilly, pp.155--158)
1506 *
1507 * <ESC> ( B ASCII
1508 * <ESC> ( J JIS-Roman
1509 * <ESC> ( H JIS-Roman (for compatibility)
1510 * <ESC> ( I Half-width katakana (JIS X 0201 kana)
1511 * <ESC> $ @ JIS C 6226-1978 (78JIS)
1512 * <ESC> $ B JIS X 0208-1983 (83JIS)
1513 * <ESC> $ ( D JIS X 0212-1990
1514 * <ESC> $ ( O JIS X 0213:2000 plane 1
1515 * <ESC> $ ( P JIS X 0213:2000 plane 2
1516 * <ESC> & @ <ESC> $ B JIS X 0208-1990, JIS X 0208:1997
1517 * 0x0e JIS7 half-width katakana shift-out
1518 * 0x0f JIS7 half-width katakana shift-in
1519 *
1520 * The state is reset to ASCII whenever newline character is read.
1521 *
1522 * The following escape sequences defined in ISO2022-JP-2 are recognized,
1523 * but all the characters within the sequence will be replaced by '?'.
1524 *
1525 * <ESC> $ A (GB2312-80) unsupported
1526 * <ESC> $ ( C (KS X 1001:1992) unsupported
1527 * <ESC> . A (ISO8859-1:1998) unsupported
1528 * <ESC> . F (ISO8859-7:1998) unsupported
1529 *
1530 * If other escape sequence is seen, the converter returns ILLEGAL_SEQUENCE.
1531 *
1532 * JIS8 kana is allowed.
1533 */
1534
1535 /* deal with escape sequence. escape byte itself is already consumed.
1536 returns # of input bytes consumed by the escape sequence,
1537 or an error code. cinfo->istate is updated accordingly. */
jis_esc(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom)1538 static ScmSize jis_esc(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom)
1539 {
1540 INCHK(2);
1541 unsigned char j1 = inptr[0];
1542 unsigned char j2 = inptr[1];
1543 switch (j1) {
1544 case '(':
1545 switch (j2) {
1546 case 'B': cinfo->istate = JIS_ASCII; break;
1547 case 'J': cinfo->istate = JIS_ROMAN; break;
1548 case 'H': cinfo->istate = JIS_ROMAN; break;
1549 case 'I': cinfo->istate = JIS_KANA; break;
1550 default: return ILLEGAL_SEQUENCE;
1551 }
1552 return 2;
1553 case '$':
1554 switch (j2) {
1555 case '@': cinfo->istate = JIS_78; break;
1556 case 'B': cinfo->istate = JIS_0213_1; break;
1557 case 'A': cinfo->istate = JIS_UNKNOWN; break;
1558 case '(':
1559 {
1560 INCHK(3);
1561 switch (inptr[2]) {
1562 case 'D': cinfo->istate = JIS_0212; break;
1563 case 'O': cinfo->istate = JIS_0213_1; break;
1564 case 'P': cinfo->istate = JIS_0213_2; break;
1565 case 'C': cinfo->istate = JIS_UNKNOWN; break;
1566 default: return ILLEGAL_SEQUENCE;
1567 }
1568 return 3;
1569 break;
1570 }
1571 default: return ILLEGAL_SEQUENCE;
1572 }
1573 return 2;
1574 case '&':
1575 {
1576 INCHK(6);
1577 if (inptr[2] == '@' && inptr[3] == 0x1b && inptr[4] == '$'
1578 && inptr[5] == 'B') {
1579 cinfo->istate = JIS_0213_1;
1580 return 5;
1581 } else {
1582 return ILLEGAL_SEQUENCE;
1583 }
1584 }
1585 case '.':
1586 switch (inptr[2]) {
1587 case 'A':/*fallthrough*/;
1588 case 'F': cinfo->istate = JIS_UNKNOWN; break;
1589 default: return ILLEGAL_SEQUENCE;
1590 }
1591 return 2;
1592 default: return ILLEGAL_SEQUENCE;
1593 }
1594 }
1595
1596 /* main routine for iso2022-jp -> euc_jp */
jis_eucj(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1597 static ScmSize jis_eucj(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom,
1598 char *outptr, ScmSize outroom, ScmSize *outchars)
1599 {
1600 ScmSize inoffset = 0;
1601
1602 unsigned char j0 = inptr[inoffset];
1603 /* skip escape sequence */
1604 while (j0 == 0x1b) {
1605 inoffset++;
1606 ScmSize r = jis_esc(cinfo, inptr+inoffset, inroom-inoffset);
1607 if (ERRP(r)) return r;
1608 inoffset += r;
1609 if (inoffset >= inroom) {
1610 *outchars = 0;
1611 return inoffset;
1612 }
1613 j0 = inptr[inoffset];
1614 }
1615
1616 if (j0 == '\n' || j0 == '\r') {
1617 cinfo->istate = JIS_ASCII;
1618 outptr[0] = j0;
1619 *outchars = 1;
1620 return 1+inoffset;
1621 } else if (j0 < 0x20) {
1622 outptr[0] = j0;
1623 *outchars = 1;
1624 return 1+inoffset;
1625 } else if (j0 >= 0xa1 && j0 <= 0xdf) {
1626 /* JIS8 kana */
1627 OUTCHK(2);
1628 outptr[0] = 0x8e;
1629 outptr[1] = j0;
1630 *outchars = 2;
1631 return 1+inoffset;
1632 } else {
1633 switch (cinfo->istate) {
1634 case JIS_ROMAN:
1635 /* jis-roman and ascii differs on 0x5c and 0x7e -- for now,
1636 I ignore the difference. */
1637 /* FALLTHROUGH */
1638 case JIS_ASCII:
1639 outptr[0] = j0;
1640 *outchars = 1;
1641 return 1+inoffset;
1642 case JIS_KANA:
1643 OUTCHK(2);
1644 outptr[0] = 0x8e;
1645 outptr[1] = j0 + 0x80;
1646 *outchars = 2;
1647 return 1+inoffset;
1648 case JIS_78:
1649 /* for now, I ignore the difference between JIS78 and JIS83 */
1650 /* FALLTHROUGH */
1651 case JIS_0213_1: {
1652 INCHK(inoffset+2);
1653 OUTCHK(2);
1654 unsigned char j1 = inptr[inoffset+1];
1655 outptr[0] = j0 + 0x80;
1656 outptr[1] = j1 + 0x80;
1657 *outchars = 2;
1658 return 2+inoffset;
1659 }
1660 case JIS_0212:
1661 /* jis x 0212 and jis x 0213 plane 2 are different character sets,
1662 but uses the same conversion scheme. */
1663 /* FALLTHROUGH */
1664 case JIS_0213_2: {
1665 INCHK(inoffset+2);
1666 OUTCHK(3);
1667 unsigned char j1 = inptr[inoffset+1];
1668 outptr[0] = 0x8f;
1669 outptr[1] = j0 + 0x80;
1670 outptr[2] = j1 + 0x80;
1671 *outchars = 3;
1672 return 2+inoffset;
1673 }
1674 case JIS_UNKNOWN:
1675 DO_SUBST;
1676 return 1+inoffset;
1677 default:
1678 /* Can't be here */
1679 Scm_Panic("internal state of ISO2022-JP -> EUC_JP got messed up (%d). Implementation error?", cinfo->istate);
1680 }
1681 }
1682 return ILLEGAL_SEQUENCE;
1683 }
1684
1685 /* reset proc */
jis_reset(ScmConvInfo * cinfo,char * outptr,ScmSize outroom)1686 static ScmSize jis_reset(ScmConvInfo *cinfo, char *outptr, ScmSize outroom)
1687 {
1688 if (outptr == NULL) {
1689 /* just reset */
1690 cinfo->ostate = JIS_ASCII;
1691 return 0;
1692 } else {
1693 if (cinfo->ostate == JIS_ASCII) return 0;
1694 if (outroom < 3) return OUTPUT_NOT_ENOUGH;
1695 outptr[0] = 0x1b;
1696 outptr[1] = '(';
1697 outptr[2] = 'B';
1698 cinfo->ostate = JIS_ASCII;
1699 return 3;
1700 }
1701 }
1702
1703 /*=================================================================
1704 * ISO8859-1
1705 */
1706
lat1_utf8(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom,ScmSize * outchars)1707 static ScmSize lat1_utf8(ScmConvInfo *cinfo SCM_UNUSED,
1708 const char *inptr,
1709 ScmSize inroom SCM_UNUSED,
1710 char *outptr,
1711 ScmSize outroom,
1712 ScmSize *outchars)
1713 {
1714 unsigned char c = inptr[0];
1715 if (c <= 0x7f) {
1716 outptr[0] = c;
1717 *outchars = 1;
1718 } else {
1719 OUTCHK(2);
1720 outptr[0] = 0xc0 + (c >> 6);
1721 outptr[1] = 0x80 + (c & 0x3f);
1722 *outchars = 2;
1723 }
1724 return 1;
1725 }
1726
lat1_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom,ScmSize * outchars)1727 static ScmSize lat1_ascii(ScmConvInfo *cinfo,
1728 const char *inptr,
1729 ScmSize inroom SCM_UNUSED,
1730 char *outptr,
1731 ScmSize outroom,
1732 ScmSize *outchars)
1733 {
1734 unsigned char c = inptr[0];
1735 if (c <= 0x7f) {
1736 outptr[0] = c;
1737 *outchars = 1;
1738 } else {
1739 DO_SUBST;
1740 }
1741 return 1;
1742 }
1743
1744 /*=================================================================
1745 * ASCII
1746 */
1747
1748 /* ASCII -> X */
1749
ascii_x(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom SCM_UNUSED,ScmSize * outchars)1750 static ScmSize ascii_x(ScmConvInfo *cinfo SCM_UNUSED,
1751 const char *inptr,
1752 ScmSize inroom SCM_UNUSED,
1753 char *outptr,
1754 ScmSize outroom SCM_UNUSED,
1755 ScmSize *outchars)
1756 {
1757 outptr[0] = inptr[0];
1758 *outchars = 1;
1759 return 1;
1760 }
1761
1762 /*=================================================================
1763 * Placeholder
1764 */
1765
ident(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr SCM_UNUSED,ScmSize inroom SCM_UNUSED,char * outptr SCM_UNUSED,ScmSize outroom SCM_UNUSED,ScmSize * outchars SCM_UNUSED)1766 static ScmSize ident(ScmConvInfo *cinfo SCM_UNUSED,
1767 const char *inptr SCM_UNUSED,
1768 ScmSize inroom SCM_UNUSED,
1769 char *outptr SCM_UNUSED,
1770 ScmSize outroom SCM_UNUSED,
1771 ScmSize *outchars SCM_UNUSED)
1772 {
1773 return 0;
1774 }
1775
1776 /******************************************************************
1777 *
1778 * Actual conversion
1779 *
1780 */
1781
1782 /* map canonical code designator to inconv and outconv. the order of
1783 entry must match with the above designators.
1784 conv_converter[incode][outcode] returns the appropriate combiniation
1785 of routines.
1786 NB: It is tedious to maintain this table; we'll eventually generate
1787 this from some DSL.
1788 */
1789 struct conv_converter_rec {
1790 ScmConvProc *conv;
1791 ScmConvReset *reset;
1792 int istate; /* initial input state */
1793 int ostate; /* initial output state */
1794 };
1795
1796 /* map convesion name to the canonical code */
1797 struct conv_support_rec {
1798 const char *name;
1799 int code;
1800 };
1801
1802 #include "jconv_tab.c"
1803 #include "latin_tab.c"
1804
conv_name_match(const char * s,const char * t)1805 static int conv_name_match(const char *s, const char *t)
1806 {
1807 const char *p, *q;
1808 for (p=s, q=t; *p && *q; p++) {
1809 if (*p == '-' || *p == '_') {
1810 continue; /* ignore '-' and '_' */
1811 } else {
1812 if (tolower(*p) != tolower(*q)) return FALSE;
1813 q++;
1814 }
1815 }
1816 if (*p || *q) return FALSE;
1817 return TRUE;
1818 }
1819
conv_name_find(const char * name)1820 static int conv_name_find(const char *name)
1821 {
1822 struct conv_support_rec *cvtab = conv_supports;
1823 for (; cvtab->name; cvtab++) {
1824 if (conv_name_match(name, cvtab->name)) {
1825 return cvtab->code;
1826 }
1827 }
1828 return -1;
1829 }
1830
1831 /* Internal conversion handler. */
1832
1833 /* when we can just pass-through input to output */
jconv_ident(ScmConvInfo * cinfo SCM_UNUSED,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1834 static ScmSize jconv_ident(ScmConvInfo *cinfo SCM_UNUSED, const char **iptr,
1835 ScmSize *iroom, char **optr, ScmSize *oroom)
1836 {
1837 ScmSize inroom = *iroom, outroom = *oroom;
1838 #ifdef JCONV_DEBUG
1839 fprintf(stderr, "jconv_ident %s->%s\n", cinfo->fromCode, cinfo->toCode);
1840 #endif
1841 if (inroom <= outroom) {
1842 memcpy(*optr, *iptr, inroom);
1843 *optr += inroom;
1844 *iptr += inroom;
1845 *iroom = 0;
1846 *oroom -= inroom;
1847 return inroom;
1848 } else {
1849 memcpy(*optr, *iptr, outroom);
1850 *optr += outroom;
1851 *iptr += outroom;
1852 *iroom -= outroom;
1853 *oroom = 0;
1854 return OUTPUT_NOT_ENOUGH;
1855 }
1856 }
1857
1858 /* calling conversion routine for each char */
jconv_1tier(ScmConvInfo * cinfo,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1859 static ScmSize jconv_1tier(ScmConvInfo *cinfo, const char **iptr,
1860 ScmSize *iroom, char **optr, ScmSize *oroom)
1861 {
1862 ScmConvProc *cvt = cinfo->convert;
1863 const char *inp = *iptr;
1864 char *outp = *optr;
1865 int inr = (int)*iroom, outr = (int)*oroom;
1866 ScmSize converted = 0;
1867
1868 #ifdef JCONV_DEBUG
1869 fprintf(stderr, "jconv_1tier %s->%s\n", cinfo->fromCode, cinfo->toCode);
1870 #endif
1871 SCM_ASSERT(cvt != NULL);
1872 while (inr > 0 && outr > 0) {
1873 ScmSize outchars;
1874 ScmSize inchars = cvt(cinfo, inp, inr, outp, outr, &outchars);
1875 if (ERRP(inchars)) {
1876 converted = inchars;
1877 break;
1878 } else {
1879 converted += inchars;
1880 inp += inchars;
1881 inr -= (int)inchars;
1882 outp += outchars;
1883 outr -= (int)outchars;
1884 }
1885 }
1886 *iptr = inp;
1887 *iroom = inr;
1888 *optr = outp;
1889 *oroom = outr;
1890 return converted;
1891 }
1892
1893 /* When we delegate conversion to iconv(3) */
1894 #ifdef HAVE_ICONV_H
1895 /* NB: although iconv manages states, we need to keep track of whether
1896 * we're sure in default status (JIS_ASCII) or not (we use JIS_UNKNOWN for it).
1897 * It's because jconv_iconv_reset will be called twice if there is any
1898 * reset sequence; the first call should emit the sequence, but the second
1899 * call shouldn't.
1900 */
jconv_iconv(ScmConvInfo * cinfo,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1901 static ScmSize jconv_iconv(ScmConvInfo *cinfo, const char **iptr, ScmSize *iroom,
1902 char **optr, ScmSize *oroom)
1903 {
1904 #ifdef JCONV_DEBUG
1905 fprintf(stderr, "jconv_iconv %s->%s\n", cinfo->fromCode, cinfo->toCode);
1906 #endif
1907 size_t ir = *iroom, or = *oroom;
1908 size_t r = iconv(cinfo->handle, (char **)iptr, &ir, optr, &or);
1909 *iroom = ir;
1910 *oroom = or;
1911 cinfo->ostate = JIS_UNKNOWN;
1912 if (r == (size_t)-1) {
1913 if (errno == EINVAL) return INPUT_NOT_ENOUGH;
1914 if (errno == E2BIG) return OUTPUT_NOT_ENOUGH;
1915 return ILLEGAL_SEQUENCE;
1916 } else {
1917 return (ScmSize)r;
1918 }
1919 }
1920
1921 /* reset routine for iconv */
jconv_iconv_reset(ScmConvInfo * cinfo,char * optr,ScmSize oroom)1922 static ScmSize jconv_iconv_reset(ScmConvInfo *cinfo, char *optr, ScmSize oroom)
1923 {
1924 ScmSize oroom_prev = oroom;
1925 if (cinfo->ostate == JIS_ASCII) return 0;
1926 size_t or = oroom;
1927 size_t r = iconv(cinfo->handle, NULL, 0, &optr, &or);
1928 if (r == (size_t)-1) {
1929 if (errno == E2BIG) return OUTPUT_NOT_ENOUGH;
1930 Scm_Panic("jconv_iconv_reset: unknown error number %d\n", errno);
1931 }
1932 cinfo->ostate = JIS_ASCII;
1933 return oroom_prev - (ScmSize)or;
1934 }
1935 #endif /*HAVE_ICONV_H*/
1936
1937 /*------------------------------------------------------------------
1938 * JCONV_OPEN
1939 * Returns ScmConvInfo, setting up some fields.
1940 * If no conversion is possible, returns NULL.
1941 */
jconv_open(const char * toCode,const char * fromCode,int useIconv)1942 ScmConvInfo *jconv_open(const char *toCode, const char *fromCode,
1943 int useIconv)
1944 {
1945 ScmConvHandler *handler = NULL;
1946 ScmConvProc *convert = NULL;
1947 ScmConvReset *reset = NULL;
1948 int istate = 0, ostate = 0;
1949 iconv_t handle = (iconv_t)-1;
1950
1951 int incode = conv_name_find(fromCode);
1952 int outcode = conv_name_find(toCode);
1953
1954 if (incode >= 0 && outcode >= 0) {
1955 convert = conv_converter[incode][outcode].conv;
1956 reset = conv_converter[incode][outcode].reset;
1957 istate = conv_converter[incode][outcode].istate;
1958 ostate = conv_converter[incode][outcode].ostate;
1959 }
1960
1961 if (convert == NULL) {
1962 if (useIconv) {
1963 #ifdef HAVE_ICONV_H
1964 /* try iconv */
1965 handle = iconv_open(toCode, fromCode);
1966 if (handle == (iconv_t)-1) return NULL;
1967 handler = jconv_iconv;
1968 reset = jconv_iconv_reset;
1969 #else /*!HAVE_ICONV_H*/
1970 return NULL;
1971 #endif
1972 } else {
1973 return NULL;
1974 }
1975 } else if (convert == ident) {
1976 handler = jconv_ident;
1977 } else {
1978 handler = jconv_1tier;
1979 }
1980
1981 ScmConvInfo *cinfo;
1982 cinfo = SCM_NEW(ScmConvInfo);
1983 cinfo->jconv = handler;
1984 cinfo->convert = convert;
1985 cinfo->reset = reset;
1986 cinfo->handle = handle;
1987 cinfo->toCode = toCode;
1988 cinfo->istate = istate;
1989 cinfo->ostate = ostate;
1990 cinfo->fromCode = fromCode;
1991 /* The replacement settings can be modified by jconv_set_replacement */
1992 cinfo->replacep = FALSE;
1993 cinfo->replaceSize = 0;
1994 cinfo->replaceSeq = NULL;
1995 return cinfo;
1996 }
1997
1998 /*------------------------------------------------------------------
1999 * JCONV_SET_REPLACEMENT
2000 * Setting up replacement sequence according to the toCode.
2001 */
jconv_set_replacement(ScmConvInfo * cinfo)2002 void jconv_set_replacement(ScmConvInfo *cinfo)
2003 {
2004 static ScmObj ces_replacement_proc = SCM_UNDEFINED;
2005 SCM_BIND_PROC(ces_replacement_proc, "%ces-replacement",
2006 Scm_FindModule(SCM_SYMBOL(SCM_INTERN("gauche.charconv")), 0));
2007 ScmObj replacements = Scm_ApplyRec1(ces_replacement_proc,
2008 SCM_MAKE_STR(cinfo->toCode));
2009 ScmSize i = Scm_Length(replacements);
2010 if (i > 0) {
2011 cinfo->replacep = TRUE;
2012 cinfo->replaceSize = i;
2013 char *replaceSeq = SCM_NEW_ATOMIC_ARRAY(char, i);
2014 for (int j = 0; j < i; j++) {
2015 SCM_ASSERT(SCM_PAIRP(replacements));
2016 replaceSeq[j] = SCM_INT_VALUE(SCM_CAR(replacements));
2017 replacements = SCM_CDR(replacements);
2018 }
2019 cinfo->replaceSeq = replaceSeq;
2020 }
2021 }
2022
2023 /*------------------------------------------------------------------
2024 * JCONV_CLOSE
2025 */
jconv_close(ScmConvInfo * cinfo)2026 int jconv_close(ScmConvInfo *cinfo)
2027 {
2028 int r = 0;
2029 #ifdef HAVE_ICONV_H
2030 if (cinfo->handle != (iconv_t)-1) {
2031 r = iconv_close(cinfo->handle);
2032 cinfo->handle = (iconv_t)-1;
2033 }
2034 #endif /*HAVE_ICONV_H*/
2035 return r;
2036 }
2037
2038 /*------------------------------------------------------------------
2039 * JCONV - main conversion routine
2040 */
jconv(ScmConvInfo * cinfo,const char ** inptr,ScmSize * inroom,char ** outptr,ScmSize * outroom)2041 ScmSize jconv(ScmConvInfo *cinfo,
2042 const char **inptr, ScmSize *inroom,
2043 char **outptr, ScmSize *outroom)
2044 {
2045 SCM_ASSERT(cinfo->jconv != NULL);
2046 return cinfo->jconv(cinfo, inptr, inroom, outptr, outroom);
2047 }
2048
2049 /*------------------------------------------------------------------
2050 * JCONV_RESET - reset
2051 */
jconv_reset(ScmConvInfo * cinfo,char * outptr,ScmSize outroom)2052 ScmSize jconv_reset(ScmConvInfo *cinfo, char *outptr, ScmSize outroom)
2053 {
2054 if (cinfo->reset) {
2055 return cinfo->reset(cinfo, outptr, outroom);
2056 } else {
2057 return 0;
2058 }
2059 }
2060