1 /*
2  * jconv.c - alternative japanese code conversion routines
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /* Some iconv() implementations don't support japanese character encodings,
35  * or have problems handling them.  This code provides an alternative way
36  * to convert these encodings.
37  */
38 
39 /* This file handles conversion among UTF8, Shift-JIS, EUC_JP, and ISO2022JP.
40  * Shift-JIS and EUC_JP are based on JIS X 0213:2000.  ISO2022JP partially
41  * handles ISO2022-JP-3 as well.
42  */
43 
44 #include <ctype.h>
45 #include "charconv.h"
46 #include "jconv_tab.h"
47 #include "latin_tab.h"
48 
49 #define INCHK(n)   do{if ((int)inroom < (n)) return INPUT_NOT_ENOUGH;}while(0)
50 #define OUTCHK(n)  do{if ((int)outroom < (n)) return OUTPUT_NOT_ENOUGH;}while(0)
51 
52 #define ERRP(n)    ((n) < 0)
53 
54 #define INTERMEDIATE_BUF_SIZE  6
55 
56 /* Fill outptr with substitution character.  Can return jconv error code. */
do_subst(ScmConvInfo * cinfo,char * outptr,ScmSize outroom,ScmSize * outchars)57 static inline int do_subst(ScmConvInfo *cinfo,
58                            char *outptr,
59                            ScmSize outroom,
60                            ScmSize *outchars)
61 {
62     if (cinfo->replaceSize == 0) {
63         return NO_OUTPUT_CHAR;
64     }
65     OUTCHK(cinfo->replaceSize);
66     for (int i = 0; i < cinfo->replaceSize; i++) {
67         outptr[i] = cinfo->replaceSeq[i];
68     }
69     *outchars = cinfo->replaceSize;
70     return cinfo->replaceSize;
71 }
72 
73 #define DO_SUBST                                                \
74     do {                                                        \
75         int i = do_subst(cinfo, outptr, outroom, outchars);     \
76         if (i < 0) return i;                                    \
77     } while (0)
78 
79 /******************************************************************
80  *
81  *  Single-unit handling routines
82  *
83  *  This section defines routines that converts single input unit
84  *  to single output unit, optionally affecting the state.
85  *  A unit is usually a character, but sometimes one input character
86  *  may be mapped to more than one output characters, or a sequence of
87  *  input characters is mapped to one output character.
88  *
89  *  The routine returns the number of input octets consumed, and
90  *  sets the number of output octets emitted in *outchars.
91  *  If an errornous condition occurs, it returns one of the following
92  *  error code, and not update *outchars.
93  *
94  *  ILLEGAL_SEQUENCE  - Input contains illegal sequence.
95  *  INPUT_NOT_ENOUGH  - Input sequence ends prematurely.
96  *  OUTPUT_NOT_ENOUGH - Output buffer is too small.
97  *  NO_OUTPUT_CHAR    - Input unit can't be represented in output CES.
98  *
99  *****************************************************************/
100 
101 /*=================================================================
102  * EUC-JP
103  */
104 
105 /* EUC_JISX0213 -> Shift_JIS
106  *
107  * Mapping anormalities
108  *
109  *   0x80--0xa0 except 0x8e and 0x8f : C1 region.
110  *          Doesn't have corresponding SJIS bytes,
111  *          so mapped to substitution char.
112  *   0xff : reserved byte.  mapped to substitution char.
113  *
114  * Conversion scheme
115  *   0x00-0x7f : corresponding ASCII range.
116  *   0x80--0x8d : substitution char.
117  *   0x8e : leading byte of JISX 0201 kana
118  *   0x8f : leading byte of JISX 0212 or JISX 0213 plane 2
119  *   0x90--0xa0 : substitution char.
120  *   0xa1--0xfe : first byte (e1) of JISX 0213 plane 1
121  *   0xff : substitution char
122  *
123  *   For double or trible-byte character, subsequent byte has to be in
124  *   the range between 0xa1 and 0xfe inclusive.  If not, it is replaced
125  *   for the substitution character.
126  *
127  *   If the first byte is in the range of 0xa1--0xfe, two bytes (e1, e2)
128  *   is mapped to SJIS (s1, s2) by:
129  *
130  *     s1 = (e1 - 0xa0 + 0x101)/2 if 0xa1 <= e1 <= 0xde
131  *          (e1 - 0xa0 + 0x181)/2 if 0xdf <= e1 <= 0xfe
132  *     s2 = (e2 - 0xa0 + 0x3f) if odd?(e1) && 0xa1 <= e2 <= 0xdf
133  *          (e2 - 0xa0 + 0x40) if odd?(e1) && 0xe0 <= e2 <= 0xfe
134  *          (e2 - 0xa0 + 0x9e) if even?(e1)
135  *
136  *   If the first byte is 0x8f, the second byte (e1) and the third byte
137  *   (e2) is mapped to SJIS (s1, s2) by:
138  *     if (0xee <= e1 <= 0xfe)  s1 = (e1 - 0xa0 + 0x19b)/2
139  *     otherwise, follow the table:
140  *       e1 == 0xa1 or 0xa8  => s1 = 0xf0
141  *       e1 == 0xa3 or 0xa4  => s1 = 0xf1
142  *       e1 == 0xa5 or 0xac  => s1 = 0xf2
143  *       e1 == 0xae or 0xad  => s1 = 0xf3
144  *       e1 == 0xaf          => s1 = 0xf4
145  *     If e1 is other value, it is JISX0212; we use substitution char.
146  *     s2 is mapped with the same rule above.
147  */
148 
eucj_sjis(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)149 static ScmSize eucj_sjis(ScmConvInfo *cinfo SCM_UNUSED,
150                          const char *inptr, ScmSize inroom,
151                          char *outptr, ScmSize outroom,
152                          ScmSize *outchars)
153 {
154     unsigned char e1 = inptr[0];
155     if (e1 <= 0x7f) {
156         outptr[0] = e1;
157         *outchars = 1;
158         return 1;
159     }
160     if (e1 >= 0xa1 && e1 <= 0xfe) {
161         /* double byte char (JISX 0213 plane 1) */
162         unsigned char s1, s2;
163         INCHK(2);
164         unsigned char e2 = inptr[1];
165         if (e2 < 0xa1 || e2 == 0xff) {
166             DO_SUBST;
167             return 2;
168         }
169         OUTCHK(2);
170         if (e1 <= 0xde) s1 = (e1 - 0xa0 + 0x101)/2;
171         else            s1 = (e1 - 0xa0 + 0x181)/2;
172         if (e1%2 == 0) {
173             s2 = e2 - 0xa0 + 0x9e;
174         } else {
175             if (e2 <= 0xdf) s2 = e2 - 0xa0 + 0x3f;
176             else            s2 = e2 - 0xa0 + 0x40;
177         }
178         outptr[0] = s1;
179         outptr[1] = s2;
180         *outchars = 2;
181         return 2;
182     }
183     if (e1 == 0x8e) {
184         /* JISX 0201 kana */
185         INCHK(2);
186         unsigned char e2 = inptr[1];
187         if (e2 < 0xa1 || e2 == 0xff) {
188             DO_SUBST;
189         } else {
190             outptr[0] = e2;
191             *outchars = 1;
192         }
193         return 2;
194     }
195     if (e1 == 0x8f) {
196         /* triple byte char */
197         unsigned char s1, s2;
198         static const unsigned char cvt[] = { 0xf0, 0, 0xf1, 0xf1, 0xf2, 0, 0, 0xf0, 0, 0, 0, 0xf2, 0xf3, 0xf3, 0xf4 };
199 
200         INCHK(3);
201         OUTCHK(2);
202         e1 = inptr[1];
203         unsigned char e2 = inptr[2];
204         if (e1 < 0xa1 || e1 == 0xff || e2 < 0xa1 || e2 == 0xff) {
205             DO_SUBST;
206             return 3;
207         }
208         if (e1 >= 0xee) {
209             s1 = (e1 - 0xa0 + 0x19b)/2;
210         } else if (e1 >= 0xb0) {
211             DO_SUBST;
212             return 3;
213         } else {
214             s1 = cvt[e1-0xa1];
215             if (s1 == 0) {
216                 DO_SUBST;
217                 return 3;
218             }
219         }
220         if (e1%2 == 0) {
221             s2 = e2 - 0xa0 + 0x9e;
222         } else {
223             if (e2 < 0xdf) s2 = e2 - 0xa0 + 0x3f;
224             else           s2 = e2 - 0xa0 + 0x40;
225         }
226         outptr[0] = s1;
227         outptr[1] = s2;
228         *outchars = 2;
229         return 3;
230     }
231     /* no corresponding char */
232     DO_SUBST;
233     return 1;
234 }
235 
236 /* [EUC_JP -> UTF8 conversion]
237  *
238  * Conversion strategy:
239  *   If euc0 is in ASCII range, or C1 range except 0x8e or 0x8f, map it as is.
240  *   If euc0 is 0x8e, use JISX0201-KANA table.
241  *   If euc0 is 0x8f, use JISX0213 plane 2 table.
242  *   If euc0 is in [0xa1-0xfe], use JISX0213 plane1 table.
243  *   If euc0 is 0xa0 or 0xff, return ILLEGAL_SEQUENCE.
244  *
245  * JISX0213 plane2 table is consisted by a 2-level tree.  The first-level
246  * returns an index to the second-level table by (euc1 - 0xa1).  Only the
247  * range of JISX0213 defined region is converted; JISX0212 region will be
248  * mapped to the substitution char.
249  */
250 
251 #include "eucj2ucs.c"
252 
253 /* UTF8 utility.  Similar stuff is included in gauche/char_utf_8.h
254    if the native encoding is UTF8, but not otherwise.
255    So I include them here as well. */
256 
jconv_ucs4_to_utf8(unsigned int ucs,char * cp)257 void jconv_ucs4_to_utf8(unsigned int ucs, char *cp)
258 {
259     if (ucs < 0x80) {
260         *cp = ucs;
261     }
262     else if (ucs < 0x800) {
263         *cp++ = ((ucs>>6)&0x1f) | 0xc0;
264         *cp = (ucs&0x3f) | 0x80;
265     }
266     else if (ucs < 0x10000) {
267         *cp++ = ((ucs>>12)&0x0f) | 0xe0;
268         *cp++ = ((ucs>>6)&0x3f) | 0x80;
269         *cp = (ucs&0x3f) | 0x80;
270     }
271     else if (ucs < 0x200000) {
272         *cp++ = ((ucs>>18)&0x07) | 0xf0;
273         *cp++ = ((ucs>>12)&0x3f) | 0x80;
274         *cp++ = ((ucs>>6)&0x3f) | 0x80;
275         *cp = (ucs&0x3f) | 0x80;
276     }
277     else if (ucs < 0x4000000) {
278         *cp++ = ((ucs>>24)&0x03) | 0xf8;
279         *cp++ = ((ucs>>18)&0x3f) | 0x80;
280         *cp++ = ((ucs>>12)&0x3f) | 0x80;
281         *cp++ = ((ucs>>6)&0x3f) | 0x80;
282         *cp = (ucs&0x3f) | 0x80;
283     } else {
284         *cp++ = ((ucs>>30)&0x1) | 0xfc;
285         *cp++ = ((ucs>>24)&0x3f) | 0x80;
286         *cp++ = ((ucs>>18)&0x3f) | 0x80;
287         *cp++ = ((ucs>>12)&0x3f) | 0x80;
288         *cp++ = ((ucs>>6)&0x3f) | 0x80;
289         *cp++ = (ucs&0x3f) | 0x80;
290     }
291 }
292 
293 /* Returns # of input chars, or negative error code on error */
jconv_utf8_to_ucs4(const char * cp,ScmSize size,ScmChar * ucs)294 int jconv_utf8_to_ucs4(const char *cp, ScmSize size, ScmChar *ucs)
295 {
296     u_char u0 = cp[0];
297     if (u0 < 0x80) {
298         *ucs = u0;
299         return 1;
300     } else if (u0 < 0xc0) {
301         return ILLEGAL_SEQUENCE;
302     } else if (u0 < 0xe0) {
303         if (size < 2) return INPUT_NOT_ENOUGH;
304         u_char u1 = cp[1];
305         ScmChar ch = ((u0 & 0x1f) << 6) | (u1 & 0x3f);
306         if (ch < 0x80) return ILLEGAL_SEQUENCE;
307         *ucs = ch;
308         return 2;
309     } else if (u0 < 0xf0) {
310         if (size < 3) return INPUT_NOT_ENOUGH;
311         u_char u1 = cp[1], u2 = cp[2];
312         ScmChar ch = ((u0 & 0x0f) << 12) | ((u1 & 0x3f) << 6) | (u2 & 0x3f);
313         if (ch < 0x800) return ILLEGAL_SEQUENCE;
314         *ucs = ch;
315         return 3;
316     } else if (u0 < 0xf8) {
317         if (size < 4) return INPUT_NOT_ENOUGH;
318         u_char u1 = cp[1], u2 = cp[2], u3 = cp[3];
319         ScmChar ch = ((u0 & 0x07) << 18) | ((u1 & 0x3f) << 12)
320             | ((u2 & 0x3f) << 6) | (u3 & 0x3f);
321         if (ch < 0x10000) return ILLEGAL_SEQUENCE;
322         *ucs = ch;
323         return 4;
324     } else if (u0 < 0xfc) {
325         if (size < 5) return INPUT_NOT_ENOUGH;
326         u_char u1 = cp[1], u2 = cp[2], u3 = cp[3], u4 = cp[4];
327         ScmChar ch = ((u0 & 0x03) << 24) | ((u1 & 0x3f) << 18)
328             | ((u2 & 0x3f) << 12) | ((u3 & 0x3f) << 6) | (u4 & 0x3f);
329         if (ch < 0x8000000) return ILLEGAL_SEQUENCE;
330         *ucs = ch;
331         return 5;
332     } else if (u0 < 0xfe) {
333         if (size < 6) return INPUT_NOT_ENOUGH;
334         u_char u1 = cp[1], u2 = cp[2], u3 = cp[3], u4 = cp[4], u5 = cp[5];
335         ScmChar ch = ((u0 & 0x01) << 30) | ((u1 & 0x3f) << 24)
336             | ((u2 & 0x3f) << 18) | ((u3 & 0x3f) << 12)
337             | ((u4 & 0x3f) << 6) | (u5 & 0x3f);
338         *ucs = ch;
339         return 6;
340     } else {
341         return ILLEGAL_SEQUENCE;
342     }
343 }
344 
345 /* Given 'encoded' ucs, emit utf8.  'Encoded' ucs is the entry of the
346    conversion table.  If ucs >= 0x100000, it is composed by two UCS2
347    character.  Otherwise, it is one UCS4 character. */
eucj_utf8_emit_utf(unsigned int ucs,ScmSize inchars,char * outptr,ScmSize outroom,ScmSize * outchars)348 static inline ScmSize eucj_utf8_emit_utf(unsigned int ucs, ScmSize inchars,
349                                          char *outptr, ScmSize outroom,
350                                          ScmSize *outchars)
351 {
352     if (ucs < 0x100000) {
353         int outreq = UCS2UTF_NBYTES(ucs);
354         OUTCHK(outreq);
355         jconv_ucs4_to_utf8(ucs, outptr);
356         *outchars = outreq;
357     } else {
358         /* we need two UCS characters */
359         unsigned int ucs0 = (ucs >> 16) & 0xffff;
360         unsigned int ucs1 = ucs & 0xfff;
361         int outreq0 = UCS2UTF_NBYTES(ucs0);
362         int outreq1 = UCS2UTF_NBYTES(ucs1);
363         OUTCHK(outreq0+outreq1);
364         jconv_ucs4_to_utf8(ucs0, outptr);
365         jconv_ucs4_to_utf8(ucs1, outptr+outreq0);
366         *outchars = outreq0+outreq1;
367     }
368     return inchars;
369 }
370 
eucj_utf8(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)371 static ScmSize eucj_utf8(ScmConvInfo *cinfo SCM_UNUSED,
372                          const char *inptr, ScmSize inroom,
373                          char *outptr, ScmSize outroom, ScmSize *outchars)
374 {
375     unsigned char e0 = (unsigned char)inptr[0];
376     if (e0 < 0xa0) {
377         if (e0 == 0x8e) {
378             /* JIS X 0201 KANA */
379             INCHK(2);
380             unsigned char e1 = (unsigned char)inptr[1];
381             if (e1 < 0xa1 || e1 > 0xdf) return ILLEGAL_SEQUENCE;
382             unsigned int ucs = 0xff61 + (e1 - 0xa1);
383             return eucj_utf8_emit_utf(ucs, 2, outptr, outroom, outchars);
384         }
385         else if (e0 == 0x8f) {
386             /* JIS X 0213 plane 2 */
387             int index;
388 
389             INCHK(3);
390             unsigned char e1 = (unsigned char)inptr[1];
391             unsigned char e2 = (unsigned char)inptr[2];
392             if (e1 < 0xa1 || e1 > 0xfe || e2 < 0xa1 || e2 > 0xfe) {
393                 return ILLEGAL_SEQUENCE;
394             }
395             index = euc_jisx0213_2_index[e1 - 0xa1];
396             if (index < 0) {
397                 DO_SUBST;
398                 return 3;
399             }
400             unsigned int ucs = euc_jisx0213_2_to_ucs2[index][e2 - 0xa1];
401             if (ucs != 0) {
402                 return eucj_utf8_emit_utf(ucs, 3, outptr, outroom, outchars);
403             }
404             DO_SUBST;
405             return 3;
406         }
407         else {
408             /* ASCII or C1 region */
409             outptr[0] = e0;
410             *outchars = 1;
411             return 1;
412         }
413     }
414     if (e0 > 0xa0 && e0 < 0xff) {
415         /* JIS X 0213 plane 1 */
416         INCHK(2);
417         unsigned char e1 = (unsigned char)inptr[1];
418         if (e1 < 0xa1 || e1 > 0xfe) return ILLEGAL_SEQUENCE;
419         unsigned int ucs = euc_jisx0213_1_to_ucs2[e0 - 0xa1][e1 - 0xa1];
420         if (ucs != 0) {
421             return eucj_utf8_emit_utf(ucs, 2, outptr, outroom, outchars);
422         }
423         DO_SUBST;
424         return 2;
425     }
426     /* e0 == 0xa0 */
427     DO_SUBST;
428     return 1;
429 }
430 
431 /* EUC_JP -> ISO8859-1 */
eucj_lat1(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)432 static ScmSize eucj_lat1(ScmConvInfo *cinfo,
433                          const char *inptr, ScmSize inroom,
434                          char *outptr, ScmSize outroom, ScmSize *outchars)
435 {
436     char u[6];
437     ScmSize nu;
438     ScmSize r = eucj_utf8(cinfo, inptr, inroom, u, 6, &nu);
439     if (r < 0) return r;
440     ScmChar ch;
441     ScmSize r2 = jconv_utf8_to_ucs4(u, nu, &ch);
442     if (r2 < 0) return r2;
443     if (ch < 0x100) {
444         *outptr = ch;
445         *outchars = 1;
446     } else {
447         DO_SUBST;
448     }
449     return r;
450 }
451 
452 /* EUC_JP -> ISO2022JP(-3)
453  *
454  * For now, I follow the strategy of iso2022jp-3-compatible behavior.
455  */
456 
457 /* ensure the current state is newstate.  returns # of output chars.
458    may return OUTPUT_NOT_ENOUGH. */
jis_ensure_state(ScmConvInfo * cinfo,int newstate,ScmSize outbytes,char * outptr,ScmSize outroom)459 static ScmSize jis_ensure_state(ScmConvInfo *cinfo, int newstate,
460                                 ScmSize outbytes,
461                                 char *outptr, ScmSize outroom)
462 {
463     const char *escseq = NULL;
464     ScmSize esclen = 0;
465 
466     if (cinfo->ostate == newstate) {
467         OUTCHK(outbytes);
468         return 0;
469     }
470     switch (newstate) {
471     case JIS_ASCII:
472         escseq = "\033(B";  esclen = 3; break;
473     case JIS_KANA:
474         escseq = "\033(I";  esclen = 3; break;
475     case JIS_0213_1:
476         escseq = "\033$B";  esclen = 3; break;
477     case JIS_0213_2:
478         escseq = "\033$(P"; esclen = 4; break;
479     case JIS_0212:
480         escseq = "\033$(D"; esclen = 4; break;
481     default:
482         /* Can't be here */
483         Scm_Panic("something wrong in jis_ensure_state: implementation error?");
484         return 0;               /* dummy */
485     }
486     OUTCHK(esclen + outbytes);
487     memcpy(outptr, escseq, esclen);
488     cinfo->ostate = newstate;
489     return esclen;
490 }
491 
eucj_jis(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)492 static ScmSize eucj_jis(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom,
493                         char *outptr, ScmSize outroom, ScmSize *outchars)
494 {
495     unsigned char e0 = inptr[0];
496     if (e0 < 0x80) {
497         ScmSize outoffset = jis_ensure_state(cinfo, JIS_ASCII, 1, outptr, outroom);
498         if (ERRP(outoffset)) return outoffset;
499         outptr[outoffset] = e0;
500         *outchars = outoffset+1;
501         return 1;
502     } else if (e0 == 0x8e) {
503         INCHK(2);
504         unsigned char e1 = inptr[1];
505         if (e1 > 0xa0 && e1 < 0xff) {
506             ScmSize outoffset = jis_ensure_state(cinfo, JIS_KANA, 1, outptr, outroom);
507             if (ERRP(outoffset)) return outoffset;
508             outptr[outoffset] = e1 - 0x80;
509             *outchars = outoffset+1;
510             return 2;
511         }
512     } else if (e0 == 0x8f) {
513         INCHK(3);
514         e0 = inptr[1];
515         unsigned char e1 = inptr[2];
516         if (e0 > 0xa0 && e0 < 0xff && e1 > 0xa0 && e1 < 0xff) {
517             int newstate = JIS_0212;
518             switch (e0) {
519             case 0xa1:; case 0xa3:; case 0xa4:; case 0xa5:;
520             case 0xa8:; case 0xac:; case 0xad:; case 0xae:; case 0xaf:;
521                 newstate = JIS_0213_2; break;
522             default:
523                 if (e0 >= 0xee) newstate = JIS_0213_2;
524             }
525             ScmSize outoffset = jis_ensure_state(cinfo, newstate, 2, outptr, outroom);
526             outptr[outoffset] = e0 - 0x80;
527             outptr[outoffset+1] = e1 - 0x80;
528             *outchars = outoffset+1;
529             return 3;
530         }
531     } else if (e0 > 0xa0 && e0 < 0xff) {
532         INCHK(2);
533         unsigned char e1 = inptr[1];
534         if (e1 > 0xa0 && e1 < 0xff) {
535             ScmSize outoffset = jis_ensure_state(cinfo, JIS_0213_1, 2, outptr, outroom);
536             if (ERRP(outoffset)) return outoffset;
537             outptr[outoffset] = e0 - 0x80;
538             outptr[outoffset+1] = e1 - 0x80;
539             *outchars = outoffset+2;
540             return 2;
541         }
542     }
543     return ILLEGAL_SEQUENCE;
544 }
545 
546 
547 /* EUC-JP -> ASCII */
eucj_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)548 static ScmSize eucj_ascii(ScmConvInfo *cinfo,
549                           const char *inptr, ScmSize inroom,
550                           char *outptr, ScmSize outroom,
551                           ScmSize *outchars)
552 {
553     unsigned char e1 = inptr[0];
554     if (e1 <= 0x7f) {
555         outptr[0] = e1;
556         *outchars = 1;
557         return 1;
558     }
559     if (e1 >= 0xa1 && e1 <= 0xfe) {
560         /* double byte char (JISX 0213 plane 1) */
561         INCHK(2);
562         DO_SUBST;
563         return 2;
564     }
565     if (e1 == 0x8e) {
566         INCHK(2);
567         DO_SUBST;
568         return 2;
569     }
570     if (e1 == 0x8f) {
571         INCHK(3);
572         DO_SUBST;
573         return 3;
574     }
575     DO_SUBST;
576     return 1;
577 }
578 
579 /*=================================================================
580  * Shift JIS
581  */
582 
583 /* Shift_JISX0213 -> EUC-JP
584  *
585  * Mapping anormalities
586  *
587  *   0x5c, 0x7e : Shift_JISX0213 mapping table maps 0x5c to U+00A5
588  *       (YEN SIGN) and 0x7e to U+203E (OVERLINE).  But mapping so
589  *       breaks the program code written in Shift JIS.   I map them
590  *       to the corresponding ASCII chars.
591  *   0xfd, 0xfe, 0xff : These are reserved bytes.  Apple uses these
592  *       bytes for vendor extension:
593  *        0xfd - U+00A9 COPYRIGHT SIGN     |EUC A9A6  |JISX0213
594  *        0xfe - U+2122 TRADE MARK SIGN    |EUC 8FA2EF|JISX0212
595  *        0xff - U+2026 HORIZONTAL ELLIPSIS|EUC A1C4  |JISX0208
596  *       This is a one-direction mapping.
597  *   0x80, 0xa0 : These are reserved bytes.  Replaced to the
598  *       one-byte substitution character of destination encoding.
599  *
600  * Conversion scheme
601  *   0x00-0x7f : corresponding ASCII range.
602  *   0x80      : substitution character
603  *   0x81 -- 0x9f : first byte (s1) of double byte range for JIS X 0213 m=1
604  *   0xa0      : substitution character
605  *   0xa1 -- 0xdf : JISX 0201 kana = s1-0x80
606  *   0xe0 -- 0xef : first byte (s1) of double byte range for JIS X 0213 m=1
607  *   0xf0 -- 0xfc : first byte (s1) of double byte range for JIS X 0213 m=2
608  *   0xfd : U+00A9, EUC A9A6, JISX0213 (1, 0x09, 0x06)
609  *   0xfe : U+2122, EUC 8FA2EF, JISX0212
610  *   0xff : U+2026, EUC A1C4, JISX0208 (1, 0x01, 0x24)
611  *
612  *   For double-byte character, second byte s2 must be in the range of
613  *   0x40 <= s2 <= 0x7e or 0x80 <= s2 <= 0xfc.  Otherwise, double-byte
614  *   substitution character is used.
615  *
616  *     two bytes (s1, s2) maps to JIS X 0213 (m, k, t) by
617  *        m = 1 if s1 <= 0xef, 2 otherwise
618  *        k = (s1-0x80)*2 - ((s2 < 0x9f)? 1 : 0)  if s1 <= 0x9f
619  *            (s1-0xc0)*2 - ((s2 < 0x9f)? 1 : 0)  if 0xe0 <= s1 <= 0xef
620  *            (s1-0x9e)*2 - ((s2 < 0x89)? 1 : 0)  if s1 >= 0xf5
621  *            otherwise, use the following table
622  *               s1   k (s2>=0x80, s2<0x80)
623  *              0xf0   (0x01, 0x08)
624  *              0xf1   (0x03, 0x04)
625  *              0xf2   (0x05, 0x0c)
626  *              0xf3   (0x0e, 0x0d)
627  *              0xf4   (0x0f, 0x4e)
628  *        t = s2-0x3f if s2 < 0x7f
629  *            s2-0x40 if s2 < 0x9f
630  *            s2-0x9e otherwise
631  *
632  *     JIS X 0213 to EUC-JP is a straightfoward conversion.
633  */
634 
sjis_eucj(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)635 static ScmSize sjis_eucj(ScmConvInfo *cinfo SCM_UNUSED,
636                          const char *inptr, ScmSize inroom,
637                          char *outptr, ScmSize outroom,
638                          ScmSize *outchars)
639 {
640     static const unsigned char cvt[] = { 0xa1, 0xa8, 0xa3, 0xa4, 0xa5, 0xac, 0xae, 0xad, 0xaf, 0xee };
641 
642     unsigned char s1 = inptr[0];
643     if (s1 <= 0x7f) {
644         *outptr = s1;
645         *outchars = 1;
646         return 1;
647     }
648     if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 <= 0xfc)) {
649         /* Double byte char */
650         unsigned char m, e1, e2;
651         INCHK(2);
652         unsigned char s2 = inptr[1];
653         if (s2 < 0x40 || s2 > 0xfc) {
654             DO_SUBST;
655             return 2;
656         }
657 
658         if (s1 <= 0x9f) {
659             OUTCHK(2);
660             m = 1;
661             e1 = (s1-0x80)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
662         } else if (s1 <= 0xef) {
663             OUTCHK(2);
664             m = 1;
665             e1 = (s1-0xc0)*2 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
666         } else if (s1 >= 0xf5) {
667             OUTCHK(3);
668             m = 2;
669             e1 = (s1-0xf5)*2 + 0x50 + 0xa0 - ((s2 < 0x9f)? 1 : 0);
670         } else {
671             OUTCHK(3);
672             m = 2;
673             e1 = cvt[(s1-0xf0)*2+((s2 < 0x9f)? 1 : 0)];
674         }
675 
676         if (s2 < 0x7f) {
677             e2 = s2 - 0x3f + 0xa0;
678         } else if (s2 < 0x9f) {
679             e2 = s2 - 0x40 + 0xa0;
680         } else {
681             e2 = s2 - 0x9e + 0xa0;
682         }
683         if (m == 1) {
684             outptr[0] = e1;
685             outptr[1] = e2;
686             *outchars = 2;
687         } else {
688             outptr[0] = 0x8f;
689             outptr[1] = e1;
690             outptr[2] = e2;
691             *outchars = 3;
692         }
693         return 2;
694     }
695     if (s1 >= 0xa1 && s1 <= 0xdf) {
696         /* JISX0201 KANA */
697         OUTCHK(2);
698         outptr[0] = 0x8e;
699         outptr[1] = s1;
700         *outchars = 2;
701         return 1;
702     }
703     if (s1 == 0xfd) {
704         /* copyright mark */
705         OUTCHK(2);
706         outptr[0] = 0xa9;
707         outptr[1] = 0xa6;
708         *outchars = 2;
709         return 1;
710     }
711     if (s1 == 0xfe) {
712         /* trademark sign.  this is not in JISX0213, but in JISX0212. */
713         OUTCHK(3);
714         outptr[0] = 0x8f;
715         outptr[1] = 0xa2;
716         outptr[2] = 0xef;
717         *outchars = 3;
718         return 1;
719     }
720     if (s1 == 0xff) {
721         /* horizontal ellipsis. */
722         OUTCHK(2);
723         outptr[0] = 0xa1;
724         outptr[1] = 0xc4;
725         *outchars = 2;
726         return 1;
727     }
728 
729     /* s1 == 0x80 or 0xa0 */
730     DO_SUBST;
731     return 2;
732 }
733 
734 /* SJIS -> ASCII */
735 
sjis_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)736 static ScmSize sjis_ascii(ScmConvInfo *cinfo,
737                           const char *inptr, ScmSize inroom,
738                           char *outptr, ScmSize outroom,
739                           ScmSize *outchars)
740 {
741     unsigned char s1 = inptr[0];
742     if (s1 <= 0x7f) {
743         outptr[0] = s1;
744         *outchars = 1;
745         return 1;
746     }
747     if ((s1 > 0x80 && s1 < 0xa0) || (s1 >= 0xe0 && s1 < 0xfc)) {
748         INCHK(2);
749         DO_SUBST;
750         *outchars = cinfo->replaceSize;
751         return 2;
752     }
753     else {
754         DO_SUBST;
755         *outchars = cinfo->replaceSize;
756         return 1;
757     }
758 }
759 
760 /*=================================================================
761  * UTF8
762  */
763 
764 /* Conversion between UTF8 and EUC_JP is based on the table found at
765  * http://isweb11.infoseek.co.jp/computer/wakaba/table/jis-note.ja.html
766  *
767  * There are some characters in JISX0213 that can't be represented
768  * in a single Unicode character, but can be with a combining character.
769  * In such case, EUC_JP to UTF8 conversion uses combining character,
770  * but UTF8 to EUC_JP conversion translates the combining character into
771  * another character.  For example, a single JISX0213 katakana 'nga'
772  * (hiragana "ka" with han-dakuon mark) will translates to Unicode
773  * U+304B+309A (HIRAGANA LETTER KA + COMBINING KATAKANA-HIRAGANA SEMI-VOICED
774  * SOUND MARK).  When this sequence is converted to EUC_JP again, it
775  * becomes EUCJ 0xA4AB + 0xA1AC.  This is an implementation limitation,
776  * and should be removed in later release.
777  */
778 
779 /* [UTF8 -> EUC_JP conversion]
780  *
781  * EUC-JP has the corresponding characters to the wide range of
782  * UCS characters.
783  *
784  *   UCS4 character   # of EUC_JP characters
785  *   ---------------------------------------
786  *     U+0000+0xxx    564
787  *     U+0000+1xxx      6
788  *     U+0000+2xxx    321
789  *     U+0000+3xxx    422
790  *     U+0000+4xxx    347
791  *     U+0000+5xxx   1951
792  *     U+0000+6xxx   2047
793  *     U+0000+7xxx   1868
794  *     U+0000+8xxx   1769
795  *     U+0000+9xxx   1583
796  *     U+0000+fxxx    241
797  *     U+0002+xxxx    302
798  *
799  * It is so wide and so sparse that naive lookup table implementation from
800  * UCS to EUC can be space-wasting.  I use hierarchical table with some
801  * ad-hoc heuristics.   Since the hierarchical table is used, I directly
802  * translates UTF8 to EUC_JP, without converting it to UCS4.
803  *
804  * Strategy outline: say input consists of bytes named u0, u1, ....
805  *
806  *  u0 <= 0x7f  : ASCII range
807  *  u0 in [0xc2-0xd1] : UTF8 uses 2 bytes.  Some mappings within this range
808  *         is either very regular or very small, and they are
809  *         hardcoded.   Other mappings uses table lookup.
810  *  u0 == 0xe1  : UTF8 uses 3 bytes.  There are only 6 characters in this
811  *         range, and it is hardcoded.
812  *  u0 in [0xe2-0xe9, 0xef] : Large number of characters are in this range.
813  *         Two-level table of 64 entries each is used to dispatch the
814  *         characters.
815  *  u0 == 0xf0  : UTF8 uses 4 bytes.  u1 is in [0xa0-0xaa].  u2 and u3 is
816  *         used for dispatch table of 64 entries each.
817  *
818  * The final table entry is unsigned short.  0x0000 means no corresponding
819  * character is defined in EUC_JP.  >=0x8000 is the EUC_JP character itself.
820  * < 0x8000 means the character is in G3 plane; 0x8f should be preceded,
821  * and 0x8000 must be added to the value.
822  */
823 
824 #include "ucs2eucj.c"
825 
826 /* Emit given euc char */
utf2euc_emit_euc(unsigned short euc,ScmSize inchars,char * outptr,ScmSize outroom,ScmSize * outchars)827 static inline ScmSize utf2euc_emit_euc(unsigned short euc,
828                                        ScmSize inchars,
829                                        char *outptr,
830                                        ScmSize outroom,
831                                        ScmSize *outchars)
832 {
833     if (euc < 0x8000) {
834         OUTCHK(3);
835         outptr[0] = 0x8f;
836         outptr[1] = (euc >> 8) + 0x80;
837         outptr[2] = euc & 0xff;
838         *outchars = 3;
839     } else {
840         OUTCHK(2);
841         outptr[0] = (euc >> 8);
842         outptr[1] = euc & 0xff;
843         *outchars = 2;
844     }
845     return inchars;
846 }
847 
848 /* handle 2-byte UTF8 sequence.  0xc0 <= u0 <= 0xdf */
utf2euc_2(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)849 static inline ScmSize utf2euc_2(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
850                                 const char *inptr, ScmSize inroom,
851                                 char *outptr, ScmSize outroom,
852                                 ScmSize *outchars)
853 {
854     const unsigned short *etab = NULL;
855 
856     INCHK(2);
857     unsigned char u1 = (unsigned char)inptr[1];
858     if (u1 < 0x80 || u1 >= 0xc0) return ILLEGAL_SEQUENCE;
859 
860     switch (u0) {
861     case 0xc2: etab = utf2euc_c2; break;
862     case 0xc3: etab = utf2euc_c3; break;
863     case 0xc4: etab = utf2euc_c4; break;
864     case 0xc5: etab = utf2euc_c5; break;
865     case 0xc6:
866         if (u1 == 0x93) { /* U+0193 -> euc ABA9 */
867             return utf2euc_emit_euc(0xaba9, 2, outptr, outroom, outchars);
868         } else break;
869     case 0xc7: etab = utf2euc_c7; break;
870     case 0xc9: etab = utf2euc_c9; break;
871     case 0xca: etab = utf2euc_ca; break;
872     case 0xcb: etab = utf2euc_cb; break;
873     case 0xcc: etab = utf2euc_cc; break;
874     case 0xcd:
875         if (u1 == 0xa1) { /* U+0361 -> euc ABD2 */
876             return utf2euc_emit_euc(0xabd2, 2, outptr, outroom, outchars);
877         } else break;
878     case 0xce: etab = utf2euc_ce; break;
879     case 0xcf: etab = utf2euc_cf; break;
880     case 0xd0: etab = utf2euc_d0; break;
881     case 0xd1: etab = utf2euc_d1; break;
882     default:
883         break;
884     }
885     if (etab != NULL) {
886         /* table lookup */
887         unsigned short euc = etab[u1-0x80];
888         if (euc != 0) {
889             return utf2euc_emit_euc(euc, 2, outptr, outroom, outchars);
890         }
891     }
892     DO_SUBST;
893     return 2;
894 }
895 
896 /* handle 3-byte UTF8 sequence.  0xe0 <= u0 <= 0xef */
utf2euc_3(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)897 static inline ScmSize utf2euc_3(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
898                                 const char *inptr, ScmSize inroom,
899                                 char *outptr, ScmSize outroom,
900                                 ScmSize *outchars)
901 {
902     const unsigned char *tab1 = NULL;
903     const unsigned short (*tab2)[64] = NULL;
904 
905     INCHK(3);
906     unsigned char u1 = (unsigned char)inptr[1];
907     unsigned char u2 = (unsigned char)inptr[2];
908 
909     switch (u0) {
910     case 0xe1: /* special case : there's only 6 chars */
911         {
912             unsigned short euc = 0;
913             if (u1 == 0xb8) {
914                 if (u2 == 0xbe)      euc = 0xa8f2;
915                 else if (u2 == 0xbf) euc = 0xa8f3;
916             } else if (u1 == 0xbd) {
917                 if (u2 == 0xb0)      euc = 0xabc6;
918                 else if (u2 == 0xb1) euc = 0xabc7;
919                 else if (u2 == 0xb2) euc = 0xabd0;
920                 else if (u2 == 0xb3) euc = 0xabd1;
921             }
922             return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
923         }
924     case 0xe2: tab1 = utf2euc_e2; tab2 = utf2euc_e2_xx; break;
925     case 0xe3: tab1 = utf2euc_e3; tab2 = utf2euc_e3_xx; break;
926     case 0xe4: tab1 = utf2euc_e4; tab2 = utf2euc_e4_xx; break;
927     case 0xe5: tab1 = utf2euc_e5; tab2 = utf2euc_e5_xx; break;
928     case 0xe6: tab1 = utf2euc_e6; tab2 = utf2euc_e6_xx; break;
929     case 0xe7: tab1 = utf2euc_e7; tab2 = utf2euc_e7_xx; break;
930     case 0xe8: tab1 = utf2euc_e8; tab2 = utf2euc_e8_xx; break;
931     case 0xe9: tab1 = utf2euc_e9; tab2 = utf2euc_e9_xx; break;
932     case 0xef: tab1 = utf2euc_ef; tab2 = utf2euc_ef_xx; break;
933     default:
934         break;
935     }
936     if (tab1 != NULL) {
937         unsigned char ind = tab1[u1-0x80];
938         if (ind != 0) {
939             unsigned short euc = tab2[ind-1][u2-0x80];
940             if (euc != 0) {
941                 return utf2euc_emit_euc(euc, 3, outptr, outroom, outchars);
942             }
943         }
944     }
945     DO_SUBST;
946     return 3;
947 }
948 
949 /* handle 4-byte UTF8 sequence.  u0 == 0xf0, 0xa0 <= u1 <= 0xaa */
utf2euc_4(ScmConvInfo * cinfo SCM_UNUSED,unsigned char u0,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)950 static inline ScmSize utf2euc_4(ScmConvInfo *cinfo SCM_UNUSED, unsigned char u0,
951                                 const char *inptr, ScmSize inroom,
952                                 char *outptr, ScmSize outroom,
953                                 ScmSize *outchars)
954 {
955     const unsigned short *tab = NULL;
956 
957     INCHK(4);
958     if (u0 != 0xf0) {
959         DO_SUBST;
960         return 4;
961     }
962     unsigned char u1 = (unsigned char)inptr[1];
963     unsigned char u2 = (unsigned char)inptr[2];
964     unsigned char u3 = (unsigned char)inptr[3];
965 
966     switch (u1) {
967     case 0xa0: tab = utf2euc_f0_a0; break;
968     case 0xa1: tab = utf2euc_f0_a1; break;
969     case 0xa2: tab = utf2euc_f0_a2; break;
970     case 0xa3: tab = utf2euc_f0_a3; break;
971     case 0xa4: tab = utf2euc_f0_a4; break;
972     case 0xa5: tab = utf2euc_f0_a5; break;
973     case 0xa6: tab = utf2euc_f0_a6; break;
974     case 0xa7: tab = utf2euc_f0_a7; break;
975     case 0xa8: tab = utf2euc_f0_a8; break;
976     case 0xa9: tab = utf2euc_f0_a9; break;
977     case 0xaa: tab = utf2euc_f0_aa; break;
978     default:
979         break;
980     }
981     if (tab != NULL) {
982         unsigned short u2u3 = u2*256 + u3;
983         for (int i=0; tab[i]; i+=2) {
984             if (tab[i] == u2u3) {
985                 unsigned short euc = tab[i+1];
986                 if (euc != 0) {
987                     return utf2euc_emit_euc(euc, 4, outptr, outroom, outchars);
988                 }
989             }
990         }
991     }
992     DO_SUBST;
993     return 4;
994 }
995 
996 /* Body of UTF8 -> EUC_JP conversion */
utf8_eucj(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)997 static ScmSize utf8_eucj(ScmConvInfo *cinfo,
998                         const char *inptr, ScmSize inroom,
999                         char *outptr, ScmSize outroom,
1000                         ScmSize *outchars)
1001 {
1002     unsigned char u0 = (unsigned char)inptr[0];
1003 
1004     if (u0 <= 0x7f) {
1005         *outptr = u0;
1006         *outchars = 1;
1007         return 1;
1008     }
1009     if (u0 <= 0xbf) {
1010         /* invalid UTF8 sequence */
1011         return ILLEGAL_SEQUENCE;
1012     }
1013     if (u0 <= 0xdf) {
1014         /* 2-byte UTF8 sequence */
1015         return utf2euc_2(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1016     }
1017     if (u0 <= 0xef) {
1018         /* 3-byte UTF8 sequence */
1019         return utf2euc_3(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1020     }
1021     if (u0 <= 0xf7) {
1022         /* 4-byte UTF8 sequence */
1023         return utf2euc_4(cinfo, u0, inptr, inroom, outptr, outroom, outchars);
1024     }
1025     if (u0 <= 0xfb) {
1026         /* 5-byte UTF8 sequence */
1027         INCHK(5);
1028         DO_SUBST;
1029         return 5;
1030     }
1031     if (u0 <= 0xfd) {
1032         /* 6-byte UTF8 sequence */
1033         INCHK(6);
1034         DO_SUBST;
1035         return 6;
1036     }
1037     return ILLEGAL_SEQUENCE;
1038 }
1039 
1040 /* UTF8 -> UTF16 */
utf8_utf16(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1041 static ScmSize utf8_utf16(ScmConvInfo *cinfo,
1042                           const char *inptr, ScmSize inroom,
1043                           char *outptr, ScmSize outroom,
1044                           ScmSize *outchars)
1045 {
1046     ScmSize reqsize = 0;
1047     int ostate = cinfo->ostate;
1048     int need_bom = FALSE;
1049     ScmChar ch;
1050 
1051     if (ostate == UTF_DEFAULT) {
1052         reqsize += 2;
1053         need_bom = TRUE;
1054         ostate = UTF_BE;
1055     }
1056     int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1057     if (r < 0) return r;
1058     if (ch < 0x10000) reqsize += 2;
1059     else reqsize += 4;
1060 
1061     OUTCHK(reqsize);
1062     if (need_bom) {
1063         if (ostate == UTF_BE) {
1064             outptr[0] = 0xfe;
1065             outptr[1] = 0xff;
1066         } else {
1067             outptr[1] = 0xfe;
1068             outptr[0] = 0xff;
1069         }
1070         outptr += 2;
1071     }
1072     if (ch < 0x10000) {
1073         char u[2];
1074         u[0] = (ch >> 8) & 0xff;
1075         u[1] = ch & 0xff;
1076         if (ostate == UTF_BE) {
1077             outptr[0] = u[0];
1078             outptr[1] = u[1];
1079         } else {
1080             outptr[1] = u[0];
1081             outptr[0] = u[1];
1082         }
1083     } else {
1084         ch -= 0x10000;
1085         char u[2];
1086         u[0] = 0xd8 + ((ch >> 18) & 0x03);
1087         u[1] = (ch >> 10) & 0xff;
1088         if (ostate == UTF_BE) {
1089             outptr[0] = u[0];
1090             outptr[1] = u[1];
1091         } else {
1092             outptr[1] = u[0];
1093             outptr[0] = u[1];
1094         }
1095         u[0] = 0xdc + ((ch >> 8) & 0x03);
1096         u[1] = ch & 0xff;
1097         if (ostate == UTF_BE) {
1098             outptr[2] = u[0];
1099             outptr[3] = u[1];
1100         } else {
1101             outptr[3] = u[0];
1102             outptr[2] = u[1];
1103         }
1104     }
1105     cinfo->ostate = ostate;
1106     *outchars = reqsize;
1107     return r;
1108 }
1109 
1110 /* UTF8 -> UTF32 */
utf8_utf32(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1111 static ScmSize utf8_utf32(ScmConvInfo *cinfo,
1112                           const char *inptr, ScmSize inroom,
1113                           char *outptr, ScmSize outroom,
1114                           ScmSize *outchars)
1115 {
1116     ScmSize reqsize = 0;
1117     int ostate = cinfo->ostate;
1118     int need_bom = FALSE;
1119     ScmChar ch;
1120 
1121     if (ostate == UTF_DEFAULT) {
1122         reqsize += 4;
1123         need_bom = TRUE;
1124         ostate = UTF_BE;
1125     }
1126     int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1127     if (r < 0) return r;
1128     reqsize += 4;
1129 
1130     OUTCHK(reqsize);
1131     if (need_bom) {
1132         if (ostate == UTF_BE) {
1133             outptr[0] = 0;
1134             outptr[1] = 0;
1135             outptr[2] = 0xfe;
1136             outptr[3] = 0xff;
1137         } else {
1138             outptr[3] = 0;
1139             outptr[2] = 0;
1140             outptr[1] = 0xfe;
1141             outptr[0] = 0xff;
1142         }
1143         outptr += 4;
1144     }
1145     if (ostate == UTF_BE) {
1146         outptr[0] = (ch >> 24) & 0xff;
1147         outptr[1] = (ch >> 16) & 0xff;
1148         outptr[2] = (ch >> 8) & 0xff;
1149         outptr[3] = ch & 0xff;
1150     } else {
1151         outptr[3] = (ch >> 24) & 0xff;
1152         outptr[2] = (ch >> 16) & 0xff;
1153         outptr[1] = (ch >> 8) & 0xff;
1154         outptr[0] = ch & 0xff;
1155     }
1156     cinfo->ostate = ostate;
1157     *outchars = reqsize;
1158     return r;
1159 }
1160 
1161 /* UTF8 -> Latin1 */
utf8_lat1(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1162 static ScmSize utf8_lat1(ScmConvInfo *cinfo,
1163                          const char *inptr, ScmSize inroom,
1164                          char *outptr, ScmSize outroom,
1165                          ScmSize *outchars)
1166 {
1167     ScmChar ch;
1168     int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1169     if (r < 0) return r;
1170     if (ch < 0x100) {
1171         *outptr = ch;
1172         *outchars = 1;
1173     } else {
1174         DO_SUBST;
1175     }
1176     return r;
1177 }
1178 
1179 /* UTF8 -> ASCII */
utf8_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1180 static ScmSize utf8_ascii(ScmConvInfo *cinfo,
1181                           const char *inptr, ScmSize inroom,
1182                           char *outptr, ScmSize outroom,
1183                           ScmSize *outchars)
1184 {
1185     ScmChar ch;
1186     int r = jconv_utf8_to_ucs4(inptr, inroom, &ch);
1187     if (r < 0) return r;
1188     if (ch < 0x80) {
1189         *outptr = ch;
1190         *outchars = 1;
1191     } else {
1192         DO_SUBST;
1193     }
1194     return r;
1195 }
1196 
1197 /*=================================================================
1198  * UTF16
1199  */
1200 
1201 /* For now, we first convert it to utf8, for we already have the table
1202    directly supports utf8.  Theoretically though, having ucs4 to
1203    jis table would speed it up. */
1204 
utf16_utf8(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1205 static ScmSize utf16_utf8(ScmConvInfo *cinfo,
1206                           const char *inptr, ScmSize inroom,
1207                           char *outptr, ScmSize outroom,
1208                           ScmSize *outchars)
1209 {
1210     INCHK(2);
1211     int istate = cinfo->istate;
1212     ScmSize inread = 0;
1213     if (istate == UTF_DEFAULT) {
1214         if ((u_char)inptr[0] == 0xfe && (u_char)inptr[1] == 0xff) {
1215             inptr += 2;
1216             inroom -= 2;
1217             inread += 2;
1218             INCHK(2);
1219             istate = UTF_BE;
1220         } else if ((u_char)inptr[0] == 0xff && (u_char)inptr[1] == 0xfe) {
1221             inptr += 2;
1222             inroom -= 2;
1223             inread += 2;
1224             INCHK(2);
1225             istate = UTF_LE;
1226         } else {
1227             /* Arbitrary choice */
1228             istate = UTF_BE;
1229         }
1230     }
1231 
1232     u_char u[2];
1233     if (istate == UTF_BE) {
1234         u[0] = inptr[0];
1235         u[1] = inptr[1];
1236     } else {
1237         u[0] = inptr[1];
1238         u[1] = inptr[0];
1239     }
1240 
1241     ScmChar ch;
1242 
1243     if ((u[0] & 0xdc) == 0xd8) {
1244         /* surrogate */
1245         inptr += 2;
1246         inroom -= 2;
1247         INCHK(2);
1248         u_char v[2];
1249         if (istate == UTF_BE) {
1250             v[0] = inptr[0];
1251             v[1] = inptr[1];
1252         } else {
1253             v[0] = inptr[1];
1254             v[1] = inptr[0];
1255         }
1256         if ((v[1] & 0xdc) == 0xdc) {
1257             ch = (((u[0] & 0x03) << 18)
1258                   | (u[1] << 10)
1259                   | ((v[0] & 0x03) << 8)
1260                   | v[1])
1261                 + 0x10000;
1262             inread += 4;
1263         } else {
1264             /* We only have first half of a surrogate pair.
1265                We leave the second character in the input, and try to
1266                substitute the first. */
1267             DO_SUBST;
1268             cinfo->istate = istate;
1269             return inread;
1270         }
1271     } else if ((u[0] & 0xdc) == 0xdc) {
1272         /* Stray second half of a surrogate pair. */
1273         DO_SUBST;
1274         return inread;
1275     } else {
1276         inread += 2;
1277         ch = (u[0] << 8) + u[1];
1278     }
1279 
1280     int outreq = UCS2UTF_NBYTES(ch);
1281     OUTCHK(outreq);
1282     jconv_ucs4_to_utf8(ch, outptr);
1283     cinfo->istate = istate;
1284     *outchars = outreq;
1285     return inread;
1286 }
1287 
1288 /* This handles BOM stuff.   It is pretty twisted, for we need to keep the
1289    internal state consistent even when we return an error. */
utf16_utf16(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1290 static ScmSize utf16_utf16(ScmConvInfo *cinfo,
1291                            const char *inptr, ScmSize inroom,
1292                            char *outptr, ScmSize outroom,
1293                            ScmSize *outchars)
1294 {
1295     ScmSize consumed = 0;
1296     ScmSize emitted = 0;
1297 
1298     if (cinfo->istate == UTF_DEFAULT || cinfo->ostate == UTF_DEFAULT) {
1299         /* We come here only at the beginning.  */
1300         int istate = 0;
1301 
1302         if (cinfo->istate == UTF_DEFAULT) {
1303             INCHK(2);
1304             if ((u_char)inptr[0] == 0xfe && (u_char)inptr[1] == 0xff) {
1305                 consumed += 2;
1306                 istate = UTF_BE;
1307                 inptr += 2;
1308                 inroom -= 2;
1309             } else if ((u_char)inptr[0] == 0xff && (u_char)inptr[1] == 0xfe) {
1310                 consumed += 2;
1311                 istate = UTF_LE;
1312                 inptr += 2;
1313                 inroom -= 2;
1314             } else {
1315                 istate = UTF_BE;
1316             }
1317         }
1318         INCHK(2);
1319         if (cinfo->ostate == UTF_DEFAULT) {
1320             OUTCHK(4);
1321             outptr[0] = 0xfe;
1322             outptr[1] = 0xff;
1323             outptr += 2;
1324             outroom -= 2;
1325             emitted += 2;
1326             cinfo->ostate = UTF_BE;
1327         } else {
1328             OUTCHK(2);
1329         }
1330         cinfo->istate = istate;
1331     } else {
1332         INCHK(2);
1333         OUTCHK(2);
1334     }
1335 
1336     char u[2];
1337     if (cinfo->istate == UTF_BE) {
1338         u[0] = inptr[0];
1339         u[1] = inptr[1];
1340     } else {
1341         u[1] = inptr[0];
1342         u[0] = inptr[1];
1343     }
1344     if (cinfo->ostate == UTF_BE) {
1345         outptr[0] = u[0];
1346         outptr[1] = u[1];
1347     } else {
1348         outptr[1] = u[0];
1349         outptr[0] = u[1];
1350     }
1351     *outchars = emitted + 2;
1352     return consumed + 2;
1353 }
1354 
1355 /*=================================================================
1356  * UTF32
1357  */
1358 
utf32_utf8(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1359 static ScmSize utf32_utf8(ScmConvInfo *cinfo,
1360                           const char *inptr, ScmSize inroom,
1361                           char *outptr, ScmSize outroom,
1362                           ScmSize *outchars)
1363 {
1364     INCHK(4);
1365     int istate = cinfo->istate;
1366     ScmSize inread = 0;
1367     if (istate == UTF_DEFAULT) {
1368         if ((u_char)inptr[0] == 0
1369             && (u_char)inptr[1] == 0
1370             && (u_char)inptr[2] == 0xfe
1371             && (u_char)inptr[3] == 0xff) {
1372             inptr += 4;
1373             inroom -= 4;
1374             inread += 4;
1375             INCHK(4);
1376             istate = UTF_BE;
1377         } else if ((u_char)inptr[0] == 0xff
1378                    && (u_char)inptr[1] == 0xfe
1379                    && (u_char)inptr[2] == 0
1380                    && (u_char)inptr[3] == 0) {
1381             inptr += 4;
1382             inroom -= 4;
1383             inread += 4;
1384             INCHK(4);
1385             istate = UTF_LE;
1386         } else {
1387             /* Arbitrary choice */
1388             istate = UTF_BE;
1389         }
1390     }
1391 
1392     u_char u[4];
1393     if (istate == UTF_BE) {
1394         u[0] = inptr[0];
1395         u[1] = inptr[1];
1396         u[2] = inptr[2];
1397         u[3] = inptr[3];
1398     } else {
1399         u[0] = inptr[3];
1400         u[1] = inptr[2];
1401         u[2] = inptr[1];
1402         u[3] = inptr[0];
1403     }
1404     inread += 4;
1405 
1406     ScmChar ch = (u[0] << 24) | (u[1] << 16) | (u[2] << 8) | u[3];
1407 
1408     int outreq = UCS2UTF_NBYTES(ch);
1409     OUTCHK(outreq);
1410     jconv_ucs4_to_utf8(ch, outptr);
1411     cinfo->istate = istate;
1412     *outchars = outreq;
1413     return inread;
1414 }
1415 
1416 /* This handles BOM stuff. */
utf32_utf32(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1417 static ScmSize utf32_utf32(ScmConvInfo *cinfo,
1418                            const char *inptr, ScmSize inroom,
1419                            char *outptr, ScmSize outroom,
1420                            ScmSize *outchars)
1421 {
1422     ScmSize consumed = 0;
1423     ScmSize emitted = 0;
1424 
1425     if (cinfo->istate == UTF_DEFAULT || cinfo->ostate == UTF_DEFAULT) {
1426         /* We come here only at the beginning.  */
1427         int istate = 0;
1428 
1429         if (cinfo->istate == UTF_DEFAULT) {
1430             INCHK(4);
1431             if ((u_char)inptr[0] == 0
1432                 && (u_char)inptr[1] == 0
1433                 && (u_char)inptr[2] == 0xfe
1434                 && (u_char)inptr[3] == 0xff) {
1435                 consumed += 4;
1436                 istate = UTF_BE;
1437                 inptr += 4;
1438                 inroom -= 4;
1439             } else if ((u_char)inptr[0] == 0xff
1440                        && (u_char)inptr[1] == 0xfe
1441                        && (u_char)inptr[2] == 0
1442                        && (u_char)inptr[3] == 0) {
1443                 consumed += 4;
1444                 istate = UTF_LE;
1445                 inptr += 4;
1446                 inroom -= 4;
1447             } else {
1448                 istate = UTF_BE;
1449             }
1450         }
1451         INCHK(4);
1452         if (cinfo->ostate == UTF_DEFAULT) {
1453             OUTCHK(8);
1454             outptr[0] = 0;
1455             outptr[1] = 0;
1456             outptr[2] = 0xfe;
1457             outptr[3] = 0xff;
1458             outptr += 4;
1459             outroom -= 4;
1460             emitted += 4;
1461             cinfo->ostate = UTF_BE;
1462         } else {
1463             OUTCHK(4);
1464         }
1465         cinfo->istate = istate;
1466     } else {
1467         INCHK(4);
1468         OUTCHK(4);
1469     }
1470 
1471     char u[4];
1472     if (cinfo->istate == UTF_BE) {
1473         u[0] = inptr[0];
1474         u[1] = inptr[1];
1475         u[2] = inptr[2];
1476         u[3] = inptr[3];
1477     } else {
1478         u[3] = inptr[0];
1479         u[2] = inptr[1];
1480         u[1] = inptr[2];
1481         u[0] = inptr[3];
1482     }
1483     if (cinfo->ostate == UTF_BE) {
1484         outptr[0] = u[0];
1485         outptr[1] = u[1];
1486         outptr[2] = u[2];
1487         outptr[3] = u[3];
1488     } else {
1489         outptr[3] = u[0];
1490         outptr[2] = u[1];
1491         outptr[1] = u[2];
1492         outptr[0] = u[3];
1493     }
1494     *outchars = emitted + 4;
1495     return consumed + 4;
1496 }
1497 
1498 /*=================================================================
1499  * ISO2022-JP
1500  */
1501 
1502 /* ISO2022-JP{-1(,2),3} -> EUC_JP
1503  * Strategy: accepts as many possibilities as possible.
1504  * The following escape sequence is recognized:
1505  * (See Lunde, CJKV information processing, O'Reilly, pp.155--158)
1506  *
1507  *  <ESC> ( B     ASCII
1508  *  <ESC> ( J     JIS-Roman
1509  *  <ESC> ( H     JIS-Roman (for compatibility)
1510  *  <ESC> ( I     Half-width katakana (JIS X 0201 kana)
1511  *  <ESC> $ @     JIS C 6226-1978 (78JIS)
1512  *  <ESC> $ B     JIS X 0208-1983 (83JIS)
1513  *  <ESC> $ ( D   JIS X 0212-1990
1514  *  <ESC> $ ( O   JIS X 0213:2000 plane 1
1515  *  <ESC> $ ( P   JIS X 0213:2000 plane 2
1516  *  <ESC> & @ <ESC> $ B   JIS X 0208-1990, JIS X 0208:1997
1517  *  0x0e          JIS7 half-width katakana shift-out
1518  *  0x0f          JIS7 half-width katakana shift-in
1519  *
1520  * The state is reset to ASCII whenever newline character is read.
1521  *
1522  * The following escape sequences defined in ISO2022-JP-2 are recognized,
1523  * but all the characters within the sequence will be replaced by '?'.
1524  *
1525  *  <ESC> $ A     (GB2312-80) unsupported
1526  *  <ESC> $ ( C   (KS X 1001:1992) unsupported
1527  *  <ESC> . A     (ISO8859-1:1998) unsupported
1528  *  <ESC> . F     (ISO8859-7:1998) unsupported
1529  *
1530  * If other escape sequence is seen, the converter returns ILLEGAL_SEQUENCE.
1531  *
1532  * JIS8 kana is allowed.
1533  */
1534 
1535 /* deal with escape sequence.  escape byte itself is already consumed.
1536    returns # of input bytes consumed by the escape sequence,
1537    or an error code.  cinfo->istate is updated accordingly. */
jis_esc(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom)1538 static ScmSize jis_esc(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom)
1539 {
1540     INCHK(2);
1541     unsigned char j1 = inptr[0];
1542     unsigned char j2 = inptr[1];
1543     switch (j1) {
1544     case '(':
1545         switch (j2) {
1546         case 'B': cinfo->istate = JIS_ASCII; break;
1547         case 'J': cinfo->istate = JIS_ROMAN; break;
1548         case 'H': cinfo->istate = JIS_ROMAN; break;
1549         case 'I': cinfo->istate = JIS_KANA;  break;
1550         default: return ILLEGAL_SEQUENCE;
1551         }
1552         return 2;
1553     case '$':
1554         switch (j2) {
1555         case '@': cinfo->istate = JIS_78; break;
1556         case 'B': cinfo->istate =  JIS_0213_1; break;
1557         case 'A': cinfo->istate =  JIS_UNKNOWN; break;
1558         case '(':
1559             {
1560                 INCHK(3);
1561                 switch (inptr[2]) {
1562                 case 'D': cinfo->istate = JIS_0212; break;
1563                 case 'O': cinfo->istate = JIS_0213_1; break;
1564                 case 'P': cinfo->istate = JIS_0213_2; break;
1565                 case 'C': cinfo->istate = JIS_UNKNOWN; break;
1566                 default:  return ILLEGAL_SEQUENCE;
1567                 }
1568                 return 3;
1569                 break;
1570             }
1571         default: return ILLEGAL_SEQUENCE;
1572         }
1573         return 2;
1574     case '&':
1575         {
1576             INCHK(6);
1577             if (inptr[2] == '@' && inptr[3] == 0x1b && inptr[4] == '$'
1578                 && inptr[5] == 'B') {
1579                 cinfo->istate = JIS_0213_1;
1580                 return 5;
1581             } else {
1582                 return ILLEGAL_SEQUENCE;
1583             }
1584         }
1585     case '.':
1586         switch (inptr[2]) {
1587         case 'A':/*fallthrough*/;
1588         case 'F':   cinfo->istate = JIS_UNKNOWN; break;
1589         default:    return ILLEGAL_SEQUENCE;
1590         }
1591         return 2;
1592     default: return ILLEGAL_SEQUENCE;
1593     }
1594 }
1595 
1596 /* main routine for iso2022-jp -> euc_jp */
jis_eucj(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom,char * outptr,ScmSize outroom,ScmSize * outchars)1597 static ScmSize jis_eucj(ScmConvInfo *cinfo, const char *inptr, ScmSize inroom,
1598                         char *outptr, ScmSize outroom, ScmSize *outchars)
1599 {
1600     ScmSize inoffset = 0;
1601 
1602     unsigned char j0 = inptr[inoffset];
1603     /* skip escape sequence */
1604     while (j0 == 0x1b) {
1605         inoffset++;
1606         ScmSize r = jis_esc(cinfo, inptr+inoffset, inroom-inoffset);
1607         if (ERRP(r)) return r;
1608         inoffset += r;
1609         if (inoffset >= inroom) {
1610             *outchars = 0;
1611             return inoffset;
1612         }
1613         j0 = inptr[inoffset];
1614     }
1615 
1616     if (j0 == '\n' || j0 == '\r') {
1617         cinfo->istate = JIS_ASCII;
1618         outptr[0] = j0;
1619         *outchars = 1;
1620         return 1+inoffset;
1621     } else if (j0 < 0x20) {
1622         outptr[0] = j0;
1623         *outchars = 1;
1624         return 1+inoffset;
1625     } else if (j0 >= 0xa1 && j0 <= 0xdf) {
1626         /* JIS8 kana */
1627         OUTCHK(2);
1628         outptr[0] = 0x8e;
1629         outptr[1] = j0;
1630         *outchars = 2;
1631         return 1+inoffset;
1632     } else {
1633         switch (cinfo->istate) {
1634         case JIS_ROMAN:
1635             /* jis-roman and ascii differs on 0x5c and 0x7e -- for now,
1636                I ignore the difference. */
1637             /* FALLTHROUGH */
1638         case JIS_ASCII:
1639             outptr[0] = j0;
1640             *outchars = 1;
1641             return 1+inoffset;
1642         case JIS_KANA:
1643             OUTCHK(2);
1644             outptr[0] = 0x8e;
1645             outptr[1] = j0 + 0x80;
1646             *outchars = 2;
1647             return 1+inoffset;
1648         case JIS_78:
1649             /* for now, I ignore the difference between JIS78 and JIS83 */
1650             /* FALLTHROUGH */
1651         case JIS_0213_1: {
1652             INCHK(inoffset+2);
1653             OUTCHK(2);
1654             unsigned char j1 = inptr[inoffset+1];
1655             outptr[0] = j0 + 0x80;
1656             outptr[1] = j1 + 0x80;
1657             *outchars = 2;
1658             return 2+inoffset;
1659         }
1660         case JIS_0212:
1661             /* jis x 0212 and jis x 0213 plane 2 are different character sets,
1662                but uses the same conversion scheme. */
1663             /* FALLTHROUGH */
1664         case JIS_0213_2: {
1665             INCHK(inoffset+2);
1666             OUTCHK(3);
1667             unsigned char j1 = inptr[inoffset+1];
1668             outptr[0] = 0x8f;
1669             outptr[1] = j0 + 0x80;
1670             outptr[2] = j1 + 0x80;
1671             *outchars = 3;
1672             return 2+inoffset;
1673         }
1674         case JIS_UNKNOWN:
1675             DO_SUBST;
1676             return 1+inoffset;
1677         default:
1678             /* Can't be here */
1679             Scm_Panic("internal state of ISO2022-JP -> EUC_JP got messed up (%d).  Implementation error?", cinfo->istate);
1680         }
1681     }
1682     return ILLEGAL_SEQUENCE;
1683 }
1684 
1685 /* reset proc */
jis_reset(ScmConvInfo * cinfo,char * outptr,ScmSize outroom)1686 static ScmSize jis_reset(ScmConvInfo *cinfo, char *outptr, ScmSize outroom)
1687 {
1688     if (outptr == NULL) {
1689         /* just reset */
1690         cinfo->ostate = JIS_ASCII;
1691         return 0;
1692     } else {
1693         if (cinfo->ostate == JIS_ASCII) return 0;
1694         if (outroom < 3) return OUTPUT_NOT_ENOUGH;
1695         outptr[0] = 0x1b;
1696         outptr[1] = '(';
1697         outptr[2] = 'B';
1698         cinfo->ostate = JIS_ASCII;
1699         return 3;
1700     }
1701 }
1702 
1703 /*=================================================================
1704  * ISO8859-1
1705  */
1706 
lat1_utf8(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom,ScmSize * outchars)1707 static ScmSize lat1_utf8(ScmConvInfo *cinfo SCM_UNUSED,
1708                          const char *inptr,
1709                          ScmSize inroom SCM_UNUSED,
1710                          char *outptr,
1711                          ScmSize outroom,
1712                          ScmSize *outchars)
1713 {
1714     unsigned char c = inptr[0];
1715     if (c <= 0x7f) {
1716         outptr[0] = c;
1717         *outchars = 1;
1718     } else {
1719         OUTCHK(2);
1720         outptr[0] = 0xc0 + (c >> 6);
1721         outptr[1] = 0x80 + (c & 0x3f);
1722         *outchars = 2;
1723     }
1724     return 1;
1725 }
1726 
lat1_ascii(ScmConvInfo * cinfo,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom,ScmSize * outchars)1727 static ScmSize lat1_ascii(ScmConvInfo *cinfo,
1728                           const char *inptr,
1729                           ScmSize inroom SCM_UNUSED,
1730                           char *outptr,
1731                           ScmSize outroom,
1732                           ScmSize *outchars)
1733 {
1734     unsigned char c = inptr[0];
1735     if (c <= 0x7f) {
1736         outptr[0] = c;
1737         *outchars = 1;
1738     } else {
1739         DO_SUBST;
1740     }
1741     return 1;
1742 }
1743 
1744 /*=================================================================
1745  * ASCII
1746  */
1747 
1748 /* ASCII -> X */
1749 
ascii_x(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr,ScmSize inroom SCM_UNUSED,char * outptr,ScmSize outroom SCM_UNUSED,ScmSize * outchars)1750 static ScmSize ascii_x(ScmConvInfo *cinfo SCM_UNUSED,
1751                        const char *inptr,
1752                        ScmSize inroom SCM_UNUSED,
1753                        char *outptr,
1754                        ScmSize outroom SCM_UNUSED,
1755                        ScmSize *outchars)
1756 {
1757     outptr[0] = inptr[0];
1758     *outchars = 1;
1759     return 1;
1760 }
1761 
1762 /*=================================================================
1763  * Placeholder
1764  */
1765 
ident(ScmConvInfo * cinfo SCM_UNUSED,const char * inptr SCM_UNUSED,ScmSize inroom SCM_UNUSED,char * outptr SCM_UNUSED,ScmSize outroom SCM_UNUSED,ScmSize * outchars SCM_UNUSED)1766 static ScmSize ident(ScmConvInfo *cinfo SCM_UNUSED,
1767                      const char *inptr SCM_UNUSED,
1768                      ScmSize inroom SCM_UNUSED,
1769                      char *outptr SCM_UNUSED,
1770                      ScmSize outroom SCM_UNUSED,
1771                      ScmSize *outchars SCM_UNUSED)
1772 {
1773     return 0;
1774 }
1775 
1776 /******************************************************************
1777  *
1778  * Actual conversion
1779  *
1780  */
1781 
1782 /* map canonical code designator to inconv and outconv.  the order of
1783    entry must match with the above designators.
1784    conv_converter[incode][outcode] returns the appropriate combiniation
1785    of routines.
1786    NB: It is tedious to maintain this table; we'll eventually generate
1787    this from some DSL.
1788 */
1789 struct conv_converter_rec {
1790     ScmConvProc *conv;
1791     ScmConvReset *reset;
1792     int istate;                 /* initial input state */
1793     int ostate;                 /* initial output state */
1794 };
1795 
1796 /* map convesion name to the canonical code */
1797 struct conv_support_rec {
1798     const char *name;
1799     int code;
1800 };
1801 
1802 #include "jconv_tab.c"
1803 #include "latin_tab.c"
1804 
conv_name_match(const char * s,const char * t)1805 static int conv_name_match(const char *s, const char *t)
1806 {
1807     const char *p, *q;
1808     for (p=s, q=t; *p && *q; p++) {
1809         if (*p == '-' || *p == '_') {
1810             continue;           /* ignore '-' and '_' */
1811         } else {
1812             if (tolower(*p) != tolower(*q)) return FALSE;
1813             q++;
1814         }
1815     }
1816     if (*p || *q) return FALSE;
1817     return TRUE;
1818 }
1819 
conv_name_find(const char * name)1820 static int conv_name_find(const char *name)
1821 {
1822     struct conv_support_rec *cvtab = conv_supports;
1823     for (; cvtab->name; cvtab++) {
1824         if (conv_name_match(name, cvtab->name)) {
1825             return cvtab->code;
1826         }
1827     }
1828     return -1;
1829 }
1830 
1831 /* Internal conversion handler. */
1832 
1833 /* when we can just pass-through input to output */
jconv_ident(ScmConvInfo * cinfo SCM_UNUSED,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1834 static ScmSize jconv_ident(ScmConvInfo *cinfo SCM_UNUSED, const char **iptr,
1835                            ScmSize *iroom, char **optr, ScmSize *oroom)
1836 {
1837     ScmSize inroom = *iroom, outroom = *oroom;
1838 #ifdef JCONV_DEBUG
1839     fprintf(stderr, "jconv_ident %s->%s\n", cinfo->fromCode, cinfo->toCode);
1840 #endif
1841     if (inroom <= outroom) {
1842         memcpy(*optr, *iptr, inroom);
1843         *optr += inroom;
1844         *iptr += inroom;
1845         *iroom = 0;
1846         *oroom -= inroom;
1847         return inroom;
1848     } else {
1849         memcpy(*optr, *iptr, outroom);
1850         *optr += outroom;
1851         *iptr += outroom;
1852         *iroom -= outroom;
1853         *oroom = 0;
1854         return OUTPUT_NOT_ENOUGH;
1855     }
1856 }
1857 
1858 /* calling conversion routine for each char */
jconv_1tier(ScmConvInfo * cinfo,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1859 static ScmSize jconv_1tier(ScmConvInfo *cinfo, const char **iptr,
1860                            ScmSize *iroom, char **optr, ScmSize *oroom)
1861 {
1862     ScmConvProc *cvt = cinfo->convert;
1863     const char *inp = *iptr;
1864     char *outp = *optr;
1865     int inr = (int)*iroom, outr = (int)*oroom;
1866     ScmSize converted = 0;
1867 
1868 #ifdef JCONV_DEBUG
1869     fprintf(stderr, "jconv_1tier %s->%s\n", cinfo->fromCode, cinfo->toCode);
1870 #endif
1871     SCM_ASSERT(cvt != NULL);
1872     while (inr > 0 && outr > 0) {
1873         ScmSize outchars;
1874         ScmSize inchars = cvt(cinfo, inp, inr, outp, outr, &outchars);
1875         if (ERRP(inchars)) {
1876             converted = inchars;
1877             break;
1878         } else {
1879             converted += inchars;
1880             inp += inchars;
1881             inr -= (int)inchars;
1882             outp += outchars;
1883             outr -= (int)outchars;
1884         }
1885     }
1886     *iptr = inp;
1887     *iroom = inr;
1888     *optr = outp;
1889     *oroom = outr;
1890     return converted;
1891 }
1892 
1893 /* When we delegate conversion to iconv(3) */
1894 #ifdef HAVE_ICONV_H
1895 /* NB: although iconv manages states, we need to keep track of whether
1896  * we're sure in default status (JIS_ASCII) or not (we use JIS_UNKNOWN for it).
1897  * It's because jconv_iconv_reset will be called twice if there is any
1898  * reset sequence; the first call should emit the sequence, but the second
1899  * call shouldn't.
1900  */
jconv_iconv(ScmConvInfo * cinfo,const char ** iptr,ScmSize * iroom,char ** optr,ScmSize * oroom)1901 static ScmSize jconv_iconv(ScmConvInfo *cinfo, const char **iptr, ScmSize *iroom,
1902                            char **optr, ScmSize *oroom)
1903 {
1904 #ifdef JCONV_DEBUG
1905     fprintf(stderr, "jconv_iconv %s->%s\n", cinfo->fromCode, cinfo->toCode);
1906 #endif
1907     size_t ir = *iroom, or = *oroom;
1908     size_t r = iconv(cinfo->handle, (char **)iptr, &ir, optr, &or);
1909     *iroom = ir;
1910     *oroom = or;
1911     cinfo->ostate = JIS_UNKNOWN;
1912     if (r == (size_t)-1) {
1913         if (errno == EINVAL) return INPUT_NOT_ENOUGH;
1914         if (errno == E2BIG)  return OUTPUT_NOT_ENOUGH;
1915         return ILLEGAL_SEQUENCE;
1916     } else {
1917         return (ScmSize)r;
1918     }
1919 }
1920 
1921 /* reset routine for iconv */
jconv_iconv_reset(ScmConvInfo * cinfo,char * optr,ScmSize oroom)1922 static ScmSize jconv_iconv_reset(ScmConvInfo *cinfo, char *optr, ScmSize oroom)
1923 {
1924     ScmSize oroom_prev = oroom;
1925     if (cinfo->ostate == JIS_ASCII) return 0;
1926     size_t or = oroom;
1927     size_t r = iconv(cinfo->handle, NULL, 0, &optr, &or);
1928     if (r == (size_t)-1) {
1929         if (errno == E2BIG)  return OUTPUT_NOT_ENOUGH;
1930         Scm_Panic("jconv_iconv_reset: unknown error number %d\n", errno);
1931     }
1932     cinfo->ostate = JIS_ASCII;
1933     return oroom_prev - (ScmSize)or;
1934 }
1935 #endif /*HAVE_ICONV_H*/
1936 
1937 /*------------------------------------------------------------------
1938  * JCONV_OPEN
1939  *  Returns ScmConvInfo, setting up some fields.
1940  *  If no conversion is possible, returns NULL.
1941  */
jconv_open(const char * toCode,const char * fromCode,int useIconv)1942 ScmConvInfo *jconv_open(const char *toCode, const char *fromCode,
1943                         int useIconv)
1944 {
1945     ScmConvHandler *handler = NULL;
1946     ScmConvProc *convert = NULL;
1947     ScmConvReset *reset = NULL;
1948     int istate = 0, ostate = 0;
1949     iconv_t handle = (iconv_t)-1;
1950 
1951     int incode  = conv_name_find(fromCode);
1952     int outcode = conv_name_find(toCode);
1953 
1954     if (incode >= 0 && outcode >= 0) {
1955         convert = conv_converter[incode][outcode].conv;
1956         reset = conv_converter[incode][outcode].reset;
1957         istate = conv_converter[incode][outcode].istate;
1958         ostate = conv_converter[incode][outcode].ostate;
1959     }
1960 
1961     if (convert == NULL) {
1962         if (useIconv) {
1963 #ifdef HAVE_ICONV_H
1964             /* try iconv */
1965             handle = iconv_open(toCode, fromCode);
1966             if (handle == (iconv_t)-1) return NULL;
1967             handler = jconv_iconv;
1968             reset = jconv_iconv_reset;
1969 #else /*!HAVE_ICONV_H*/
1970             return NULL;
1971 #endif
1972         } else {
1973             return NULL;
1974         }
1975     } else if (convert == ident) {
1976         handler = jconv_ident;
1977     } else  {
1978         handler = jconv_1tier;
1979     }
1980 
1981     ScmConvInfo *cinfo;
1982     cinfo = SCM_NEW(ScmConvInfo);
1983     cinfo->jconv = handler;
1984     cinfo->convert = convert;
1985     cinfo->reset = reset;
1986     cinfo->handle = handle;
1987     cinfo->toCode = toCode;
1988     cinfo->istate = istate;
1989     cinfo->ostate = ostate;
1990     cinfo->fromCode = fromCode;
1991     /* The replacement settings can be modified by jconv_set_replacement */
1992     cinfo->replacep = FALSE;
1993     cinfo->replaceSize = 0;
1994     cinfo->replaceSeq = NULL;
1995     return cinfo;
1996 }
1997 
1998 /*------------------------------------------------------------------
1999  * JCONV_SET_REPLACEMENT
2000  *   Setting up replacement sequence according to the toCode.
2001  */
jconv_set_replacement(ScmConvInfo * cinfo)2002 void jconv_set_replacement(ScmConvInfo *cinfo)
2003 {
2004     static ScmObj ces_replacement_proc = SCM_UNDEFINED;
2005     SCM_BIND_PROC(ces_replacement_proc, "%ces-replacement",
2006                   Scm_FindModule(SCM_SYMBOL(SCM_INTERN("gauche.charconv")), 0));
2007     ScmObj replacements = Scm_ApplyRec1(ces_replacement_proc,
2008                                         SCM_MAKE_STR(cinfo->toCode));
2009     ScmSize i = Scm_Length(replacements);
2010     if (i > 0) {
2011         cinfo->replacep = TRUE;
2012         cinfo->replaceSize = i;
2013         char *replaceSeq = SCM_NEW_ATOMIC_ARRAY(char, i);
2014         for (int j = 0; j < i; j++) {
2015             SCM_ASSERT(SCM_PAIRP(replacements));
2016             replaceSeq[j] = SCM_INT_VALUE(SCM_CAR(replacements));
2017             replacements = SCM_CDR(replacements);
2018         }
2019         cinfo->replaceSeq = replaceSeq;
2020     }
2021 }
2022 
2023 /*------------------------------------------------------------------
2024  * JCONV_CLOSE
2025  */
jconv_close(ScmConvInfo * cinfo)2026 int jconv_close(ScmConvInfo *cinfo)
2027 {
2028     int r = 0;
2029 #ifdef HAVE_ICONV_H
2030     if (cinfo->handle != (iconv_t)-1) {
2031         r = iconv_close(cinfo->handle);
2032         cinfo->handle = (iconv_t)-1;
2033     }
2034 #endif /*HAVE_ICONV_H*/
2035     return r;
2036 }
2037 
2038 /*------------------------------------------------------------------
2039  * JCONV - main conversion routine
2040  */
jconv(ScmConvInfo * cinfo,const char ** inptr,ScmSize * inroom,char ** outptr,ScmSize * outroom)2041 ScmSize jconv(ScmConvInfo *cinfo,
2042               const char **inptr, ScmSize *inroom,
2043               char **outptr, ScmSize *outroom)
2044 {
2045     SCM_ASSERT(cinfo->jconv != NULL);
2046     return cinfo->jconv(cinfo, inptr, inroom, outptr, outroom);
2047 }
2048 
2049 /*------------------------------------------------------------------
2050  * JCONV_RESET - reset
2051  */
jconv_reset(ScmConvInfo * cinfo,char * outptr,ScmSize outroom)2052 ScmSize jconv_reset(ScmConvInfo *cinfo, char *outptr, ScmSize outroom)
2053 {
2054     if (cinfo->reset) {
2055         return cinfo->reset(cinfo, outptr, outroom);
2056     } else {
2057         return 0;
2058     }
2059 }
2060