1
2 /* $Id: sjis_imode2.c 5246 2008-01-17 08:47:46Z hio $ */
3
4 #include "Japanese.h"
5 #include <stdio.h>
6
7 #ifndef __cplusplus
8 #undef bool
9 #undef true
10 #undef false
11 typedef enum bool { false, true, } bool;
12 #endif
13
14 #define DISP_U2S 0
15 #define DISP_S2U 0
16
17 #if DISP_U2S
18 #define ECHO_U2S(arg) fprintf arg
19 #define ON_U2S(cmd) cmd
20 #else
21 #define ECHO_U2S(arg)
22 #define ON_U2S(cmd)
23 #endif
24 #if DISP_S2U
25 #define ECHO_S2U(arg) fprintf arg
26 #define ON_S2U(cmd) cmd
27 #else
28 #define ECHO_S2U(arg)
29 #define ON_S2U(cmd)
30 #endif
31
32 /* ----------------------------------------------------------------------------
33 * SV* sv_utf8 = xs_sjis_imode2_utf8(SV* sv_sjis)
34 * convert sjis(imode2) into utf8.
35 * ------------------------------------------------------------------------- */
36 EXTERN_C
37 SV*
xs_sjis_imode2_utf8(SV * sv_str)38 xs_sjis_imode2_utf8(SV* sv_str)
39 {
40 UJ_UINT8* src;
41 STRLEN len;
42
43 SV_Buf result;
44 const UJ_UINT8* src_end;
45
46 if( sv_str==&PL_sv_undef )
47 {
48 return newSVsv(&PL_sv_undef);
49 }
50 if( SvGMAGICAL(sv_str) )
51 {
52 mg_get(sv_str);
53 }
54 if( !SvOK(sv_str) )
55 {
56 return newSVsv(&PL_sv_undef);
57 }
58
59 src = (UJ_UINT8*)SvPV(sv_str, len);
60 #if DISP_S2U
61 fprintf(stderr,"Unicode::Japanese::(xs)sjis_utf8_imode2\n",len);
62 bin_dump("in ",src,len);
63 #endif
64 SV_Buf_init(&result,len*3/2+4);
65 src_end = src+len;
66
67 while( src<src_end )
68 {
69 const UJ_UINT8* ptr;
70 if( src[0]<0x80 )
71 { /* ASCII */
72 ECHO_U2S((stderr,"ascii: %02x\n",src[0]));
73 if( src[0]=='&' && src+3<src_end && src[1]=='#' )
74 { /* check "&#ddddd;" */
75 int num = 0;
76 UJ_UINT8* ptr = src+2;
77 const UJ_UINT8* ptr_end = ptr+8<src_end ? ptr+8 : src_end;
78 for( ; ptr<ptr_end; ++ptr )
79 {
80 if( *ptr==';' ) break;
81 if( *ptr<'0' || '9'<*ptr ) break;
82 num = num*10 + *ptr-'0';
83 }
84 if( ptr<ptr_end && *ptr==';' && 0xf800<=num && num<=0xf9ff )
85 { /* yes, this is "&#ddddd;" */
86 const UJ_UINT8* emoji = (UJ_UINT8*)&g_ei2u2_table[num&0x1ff];
87 if( emoji[3] )
88 {
89 /*fprintf(stderr,"utf8-len: [%d]\n",4); */
90 SV_Buf_append_mem(&result, emoji, 4);
91 src = ptr+1;
92 continue;
93 }
94 }
95 }
96 SV_Buf_append_ch(&result,*src);
97 ++src;
98 continue;
99 }else if( 0xa1<=src[0] && src[0]<=0xdf )
100 { /* half-width katakana (ja:Ⱦ�ѥ���) */
101 ECHO_U2S((stderr,"kana: %02x\n",src[0]));
102 ptr = (UJ_UINT8*)&g_s2u_table[(src[0]-0xa1)*3];
103 ++src;
104 }else if( src+1<src_end && 0x81<=src[0] && src[0]<=0x9f )
105 { /* a double-byte letter (ja:2�Х���ʸ��) */
106 const UJ_UINT16 sjis = (src[0]<<8)+src[1]; /* ntohs */
107 ECHO_U2S((stderr,"sjis.dbcs#1: %04x\n",sjis));
108 ptr = (UJ_UINT8*)&g_s2u_table[(sjis - 0x8100 + 0x3f)*3];
109 src += 2;
110 }else if( src+1<src_end && ( src[0]==0xf8 || src[0]==0xf9 ) )
111 { /* i-mode emoji */
112 const UJ_UINT32* ptr32;
113 ECHO_S2U((stderr,"code: %02x %02x\n", src[0],src[1]));
114 ptr32 = &g_ei2u2_table[((src[0]&1)<<8)|src[1]];
115 if( ((char*)ptr32)[3]!=0 )
116 {
117 SV_Buf_append_ch4(&result, *ptr32);
118 src += 2;
119 continue;
120 }else if( *ptr32 )
121 {
122 SV_Buf_append_mem(&result, ptr32, strlen((char*)ptr32));
123 src += 2;
124 continue;
125 }else
126 {
127 const UJ_UINT16 sjis = (src[0]<<8)+src[1]; /* ntohs */
128 ECHO_U2S((stderr,"sjis.dbcs#2: %04x\n",sjis));
129 ptr = &g_s2u_table[(sjis- 0xe000 + 0x1f3f)*3];
130 src += 2;
131 }
132 }else if( src+1<src_end && 0xe0<=src[0] && src[0]<=0xfc )
133 { /* a double-byte letter (ja:2�Х���ʸ��) */
134 const UJ_UINT16 sjis = ntohs(*(UJ_UINT16*)src);
135 ECHO_U2S((stderr,"sjis.dbcs#2: %04x\n",sjis));
136 ptr = &g_s2u_table[(sjis- 0xe000 + 0x1f3f)*3];
137 src += 2;
138 }else
139 { /* unknown */
140 /*fprintf(stderr,"unknown: %02x\n",src[0]); */
141 SV_Buf_append_ch(&result,'?');
142 ++src;
143 continue;
144 }
145
146 ECHO_U2S((stderr,"offset: 0x%04x\n",ptr-g_s2u_table));
147 ECHO_U2S((stderr,"utf8-char : %02x %02x %02x\n",ptr[0],ptr[1],ptr[2]));
148 if( ptr[2] )
149 {
150 /*fprintf(stderr,"utf8-len: [%d]\n",3); */
151 SV_Buf_append_mem(&result, ptr, 3);
152 }else if( ptr[1] )
153 {
154 /*fprintf(stderr,"utf8-len: [%d]\n",2); */
155 SV_Buf_append_mem(&result, ptr, 2);
156 }else if( ptr[0] )
157 {
158 /*fprintf(stderr,"utf8-len: [%d]\n",1); */
159 SV_Buf_append_ch(&result,*ptr);
160 }else
161 {
162 SV_Buf_append_ch(&result,'?');
163 }
164 }
165 #if DISP_S2U
166 ON_S2U( bin_dump("out",SV_Buf_getBegin(&result),SV_Buf_getLength(&result)) );
167 #endif
168 SV_Buf_setLength(&result);
169
170 return SV_Buf_getSv(&result);
171 }
172
173
174 /* ---------------------------------------------------------------------------
175 * utf8 ==> imode2
176 * ------------------------------------------------------------------------- */
177 EXTERN_C
178 SV*
xs_utf8_sjis_imode2(SV * sv_str)179 xs_utf8_sjis_imode2(SV* sv_str)
180 {
181 UJ_UINT8* src;
182 STRLEN len;
183 SV_Buf result;
184 const UJ_UINT8* src_end;
185
186 if( sv_str==&PL_sv_undef )
187 {
188 return newSVsv(&PL_sv_undef);
189 }
190 if( SvGMAGICAL(sv_str) )
191 {
192 mg_get(sv_str);
193 }
194 if( !SvOK(sv_str) )
195 {
196 return newSVsv(&PL_sv_undef);
197 }
198 src = (UJ_UINT8*)SvPV(sv_str, len);
199
200 ECHO_U2S((stderr,"Unicode::Japanese::(xs)utf8_sjis_imode1\n"));
201 ON_U2S( bin_dump("in ",src,len) );
202
203 SV_Buf_init(&result,len+4);
204 src_end = src+len;
205
206 while( src<src_end )
207 {
208 UJ_UINT32 ucs;
209 const UJ_UINT8* sjis_ptr;
210
211 if( *src<=0x7f )
212 {
213 /* ascii chars sequence (ja:ASCII�ϤޤȤ���ɲá�) */
214 int len = 1;
215 while( src+len<src_end && src[len]<=0x7f )
216 {
217 ++len;
218 }
219 SV_Buf_append_mem(&result,src,len);
220 src+=len;
221 continue;
222 }
223
224 /* non-ascii */
225 if( 0xe0<=*src && *src<=0xef )
226 { /* 3byte range. mostly enter here. */
227 const int utf8_len = 3;
228 const UJ_UINT32 ucs_min = 0x800;
229 const UJ_UINT32 ucs_max = 0xffff;
230 ECHO_U2S((stderr,"utf8-len: [%d]\n",utf8_len));
231 /* check length */
232 if( src+utf8_len<=src_end )
233 { /* noop */
234 }else
235 { /* no enough sequence */
236 SV_Buf_append_ch(&result,'?');
237 ++src;
238 continue;
239 }
240 /* check follow sequences */
241 if( 0x80<=src[1] && src[1]<=0xbf && 0x80<=src[2] && src[2]<=0xbf )
242 { /* noop */
243 }else
244 {
245 SV_Buf_append_ch(&result,'?');
246 ++src;
247 continue;
248 }
249
250 /* compute code point */
251 ucs = ((src[0] & 0x0F)<<12)|((src[1] & 0x3F)<<6)|(src[2] & 0x3F);
252 src += utf8_len;
253 if( ucs_min<=ucs && ucs<=ucs_max )
254 { /* noop */
255 }else
256 { /* illegal sequence */
257 SV_Buf_append_ch(&result,'?');
258 continue;
259 }
260 /* ok. */
261 }else if( 0xf0<=*src && *src<=0xf7 )
262 {
263 const int utf8_len = 4;
264 const UJ_UINT32 ucs_min = 0x010000;
265 const UJ_UINT32 ucs_max = 0x10ffff;
266 ECHO_U2S((stderr,"utf8-len: [%d]\n",utf8_len));
267 /* check length */
268 if( src+utf8_len<=src_end )
269 { /* noop */
270 }else
271 { /* no enough sequence */
272 SV_Buf_append_ch(&result,'?');
273 ++src;
274 continue;
275 }
276 /* check follow sequences */
277 if( 0x80<=src[1] && src[1]<=0xbf && 0x80<=src[2] && src[2]<=0xbf
278 && 0x80<=src[3] && src[3]<=0xbf )
279 { /* noop */
280 }else
281 {
282 SV_Buf_append_ch(&result,'?');
283 ++src;
284 continue;
285 }
286
287 /* compute code point */
288 ucs = ((src[0] & 0x07)<<18)|((src[1] & 0x3F)<<12)|
289 ((src[2] & 0x3f) << 6)|(src[3] & 0x3F);
290 src += utf8_len;
291 if( ucs_min<=ucs && ucs<=ucs_max )
292 { /* noop */
293 }else
294 { /* illegal sequence */
295 SV_Buf_append_ch(&result,'?');
296 continue;
297 }
298 /* private area: block emoji */
299 if( 0x0f0000<=ucs && ucs<=0x0fffff )
300 {
301 const UJ_UINT16* sjis16;
302 const UJ_UINT8* sjis8;
303 if( ucs<0x0fe000 )
304 { /* unknown area. */
305 SV_Buf_append_ch(&result,'?');
306 continue;
307 }
308 /* imode */
309 sjis16 = &g_eu2i2_table[ucs - 0x0fe000];
310 sjis8 = (UJ_UINT8*)sjis16;
311 if( sjis8[1]!=0 )
312 { /* double-byte char */
313 SV_Buf_append_ch2(&result, *sjis16);
314 }else if( sjis8[0]!=0 )
315 { /* single-byte char, is it exists?? */
316 SV_Buf_append_ch(&result, *sjis8);
317 }else
318 { /* no mapping */
319 SV_Buf_append_ch(&result,'?');
320 }
321 continue;
322 }
323
324 /* > U+10FFFF not supported by UTF-8 (RFC 3629). */
325 if( ucs>0x10FFFF )
326 {
327 SV_Buf_append_ch(&result,'?');
328 continue;
329 }
330 }else if( 0xc0<=*src && *src<=0xdf )
331 {
332 const int utf8_len = 2;
333 const UJ_UINT32 ucs_min = 0x80;
334 const UJ_UINT32 ucs_max = 0x7ff;
335 ECHO_U2S((stderr,"utf8-len: [%d]\n",utf8_len));
336 /* check length */
337 if( src+utf8_len<=src_end )
338 { /* noop */
339 }else
340 { /* no enough sequence */
341 SV_Buf_append_ch(&result,'?');
342 ++src;
343 continue;
344 }
345 /* check follow sequences */
346 if( 0x80<=src[1] && src[1]<=0xbf )
347 { /* noop */
348 }else
349 {
350 SV_Buf_append_ch(&result,'?');
351 ++src;
352 continue;
353 }
354
355 /* compute code point */
356 ucs = ((src[0] & 0x1F)<<6)|(src[1] & 0x3F);
357 src += utf8_len;
358 if( ucs_min<=ucs && ucs<=ucs_max )
359 { /* noop */
360 }else
361 { /* illegal sequence */
362 SV_Buf_append_ch(&result,'?');
363 continue;
364 }
365
366 /* ok. */
367 }else if( 0xf8<=*src && *src<=0xfb )
368 {
369 const int utf8_len = 5;
370 ECHO_U2S((stderr,"utf8-len: [%d]\n",utf8_len));
371 /* check length */
372 if( src+utf8_len<=src_end )
373 { /* noop */
374 }else
375 { /* no enough sequence */
376 SV_Buf_append_ch(&result,'?');
377 ++src;
378 continue;
379 }
380 /* check follow sequences */
381 if( 0x80<=src[1] && src[1]<=0xbf && 0x80<=src[2] && src[2]<=0xbf
382 && 0x80<=src[3] && src[3]<=0xbf && 0x80<=src[4] && src[4]<=0xbf )
383 { /* noop */
384 }else
385 {
386 SV_Buf_append_ch(&result,'?');
387 ++src;
388 continue;
389 }
390
391 /* compute code point */
392 /* > U+10FFFF not supported by UTF-8 (RFC 3629). */
393 src += utf8_len;
394 SV_Buf_append_ch(&result,'?');
395 continue;
396 }else if( 0xfc<=*src && *src<=0xfd )
397 {
398 const int utf8_len = 6;
399 ECHO_U2S((stderr,"utf8-len: [%d]\n",utf8_len));
400 /* check length */
401 if( src+utf8_len<=src_end )
402 { /* noop */
403 }else
404 { /* no enough sequence */
405 SV_Buf_append_ch(&result,'?');
406 ++src;
407 continue;
408 }
409 /* check follow sequences */
410 if( 0x80<=src[1] && src[1]<=0xbf && 0x80<=src[2] && src[2]<=0xbf
411 && 0x80<=src[3] && src[3]<=0xbf && 0x80<=src[4] && src[4]<=0xbf
412 && 0x80<=src[5] && src[5]<=0xbf )
413 { /* noop */
414 }else
415 {
416 SV_Buf_append_ch(&result,'?');
417 ++src;
418 continue;
419 }
420
421 /* compute code point */
422 /* > U+10FFFF not supported by UTF-8 (RFC 3629). */
423 src += utf8_len;
424 SV_Buf_append_ch(&result,'?');
425 continue;
426 }else
427 {
428 SV_Buf_append_ch(&result,'?');
429 ++src;
430 continue;
431 }
432
433 /* ucs => sjis */
434 ECHO_U2S((stderr,"ucs [%04x]\n",ucs));
435 if( ucs<=0x9FFF )
436 {
437 sjis_ptr = g_u2s_table + ucs*2;
438 }else if( 0xF900<=ucs && ucs<=0xFFFF )
439 {
440 sjis_ptr = g_u2s_table + (ucs - 0xF900 + 0xA000)*2;
441 }else if( 0x0FE000<=ucs && ucs<=0x0FFFFF )
442 {
443 sjis_ptr = (UJ_UINT8*)"?"; /* exactly 2byte: "?\0" */
444 }else
445 {
446 sjis_ptr = (UJ_UINT8*)"\0"; /* exactly 2byte: "\0\0" */
447 }
448 if( sjis_ptr[0]!=0 || sjis_ptr[1]!=0 )
449 { /* mapping dest exists. */
450 if( sjis_ptr[1]!=0 )
451 {
452 SV_Buf_append_mem(&result, sjis_ptr, 2);
453 }else
454 {
455 SV_Buf_append_ch(&result,sjis_ptr[0]);
456 }
457 }else if( ucs<=0x7F )
458 {
459 SV_Buf_append_ch(&result,(UJ_UINT8)ucs);
460 }else
461 {
462 SV_Buf_append_ch(&result,'?');
463 }
464 } /* while */
465
466 ON_U2S( bin_dump("out",SV_Buf_getBegin(&result),SV_Buf_getLength(&result)) );
467 SV_Buf_setLength(&result);
468
469 return SV_Buf_getSv(&result);
470 }
471
472 /* ----------------------------------------------------------------------------
473 * End of File.
474 * ------------------------------------------------------------------------- */
475
476