1 /* 2 $Id: Unicode.xs,v 2.6 2009/11/16 14:08:13 dankogai Exp $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #define U8 U8 10 #include "../Encode/encode.h" 11 12 #define FBCHAR 0xFFFd 13 #define BOM_BE 0xFeFF 14 #define BOM16LE 0xFFFe 15 #define BOM32LE 0xFFFe0000 16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 20 21 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 22 23 /* Avoid wasting too much space in the result buffer */ 24 /* static void */ 25 /* shrink_buffer(SV *result) */ 26 /* { */ 27 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 28 /* char *buf; */ 29 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 30 /* New(0, buf, len, char); */ 31 /* Copy(SvPVX(result), buf, len, char); */ 32 /* Safefree(SvPVX(result)); */ 33 /* SvPV_set(result, buf); */ 34 /* SvLEN_set(result, len); */ 35 /* } */ 36 /* } */ 37 38 #define shrink_buffer(result) { \ 39 if (SvLEN(result) > 42 + SvCUR(result)) { \ 40 char *newpv; \ 41 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 42 New(0, newpv, newlen, char); \ 43 Copy(SvPVX(result), newpv, newlen, char); \ 44 Safefree(SvPVX(result)); \ 45 SvPV_set(result, newpv); \ 46 SvLEN_set(result, newlen); \ 47 } \ 48 } 49 50 static UV 51 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 52 { 53 U8 *s = *sp; 54 UV v = 0; 55 if (s+size > e) { 56 croak("Partial character %c",(char) endian); 57 } 58 switch(endian) { 59 case 'N': 60 v = *s++; 61 v = (v << 8) | *s++; 62 case 'n': 63 v = (v << 8) | *s++; 64 v = (v << 8) | *s++; 65 break; 66 case 'V': 67 case 'v': 68 v |= *s++; 69 v |= (*s++ << 8); 70 if (endian == 'v') 71 break; 72 v |= (*s++ << 16); 73 v |= (*s++ << 24); 74 break; 75 default: 76 croak("Unknown endian %c",(char) endian); 77 break; 78 } 79 *sp = s; 80 return v; 81 } 82 83 void 84 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 85 { 86 U8 *d = (U8 *) SvPV_nolen(result); 87 88 switch(endian) { 89 case 'v': 90 case 'V': 91 d += SvCUR(result); 92 SvCUR_set(result,SvCUR(result)+size); 93 while (size--) { 94 *d++ = (U8)(value & 0xFF); 95 value >>= 8; 96 } 97 break; 98 case 'n': 99 case 'N': 100 SvCUR_set(result,SvCUR(result)+size); 101 d += SvCUR(result); 102 while (size--) { 103 *--d = (U8)(value & 0xFF); 104 value >>= 8; 105 } 106 break; 107 default: 108 croak("Unknown endian %c",(char) endian); 109 break; 110 } 111 } 112 113 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 114 115 PROTOTYPES: DISABLE 116 117 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 118 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 119 120 void 121 decode_xs(obj, str, check = 0) 122 SV * obj 123 SV * str 124 IV check 125 CODE: 126 { 127 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 128 int size = SvIV(attr("size", 4)); 129 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 130 SV *result = newSVpvn("",0); 131 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 132 STRLEN ulen; 133 STRLEN resultbuflen; 134 U8 *resultbuf; 135 U8 *s = (U8 *)SvPVbyte(str,ulen); 136 U8 *e = (U8 *)SvEND(str); 137 /* Optimise for the common case of being called from PerlIOEncode_fill() 138 with a standard length buffer. In this case the result SV's buffer is 139 only used temporarily, so we can afford to allocate the maximum needed 140 and not care about unused space. */ 141 const bool temp_result = (ulen == PERLIO_BUFSIZ); 142 143 ST(0) = sv_2mortal(result); 144 SvUTF8_on(result); 145 146 if (!endian && s+size <= e) { 147 UV bom; 148 endian = (size == 4) ? 'N' : 'n'; 149 bom = enc_unpack(aTHX_ &s,e,size,endian); 150 if (bom != BOM_BE) { 151 if (bom == BOM16LE) { 152 endian = 'v'; 153 } 154 else if (bom == BOM32LE) { 155 endian = 'V'; 156 } 157 else { 158 croak("%"SVf":Unrecognised BOM %"UVxf, 159 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 160 bom); 161 } 162 } 163 #if 1 164 /* Update endian for next sequence */ 165 if (SvTRUE(attr("renewed", 7))) { 166 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 167 } 168 #endif 169 } 170 171 if (temp_result) { 172 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 173 } else { 174 /* Preallocate the buffer to the minimum possible space required. */ 175 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 176 } 177 resultbuf = (U8 *) SvGROW(result, resultbuflen); 178 179 while (s < e && s+size <= e) { 180 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 181 U8 *d; 182 if (issurrogate(ord)) { 183 if (ucs2 == -1) { 184 ucs2 = SvTRUE(attr("ucs2", 4)); 185 } 186 if (ucs2 || size == 4) { 187 if (check) { 188 croak("%"SVf":no surrogates allowed %"UVxf, 189 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 190 ord); 191 } 192 if (s+size <= e) { 193 /* skip the next one as well */ 194 enc_unpack(aTHX_ &s,e,size,endian); 195 } 196 ord = FBCHAR; 197 } 198 else { 199 UV lo; 200 if (!isHiSurrogate(ord)) { 201 if (check) { 202 croak("%"SVf":Malformed HI surrogate %"UVxf, 203 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 204 ord); 205 } 206 else { 207 ord = FBCHAR; 208 } 209 } 210 else { 211 if (s+size > e) { 212 /* Partial character */ 213 s -= size; /* back up to 1st half */ 214 break; /* And exit loop */ 215 } 216 lo = enc_unpack(aTHX_ &s,e,size,endian); 217 if (!isLoSurrogate(lo)) { 218 if (check) { 219 croak("%"SVf":Malformed LO surrogate %"UVxf, 220 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 221 ord); 222 } 223 else { 224 ord = FBCHAR; 225 } 226 } 227 else { 228 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 229 } 230 } 231 } 232 } 233 234 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 235 if (check) { 236 croak("%"SVf":Unicode character %"UVxf" is illegal", 237 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 238 ord); 239 } else { 240 ord = FBCHAR; 241 } 242 } 243 244 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 245 /* Do not allocate >8Mb more than the minimum needed. 246 This prevents allocating too much in the rogue case of a large 247 input consisting initially of long sequence uft8-byte unicode 248 chars followed by single utf8-byte chars. */ 249 /* +1 250 fixes Unicode.xs!decode_xs n-byte heap-overflow 251 */ 252 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 253 STRLEN max_alloc = remaining + (8*1024*1024); 254 STRLEN est_alloc = remaining * UTF8_MAXLEN; 255 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 256 (est_alloc > max_alloc ? max_alloc : est_alloc); 257 resultbuf = (U8 *) SvGROW(result, newlen); 258 resultbuflen = SvLEN(result); 259 } 260 261 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0); 262 SvCUR_set(result, d - (U8 *)SvPVX(result)); 263 } 264 265 if (s < e) { 266 /* unlikely to happen because it's fixed-length -- dankogai */ 267 if (check & ENCODE_WARN_ON_ERR) { 268 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 269 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 270 } 271 } 272 if (check && !(check & ENCODE_LEAVE_SRC)) { 273 if (s < e) { 274 Move(s,SvPVX(str),e-s,U8); 275 SvCUR_set(str,(e-s)); 276 } 277 else { 278 SvCUR_set(str,0); 279 } 280 *SvEND(str) = '\0'; 281 } 282 283 if (!temp_result) 284 shrink_buffer(result); 285 286 XSRETURN(1); 287 } 288 289 void 290 encode_xs(obj, utf8, check = 0) 291 SV * obj 292 SV * utf8 293 IV check 294 CODE: 295 { 296 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 297 const int size = SvIV(attr("size", 4)); 298 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 299 const STRLEN usize = (size > 0 ? size : 1); 300 SV *result = newSVpvn("", 0); 301 STRLEN ulen; 302 U8 *s = (U8 *) SvPVutf8(utf8, ulen); 303 const U8 *e = (U8 *) SvEND(utf8); 304 /* Optimise for the common case of being called from PerlIOEncode_flush() 305 with a standard length buffer. In this case the result SV's buffer is 306 only used temporarily, so we can afford to allocate the maximum needed 307 and not care about unused space. */ 308 const bool temp_result = (ulen == PERLIO_BUFSIZ); 309 310 ST(0) = sv_2mortal(result); 311 312 /* Preallocate the result buffer to the maximum possible size. 313 ie. assume each UTF8 byte is 1 character. 314 Then shrink the result's buffer if necesary at the end. */ 315 SvGROW(result, ((ulen+1) * usize)); 316 317 if (!endian) { 318 endian = (size == 4) ? 'N' : 'n'; 319 enc_pack(aTHX_ result,size,endian,BOM_BE); 320 #if 1 321 /* Update endian for next sequence */ 322 if (SvTRUE(attr("renewed", 7))) { 323 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 324 } 325 #endif 326 } 327 while (s < e && s+UTF8SKIP(s) <= e) { 328 STRLEN len; 329 UV ord = utf8n_to_uvuni(s, e-s, &len, 0); 330 s += len; 331 if (size != 4 && invalid_ucs2(ord)) { 332 if (!issurrogate(ord)) { 333 if (ucs2 == -1) { 334 ucs2 = SvTRUE(attr("ucs2", 4)); 335 } 336 if (ucs2) { 337 if (check) { 338 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 339 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 340 } 341 enc_pack(aTHX_ result,size,endian,FBCHAR); 342 } else { 343 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 344 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 345 enc_pack(aTHX_ result,size,endian,hi); 346 enc_pack(aTHX_ result,size,endian,lo); 347 } 348 } 349 else { 350 /* not supposed to happen */ 351 enc_pack(aTHX_ result,size,endian,FBCHAR); 352 } 353 } 354 else { 355 enc_pack(aTHX_ result,size,endian,ord); 356 } 357 } 358 if (s < e) { 359 /* UTF-8 partial char happens often on PerlIO. 360 Since this is okay and normal, we do not warn. 361 But this is critical when you choose to LEAVE_SRC 362 in which case we die */ 363 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 364 Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 365 "when CHECK = 0x%" UVuf, 366 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 367 } 368 } 369 if (check && !(check & ENCODE_LEAVE_SRC)) { 370 if (s < e) { 371 Move(s,SvPVX(utf8),e-s,U8); 372 SvCUR_set(utf8,(e-s)); 373 } 374 else { 375 SvCUR_set(utf8,0); 376 } 377 *SvEND(utf8) = '\0'; 378 } 379 380 if (!temp_result) 381 shrink_buffer(result); 382 383 SvSETMAGIC(utf8); 384 385 XSRETURN(1); 386 } 387