1 /* 2 $Id: Unicode.xs,v 2.11 2014/04/29 16:25:06 dankogai Exp dankogai $ 3 */ 4 5 #define PERL_NO_GET_CONTEXT 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 #include "../Encode/encode.h" 10 11 #define FBCHAR 0xFFFd 12 #define BOM_BE 0xFeFF 13 #define BOM16LE 0xFFFe 14 #define BOM32LE 0xFFFe0000 15 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 16 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 17 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 18 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 19 20 /* For pre-5.14 source compatibility */ 21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE 22 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0 23 # define UTF8_DISALLOW_SURROGATE 0 24 # define UTF8_WARN_SURROGATE 0 25 # define UTF8_DISALLOW_FE_FF 0 26 # define UTF8_WARN_FE_FF 0 27 # define UTF8_WARN_NONCHAR 0 28 #endif 29 30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 31 32 /* Avoid wasting too much space in the result buffer */ 33 /* static void */ 34 /* shrink_buffer(SV *result) */ 35 /* { */ 36 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 37 /* char *buf; */ 38 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 39 /* New(0, buf, len, char); */ 40 /* Copy(SvPVX(result), buf, len, char); */ 41 /* Safefree(SvPVX(result)); */ 42 /* SvPV_set(result, buf); */ 43 /* SvLEN_set(result, len); */ 44 /* } */ 45 /* } */ 46 47 #define shrink_buffer(result) { \ 48 if (SvLEN(result) > 42 + SvCUR(result)) { \ 49 char *newpv; \ 50 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 51 New(0, newpv, newlen, char); \ 52 Copy(SvPVX(result), newpv, newlen, char); \ 53 Safefree(SvPVX(result)); \ 54 SvPV_set(result, newpv); \ 55 SvLEN_set(result, newlen); \ 56 } \ 57 } 58 59 static UV 60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 61 { 62 U8 *s = *sp; 63 UV v = 0; 64 if (s+size > e) { 65 croak("Partial character %c",(char) endian); 66 } 67 switch(endian) { 68 case 'N': 69 v = *s++; 70 v = (v << 8) | *s++; 71 case 'n': 72 v = (v << 8) | *s++; 73 v = (v << 8) | *s++; 74 break; 75 case 'V': 76 case 'v': 77 v |= *s++; 78 v |= (*s++ << 8); 79 if (endian == 'v') 80 break; 81 v |= (*s++ << 16); 82 v |= ((UV)*s++ << 24); 83 break; 84 default: 85 croak("Unknown endian %c",(char) endian); 86 break; 87 } 88 *sp = s; 89 return v; 90 } 91 92 void 93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 94 { 95 U8 *d = (U8 *) SvPV_nolen(result); 96 97 switch(endian) { 98 case 'v': 99 case 'V': 100 d += SvCUR(result); 101 SvCUR_set(result,SvCUR(result)+size); 102 while (size--) { 103 *d++ = (U8)(value & 0xFF); 104 value >>= 8; 105 } 106 break; 107 case 'n': 108 case 'N': 109 SvCUR_set(result,SvCUR(result)+size); 110 d += SvCUR(result); 111 while (size--) { 112 *--d = (U8)(value & 0xFF); 113 value >>= 8; 114 } 115 break; 116 default: 117 croak("Unknown endian %c",(char) endian); 118 break; 119 } 120 } 121 122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 123 124 PROTOTYPES: DISABLE 125 126 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ 127 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) 128 129 void 130 decode_xs(obj, str, check = 0) 131 SV * obj 132 SV * str 133 IV check 134 CODE: 135 { 136 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 137 int size = SvIV(attr("size", 4)); 138 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 139 SV *result = newSVpvn("",0); 140 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 141 STRLEN ulen; 142 STRLEN resultbuflen; 143 U8 *resultbuf; 144 U8 *s = (U8 *)SvPVbyte(str,ulen); 145 U8 *e = (U8 *)SvEND(str); 146 /* Optimise for the common case of being called from PerlIOEncode_fill() 147 with a standard length buffer. In this case the result SV's buffer is 148 only used temporarily, so we can afford to allocate the maximum needed 149 and not care about unused space. */ 150 const bool temp_result = (ulen == PERLIO_BUFSIZ); 151 152 ST(0) = sv_2mortal(result); 153 SvUTF8_on(result); 154 155 if (!endian && s+size <= e) { 156 UV bom; 157 endian = (size == 4) ? 'N' : 'n'; 158 bom = enc_unpack(aTHX_ &s,e,size,endian); 159 if (bom != BOM_BE) { 160 if (bom == BOM16LE) { 161 endian = 'v'; 162 } 163 else if (bom == BOM32LE) { 164 endian = 'V'; 165 } 166 else { 167 croak("%"SVf":Unrecognised BOM %"UVxf, 168 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 169 bom); 170 } 171 } 172 #if 1 173 /* Update endian for next sequence */ 174 if (SvTRUE(attr("renewed", 7))) { 175 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 176 } 177 #endif 178 } 179 180 if (temp_result) { 181 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 182 } else { 183 /* Preallocate the buffer to the minimum possible space required. */ 184 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 185 } 186 resultbuf = (U8 *) SvGROW(result, resultbuflen); 187 188 while (s < e && s+size <= e) { 189 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 190 U8 *d; 191 if (issurrogate(ord)) { 192 if (ucs2 == -1) { 193 ucs2 = SvTRUE(attr("ucs2", 4)); 194 } 195 if (ucs2 || size == 4) { 196 if (check) { 197 croak("%"SVf":no surrogates allowed %"UVxf, 198 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 199 ord); 200 } 201 ord = FBCHAR; 202 } 203 else { 204 UV lo; 205 if (!isHiSurrogate(ord)) { 206 if (check) { 207 croak("%"SVf":Malformed HI surrogate %"UVxf, 208 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 209 ord); 210 } 211 else { 212 ord = FBCHAR; 213 } 214 } 215 else if (s+size > e) { 216 if (check) { 217 if (check & ENCODE_STOP_AT_PARTIAL) { 218 s -= size; 219 break; 220 } 221 else { 222 croak("%"SVf":Malformed HI surrogate %"UVxf, 223 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 224 ord); 225 } 226 } 227 else { 228 ord = FBCHAR; 229 } 230 } 231 else { 232 lo = enc_unpack(aTHX_ &s,e,size,endian); 233 if (!isLoSurrogate(lo)) { 234 if (check) { 235 croak("%"SVf":Malformed LO surrogate %"UVxf, 236 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 237 ord); 238 } 239 else { 240 s -= size; 241 ord = FBCHAR; 242 } 243 } 244 else { 245 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 246 } 247 } 248 } 249 } 250 251 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 252 if (check) { 253 croak("%"SVf":Unicode character %"UVxf" is illegal", 254 *hv_fetch((HV *)SvRV(obj),"Name",4,0), 255 ord); 256 } else { 257 ord = FBCHAR; 258 } 259 } 260 261 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 262 /* Do not allocate >8Mb more than the minimum needed. 263 This prevents allocating too much in the rogue case of a large 264 input consisting initially of long sequence uft8-byte unicode 265 chars followed by single utf8-byte chars. */ 266 /* +1 267 fixes Unicode.xs!decode_xs n-byte heap-overflow 268 */ 269 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 270 STRLEN max_alloc = remaining + (8*1024*1024); 271 STRLEN est_alloc = remaining * UTF8_MAXLEN; 272 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 273 (est_alloc > max_alloc ? max_alloc : est_alloc); 274 resultbuf = (U8 *) SvGROW(result, newlen); 275 resultbuflen = SvLEN(result); 276 } 277 278 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 279 UNICODE_WARN_ILLEGAL_INTERCHANGE); 280 SvCUR_set(result, d - (U8 *)SvPVX(result)); 281 } 282 283 if (s < e) { 284 /* unlikely to happen because it's fixed-length -- dankogai */ 285 if (check & ENCODE_WARN_ON_ERR) { 286 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", 287 *hv_fetch((HV *)SvRV(obj),"Name",4,0)); 288 } 289 } 290 if (check && !(check & ENCODE_LEAVE_SRC)) { 291 if (s < e) { 292 Move(s,SvPVX(str),e-s,U8); 293 SvCUR_set(str,(e-s)); 294 } 295 else { 296 SvCUR_set(str,0); 297 } 298 *SvEND(str) = '\0'; 299 } 300 301 if (!temp_result) shrink_buffer(result); 302 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */ 303 XSRETURN(1); 304 } 305 306 void 307 encode_xs(obj, utf8, check = 0) 308 SV * obj 309 SV * utf8 310 IV check 311 CODE: 312 { 313 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); 314 const int size = SvIV(attr("size", 4)); 315 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 316 const STRLEN usize = (size > 0 ? size : 1); 317 SV *result = newSVpvn("", 0); 318 STRLEN ulen; 319 U8 *s = (U8 *) SvPVutf8(utf8, ulen); 320 const U8 *e = (U8 *) SvEND(utf8); 321 /* Optimise for the common case of being called from PerlIOEncode_flush() 322 with a standard length buffer. In this case the result SV's buffer is 323 only used temporarily, so we can afford to allocate the maximum needed 324 and not care about unused space. */ 325 const bool temp_result = (ulen == PERLIO_BUFSIZ); 326 327 ST(0) = sv_2mortal(result); 328 329 /* Preallocate the result buffer to the maximum possible size. 330 ie. assume each UTF8 byte is 1 character. 331 Then shrink the result's buffer if necesary at the end. */ 332 SvGROW(result, ((ulen+1) * usize)); 333 334 if (!endian) { 335 endian = (size == 4) ? 'N' : 'n'; 336 enc_pack(aTHX_ result,size,endian,BOM_BE); 337 #if 1 338 /* Update endian for next sequence */ 339 if (SvTRUE(attr("renewed", 7))) { 340 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 341 } 342 #endif 343 } 344 while (s < e && s+UTF8SKIP(s) <= e) { 345 STRLEN len; 346 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE 347 |UTF8_WARN_SURROGATE 348 |UTF8_DISALLOW_FE_FF 349 |UTF8_WARN_FE_FF 350 |UTF8_WARN_NONCHAR)); 351 s += len; 352 if (size != 4 && invalid_ucs2(ord)) { 353 if (!issurrogate(ord)) { 354 if (ucs2 == -1) { 355 ucs2 = SvTRUE(attr("ucs2", 4)); 356 } 357 if (ucs2 || ord > 0x10FFFF) { 358 if (check) { 359 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", 360 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); 361 } 362 enc_pack(aTHX_ result,size,endian,FBCHAR); 363 } else { 364 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 365 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 366 enc_pack(aTHX_ result,size,endian,hi); 367 enc_pack(aTHX_ result,size,endian,lo); 368 } 369 } 370 else { 371 /* not supposed to happen */ 372 enc_pack(aTHX_ result,size,endian,FBCHAR); 373 } 374 } 375 else { 376 enc_pack(aTHX_ result,size,endian,ord); 377 } 378 } 379 if (s < e) { 380 /* UTF-8 partial char happens often on PerlIO. 381 Since this is okay and normal, we do not warn. 382 But this is critical when you choose to LEAVE_SRC 383 in which case we die */ 384 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 385 Perl_croak(aTHX_ "%"SVf":partial character is not allowed " 386 "when CHECK = 0x%" UVuf, 387 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); 388 } 389 } 390 if (check && !(check & ENCODE_LEAVE_SRC)) { 391 if (s < e) { 392 Move(s,SvPVX(utf8),e-s,U8); 393 SvCUR_set(utf8,(e-s)); 394 } 395 else { 396 SvCUR_set(utf8,0); 397 } 398 *SvEND(utf8) = '\0'; 399 } 400 401 if (!temp_result) shrink_buffer(result); 402 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ 403 404 SvSETMAGIC(utf8); 405 406 XSRETURN(1); 407 } 408