1 /* 2 $Id: Unicode.xs,v 2.20 2021/07/23 02:26:54 dankogai Exp $ 3 */ 4 5 #define IN_UNICODE_XS 6 7 #define PERL_NO_GET_CONTEXT 8 #include "EXTERN.h" 9 #include "perl.h" 10 #include "XSUB.h" 11 #include "../Encode/encode.h" 12 13 #define FBCHAR 0xFFFd 14 #define BOM_BE 0xFeFF 15 #define BOM16LE 0xFFFe 16 #define BOM32LE 0xFFFe0000 17 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) 18 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) 19 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) 20 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) 21 22 #ifndef SVfARG 23 #define SVfARG(p) ((void*)(p)) 24 #endif 25 26 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */ 27 28 /* Avoid wasting too much space in the result buffer */ 29 /* static void */ 30 /* shrink_buffer(SV *result) */ 31 /* { */ 32 /* if (SvLEN(result) > 42 + SvCUR(result)) { */ 33 /* char *buf; */ 34 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */ 35 /* New(0, buf, len, char); */ 36 /* Copy(SvPVX(result), buf, len, char); */ 37 /* Safefree(SvPVX(result)); */ 38 /* SvPV_set(result, buf); */ 39 /* SvLEN_set(result, len); */ 40 /* } */ 41 /* } */ 42 43 #define shrink_buffer(result) { \ 44 if (SvLEN(result) > 42 + SvCUR(result)) { \ 45 char *newpv; \ 46 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \ 47 New(0, newpv, newlen, char); \ 48 Copy(SvPVX(result), newpv, newlen, char); \ 49 Safefree(SvPVX(result)); \ 50 SvPV_set(result, newpv); \ 51 SvLEN_set(result, newlen); \ 52 } \ 53 } 54 55 static UV 56 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian) 57 { 58 U8 *s = *sp; 59 UV v = 0; 60 if (s+size > e) { 61 croak("Partial character %c",(char) endian); 62 } 63 switch(endian) { 64 case 'N': 65 v = *s++; 66 v = (v << 8) | *s++; 67 /* FALLTHROUGH */ 68 case 'n': 69 v = (v << 8) | *s++; 70 v = (v << 8) | *s++; 71 break; 72 case 'V': 73 case 'v': 74 v |= *s++; 75 v |= (*s++ << 8); 76 if (endian == 'v') 77 break; 78 v |= (*s++ << 16); 79 v |= ((UV)*s++ << 24); 80 break; 81 default: 82 croak("Unknown endian %c",(char) endian); 83 break; 84 } 85 *sp = s; 86 return v; 87 } 88 89 static void 90 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value) 91 { 92 U8 *d = (U8 *) SvPV_nolen(result); 93 94 switch(endian) { 95 case 'v': 96 case 'V': 97 d += SvCUR(result); 98 SvCUR_set(result,SvCUR(result)+size); 99 while (size--) { 100 *d++ = (U8)(value & 0xFF); 101 value >>= 8; 102 } 103 break; 104 case 'n': 105 case 'N': 106 SvCUR_set(result,SvCUR(result)+size); 107 d += SvCUR(result); 108 while (size--) { 109 *--d = (U8)(value & 0xFF); 110 value >>= 8; 111 } 112 break; 113 default: 114 croak("Unknown endian %c",(char) endian); 115 break; 116 } 117 } 118 119 MODULE = Encode::Unicode PACKAGE = Encode::Unicode 120 121 PROTOTYPES: DISABLE 122 123 #define attr(k) (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \ 124 *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef) 125 126 void 127 decode(obj, str, check = 0) 128 SV * obj 129 SV * str 130 IV check 131 CODE: 132 { 133 SV *name = attr("Name"); 134 SV *sve = attr("endian"); 135 U8 endian = *((U8 *)SvPV_nolen(sve)); 136 SV *svs = attr("size"); 137 int size = SvIV(svs); 138 int ucs2 = -1; /* only needed in the event of surrogate pairs */ 139 SV *result = newSVpvn("",0); 140 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ 141 STRLEN ulen; 142 STRLEN resultbuflen; 143 U8 *resultbuf; 144 U8 *s; 145 U8 *e; 146 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 147 bool temp_result; 148 149 SvGETMAGIC(str); 150 if (!SvOK(str)) 151 XSRETURN_UNDEF; 152 s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen); 153 if (SvUTF8(str)) { 154 if (!modify) { 155 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 156 SvUTF8_on(tmp); 157 if (SvTAINTED(str)) 158 SvTAINTED_on(tmp); 159 str = tmp; 160 s = (U8 *)SvPVX(str); 161 } 162 if (ulen) { 163 if (!utf8_to_bytes(s, &ulen)) 164 croak("Wide character"); 165 SvCUR_set(str, ulen); 166 } 167 SvUTF8_off(str); 168 } 169 e = s+ulen; 170 171 /* Optimise for the common case of being called from PerlIOEncode_fill() 172 with a standard length buffer. In this case the result SV's buffer is 173 only used temporarily, so we can afford to allocate the maximum needed 174 and not care about unused space. */ 175 temp_result = (ulen == PERLIO_BUFSIZ); 176 177 ST(0) = sv_2mortal(result); 178 SvUTF8_on(result); 179 180 if (!endian && s+size <= e) { 181 SV *sv; 182 UV bom; 183 endian = (size == 4) ? 'N' : 'n'; 184 bom = enc_unpack(aTHX_ &s,e,size,endian); 185 if (bom != BOM_BE) { 186 if (bom == BOM16LE) { 187 endian = 'v'; 188 } 189 else if (bom == BOM32LE) { 190 endian = 'V'; 191 } 192 else { 193 /* No BOM found, use big-endian fallback as specified in 194 * RFC2781 and the Unicode Standard version 8.0: 195 * 196 * The UTF-16 encoding scheme may or may not begin with 197 * a BOM. However, when there is no BOM, and in the 198 * absence of a higher-level protocol, the byte order 199 * of the UTF-16 encoding scheme is big-endian. 200 * 201 * If the first two octets of the text is not 0xFE 202 * followed by 0xFF, and is not 0xFF followed by 0xFE, 203 * then the text SHOULD be interpreted as big-endian. 204 */ 205 s -= size; 206 } 207 } 208 #if 1 209 /* Update endian for next sequence */ 210 sv = attr("renewed"); 211 if (SvTRUE(sv)) { 212 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 213 } 214 #endif 215 } 216 217 if (temp_result) { 218 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN; 219 } else { 220 /* Preallocate the buffer to the minimum possible space required. */ 221 resultbuflen = ulen/usize + UTF8_MAXLEN + 1; 222 } 223 resultbuf = (U8 *) SvGROW(result, resultbuflen); 224 225 while (s < e && s+size <= e) { 226 UV ord = enc_unpack(aTHX_ &s,e,size,endian); 227 U8 *d; 228 HV *hv = NULL; 229 if (issurrogate(ord)) { 230 if (ucs2 == -1) { 231 SV *sv = attr("ucs2"); 232 ucs2 = SvTRUE(sv); 233 } 234 if (ucs2 || size == 4) { 235 if (check & ENCODE_DIE_ON_ERR) { 236 croak("%" SVf ":no surrogates allowed %" UVxf, 237 SVfARG(name), ord); 238 } 239 if (encode_ckWARN(check, WARN_SURROGATE)) { 240 warner(packWARN(WARN_SURROGATE), 241 "%" SVf ":no surrogates allowed %" UVxf, 242 SVfARG(name), ord); 243 } 244 ord = FBCHAR; 245 } 246 else { 247 UV lo; 248 if (!isHiSurrogate(ord)) { 249 if (check & ENCODE_DIE_ON_ERR) { 250 croak("%" SVf ":Malformed HI surrogate %" UVxf, 251 SVfARG(name), ord); 252 } 253 if (encode_ckWARN(check, WARN_SURROGATE)) { 254 warner(packWARN(WARN_SURROGATE), 255 "%" SVf ":Malformed HI surrogate %" UVxf, 256 SVfARG(name), ord); 257 } 258 ord = FBCHAR; 259 } 260 else if (s+size > e) { 261 if (check & ENCODE_STOP_AT_PARTIAL) { 262 s -= size; 263 break; 264 } 265 if (check & ENCODE_DIE_ON_ERR) { 266 croak("%" SVf ":Malformed HI surrogate %" UVxf, 267 SVfARG(name), ord); 268 } 269 if (encode_ckWARN(check, WARN_SURROGATE)) { 270 warner(packWARN(WARN_SURROGATE), 271 "%" SVf ":Malformed HI surrogate %" UVxf, 272 SVfARG(name), ord); 273 } 274 ord = FBCHAR; 275 } 276 else { 277 lo = enc_unpack(aTHX_ &s,e,size,endian); 278 if (!isLoSurrogate(lo)) { 279 if (check & ENCODE_DIE_ON_ERR) { 280 croak("%" SVf ":Malformed LO surrogate %" UVxf, 281 SVfARG(name), ord); 282 } 283 if (encode_ckWARN(check, WARN_SURROGATE)) { 284 warner(packWARN(WARN_SURROGATE), 285 "%" SVf ":Malformed LO surrogate %" UVxf, 286 SVfARG(name), ord); 287 } 288 s -= size; 289 ord = FBCHAR; 290 } 291 else { 292 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); 293 } 294 } 295 } 296 } 297 298 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { 299 if (check & ENCODE_DIE_ON_ERR) { 300 croak("%" SVf ":Unicode character %" UVxf " is illegal", 301 SVfARG(name), ord); 302 } 303 if (encode_ckWARN(check, WARN_NONCHAR)) { 304 warner(packWARN(WARN_NONCHAR), 305 "%" SVf ":Unicode character %" UVxf " is illegal", 306 SVfARG(name), ord); 307 } 308 ord = FBCHAR; 309 } 310 311 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) { 312 /* Do not allocate >8Mb more than the minimum needed. 313 This prevents allocating too much in the rogue case of a large 314 input consisting initially of long sequence uft8-byte unicode 315 chars followed by single utf8-byte chars. */ 316 /* +1 317 fixes Unicode.xs!decode_xs n-byte heap-overflow 318 */ 319 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */ 320 STRLEN max_alloc = remaining + (8*1024*1024); 321 STRLEN est_alloc = remaining * UTF8_MAXLEN; 322 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */ 323 (est_alloc > max_alloc ? max_alloc : est_alloc); 324 resultbuf = (U8 *) SvGROW(result, newlen); 325 resultbuflen = SvLEN(result); 326 } 327 328 d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv); 329 if (hv) { 330 SV *message = *hv_fetch(hv, "text", 4, 0); 331 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0)); 332 sv_2mortal((SV *)hv); 333 if (check & ENCODE_DIE_ON_ERR) 334 croak("%" SVf, SVfARG(message)); 335 if (encode_ckWARN_packed(check, categories)) 336 warner(categories, "%" SVf, SVfARG(message)); 337 d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0); 338 } 339 340 SvCUR_set(result, d - (U8 *)SvPVX(result)); 341 } 342 343 if (s < e) { 344 /* unlikely to happen because it's fixed-length -- dankogai */ 345 if (check & ENCODE_DIE_ON_ERR) 346 croak("%" SVf ":Partial character", SVfARG(name)); 347 if (encode_ckWARN(check, WARN_UTF8)) { 348 warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name)); 349 } 350 } 351 if (check && !(check & ENCODE_LEAVE_SRC)) { 352 if (s < e) { 353 Move(s,SvPVX(str),e-s,U8); 354 SvCUR_set(str,(e-s)); 355 } 356 else { 357 SvCUR_set(str,0); 358 } 359 *SvEND(str) = '\0'; 360 SvSETMAGIC(str); 361 } 362 363 if (!temp_result) shrink_buffer(result); 364 365 /* Make sure we have a trailing NUL: */ 366 *SvEND(result) = '\0'; 367 368 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */ 369 XSRETURN(1); 370 } 371 372 void 373 encode(obj, utf8, check = 0) 374 SV * obj 375 SV * utf8 376 IV check 377 CODE: 378 { 379 SV *name = attr("Name"); 380 SV *sve = attr("endian"); 381 U8 endian = *((U8 *)SvPV_nolen(sve)); 382 SV *svs = attr("size"); 383 const int size = SvIV(svs); 384 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ 385 const STRLEN usize = (size > 0 ? size : 1); 386 SV *result = newSVpvn("", 0); 387 STRLEN ulen; 388 U8 *s; 389 U8 *e; 390 bool modify = (check && !(check & ENCODE_LEAVE_SRC)); 391 bool temp_result; 392 393 SvGETMAGIC(utf8); 394 if (!SvOK(utf8)) 395 XSRETURN_UNDEF; 396 s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen); 397 if (!SvUTF8(utf8)) { 398 if (!modify) { 399 SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); 400 if (SvTAINTED(utf8)) 401 SvTAINTED_on(tmp); 402 utf8 = tmp; 403 } 404 sv_utf8_upgrade_nomg(utf8); 405 s = (U8 *)SvPV_nomg(utf8, ulen); 406 } 407 e = s+ulen; 408 409 /* Optimise for the common case of being called from PerlIOEncode_flush() 410 with a standard length buffer. In this case the result SV's buffer is 411 only used temporarily, so we can afford to allocate the maximum needed 412 and not care about unused space. */ 413 temp_result = (ulen == PERLIO_BUFSIZ); 414 415 ST(0) = sv_2mortal(result); 416 417 /* Preallocate the result buffer to the maximum possible size. 418 ie. assume each UTF8 byte is 1 character. 419 Then shrink the result's buffer if necesary at the end. */ 420 SvGROW(result, ((ulen+1) * usize)); 421 422 if (!endian) { 423 SV *sv; 424 endian = (size == 4) ? 'N' : 'n'; 425 enc_pack(aTHX_ result,size,endian,BOM_BE); 426 #if 1 427 /* Update endian for next sequence */ 428 sv = attr("renewed"); 429 if (SvTRUE(sv)) { 430 (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); 431 } 432 #endif 433 } 434 while (s < e && s+UTF8SKIP(s) <= e) { 435 STRLEN len; 436 AV *msgs = NULL; 437 UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs); 438 if (msgs) { 439 SSize_t i; 440 SSize_t len = av_len(msgs)+1; 441 sv_2mortal((SV *)msgs); 442 for (i = 0; i < len; ++i) { 443 SV *sv = *av_fetch(msgs, i, 0); 444 HV *hv = (HV *)SvRV(sv); 445 SV *message = *hv_fetch(hv, "text", 4, 0); 446 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0)); 447 if (check & ENCODE_DIE_ON_ERR) 448 croak("%" SVf, SVfARG(message)); 449 if (encode_ckWARN_packed(check, categories)) 450 warner(categories, "%" SVf, SVfARG(message)); 451 } 452 } 453 if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) { 454 if (!issurrogate(ord)) { 455 if (ucs2 == -1) { 456 SV *sv = attr("ucs2"); 457 ucs2 = SvTRUE(sv); 458 } 459 if (ucs2 || ord > 0x10FFFF) { 460 if (check & ENCODE_DIE_ON_ERR) { 461 croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high", 462 SVfARG(name),ord); 463 } 464 if (encode_ckWARN(check, WARN_NON_UNICODE)) { 465 warner(packWARN(WARN_NON_UNICODE), 466 "%" SVf ":code point \"\\x{%" UVxf "}\" too high", 467 SVfARG(name),ord); 468 } 469 enc_pack(aTHX_ result,size,endian,FBCHAR); 470 } else if (ord == 0) { 471 enc_pack(aTHX_ result,size,endian,FBCHAR); 472 } else { 473 UV hi = ((ord - 0x10000) >> 10) + 0xD800; 474 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; 475 enc_pack(aTHX_ result,size,endian,hi); 476 enc_pack(aTHX_ result,size,endian,lo); 477 } 478 } 479 else { 480 /* not supposed to happen */ 481 enc_pack(aTHX_ result,size,endian,FBCHAR); 482 } 483 } 484 else { 485 enc_pack(aTHX_ result,size,endian,ord); 486 } 487 s += len; 488 } 489 if (s < e) { 490 /* UTF-8 partial char happens often on PerlIO. 491 Since this is okay and normal, we do not warn. 492 But this is critical when you choose to LEAVE_SRC 493 in which case we die */ 494 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { 495 Perl_croak(aTHX_ "%" SVf ":partial character is not allowed " 496 "when CHECK = 0x%" UVuf, 497 SVfARG(name), check); 498 } 499 } 500 if (check && !(check & ENCODE_LEAVE_SRC)) { 501 if (s < e) { 502 Move(s,SvPVX(utf8),e-s,U8); 503 SvCUR_set(utf8,(e-s)); 504 } 505 else { 506 SvCUR_set(utf8,0); 507 } 508 *SvEND(utf8) = '\0'; 509 SvSETMAGIC(utf8); 510 } 511 512 if (!temp_result) shrink_buffer(result); 513 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ 514 515 XSRETURN(1); 516 } 517