1################################################################################ 2## 3## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz. 4## Version 2.x, Copyright (C) 2001, Paul Marquess. 5## Version 1.x, Copyright (C) 1999, Kenneth Albanowski. 6## 7## This program is free software; you can redistribute it and/or 8## modify it under the same terms as Perl itself. 9## 10################################################################################ 11 12=provides 13 14__UNDEFINED__ 15my_strnlen 16SvUOK 17utf8_to_uvchr_buf 18 19=dontwarn 20 21_ppport_utf8_to_uvchr_buf_callee 22_ppport_MIN 23 24=implementation 25 26#define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b)) 27 28__UNDEFINED__ sv_setuv(sv, uv) \ 29 STMT_START { \ 30 UV TeMpUv = uv; \ 31 if (TeMpUv <= IV_MAX) \ 32 sv_setiv(sv, TeMpUv); \ 33 else \ 34 sv_setnv(sv, (double)TeMpUv); \ 35 } STMT_END 36 37__UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) 38 39__UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) 40__UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv)) 41__UNDEFINED__ SvUVXx(sv) SvUVX(sv) 42__UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) 43__UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) 44 45/* Hint: sv_uv 46 * Always use the SvUVx() macro instead of sv_uv(). 47 */ 48__UNDEFINED__ sv_uv(sv) SvUVx(sv) 49 50#if !defined(SvUOK) && defined(SvIOK_UV) 51# define SvUOK(sv) SvIOK_UV(sv) 52#endif 53 54__UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) 55__UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END 56 57__UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END 58__UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END 59 60#if defined UTF8SKIP 61 62/* Don't use official version because it uses MIN, which may not be available */ 63#undef UTF8_SAFE_SKIP 64 65__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ 66 ((((e) - (s)) <= 0) \ 67 ? 0 \ 68 : _ppport_MIN(((e) - (s)), UTF8SKIP(s)))) 69#endif 70 71#if !defined(my_strnlen) 72#if { NEED my_strnlen } 73 74STRLEN 75my_strnlen(const char *str, Size_t maxlen) 76{ 77 const char *p = str; 78 79 while(maxlen-- && *p) 80 p++; 81 82 return p - str; 83} 84 85#endif 86#endif 87 88#if { VERSION < 5.31.2 } 89 /* Versions prior to this accepted things that are now considered 90 * malformations, and didn't return -1 on error with warnings enabled 91 * */ 92# undef utf8_to_uvchr_buf 93#endif 94 95/* This implementation brings modern, generally more restricted standards to 96 * utf8_to_uvchr_buf. Some of these are security related, and clearly must 97 * be done. But its arguable that the others need not, and hence should not. 98 * The reason they're here is that a module that intends to play with the 99 * latest perls shoud be able to work the same in all releases. An example is 100 * that perl no longer accepts any UV for a code point, but limits them to 101 * IV_MAX or below. This is for future internal use of the larger code points. 102 * If it turns out that some of these changes are breaking code that isn't 103 * intended to work with modern perls, the tighter restrictions could be 104 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ 105 106#ifndef utf8_to_uvchr_buf 107 /* Choose which underlying implementation to use. At least one must be 108 * present or the perl is too early to handle this function */ 109# if defined(utf8n_to_uvchr) || defined(utf8_to_uv) 110# if defined(utf8n_to_uvchr) /* This is the preferred implementation */ 111# define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr 112# else 113# define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv 114# endif 115 116# endif 117 118#ifdef _ppport_utf8_to_uvchr_buf_callee 119# if { NEED utf8_to_uvchr_buf } 120 121UV 122utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) 123{ 124 UV ret; 125 STRLEN curlen; 126 bool overflows = 0; 127 const U8 *cur_s = s; 128 const bool do_warnings = ckWARN_d(WARN_UTF8); 129 130 if (send > s) { 131 curlen = send - s; 132 } 133 else { 134 assert(0); /* Modern perls die under this circumstance */ 135 curlen = 0; 136 if (! do_warnings) { /* Handle empty here if no warnings needed */ 137 if (retlen) *retlen = 0; 138 return UNICODE_REPLACEMENT; 139 } 140 } 141 142 /* The modern version allows anything that evaluates to a legal UV, but not 143 * overlongs nor an empty input */ 144 ret = _ppport_utf8_to_uvchr_buf_callee( 145 s, curlen, retlen, (UTF8_ALLOW_ANYUV 146 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); 147 148 /* But actually, modern versions restrict the UV to being no more than what 149 * an IV can hold */ 150 if (ret > PERL_INT_MAX) { 151 overflows = 1; 152 } 153 154# if { VERSION < 5.26.0 } 155# ifndef EBCDIC 156 157 /* There are bugs in versions earlier than this on non-EBCDIC platforms 158 * in which it did not detect all instances of overflow, which could be 159 * a security hole. Also, earlier versions did not allow the overflow 160 * malformation under any circumstances, and modern ones do. So we 161 * need to check here. */ 162 163 else if (curlen > 0 && *s >= 0xFE) { 164 165 /* If the main routine detected overflow, great; it returned 0. But if the 166 * input's first byte indicates it could overflow, we need to verify. 167 * First, on a 32-bit machine the first byte being at least \xFE 168 * automatically is overflow */ 169 if (sizeof(ret) < 8) { 170 overflows = 1; 171 } 172 else { 173 const U8 highest[] = /* 2*63-1 */ 174 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; 175 const U8 *cur_h = highest; 176 177 for (cur_s = s; cur_s < send; cur_s++, cur_h++) { 178 if (UNLIKELY(*cur_s == *cur_h)) { 179 continue; 180 } 181 182 /* If this byte is larger than the corresponding highest UTF-8 183 * byte, the sequence overflows; otherwise the byte is less than 184 * (as we handled the equality case above), and so the sequence 185 * doesn't overflow */ 186 overflows = *cur_s > *cur_h; 187 break; 188 189 } 190 191 /* Here, either we set the bool and broke out of the loop, or got 192 * to the end and all bytes are the same which indicates it doesn't 193 * overflow. */ 194 } 195 } 196 197# endif 198# endif /* < 5.26 */ 199 200 if (UNLIKELY(overflows)) { 201 if (! do_warnings) { 202 if (retlen) { 203 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); 204 *retlen = _ppport_MIN(*retlen, curlen); 205 } 206 return UNICODE_REPLACEMENT; 207 } 208 else { 209 210 /* On versions that correctly detect overflow, but forbid it 211 * always, 0 will be returned, but also a warning will have been 212 * raised. Don't repeat it */ 213 if (ret != 0) { 214 /* We use the error message in use from 5.8-5.14 */ 215 Perl_warner(aTHX_ packWARN(WARN_UTF8), 216 "Malformed UTF-8 character (overflow at 0x%" UVxf 217 ", byte 0x%02x, after start byte 0x%02x)", 218 ret, *cur_s, *s); 219 } 220 if (retlen) { 221 *retlen = (STRLEN) -1; 222 } 223 return 0; 224 } 225 } 226 227 /* If failed and warnings are off, to emulate the behavior of the real 228 * utf8_to_uvchr(), try again, allowing anything. (Note a return of 0 is 229 * ok if the input was '\0') */ 230 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { 231 232 /* If curlen is 0, we already handled the case where warnings are 233 * disabled, so this 'if' will be true, and we won't look at the 234 * contents of 's' */ 235 if (do_warnings) { 236 *retlen = (STRLEN) -1; 237 } 238 else { 239 ret = _ppport_utf8_to_uvchr_buf_callee( 240 s, curlen, retlen, UTF8_ALLOW_ANY); 241 /* Override with the REPLACEMENT character, as that is what the 242 * modern version of this function returns */ 243 ret = UNICODE_REPLACEMENT; 244 245# if { VERSION < 5.16.0 } 246 247 /* Versions earlier than this don't necessarily return the proper 248 * length. It should not extend past the end of string, nor past 249 * what the first byte indicates the length is, nor past the 250 * continuation characters */ 251 if (retlen && *retlen >= 0) { 252 *retlen = _ppport_MIN(*retlen, curlen); 253 *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); 254 unsigned int i = 1; 255 do { 256 if (s[i] < 0x80 || s[i] > 0xBF) { 257 *retlen = i; 258 break; 259 } 260 } while (++i < *retlen); 261 } 262 263# endif 264 265 } 266 } 267 268 return ret; 269} 270 271# endif 272#endif 273#endif 274 275#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) 276#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses 277 to read past a NUL, making it much less likely to read 278 off the end of the buffer. A NUL indicates the start 279 of the next character anyway. If the input isn't 280 NUL-terminated, the function remains unsafe, as it 281 always has been. */ 282 283__UNDEFINED__ utf8_to_uvchr(s, lp) \ 284 ((*(s) == '\0') \ 285 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ 286 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) 287 288#endif 289 290=xsinit 291 292#define NEED_my_strnlen 293#define NEED_utf8_to_uvchr_buf 294 295=xsubs 296 297SV * 298sv_setuv(uv) 299 UV uv 300 CODE: 301 RETVAL = newSViv(1); 302 sv_setuv(RETVAL, uv); 303 OUTPUT: 304 RETVAL 305 306SV * 307newSVuv(uv) 308 UV uv 309 CODE: 310 RETVAL = newSVuv(uv); 311 OUTPUT: 312 RETVAL 313 314UV 315sv_2uv(sv) 316 SV *sv 317 CODE: 318 RETVAL = sv_2uv(sv); 319 OUTPUT: 320 RETVAL 321 322UV 323SvUVx(sv) 324 SV *sv 325 CODE: 326 sv--; 327 RETVAL = SvUVx(++sv); 328 OUTPUT: 329 RETVAL 330 331void 332XSRETURN_UV() 333 PPCODE: 334 XSRETURN_UV(42); 335 336void 337PUSHu() 338 PREINIT: 339 dTARG; 340 PPCODE: 341 TARG = sv_newmortal(); 342 EXTEND(SP, 1); 343 PUSHu(42); 344 XSRETURN(1); 345 346void 347XPUSHu() 348 PREINIT: 349 dTARG; 350 PPCODE: 351 TARG = sv_newmortal(); 352 XPUSHu(43); 353 XSRETURN(1); 354 355STRLEN 356UTF8_SAFE_SKIP(s, adjustment) 357 unsigned char * s 358 int adjustment 359 CODE: 360 /* Instead of passing in an 'e' ptr, use the real end, adjusted */ 361 RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment); 362 OUTPUT: 363 RETVAL 364 365STRLEN 366my_strnlen(s, max) 367 char * s 368 STRLEN max 369 CODE: 370 RETVAL= my_strnlen(s, max); 371 OUTPUT: 372 RETVAL 373 374AV * 375utf8_to_uvchr_buf(s, adjustment) 376 unsigned char *s 377 int adjustment 378 PREINIT: 379 AV *av; 380 STRLEN len; 381 CODE: 382 av = newAV(); 383 av_push(av, newSVuv(utf8_to_uvchr_buf(s, 384 s + UTF8SKIP(s) + adjustment, 385 &len))); 386 if (len == (STRLEN) -1) { 387 av_push(av, newSViv(-1)); 388 } 389 else { 390 av_push(av, newSVuv(len)); 391 } 392 RETVAL = av; 393 OUTPUT: 394 RETVAL 395 396AV * 397utf8_to_uvchr(s) 398 unsigned char *s 399 PREINIT: 400 AV *av; 401 STRLEN len; 402 CODE: 403 av = newAV(); 404 av_push(av, newSVuv(utf8_to_uvchr(s, &len))); 405 if (len == (STRLEN) -1) { 406 av_push(av, newSViv(-1)); 407 } 408 else { 409 av_push(av, newSVuv(len)); 410 } 411 RETVAL = av; 412 OUTPUT: 413 RETVAL 414 415=tests plan => 52 416 417ok(&Devel::PPPort::sv_setuv(42), 42); 418ok(&Devel::PPPort::newSVuv(123), 123); 419ok(&Devel::PPPort::sv_2uv("4711"), 4711); 420ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559); 421ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559); 422ok(&Devel::PPPort::SvUVx(1735928559), 1735928559); 423ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef); 424ok(&Devel::PPPort::XSRETURN_UV(), 42); 425ok(&Devel::PPPort::PUSHu(), 42); 426ok(&Devel::PPPort::XPUSHu(), 43); 427ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); 428ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); 429ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3); 430 431my $ret = &Devel::PPPort::utf8_to_uvchr("A"); 432ok($ret->[0], ord("A")); 433ok($ret->[1], 1); 434 435$ret = &Devel::PPPort::utf8_to_uvchr("\0"); 436ok($ret->[0], 0); 437ok($ret->[1], 1); 438 439$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); 440ok($ret->[0], ord("A")); 441ok($ret->[1], 1); 442 443$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); 444ok($ret->[0], 0); 445ok($ret->[1], 1); 446 447if (ord("A") != 65) { # tests not valid for EBCDIC 448 ok(1, 1) for 1 .. (2 + 4 + (5 * 5)); 449} 450else { 451 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); 452 ok($ret->[0], 0x100); 453 ok($ret->[1], 2); 454 455 my @warnings; 456 local $SIG{__WARN__} = sub { push @warnings, @_; }; 457 458 { 459 use warnings 'utf8'; 460 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); 461 ok($ret->[0], 0); 462 ok($ret->[1], -1); 463 464 no warnings; 465 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); 466 ok($ret->[0], 0xFFFD); 467 ok($ret->[1], 1); 468 } 469 470 my @buf_tests = ( 471 { 472 input => "A", 473 adjustment => -1, 474 warning => qr/empty/, 475 no_warnings_returned_length => 0, 476 }, 477 { 478 input => "\xc4\xc5", 479 adjustment => 0, 480 warning => qr/non-continuation/, 481 no_warnings_returned_length => 1, 482 }, 483 { 484 input => "\xc4\x80", 485 adjustment => -1, 486 warning => qr/short|1 byte, need 2/, 487 no_warnings_returned_length => 1, 488 }, 489 { 490 input => "\xc0\x81", 491 adjustment => 0, 492 warning => qr/overlong|2 bytes, need 1/, 493 no_warnings_returned_length => 2, 494 }, 495 { # Old algorithm supposedly failed to detect this 496 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 497 adjustment => 0, 498 warning => qr/overflow/, 499 no_warnings_returned_length => 13, 500 }, 501 ); 502 503 # An empty input is an assertion failure on debugging builds. It is 504 # deliberately the first test. 505 require Config; import Config; 506 use vars '%Config'; 507 if ($Config{ccflags} =~ /-DDEBUGGING/) { 508 shift @buf_tests; 509 ok(1, 1) for 1..5; 510 } 511 512 for my $test (@buf_tests) { 513 my $input = $test->{'input'}; 514 my $adjustment = $test->{'adjustment'}; 515 my $display = 'utf8_to_uvchr_buf("'; 516 for (my $i = 0; $i < length($input) + $adjustment; $i++) { 517 $display .= sprintf "\\x%02x", ord substr($input, $i, 1); 518 } 519 520 $display .= '")'; 521 my $warning = $test->{'warning'}; 522 523 undef @warnings; 524 use warnings 'utf8'; 525 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); 526 ok($ret->[0], 0, "returned value $display; warnings enabled"); 527 ok($ret->[1], -1, "returned length $display; warnings enabled"); 528 my $all_warnings = join "; ", @warnings; 529 my $contains = grep { $_ =~ $warning } $all_warnings; 530 ok($contains, 1, $display . "; '$all_warnings' contains '$warning'"); 531 532 undef @warnings; 533 no warnings 'utf8'; 534 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); 535 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); 536 ok($ret->[1], $test->{'no_warnings_returned_length'}, 537 "returned length $display; warnings disabled"); 538 } 539} 540