1 /* handy.h 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 4 * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* IMPORTANT NOTE: Everything whose name begins with an underscore is for 12 * internal core Perl use only. */ 13 14 #ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */ 15 #define PERL_HANDY_H_ 16 17 #ifndef PERL_CORE 18 # define Null(type) ((type)NULL) 19 20 /* 21 =for apidoc_section $string 22 =for apidoc AmnU||Nullch 23 Null character pointer. (No longer available when C<PERL_CORE> is 24 defined.) 25 26 =for apidoc_section $SV 27 =for apidoc AmnU||Nullsv 28 Null SV pointer. (No longer available when C<PERL_CORE> is defined.) 29 30 =cut 31 32 Below are signatures of functions from config.h which can't easily be gleaned 33 from it, and are very unlikely to change 34 35 =for apidoc_section $signals 36 =for apidoc Am|int|Sigsetjmp|jmp_buf env|int savesigs 37 =for apidoc Am|void|Siglongjmp|jmp_buf env|int val 38 39 =for apidoc_section $filesystem 40 =for apidoc Am|void *|FILE_ptr|FILE * f 41 =for apidoc Am|Size_t|FILE_cnt|FILE * f 42 =for apidoc Am|void *|FILE_base|FILE * f 43 =for apidoc Am|Size_t|FILE_bufsiz|FILE *f 44 45 =for apidoc_section $string 46 =for apidoc Amu|token|CAT2|token x|token y 47 =for apidoc Amu|string|STRINGIFY|token x 48 49 =for apidoc_section $numeric 50 =for apidoc Am|double|Drand01 51 =for apidoc Am|void|seedDrand01|Rand_seed_t x 52 =for apidoc Am|char *|Gconvert|double x|Size_t n|bool t|char * b 53 54 =cut 55 */ 56 57 # define Nullch Null(char*) 58 # define Nullfp Null(PerlIO*) 59 # define Nullsv Null(SV*) 60 #endif 61 62 #ifdef TRUE 63 #undef TRUE 64 #endif 65 #ifdef FALSE 66 #undef FALSE 67 #endif 68 #define TRUE (1) 69 #define FALSE (0) 70 71 /* 72 =for apidoc_section $SV 73 =for apidoc Am|void *|MUTABLE_PTR|void * p 74 =for apidoc_item |AV *|MUTABLE_AV|AV * p 75 =for apidoc_item |CV *|MUTABLE_CV|CV * p 76 =for apidoc_item |GV *|MUTABLE_GV|GV * p 77 =for apidoc_item |HV *|MUTABLE_HV|HV * p 78 =for apidoc_item |IO *|MUTABLE_IO|IO * p 79 =for apidoc_item |SV *|MUTABLE_SV|SV * p 80 81 The C<MUTABLE_I<*>>() macros cast pointers to the types shown, in such a way 82 (compiler permitting) that casting away const-ness will give a warning; 83 e.g.: 84 85 const SV *sv = ...; 86 AV *av1 = (AV*)sv; <== BAD: the const has been silently 87 cast away 88 AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn 89 90 C<MUTABLE_PTR> is the base macro used to derive new casts. The other 91 already-built-in ones return pointers to what their names indicate. 92 93 =cut 94 */ 95 96 #if defined(PERL_USE_GCC_BRACE_GROUPS) 97 # define MUTABLE_PTR(p) ({ void *p_ = (p); p_; }) 98 #else 99 # define MUTABLE_PTR(p) ((void *) (p)) 100 #endif 101 102 #define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p)) 103 #define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p)) 104 #define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p)) 105 #define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p)) 106 #define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p)) 107 #define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p)) 108 109 #if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR) 110 # include <stdbool.h> 111 # ifndef HAS_BOOL 112 # define HAS_BOOL 1 113 # endif 114 #endif 115 116 /* bool is built-in for g++-2.6.3 and later, which might be used 117 for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't 118 be sure _G_config.h will be included before this file. _G_config.h 119 also defines _G_HAVE_BOOL for both gcc and g++, but only g++ 120 actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us. 121 g++ can be identified by __GNUG__. 122 Andy Dougherty February 2000 123 */ 124 #ifdef __GNUG__ /* GNU g++ has bool built-in */ 125 # ifndef PERL_BOOL_AS_CHAR 126 # ifndef HAS_BOOL 127 # define HAS_BOOL 1 128 # endif 129 # endif 130 #endif 131 132 #ifndef HAS_BOOL 133 # ifdef bool 134 # undef bool 135 # endif 136 # define bool char 137 # define HAS_BOOL 1 138 #endif 139 140 /* 141 =for apidoc_section $casting 142 =for apidoc Am|bool|cBOOL|bool expr 143 144 Cast-to-bool. A simple S<C<(bool) I<expr>>> cast may not do the right thing: 145 if C<bool> is defined as C<char>, for example, then the cast from C<int> is 146 implementation-defined. 147 148 C<(bool)!!(cbool)> in a ternary triggers a bug in xlc on AIX 149 150 =cut 151 */ 152 #define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0) 153 154 /* Try to figure out __func__ or __FUNCTION__ equivalent, if any. 155 * XXX Should really be a Configure probe, with HAS__FUNCTION__ 156 * and FUNCTION__ as results. 157 * XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */ 158 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */ 159 # define FUNCTION__ __func__ 160 #elif (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */ 161 # define FUNCTION__ "" 162 #else 163 # define FUNCTION__ __FUNCTION__ /* Common extension. */ 164 #endif 165 166 /* XXX A note on the perl source internal type system. The 167 original intent was that I32 be *exactly* 32 bits. 168 169 Currently, we only guarantee that I32 is *at least* 32 bits. 170 Specifically, if int is 64 bits, then so is I32. (This is the case 171 for the Cray.) This has the advantage of meshing nicely with 172 standard library calls (where we pass an I32 and the library is 173 expecting an int), but the disadvantage that an I32 is not 32 bits. 174 Andy Dougherty August 1996 175 176 There is no guarantee that there is *any* integral type with 177 exactly 32 bits. It is perfectly legal for a system to have 178 sizeof(short) == sizeof(int) == sizeof(long) == 8. 179 180 Similarly, there is no guarantee that I16 and U16 have exactly 16 181 bits. 182 183 For dealing with issues that may arise from various 32/64-bit 184 systems, we will ask Configure to check out 185 186 SHORTSIZE == sizeof(short) 187 INTSIZE == sizeof(int) 188 LONGSIZE == sizeof(long) 189 LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG) 190 PTRSIZE == sizeof(void *) 191 DOUBLESIZE == sizeof(double) 192 LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE). 193 194 */ 195 196 #ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */ 197 # include <inttypes.h> 198 # ifdef INT32_MIN_BROKEN 199 # undef INT32_MIN 200 # define INT32_MIN (-2147483647-1) 201 # endif 202 # ifdef INT64_MIN_BROKEN 203 # undef INT64_MIN 204 # define INT64_MIN (-9223372036854775807LL-1) 205 # endif 206 #endif 207 208 typedef I8TYPE I8; 209 typedef U8TYPE U8; 210 typedef I16TYPE I16; 211 typedef U16TYPE U16; 212 typedef I32TYPE I32; 213 typedef U32TYPE U32; 214 215 #ifdef QUADKIND 216 typedef I64TYPE I64; 217 typedef U64TYPE U64; 218 #endif 219 220 /* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type. 221 Please search CHAR_MAX in perl.h for further details. */ 222 #ifdef UINT8_MAX 223 # define U8_MAX UINT8_MAX 224 #else 225 # define U8_MAX PERL_UCHAR_MAX 226 #endif 227 #ifdef UINT8_MIN 228 # define U8_MIN UINT8_MIN 229 #else 230 # define U8_MIN PERL_UCHAR_MIN 231 #endif 232 233 #ifdef INT16_MAX 234 # define I16_MAX INT16_MAX 235 #else 236 # define I16_MAX PERL_SHORT_MAX 237 #endif 238 #ifdef INT16_MIN 239 # define I16_MIN INT16_MIN 240 #else 241 # define I16_MIN PERL_SHORT_MIN 242 #endif 243 #ifdef UINT16_MAX 244 # define U16_MAX UINT16_MAX 245 #else 246 # define U16_MAX PERL_USHORT_MAX 247 #endif 248 #ifdef UINT16_MIN 249 # define U16_MIN UINT16_MIN 250 #else 251 # define U16_MIN PERL_USHORT_MIN 252 #endif 253 254 #ifdef INT32_MAX 255 # define I32_MAX INT32_MAX 256 #elif LONGSIZE > 4 257 # define I32_MAX PERL_INT_MAX 258 #else 259 # define I32_MAX PERL_LONG_MAX 260 #endif 261 #ifdef INT32_MIN 262 # define I32_MIN INT32_MIN 263 #elif LONGSIZE > 4 264 # define I32_MIN PERL_INT_MIN 265 #else 266 # define I32_MIN PERL_LONG_MIN 267 #endif 268 #ifdef UINT32_MAX 269 # ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */ 270 # define U32_MAX UINT_MAX 271 # else 272 # define U32_MAX 4294967295U 273 # endif 274 #elif LONGSIZE > 4 275 # define U32_MAX PERL_UINT_MAX 276 #else 277 # define U32_MAX PERL_ULONG_MAX 278 #endif 279 #ifdef UINT32_MIN 280 # define U32_MIN UINT32_MIN 281 #elif LONGSIZE > 4 282 # define U32_MIN PERL_UINT_MIN 283 #else 284 # define U32_MIN PERL_ULONG_MIN 285 #endif 286 287 /* 288 =for apidoc_section $integer 289 =for apidoc Ay|| PERL_INT_FAST8_T 290 =for apidoc_item PERL_INT_FAST16_T 291 =for apidoc_item PERL_UINT_FAST8_T 292 =for apidoc_item PERL_UINT_FAST16_T 293 294 These are equivalent to the correspondingly-named C99 typedefs on platforms 295 that have those; they evaluate to C<int> and C<unsigned int> on platforms that 296 don't, so that you can portably take advantage of this C99 feature. 297 298 =cut 299 */ 300 # ifdef I_STDINT 301 typedef int_fast8_t PERL_INT_FAST8_T; 302 typedef uint_fast8_t PERL_UINT_FAST8_T; 303 typedef int_fast16_t PERL_INT_FAST16_T; 304 typedef uint_fast16_t PERL_UINT_FAST16_T; 305 # else 306 typedef int PERL_INT_FAST8_T; 307 typedef unsigned int PERL_UINT_FAST8_T; 308 typedef int PERL_INT_FAST16_T; 309 typedef unsigned int PERL_UINT_FAST16_T; 310 # endif 311 312 /* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case 313 * anyone is grepping for it. So BIT_DIGITS gives the number of decimal digits 314 * required to represent any possible unsigned number containing N bits. 315 * TYPE_DIGITS gives the number of decimal digits required to represent any 316 * possible unsigned number of type T. */ 317 #define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */ 318 #define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8) 319 #define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */ 320 321 /* Unused by core; should be deprecated */ 322 #define Ctl(ch) ((ch) & 037) 323 324 #if defined(PERL_CORE) || defined(PERL_EXT) 325 # ifndef MIN 326 # define MIN(a,b) ((a) < (b) ? (a) : (b)) 327 # endif 328 # ifndef MAX 329 # define MAX(a,b) ((a) > (b) ? (a) : (b)) 330 # endif 331 #endif 332 333 /* Returns a boolean as to whether the input unsigned number is a power of 2 334 * (2**0, 2**1, etc). In other words if it has just a single bit set. 335 * If not, subtracting 1 would leave the uppermost bit set, so the & would 336 * yield non-zero */ 337 #if defined(PERL_CORE) || defined(PERL_EXT) 338 # define isPOWER_OF_2(n) ((n) && ((n) & ((n)-1)) == 0) 339 #endif 340 341 /* Returns a mask with the lowest n bits set */ 342 #define nBIT_MASK(n) ((UINTMAX_C(1) << (n)) - 1) 343 344 /* The largest unsigned number that will fit into n bits */ 345 #define nBIT_UMAX(n) nBIT_MASK(n) 346 347 /* 348 =for apidoc_section $directives 349 =for apidoc Am||__ASSERT_|bool expr 350 351 This is a helper macro to avoid preprocessor issues, replaced by nothing 352 unless under DEBUGGING, where it expands to an assert of its argument, 353 followed by a comma (hence the comma operator). If we just used a straight 354 assert(), we would get a comma with nothing before it when not DEBUGGING. 355 356 =cut 357 358 We also use empty definition under Coverity since the __ASSERT_ 359 checks often check for things that Really Cannot Happen, and Coverity 360 detects that and gets all excited. */ 361 362 #if defined(DEBUGGING) && !defined(__COVERITY__) \ 363 && ! defined(PERL_SMALL_MACRO_BUFFER) 364 # define __ASSERT_(statement) assert(statement), 365 #else 366 # define __ASSERT_(statement) 367 #endif 368 369 /* 370 =for apidoc_section $SV 371 372 =for apidoc Ama|SV*|newSVpvs|"literal string" 373 Like C<newSVpvn>, but takes a literal string instead of a 374 string/length pair. 375 376 =for apidoc Ama|SV*|newSVpvs_flags|"literal string"|U32 flags 377 Like C<newSVpvn_flags>, but takes a literal string instead of 378 a string/length pair. 379 380 =for apidoc Ama|SV*|newSVpvs_share|"literal string" 381 Like C<newSVpvn_share>, but takes a literal string instead of 382 a string/length pair and omits the hash parameter. 383 384 =for apidoc Am|void|sv_catpvs_flags|SV* sv|"literal string"|I32 flags 385 Like C<sv_catpvn_flags>, but takes a literal string instead 386 of a string/length pair. 387 388 =for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string" 389 Like C<sv_catpvn_nomg>, but takes a literal string instead of 390 a string/length pair. 391 392 =for apidoc Am|void|sv_catpvs|SV* sv|"literal string" 393 Like C<sv_catpvn>, but takes a literal string instead of a 394 string/length pair. 395 396 =for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string" 397 Like C<sv_catpvn_mg>, but takes a literal string instead of a 398 string/length pair. 399 400 =for apidoc Am|void|sv_setpvs|SV* sv|"literal string" 401 Like C<sv_setpvn>, but takes a literal string instead of a 402 string/length pair. 403 404 =for apidoc Am|void|sv_setpvs_mg|SV* sv|"literal string" 405 Like C<sv_setpvn_mg>, but takes a literal string instead of a 406 string/length pair. 407 408 =for apidoc Am|SV *|sv_setref_pvs|SV *const rv|const char *const classname|"literal string" 409 Like C<sv_setref_pvn>, but takes a literal string instead of 410 a string/length pair. 411 412 =for apidoc_section $string 413 414 =for apidoc Ama|char*|savepvs|"literal string" 415 Like C<savepvn>, but takes a literal string instead of a 416 string/length pair. 417 418 =for apidoc Ama|char*|savesharedpvs|"literal string" 419 A version of C<savepvs()> which allocates the duplicate string in memory 420 which is shared between threads. 421 422 =for apidoc_section $GV 423 424 =for apidoc Am|HV*|gv_stashpvs|"name"|I32 create 425 Like C<gv_stashpvn>, but takes a literal string instead of a 426 string/length pair. 427 428 =for apidoc_section $HV 429 430 =for apidoc Am|SV**|hv_fetchs|HV* tb|"key"|I32 lval 431 Like C<hv_fetch>, but takes a literal string instead of a 432 string/length pair. 433 434 =for apidoc Am|SV**|hv_stores|HV* tb|"key"|SV* val 435 Like C<hv_store>, but takes a literal string instead of a 436 string/length pair 437 and omits the hash parameter. 438 439 =for apidoc_section $lexer 440 441 =for apidoc Amx|void|lex_stuff_pvs|"pv"|U32 flags 442 443 Like L</lex_stuff_pvn>, but takes a literal string instead of 444 a string/length pair. 445 446 =cut 447 */ 448 449 /* 450 =for apidoc_section $string 451 452 =for apidoc Amu|pair|STR_WITH_LEN|"literal string" 453 454 Returns two comma separated tokens of the input literal string, and its length. 455 This is convenience macro which helps out in some API calls. 456 Note that it can't be used as an argument to macros or functions that under 457 some configurations might be macros, which means that it requires the full 458 Perl_xxx(aTHX_ ...) form for any API calls where it's used. 459 460 =cut 461 */ 462 463 #define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1) 464 465 /* STR_WITH_LEN() shortcuts */ 466 #define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str)) 467 #define newSVpvs_flags(str,flags) \ 468 Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags) 469 #define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0) 470 #define sv_catpvs_flags(sv, str, flags) \ 471 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags) 472 #define sv_catpvs_nomg(sv, str) \ 473 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0) 474 #define sv_catpvs(sv, str) \ 475 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC) 476 #define sv_catpvs_mg(sv, str) \ 477 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC) 478 #define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str)) 479 #define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str)) 480 #define sv_setref_pvs(rv, classname, str) \ 481 Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str)) 482 #define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str)) 483 #define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str)) 484 #define gv_stashpvs(str, create) \ 485 Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create) 486 487 #define gv_fetchpvs(namebeg, flags, sv_type) \ 488 Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), flags, sv_type) 489 #define gv_fetchpvn gv_fetchpvn_flags 490 #define sv_catxmlpvs(dsv, str, utf8) \ 491 Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8) 492 493 494 #define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags) 495 496 #define get_cvs(str, flags) \ 497 Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags)) 498 499 /* internal helpers */ 500 /* Transitional */ 501 #ifndef PERL_VERSION_MAJOR 502 # define PERL_VERSION_MAJOR PERL_REVISION 503 #else 504 # undef PERL_REVISION /* We don't want code to be using these */ 505 #endif 506 #ifndef PERL_VERSION_MINOR 507 # define PERL_VERSION_MINOR PERL_VERSION 508 #else 509 # undef PERL_VERSION 510 #endif 511 #ifndef PERL_VERSION_PATCH 512 # define PERL_VERSION_PATCH PERL_SUBVERSION 513 #else 514 # undef PERL_SUBVERSION 515 #endif 516 517 #define PERL_JNP_TO_DECIMAL_(maJor,miNor,Patch) \ 518 /* '10*' leaves room for things like alpha, beta, releases */ \ 519 (10 * ((maJor) * 1000000) + ((miNor) * 1000) + (Patch)) 520 #define PERL_DECIMAL_VERSION_ \ 521 PERL_JNP_TO_DECIMAL_(PERL_VERSION_MAJOR, PERL_VERSION_MINOR, \ 522 PERL_VERSION_PATCH) 523 524 /* 525 =for apidoc_section $versioning 526 =for apidoc AmR|bool|PERL_VERSION_EQ|const U8 major|const U8 minor|const U8 patch 527 =for apidoc_item PERL_VERSION_NE 528 =for apidoc_item PERL_VERSION_LT 529 =for apidoc_item PERL_VERSION_LE 530 =for apidoc_item PERL_VERSION_GT 531 =for apidoc_item PERL_VERSION_GE 532 533 Returns whether or not the perl currently being compiled has the specified 534 relationship to the perl given by the parameters. For example, 535 536 #if PERL_VERSION_GT(5,24,2) 537 code that will only be compiled on perls after v5.24.2 538 #else 539 fallback code 540 #endif 541 542 Note that this is usable in making compile-time decisions 543 544 You may use the special value '*' for the final number to mean ALL possible 545 values for it. Thus, 546 547 #if PERL_VERSION_EQ(5,31,'*') 548 549 means all perls in the 5.31 series. And 550 551 #if PERL_VERSION_NE(5,24,'*') 552 553 means all perls EXCEPT 5.24 ones. And 554 555 #if PERL_VERSION_LE(5,9,'*') 556 557 is effectively 558 559 #if PERL_VERSION_LT(5,10,0) 560 561 This means you don't have to think so much when converting from the existing 562 deprecated C<PERL_VERSION> to using this macro: 563 564 #if PERL_VERSION <= 9 565 566 becomes 567 568 #if PERL_VERSION_LE(5,9,'*') 569 570 =cut 571 */ 572 573 /* N.B. These don't work if the patch version is 42 or 92, as those are what 574 * '*' is in ASCII and EBCDIC respectively */ 575 # define PERL_VERSION_EQ(j,n,p) \ 576 (((p) == '*') \ 577 ? ( (j) == PERL_VERSION_MAJOR \ 578 && (n) == PERL_VERSION_MINOR) \ 579 : (PERL_DECIMAL_VERSION_ == PERL_JNP_TO_DECIMAL_(j,n,p))) 580 # define PERL_VERSION_NE(j,n,p) (! PERL_VERSION_EQ(j,n,p)) 581 582 # define PERL_VERSION_LT(j,n,p) /* < '*' effectively means < 0 */ \ 583 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \ 584 (n), \ 585 (((p) == '*') ? 0 : p))) 586 # define PERL_VERSION_GE(j,n,p) (! PERL_VERSION_LT(j,n,p)) 587 588 # define PERL_VERSION_LE(j,n,p) /* <= '*' effectively means < n+1 */ \ 589 (PERL_DECIMAL_VERSION_ < PERL_JNP_TO_DECIMAL_( (j), \ 590 (((p) == '*') ? ((n)+1) : (n)), \ 591 (((p) == '*') ? 0 : p))) 592 # define PERL_VERSION_GT(j,n,p) (! PERL_VERSION_LE(j,n,p)) 593 594 /* 595 =for apidoc_section $string 596 597 =for apidoc Am|bool|strNE|char* s1|char* s2 598 Test two C<NUL>-terminated strings to see if they are different. Returns true 599 or false. 600 601 =for apidoc Am|bool|strEQ|char* s1|char* s2 602 Test two C<NUL>-terminated strings to see if they are equal. Returns true or 603 false. 604 605 =for apidoc Am|bool|strLT|char* s1|char* s2 606 Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the 607 second, C<s2>. Returns true or false. 608 609 =for apidoc Am|bool|strLE|char* s1|char* s2 610 Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or 611 equal to the second, C<s2>. Returns true or false. 612 613 =for apidoc Am|bool|strGT|char* s1|char* s2 614 Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than 615 the second, C<s2>. Returns true or false. 616 617 =for apidoc Am|bool|strGE|char* s1|char* s2 618 Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than 619 or equal to the second, C<s2>. Returns true or false. 620 621 =for apidoc Am|bool|strnNE|char* s1|char* s2|STRLEN len 622 Test two C<NUL>-terminated strings to see if they are different. The C<len> 623 parameter indicates the number of bytes to compare. Returns true or false. (A 624 wrapper for C<strncmp>). 625 626 =for apidoc Am|bool|strnEQ|char* s1|char* s2|STRLEN len 627 Test two C<NUL>-terminated strings to see if they are equal. The C<len> 628 parameter indicates the number of bytes to compare. Returns true or false. (A 629 wrapper for C<strncmp>). 630 631 =for apidoc Am|bool|memEQ|char* s1|char* s2|STRLEN len 632 Test two buffers (which may contain embedded C<NUL> characters, to see if they 633 are equal. The C<len> parameter indicates the number of bytes to compare. 634 Returns true or false. It is undefined behavior if either of the buffers 635 doesn't contain at least C<len> bytes. 636 637 =for apidoc Am|bool|memEQs|char* s1|STRLEN l1|"s2" 638 Like L</memEQ>, but the second string is a literal enclosed in double quotes, 639 C<l1> gives the number of bytes in C<s1>. 640 Returns true or false. 641 642 =for apidoc Am|bool|memNE|char* s1|char* s2|STRLEN len 643 Test two buffers (which may contain embedded C<NUL> characters, to see if they 644 are not equal. The C<len> parameter indicates the number of bytes to compare. 645 Returns true or false. It is undefined behavior if either of the buffers 646 doesn't contain at least C<len> bytes. 647 648 =for apidoc Am|bool|memNEs|char* s1|STRLEN l1|"s2" 649 Like L</memNE>, but the second string is a literal enclosed in double quotes, 650 C<l1> gives the number of bytes in C<s1>. 651 Returns true or false. 652 653 =for apidoc Am|bool|memCHRs|"list"|char c 654 Returns the position of the first occurence of the byte C<c> in the literal 655 string C<"list">, or NULL if C<c> doesn't appear in C<"list">. All bytes are 656 treated as unsigned char. Thus this macro can be used to determine if C<c> is 657 in a set of particular characters. Unlike L<strchr(3)>, it works even if C<c> 658 is C<NUL> (and the set doesn't include C<NUL>). 659 660 =cut 661 662 New macros should use the following conventions for their names (which are 663 based on the underlying C library functions): 664 665 (mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s? 666 667 Each has two main parameters, string-like operands that are compared 668 against each other, as specified by the macro name. Some macros may 669 additionally have one or potentially even two length parameters. If a length 670 parameter applies to both string parameters, it will be positioned third; 671 otherwise any length parameter immediately follows the string parameter it 672 applies to. 673 674 If the prefix to the name is 'str', the string parameter is a pointer to a C 675 language string. Such a string does not contain embedded NUL bytes; its 676 length may be unknown, but can be calculated by C<strlen()>, since it is 677 terminated by a NUL, which isn't included in its length. 678 679 The optional 'n' following 'str' means that there is a third parameter, 680 giving the maximum number of bytes to look at in each string. Even if both 681 strings are longer than the length parameter, those extra bytes will be 682 unexamined. 683 684 The 's' suffix means that the 2nd byte string parameter is a literal C 685 double-quoted string. Its length will automatically be calculated by the 686 macro, so no length parameter will ever be needed for it. 687 688 If the prefix is 'mem', the string parameters don't have to be C strings; 689 they may contain embedded NUL bytes, do not necessarily have a terminating 690 NUL, and their lengths can be known only through other means, which in 691 practice are additional parameter(s) passed to the function. All 'mem' 692 functions have at least one length parameter. Barring any 'l' or 's' suffix, 693 there is a single length parameter, in position 3, which applies to both 694 string parameters. The 's' suffix means, as described above, that the 2nd 695 string is a literal double-quoted C string (hence its length is calculated by 696 the macro, and the length parameter to the function applies just to the first 697 string parameter, and hence is positioned just after it). An 'l' suffix 698 means that the 2nd string parameter has its own length parameter, and the 699 signature will look like memFOOl(s1, l1, s2, l2). 700 701 BEGIN (and END) are for testing if the 2nd string is an initial (or final) 702 substring of the 1st string. 'P' if present indicates that the substring 703 must be a "proper" one in tha mathematical sense that the first one must be 704 strictly larger than the 2nd. 705 706 */ 707 708 709 #define strNE(s1,s2) (strcmp(s1,s2) != 0) 710 #define strEQ(s1,s2) (strcmp(s1,s2) == 0) 711 #define strLT(s1,s2) (strcmp(s1,s2) < 0) 712 #define strLE(s1,s2) (strcmp(s1,s2) <= 0) 713 #define strGT(s1,s2) (strcmp(s1,s2) > 0) 714 #define strGE(s1,s2) (strcmp(s1,s2) >= 0) 715 716 #define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0) 717 #define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0) 718 719 #define memEQ(s1,s2,l) (memcmp(((const void *) (s1)), ((const void *) (s2)), l) == 0) 720 #define memNE(s1,s2,l) (! memEQ(s1,s2,l)) 721 722 /* memEQ and memNE where second comparand is a string constant */ 723 #define memEQs(s1, l, s2) \ 724 (((sizeof(s2)-1) == (l)) && memEQ((s1), ("" s2 ""), (sizeof(s2)-1))) 725 #define memNEs(s1, l, s2) (! memEQs(s1, l, s2)) 726 727 /* Keep these private until we decide it was a good idea */ 728 #if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX) 729 730 #define strBEGINs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1) == 0) 731 732 #define memBEGINs(s1, l, s2) \ 733 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \ 734 && memEQ(s1, "" s2 "", sizeof(s2)-1)) 735 #define memBEGINPs(s1, l, s2) \ 736 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) - 1 \ 737 && memEQ(s1, "" s2 "", sizeof(s2)-1)) 738 #define memENDs(s1, l, s2) \ 739 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \ 740 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1)) 741 #define memENDPs(s1, l, s2) \ 742 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) \ 743 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1)) 744 #endif /* End of making macros private */ 745 746 #define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0) 747 #define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0) 748 #define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0) 749 #define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0) 750 751 #define memCHRs(s1,c) ((const char *) memchr("" s1 "" , c, sizeof(s1)-1)) 752 753 /* 754 * Character classes. 755 * 756 * Unfortunately, the introduction of locales means that we 757 * can't trust isupper(), etc. to tell the truth. And when 758 * it comes to /\w+/ with tainting enabled, we *must* be able 759 * to trust our character classes. 760 * 761 * Therefore, the default tests in the text of Perl will be 762 * independent of locale. Any code that wants to depend on 763 * the current locale will use the tests that begin with "lc". 764 */ 765 766 #ifdef USE_LOCALE 767 # ifndef CTYPE256 768 # define CTYPE256 769 # endif 770 #endif 771 772 /* 773 774 =head1 Character classification 775 This section is about functions (really macros) that classify characters 776 into types, such as punctuation versus alphabetic, etc. Most of these are 777 analogous to regular expression character classes. (See 778 L<perlrecharclass/POSIX Character Classes>.) There are several variants for 779 each class. (Not all macros have all variants; each item below lists the 780 ones valid for it.) None are affected by C<use bytes>, and only the ones 781 with C<LC> in the name are affected by the current locale. 782 783 The base function, e.g., C<isALPHA()>, takes any signed or unsigned value, 784 treating it as a code point, and returns a boolean as to whether or not the 785 character represented by it is (or on non-ASCII platforms, corresponds to) an 786 ASCII character in the named class based on platform, Unicode, and Perl rules. 787 If the input is a number that doesn't fit in an octet, FALSE is returned. 788 789 Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function 790 with no suffix C<"_A">. This variant is used to emphasize by its name that 791 only ASCII-range characters can return TRUE. 792 793 Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set 794 onto the platform. That is, the code points that are ASCII are unaffected, 795 since ASCII is a subset of Latin-1. But the non-ASCII code points are treated 796 as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return 797 true when called with the code point 0xDF, which is a word character in both 798 ASCII and EBCDIC (though it represents different characters in each). 799 If the input is a number that doesn't fit in an octet, FALSE is returned. 800 (Perl's documentation uses a colloquial definition of Latin-1, to include all 801 code points below 256.) 802 803 Variant C<isI<FOO>_uvchr> is exactly like the C<isI<FOO>_L1> variant, for 804 inputs below 256, but if the code point is larger than 255, Unicode rules are 805 used to determine if it is in the character class. For example, 806 C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A 807 WITH MACRON in Unicode, and is a word character. 808 809 Variants C<isI<FOO>_utf8> and C<isI<FOO>_utf8_safe> are like C<isI<FOO>_uvchr>, 810 but are used for UTF-8 encoded strings. The two forms are different names for 811 the same thing. Each call to one of these classifies the first character of 812 the string starting at C<p>. The second parameter, C<e>, points to anywhere in 813 the string beyond the first character, up to one byte past the end of the 814 entire string. Although both variants are identical, the suffix C<_safe> in 815 one name emphasizes that it will not attempt to read beyond S<C<e - 1>>, 816 provided that the constraint S<C<s E<lt> e>> is true (this is asserted for in 817 C<-DDEBUGGING> builds). If the UTF-8 for the input character is malformed in 818 some way, the program may croak, or the function may return FALSE, at the 819 discretion of the implementation, and subject to change in future releases. 820 821 Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants, 822 but the result is based on the current locale, which is what C<LC> in the name 823 stands for. If Perl can determine that the current locale is a UTF-8 locale, 824 it uses the published Unicode rules; otherwise, it uses the C library function 825 that gives the named classification. For example, C<isDIGIT_LC()> when not in 826 a UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always 827 returned if the input won't fit into an octet. On some platforms where the C 828 library function is known to be defective, Perl changes its result to follow 829 the POSIX standard's rules. 830 831 Variant C<isI<FOO>_LC_uvchr> acts exactly like C<isI<FOO>_LC> for inputs less 832 than 256, but for larger ones it returns the Unicode classification of the code 833 point. 834 835 Variants C<isI<FOO>_LC_utf8> and C<isI<FOO>_LC_utf8_safe> are like 836 C<isI<FOO>_LC_uvchr>, but are used for UTF-8 encoded strings. The two forms 837 are different names for the same thing. Each call to one of these classifies 838 the first character of the string starting at C<p>. The second parameter, 839 C<e>, points to anywhere in the string beyond the first character, up to one 840 byte past the end of the entire string. Although both variants are identical, 841 the suffix C<_safe> in one name emphasizes that it will not attempt to read 842 beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this 843 is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input 844 character is malformed in some way, the program may croak, or the function may 845 return FALSE, at the discretion of the implementation, and subject to change in 846 future releases. 847 848 =for apidoc Am|bool|isALPHA|UV ch 849 =for apidoc_item ||isALPHA_A|UV ch 850 =for apidoc_item ||isALPHA_L1|UV ch 851 =for apidoc_item ||isALPHA_uvchr|UV ch 852 =for apidoc_item ||isALPHA_utf8_safe|U8 * s|U8 * end 853 =for apidoc_item ||isALPHA_utf8|U8 * s|U8 * end 854 =for apidoc_item ||isALPHA_LC|UV ch 855 =for apidoc_item ||isALPHA_LC_uvchr|UV ch 856 =for apidoc_item ||isALPHA_LC_utf8_safe|U8 * s| U8 *end 857 Returns a boolean indicating whether the specified input is one of C<[A-Za-z]>, 858 analogous to C<m/[[:alpha:]]/>. 859 See the L<top of this section|/Character classification> for an explanation of 860 the variants. 861 862 =cut 863 864 Here and below, we add the prototypes of these macros for downstream programs 865 that would be interested in them, such as Devel::PPPort 866 867 =for apidoc Am|bool|isALPHANUMERIC|UV ch 868 =for apidoc_item ||isALPHANUMERIC_A|UV ch 869 =for apidoc_item ||isALPHANUMERIC_L1|UV ch 870 =for apidoc_item ||isALPHANUMERIC_uvchr|UV ch 871 =for apidoc_item ||isALPHANUMERIC_utf8_safe|U8 * s|U8 * end 872 =for apidoc_item ||isALPHANUMERIC_utf8|U8 * s|U8 * end 873 =for apidoc_item ||isALPHANUMERIC_LC|UV ch 874 =for apidoc_item ||isALPHANUMERIC_LC_uvchr|UV ch 875 =for apidoc_item ||isALPHANUMERIC_LC_utf8_safe|U8 * s| U8 *end 876 =for apidoc_item ||isALNUMC|UV ch 877 =for apidoc_item ||isALNUMC_A|UV ch 878 =for apidoc_item ||isALNUMC_L1|UV ch 879 =for apidoc_item ||isALNUMC_LC|UV ch 880 =for apidoc_item ||isALNUMC_LC_uvchr|UV ch 881 Returns a boolean indicating whether the specified character is one of 882 C<[A-Za-z0-9]>, analogous to C<m/[[:alnum:]]/>. 883 See the L<top of this section|/Character classification> for an explanation of 884 the variants. 885 886 A (discouraged from use) synonym is C<isALNUMC> (where the C<C> suffix means 887 this corresponds to the C language alphanumeric definition). Also 888 there are the variants 889 C<isALNUMC_A>, C<isALNUMC_L1> 890 C<isALNUMC_LC>, and C<isALNUMC_LC_uvchr>. 891 892 =for apidoc Am|bool|isASCII|UV ch 893 =for apidoc_item ||isASCII_A|UV ch 894 =for apidoc_item ||isASCII_L1|UV ch 895 =for apidoc_item ||isASCII_uvchr|UV ch 896 =for apidoc_item ||isASCII_utf8_safe|U8 * s|U8 * end 897 =for apidoc_item ||isASCII_utf8|U8 * s|U8 * end 898 =for apidoc_item ||isASCII_LC|UV ch 899 =for apidoc_item ||isASCII_LC_uvchr|UV ch 900 =for apidoc_item ||isASCII_LC_utf8_safe|U8 * s| U8 *end 901 Returns a boolean indicating whether the specified character is one of the 128 902 characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>. 903 On non-ASCII platforms, it returns TRUE iff this 904 character corresponds to an ASCII character. Variants C<isASCII_A()> and 905 C<isASCII_L1()> are identical to C<isASCII()>. 906 See the L<top of this section|/Character classification> for an explanation of 907 the variants. 908 Note, however, that some platforms do not have the C library routine 909 C<isascii()>. In these cases, the variants whose names contain C<LC> are the 910 same as the corresponding ones without. 911 912 Also note, that because all ASCII characters are UTF-8 invariant (meaning they 913 have the exact same representation (always a single byte) whether encoded in 914 UTF-8 or not), C<isASCII> will give the correct results when called with any 915 byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8> and 916 C<isASCII_utf8_safe> will work properly on any string encoded or not in UTF-8. 917 918 =for apidoc Am|bool|isBLANK|UV ch 919 =for apidoc_item ||isBLANK_A|UV ch 920 =for apidoc_item ||isBLANK_L1|UV ch 921 =for apidoc_item ||isBLANK_uvchr|UV ch 922 =for apidoc_item ||isBLANK_utf8_safe|U8 * s|U8 * end 923 =for apidoc_item ||isBLANK_utf8|U8 * s|U8 * end 924 =for apidoc_item ||isBLANK_LC|UV ch 925 =for apidoc_item ||isBLANK_LC_uvchr|UV ch 926 =for apidoc_item ||isBLANK_LC_utf8_safe|U8 * s| U8 *end 927 Returns a boolean indicating whether the specified character is a 928 character considered to be a blank, analogous to C<m/[[:blank:]]/>. 929 See the L<top of this section|/Character classification> for an explanation of 930 the variants. 931 Note, 932 however, that some platforms do not have the C library routine 933 C<isblank()>. In these cases, the variants whose names contain C<LC> are 934 the same as the corresponding ones without. 935 936 =for apidoc Am|bool|isCNTRL|UV ch 937 =for apidoc_item ||isCNTRL_A|UV ch 938 =for apidoc_item ||isCNTRL_L1|UV ch 939 =for apidoc_item ||isCNTRL_uvchr|UV ch 940 =for apidoc_item ||isCNTRL_utf8_safe|U8 * s|U8 * end 941 =for apidoc_item ||isCNTRL_utf8|U8 * s|U8 * end 942 =for apidoc_item ||isCNTRL_LC|UV ch 943 =for apidoc_item ||isCNTRL_LC_uvchr|UV ch 944 =for apidoc_item ||isCNTRL_LC_utf8_safe|U8 * s| U8 *end 945 946 Returns a boolean indicating whether the specified character is a 947 control character, analogous to C<m/[[:cntrl:]]/>. 948 See the L<top of this section|/Character classification> for an explanation of 949 the variants. 950 On EBCDIC platforms, you almost always want to use the C<isCNTRL_L1> variant. 951 952 =for apidoc Am|bool|isDIGIT|UV ch 953 =for apidoc_item ||isDIGIT_A|UV ch 954 =for apidoc_item ||isDIGIT_L1|UV ch 955 =for apidoc_item ||isDIGIT_uvchr|UV ch 956 =for apidoc_item ||isDIGIT_utf8_safe|U8 * s|U8 * end 957 =for apidoc_item ||isDIGIT_utf8|U8 * s|U8 * end 958 =for apidoc_item ||isDIGIT_LC|UV ch 959 =for apidoc_item ||isDIGIT_LC_uvchr|UV ch 960 =for apidoc_item ||isDIGIT_LC_utf8_safe|U8 * s| U8 *end 961 962 Returns a boolean indicating whether the specified character is a 963 digit, analogous to C<m/[[:digit:]]/>. 964 Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>. 965 See the L<top of this section|/Character classification> for an explanation of 966 the variants. 967 968 =for apidoc Am|bool|isGRAPH|UV ch 969 =for apidoc_item ||isGRAPH_A|UV ch 970 =for apidoc_item ||isGRAPH_L1|UV ch 971 =for apidoc_item ||isGRAPH_uvchr|UV ch 972 =for apidoc_item ||isGRAPH_utf8_safe|U8 * s|U8 * end 973 =for apidoc_item ||isGRAPH_utf8|U8 * s|U8 * end 974 =for apidoc_item ||isGRAPH_LC|UV ch 975 =for apidoc_item ||isGRAPH_LC_uvchr|UV ch 976 =for apidoc_item ||isGRAPH_LC_utf8_safe|U8 * s| U8 *end 977 Returns a boolean indicating whether the specified character is a 978 graphic character, analogous to C<m/[[:graph:]]/>. 979 See the L<top of this section|/Character classification> for an explanation of 980 the variants. 981 982 =for apidoc Am|bool|isLOWER|UV ch 983 =for apidoc_item ||isLOWER_A|UV ch 984 =for apidoc_item ||isLOWER_L1|UV ch 985 =for apidoc_item ||isLOWER_uvchr|UV ch 986 =for apidoc_item ||isLOWER_utf8_safe|U8 * s|U8 * end 987 =for apidoc_item ||isLOWER_utf8|U8 * s|U8 * end 988 =for apidoc_item ||isLOWER_LC|UV ch 989 =for apidoc_item ||isLOWER_LC_uvchr|UV ch 990 =for apidoc_item ||isLOWER_LC_utf8_safe|U8 * s| U8 *end 991 Returns a boolean indicating whether the specified character is a 992 lowercase character, analogous to C<m/[[:lower:]]/>. 993 See the L<top of this section|/Character classification> for an explanation of 994 the variants 995 996 =for apidoc Am|bool|isOCTAL|UV ch 997 =for apidoc_item ||isOCTAL_A|UV ch 998 =for apidoc_item ||isOCTAL_L1|UV ch 999 Returns a boolean indicating whether the specified character is an 1000 octal digit, [0-7]. 1001 The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to 1002 C<isOCTAL>. 1003 1004 =for apidoc Am|bool|isPUNCT|UV ch 1005 =for apidoc_item ||isPUNCT_A|UV ch 1006 =for apidoc_item ||isPUNCT_L1|UV ch 1007 =for apidoc_item ||isPUNCT_uvchr|UV ch 1008 =for apidoc_item ||isPUNCT_utf8_safe|U8 * s|U8 * end 1009 =for apidoc_item ||isPUNCT_utf8|U8 * s|U8 * end 1010 =for apidoc_item ||isPUNCT_LC|UV ch 1011 =for apidoc_item ||isPUNCT_LC_uvchr|UV ch 1012 =for apidoc_item ||isPUNCT_LC_utf8_safe|U8 * s| U8 *end 1013 Returns a boolean indicating whether the specified character is a 1014 punctuation character, analogous to C<m/[[:punct:]]/>. 1015 Note that the definition of what is punctuation isn't as 1016 straightforward as one might desire. See L<perlrecharclass/POSIX Character 1017 Classes> for details. 1018 See the L<top of this section|/Character classification> for an explanation of 1019 the variants. 1020 1021 =for apidoc Am|bool|isSPACE|UV ch 1022 =for apidoc_item ||isSPACE_A|UV ch 1023 =for apidoc_item ||isSPACE_L1|UV ch 1024 =for apidoc_item ||isSPACE_uvchr|UV ch 1025 =for apidoc_item ||isSPACE_utf8_safe|U8 * s|U8 * end 1026 =for apidoc_item ||isSPACE_utf8|U8 * s|U8 * end 1027 =for apidoc_item ||isSPACE_LC|UV ch 1028 =for apidoc_item ||isSPACE_LC_uvchr|UV ch 1029 =for apidoc_item ||isSPACE_LC_utf8_safe|U8 * s| U8 *end 1030 Returns a boolean indicating whether the specified character is a 1031 whitespace character. This is analogous 1032 to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18 1033 this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the 1034 locale forms of this macro (the ones with C<LC> in their names) matched 1035 precisely what C<m/[[:space:]]/> does. In those releases, the only difference, 1036 in the non-locale variants, was that C<isSPACE()> did not match a vertical tab. 1037 (See L</isPSXSPC> for a macro that matches a vertical tab in all releases.) 1038 See the L<top of this section|/Character classification> for an explanation of 1039 the variants. 1040 1041 =for apidoc Am|bool|isPSXSPC|UV ch 1042 =for apidoc_item ||isPSXSPC_A|UV ch 1043 =for apidoc_item ||isPSXSPC_L1|UV ch 1044 =for apidoc_item ||isPSXSPC_uvchr|UV ch 1045 =for apidoc_item ||isPSXSPC_utf8_safe|U8 * s|U8 * end 1046 =for apidoc_item ||isPSXSPC_utf8|U8 * s|U8 * end 1047 =for apidoc_item ||isPSXSPC_LC|UV ch 1048 =for apidoc_item ||isPSXSPC_LC_uvchr|UV ch 1049 =for apidoc_item ||isPSXSPC_LC_utf8_safe|U8 * s| U8 *end 1050 (short for Posix Space) 1051 Starting in 5.18, this is identical in all its forms to the 1052 corresponding C<isSPACE()> macros. 1053 The locale forms of this macro are identical to their corresponding 1054 C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the 1055 non-locale forms differ from their C<isSPACE()> forms only in that the 1056 C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do. 1057 Otherwise they are identical. Thus this macro is analogous to what 1058 C<m/[[:space:]]/> matches in a regular expression. 1059 See the L<top of this section|/Character classification> for an explanation of 1060 the variants. 1061 1062 =for apidoc Am|bool|isUPPER|UV ch 1063 =for apidoc_item ||isUPPER_A|UV ch 1064 =for apidoc_item ||isUPPER_L1|UV ch 1065 =for apidoc_item ||isUPPER_uvchr|UV ch 1066 =for apidoc_item ||isUPPER_utf8_safe|U8 * s|U8 * end 1067 =for apidoc_item ||isUPPER_utf8|U8 * s|U8 * end 1068 =for apidoc_item ||isUPPER_LC|UV ch 1069 =for apidoc_item ||isUPPER_LC_uvchr|UV ch 1070 =for apidoc_item ||isUPPER_LC_utf8_safe|U8 * s| U8 *end 1071 Returns a boolean indicating whether the specified character is an 1072 uppercase character, analogous to C<m/[[:upper:]]/>. 1073 See the L<top of this section|/Character classification> for an explanation of 1074 the variants. 1075 1076 =for apidoc Am|bool|isPRINT|UV ch 1077 =for apidoc_item ||isPRINT_A|UV ch 1078 =for apidoc_item ||isPRINT_L1|UV ch 1079 =for apidoc_item ||isPRINT_uvchr|UV ch 1080 =for apidoc_item ||isPRINT_utf8_safe|U8 * s|U8 * end 1081 =for apidoc_item ||isPRINT_utf8|U8 * s|U8 * end 1082 =for apidoc_item ||isPRINT_LC|UV ch 1083 =for apidoc_item ||isPRINT_LC_uvchr|UV ch 1084 =for apidoc_item ||isPRINT_LC_utf8_safe|U8 * s| U8 *end 1085 Returns a boolean indicating whether the specified character is a 1086 printable character, analogous to C<m/[[:print:]]/>. 1087 See the L<top of this section|/Character classification> for an explanation of 1088 the variants. 1089 1090 =for apidoc Am|bool|isWORDCHAR|UV ch 1091 =for apidoc_item ||isWORDCHAR_A|UV ch 1092 =for apidoc_item ||isWORDCHAR_L1|UV ch 1093 =for apidoc_item ||isWORDCHAR_uvchr|UV ch 1094 =for apidoc_item ||isWORDCHAR_utf8_safe|U8 * s|U8 * end 1095 =for apidoc_item ||isWORDCHAR_utf8|U8 * s|U8 * end 1096 =for apidoc_item ||isWORDCHAR_LC|UV ch 1097 =for apidoc_item ||isWORDCHAR_LC_uvchr|UV ch 1098 =for apidoc_item ||isWORDCHAR_LC_utf8_safe|U8 * s| U8 *end 1099 =for apidoc_item ||isALNUM|UV ch 1100 =for apidoc_item ||isALNUM_A|UV ch 1101 =for apidoc_item ||isALNUM_LC|UV ch 1102 =for apidoc_item ||isALNUM_LC_uvchr|UV ch 1103 Returns a boolean indicating whether the specified character is a character 1104 that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match 1105 in a regular expression. A word character is an alphabetic character, a 1106 decimal digit, a connecting punctuation character (such as an underscore), or 1107 a "mark" character that attaches to one of those (like some sort of accent). 1108 C<isALNUM()> is a synonym provided for backward compatibility, even though a 1109 word character includes more than the standard C language meaning of 1110 alphanumeric. 1111 See the L<top of this section|/Character classification> for an explanation of 1112 the variants. 1113 C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, 1114 C<isWORDCHAR_LC>, C<isWORDCHAR_LC_uvchr>, C<isWORDCHAR_LC_utf8>, and 1115 C<isWORDCHAR_LC_utf8_safe> are also as described there, but additionally 1116 include the platform's native underscore. 1117 1118 =for apidoc Am|bool|isXDIGIT|UV ch 1119 =for apidoc_item ||isXDIGIT_A|UV ch 1120 =for apidoc_item ||isXDIGIT_L1|UV ch 1121 =for apidoc_item ||isXDIGIT_uvchr|UV ch 1122 =for apidoc_item ||isXDIGIT_utf8_safe|U8 * s|U8 * end 1123 =for apidoc_item ||isXDIGIT_utf8|U8 * s|U8 * end 1124 =for apidoc_item ||isXDIGIT_LC|UV ch 1125 =for apidoc_item ||isXDIGIT_LC_uvchr|UV ch 1126 =for apidoc_item ||isXDIGIT_LC_utf8_safe|U8 * s| U8 *end 1127 Returns a boolean indicating whether the specified character is a hexadecimal 1128 digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()> 1129 and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>. 1130 See the L<top of this section|/Character classification> for an explanation of 1131 the variants. 1132 1133 =for apidoc Am|bool|isIDFIRST|UV ch 1134 =for apidoc_item ||isIDFIRST_A|UV ch 1135 =for apidoc_item ||isIDFIRST_L1|UV ch 1136 =for apidoc_item ||isIDFIRST_uvchr|UV ch 1137 =for apidoc_item ||isIDFIRST_utf8_safe|U8 * s|U8 * end 1138 =for apidoc_item ||isIDFIRST_utf8|U8 * s|U8 * end 1139 =for apidoc_item ||isIDFIRST_LC|UV ch 1140 =for apidoc_item ||isIDFIRST_LC_uvchr|UV ch 1141 =for apidoc_item ||isIDFIRST_LC_utf8_safe|U8 * s| U8 *end 1142 Returns a boolean indicating whether the specified character can be the first 1143 character of an identifier. This is very close to, but not quite the same as 1144 the official Unicode property C<XID_Start>. The difference is that this 1145 returns true only if the input character also matches L</isWORDCHAR>. 1146 See the L<top of this section|/Character classification> for an explanation of 1147 the variants. 1148 1149 =for apidoc Am|bool|isIDCONT|UV ch 1150 =for apidoc_item ||isIDCONT_A|UV ch 1151 =for apidoc_item ||isIDCONT_L1|UV ch 1152 =for apidoc_item ||isIDCONT_uvchr|UV ch 1153 =for apidoc_item ||isIDCONT_utf8_safe|U8 * s|U8 * end 1154 =for apidoc_item ||isIDCONT_utf8|U8 * s|U8 * end 1155 =for apidoc_item ||isIDCONT_LC|UV ch 1156 =for apidoc_item ||isIDCONT_LC_uvchr|UV ch 1157 =for apidoc_item ||isIDCONT_LC_utf8_safe|U8 * s| U8 *end 1158 Returns a boolean indicating whether the specified character can be the 1159 second or succeeding character of an identifier. This is very close to, but 1160 not quite the same as the official Unicode property C<XID_Continue>. The 1161 difference is that this returns true only if the input character also matches 1162 L</isWORDCHAR>. See the L<top of this section|/Character classification> for 1163 an explanation of the variants. 1164 1165 =for apidoc_section $numeric 1166 1167 =for apidoc Am|U8|READ_XDIGIT|char str* 1168 Returns the value of an ASCII-range hex digit and advances the string pointer. 1169 Behaviour is only well defined when isXDIGIT(*str) is true. 1170 1171 =head1 Character case changing 1172 Perl uses "full" Unicode case mappings. This means that converting a single 1173 character to another case may result in a sequence of more than one character. 1174 For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two 1175 character sequence C<SS>. This presents some complications The lowercase of 1176 all characters in the range 0..255 is a single character, and thus 1177 C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't 1178 return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has 1179 an API that does allow every possible legal result to be returned.) Likewise 1180 no other function that is crippled by not being able to give the correct 1181 results for the full range of possible inputs has been implemented here. 1182 1183 =for apidoc Am|UV|toUPPER|UV cp 1184 =for apidoc_item |UV|toUPPER_A|UV cp 1185 =for apidoc_item |UV|toUPPER_uvchr|UV cp|U8* s|STRLEN* lenp 1186 =for apidoc_item |UV|toUPPER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1187 =for apidoc_item |UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1188 1189 These all return the uppercase of a character. The differences are what domain 1190 they operate on, and whether the input is specified as a code point (those 1191 forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter 1192 case, the code point to use is the first one in the buffer of UTF-8 encoded 1193 code points, delineated by the arguments S<C<p .. e - 1>>. 1194 1195 C<toUPPER> and C<toUPPER_A> are synonyms of each other. They return the 1196 uppercase of any lowercase ASCII-range code point. All other inputs are 1197 returned unchanged. Since these are macros, the input type may be any integral 1198 one, and the output will occupy the same number of bits as the input. 1199 1200 There is no C<toUPPER_L1> nor C<toUPPER_LATIN1> as the uppercase of some code 1201 points in the 0..255 range is above that range or consists of multiple 1202 characters. Instead use C<toUPPER_uvchr>. 1203 1204 C<toUPPER_uvchr> returns the uppercase of any Unicode code point. The return 1205 value is identical to that of C<toUPPER_A> for input code points in the ASCII 1206 range. The uppercase of the vast majority of Unicode code points is the same 1207 as the code point itself. For these, and for code points above the legal 1208 Unicode maximum, this returns the input code point unchanged. It additionally 1209 stores the UTF-8 of the result into the buffer beginning at C<s>, and its 1210 length in bytes into C<*lenp>. The caller must have made C<s> large enough to 1211 contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow. 1212 1213 NOTE: the uppercase of a code point may be more than one code point. The 1214 return value of this function is only the first of these. The entire uppercase 1215 is returned in C<s>. To determine if the result is more than a single code 1216 point, you can do something like this: 1217 1218 uc = toUPPER_uvchr(cp, s, &len); 1219 if (len > UTF8SKIP(s)) { is multiple code points } 1220 else { is a single code point } 1221 1222 C<toUPPER_utf8> and C<toUPPER_utf8_safe> are synonyms of each other. The only 1223 difference between these and C<toUPPER_uvchr> is that the source for these is 1224 encoded in UTF-8, instead of being a code point. It is passed as a buffer 1225 starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p> 1226 buffer may certainly contain more than one code point; but only the first one 1227 (up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is 1228 malformed in some way, the program may croak, or the function may return the 1229 REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to 1230 change in future releases. 1231 1232 =for apidoc Am|UV|toFOLD|UV cp 1233 =for apidoc_item |UV|toFOLD_A|UV cp 1234 =for apidoc_item |UV|toFOLD_uvchr|UV cp|U8* s|STRLEN* lenp 1235 =for apidoc_item |UV|toFOLD_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1236 =for apidoc_item |UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1237 1238 These all return the foldcase of a character. "foldcase" is an internal case 1239 for C</i> pattern matching. If the foldcase of character A and the foldcase of 1240 character B are the same, they match caselessly; otherwise they don't. 1241 1242 The differences in the forms are what domain they operate on, and whether the 1243 input is specified as a code point (those forms with a C<cp> parameter) or as a 1244 UTF-8 string (the others). In the latter case, the code point to use is the 1245 first one in the buffer of UTF-8 encoded code points, delineated by the 1246 arguments S<C<p .. e - 1>>. 1247 1248 C<toFOLD> and C<toFOLD_A> are synonyms of each other. They return the 1249 foldcase of any ASCII-range code point. In this range, the foldcase is 1250 identical to the lowercase. All other inputs are returned unchanged. Since 1251 these are macros, the input type may be any integral one, and the output will 1252 occupy the same number of bits as the input. 1253 1254 There is no C<toFOLD_L1> nor C<toFOLD_LATIN1> as the foldcase of some code 1255 points in the 0..255 range is above that range or consists of multiple 1256 characters. Instead use C<toFOLD_uvchr>. 1257 1258 C<toFOLD_uvchr> returns the foldcase of any Unicode code point. The return 1259 value is identical to that of C<toFOLD_A> for input code points in the ASCII 1260 range. The foldcase of the vast majority of Unicode code points is the same 1261 as the code point itself. For these, and for code points above the legal 1262 Unicode maximum, this returns the input code point unchanged. It additionally 1263 stores the UTF-8 of the result into the buffer beginning at C<s>, and its 1264 length in bytes into C<*lenp>. The caller must have made C<s> large enough to 1265 contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow. 1266 1267 NOTE: the foldcase of a code point may be more than one code point. The 1268 return value of this function is only the first of these. The entire foldcase 1269 is returned in C<s>. To determine if the result is more than a single code 1270 point, you can do something like this: 1271 1272 uc = toFOLD_uvchr(cp, s, &len); 1273 if (len > UTF8SKIP(s)) { is multiple code points } 1274 else { is a single code point } 1275 1276 C<toFOLD_utf8> and C<toFOLD_utf8_safe> are synonyms of each other. The only 1277 difference between these and C<toFOLD_uvchr> is that the source for these is 1278 encoded in UTF-8, instead of being a code point. It is passed as a buffer 1279 starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p> 1280 buffer may certainly contain more than one code point; but only the first one 1281 (up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is 1282 malformed in some way, the program may croak, or the function may return the 1283 REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to 1284 change in future releases. 1285 1286 =for apidoc Am|UV|toLOWER|UV cp 1287 =for apidoc_item |UV|toLOWER_A|UV cp 1288 =for apidoc_item |UV|toLOWER_L1|UV cp 1289 =for apidoc_item |UV|toLOWER_LATIN1|UV cp 1290 =for apidoc_item |UV|toLOWER_LC|UV cp 1291 =for apidoc_item |UV|toLOWER_uvchr|UV cp|U8* s|STRLEN* lenp 1292 =for apidoc_item |UV|toLOWER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1293 =for apidoc_item |UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1294 1295 These all return the lowercase of a character. The differences are what domain 1296 they operate on, and whether the input is specified as a code point (those 1297 forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter 1298 case, the code point to use is the first one in the buffer of UTF-8 encoded 1299 code points, delineated by the arguments S<C<p .. e - 1>>. 1300 1301 C<toLOWER> and C<toLOWER_A> are synonyms of each other. They return the 1302 lowercase of any uppercase ASCII-range code point. All other inputs are 1303 returned unchanged. Since these are macros, the input type may be any integral 1304 one, and the output will occupy the same number of bits as the input. 1305 1306 C<toLOWER_L1> and C<toLOWER_LATIN1> are synonyms of each other. They behave 1307 identically as C<toLOWER> for ASCII-range input. But additionally will return 1308 the lowercase of any uppercase code point in the entire 0..255 range, assuming 1309 a Latin-1 encoding (or the EBCDIC equivalent on such platforms). 1310 1311 C<toLOWER_LC> returns the lowercase of the input code point according to the 1312 rules of the current POSIX locale. Input code points outside the range 0..255 1313 are returned unchanged. 1314 1315 C<toLOWER_uvchr> returns the lowercase of any Unicode code point. The return 1316 value is identical to that of C<toLOWER_L1> for input code points in the 0..255 1317 range. The lowercase of the vast majority of Unicode code points is the same 1318 as the code point itself. For these, and for code points above the legal 1319 Unicode maximum, this returns the input code point unchanged. It additionally 1320 stores the UTF-8 of the result into the buffer beginning at C<s>, and its 1321 length in bytes into C<*lenp>. The caller must have made C<s> large enough to 1322 contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow. 1323 1324 NOTE: the lowercase of a code point may be more than one code point. The 1325 return value of this function is only the first of these. The entire lowercase 1326 is returned in C<s>. To determine if the result is more than a single code 1327 point, you can do something like this: 1328 1329 uc = toLOWER_uvchr(cp, s, &len); 1330 if (len > UTF8SKIP(s)) { is multiple code points } 1331 else { is a single code point } 1332 1333 C<toLOWER_utf8> and C<toLOWER_utf8_safe> are synonyms of each other. The only 1334 difference between these and C<toLOWER_uvchr> is that the source for these is 1335 encoded in UTF-8, instead of being a code point. It is passed as a buffer 1336 starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p> 1337 buffer may certainly contain more than one code point; but only the first one 1338 (up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is 1339 malformed in some way, the program may croak, or the function may return the 1340 REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to 1341 change in future releases. 1342 1343 =for apidoc Am|UV|toTITLE|UV cp 1344 =for apidoc_item |UV|toTITLE_A|UV cp 1345 =for apidoc_item |UV|toTITLE_uvchr|UV cp|U8* s|STRLEN* lenp 1346 =for apidoc_item |UV|toTITLE_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1347 =for apidoc_item |UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1348 1349 These all return the titlecase of a character. The differences are what domain 1350 they operate on, and whether the input is specified as a code point (those 1351 forms with a C<cp> parameter) or as a UTF-8 string (the others). In the latter 1352 case, the code point to use is the first one in the buffer of UTF-8 encoded 1353 code points, delineated by the arguments S<C<p .. e - 1>>. 1354 1355 C<toTITLE> and C<toTITLE_A> are synonyms of each other. They return the 1356 titlecase of any lowercase ASCII-range code point. In this range, the 1357 titlecase is identical to the uppercase. All other inputs are returned 1358 unchanged. Since these are macros, the input type may be any integral one, and 1359 the output will occupy the same number of bits as the input. 1360 1361 There is no C<toTITLE_L1> nor C<toTITLE_LATIN1> as the titlecase of some code 1362 points in the 0..255 range is above that range or consists of multiple 1363 characters. Instead use C<toTITLE_uvchr>. 1364 1365 C<toTITLE_uvchr> returns the titlecase of any Unicode code point. The return 1366 value is identical to that of C<toTITLE_A> for input code points in the ASCII 1367 range. The titlecase of the vast majority of Unicode code points is the same 1368 as the code point itself. For these, and for code points above the legal 1369 Unicode maximum, this returns the input code point unchanged. It additionally 1370 stores the UTF-8 of the result into the buffer beginning at C<s>, and its 1371 length in bytes into C<*lenp>. The caller must have made C<s> large enough to 1372 contain at least C<UTF8_MAXBYTES_CASE+1> bytes to avoid possible overflow. 1373 1374 NOTE: the titlecase of a code point may be more than one code point. The 1375 return value of this function is only the first of these. The entire titlecase 1376 is returned in C<s>. To determine if the result is more than a single code 1377 point, you can do something like this: 1378 1379 uc = toTITLE_uvchr(cp, s, &len); 1380 if (len > UTF8SKIP(s)) { is multiple code points } 1381 else { is a single code point } 1382 1383 C<toTITLE_utf8> and C<toTITLE_utf8_safe> are synonyms of each other. The only 1384 difference between these and C<toTITLE_uvchr> is that the source for these is 1385 encoded in UTF-8, instead of being a code point. It is passed as a buffer 1386 starting at C<p>, with C<e> pointing to one byte beyond its end. The C<p> 1387 buffer may certainly contain more than one code point; but only the first one 1388 (up through S<C<e - 1>>) is examined. If the UTF-8 for the input character is 1389 malformed in some way, the program may croak, or the function may return the 1390 REPLACEMENT CHARACTER, at the discretion of the implementation, and subject to 1391 change in future releases. 1392 1393 =cut 1394 1395 XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names 1396 really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change, 1397 and aren't general purpose as they don't work on U+DF, and assert against that. 1398 1399 Note that these macros are repeated in Devel::PPPort, so should also be 1400 patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc 1401 1402 */ 1403 1404 /* 1405 void below because that's the best fit, and works for Devel::PPPort 1406 =for apidoc_section $integer 1407 =for apidoc AyT||WIDEST_UTYPE 1408 1409 Yields the widest unsigned integer type on the platform, currently either 1410 C<U32> or C<U64>. This can be used in declarations such as 1411 1412 WIDEST_UTYPE my_uv; 1413 1414 or casts 1415 1416 my_uv = (WIDEST_UTYPE) val; 1417 1418 =cut 1419 1420 */ 1421 #ifdef QUADKIND 1422 # define WIDEST_UTYPE U64 1423 #else 1424 # define WIDEST_UTYPE U32 1425 #endif 1426 1427 /* Where there could be some confusion, use this as a static assert in macros 1428 * to make sure that a parameter isn't a pointer. But some compilers can't 1429 * handle this. The only one known so far that doesn't is gcc 3.3.6; the check 1430 * below isn't thorough for such an old compiler, so may have to be revised if 1431 * experience so dictates. */ 1432 #if ! PERL_IS_GCC || PERL_GCC_VERSION_GT(3,3,6) 1433 # define ASSERT_NOT_PTR(x) ((x) | 0) 1434 #else 1435 # define ASSERT_NOT_PTR(x) (x) 1436 #endif 1437 1438 /* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in 1439 * the lower 8. It is designed to be hopefully bomb-proof, making sure that no 1440 * bits of information are lost even on a 64-bit machine, but to get the 1441 * compiler to optimize it out if possible. This is because Configure makes 1442 * sure that the machine has an 8-bit byte, so if c is stored in a byte, the 1443 * sizeof() guarantees that this evaluates to a constant true at compile time. 1444 * 1445 * For Coverity, be always true, because otherwise Coverity thinks 1446 * it finds several expressions that are always true, independent 1447 * of operands. Well, they are, but that is kind of the point. 1448 */ 1449 #ifndef __COVERITY__ 1450 /* The '| 0' part in ASSERT_NOT_PTR ensures a compiler error if c is not 1451 * integer (like e.g., a pointer) */ 1452 # define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \ 1453 || (((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) >> 8) == 0) 1454 #else 1455 # define FITS_IN_8_BITS(c) (1) 1456 #endif 1457 1458 /* Returns true if l <= c <= (l + n), where 'l' and 'n' are non-negative 1459 * Written this way so that after optimization, only one conditional test is 1460 * needed. (The NV casts stop any warnings about comparison always being true 1461 * if called with an unsigned. The cast preserves the sign, which is all we 1462 * care about.) */ 1463 #define withinCOUNT(c, l, n) (__ASSERT_((NV) (l) >= 0) \ 1464 __ASSERT_((NV) (n) >= 0) \ 1465 withinCOUNT_KNOWN_VALID_((c), (l), (n))) 1466 1467 /* For internal use only, this can be used in places where it is known that the 1468 * parameters to withinCOUNT() are valid, to avoid the asserts. For example, 1469 * inRANGE() below, calls this several times, but does all the necessary 1470 * asserts itself, once. The reason that this is necessary is that the 1471 * duplicate asserts were exceeding the internal limits of some compilers */ 1472 #define withinCOUNT_KNOWN_VALID_(c, l, n) \ 1473 ((((WIDEST_UTYPE) (c)) - ASSERT_NOT_PTR(l)) \ 1474 <= ((WIDEST_UTYPE) ASSERT_NOT_PTR(n))) 1475 1476 /* Returns true if c is in the range l..u, where 'l' is non-negative 1477 * Written this way so that after optimization, only one conditional test is 1478 * needed. */ 1479 #define inRANGE(c, l, u) (__ASSERT_((NV) (l) >= 0) __ASSERT_((u) >= (l)) \ 1480 ( (sizeof(c) == sizeof(U8)) ? inRANGE_helper_(U8, (c), (l), ((u))) \ 1481 : (sizeof(c) == sizeof(U16)) ? inRANGE_helper_(U16,(c), (l), ((u))) \ 1482 : (sizeof(c) == sizeof(U32)) ? inRANGE_helper_(U32,(c), (l), ((u))) \ 1483 : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE)) \ 1484 inRANGE_helper_(WIDEST_UTYPE,(c), (l), ((u)))))) 1485 1486 /* For internal use, this is used by machine-generated code which generates 1487 * known valid calls, with a known sizeof(). This avoids the extra code and 1488 * asserts that were exceeding internal limits of some compilers. */ 1489 #define inRANGE_helper_(cast, c, l, u) \ 1490 withinCOUNT_KNOWN_VALID_(((cast) (c)), (l), ((u) - (l))) 1491 1492 #ifdef EBCDIC 1493 # ifndef _ALL_SOURCE 1494 /* The native libc isascii() et.al. functions return the wrong results 1495 * on at least z/OS unless this is defined. */ 1496 # error _ALL_SOURCE should probably be defined 1497 # endif 1498 #else 1499 /* There is a simple definition of ASCII for ASCII platforms. But the 1500 * EBCDIC one isn't so simple, so is defined using table look-up like the 1501 * other macros below. 1502 * 1503 * The cast here is used instead of '(c) >= 0', because some compilers emit 1504 * a warning that that test is always true when the parameter is an 1505 * unsigned type. khw supposes that it could be written as 1506 * && ((c) == '\0' || (c) > 0) 1507 * to avoid the message, but the cast will likely avoid extra branches even 1508 * with stupid compilers. */ 1509 # define isASCII(c) (((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) < 128) 1510 #endif 1511 1512 /* Take the eight possible bit patterns of the lower 3 bits and you get the 1513 * lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits 1514 * can be ignored. If the rest match '0', we have an octal */ 1515 #define isOCTAL_A(c) ((((WIDEST_UTYPE) ASSERT_NOT_PTR(c)) & ~7) == '0') 1516 1517 #ifdef H_PERL /* If have access to perl.h, lookup in its table */ 1518 1519 /* Character class numbers. For internal core Perl use only. The ones less 1520 * than 32 are used in PL_charclass[] and the ones up through the one that 1521 * corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h and 1522 * related files. PL_charclass ones use names used in l1_char_class_tab.h but 1523 * their actual definitions are here. If that file has a name not used here, 1524 * it won't compile. 1525 * 1526 * The first group of these is ordered in what I (khw) estimate to be the 1527 * frequency of their use. This gives a slight edge to exiting a loop earlier 1528 * (in reginclass() in regexec.c). Except \v should be last, as it isn't a 1529 * real Posix character class, and some (small) inefficiencies in regular 1530 * expression handling would be introduced by putting it in the middle of those 1531 * that are. Also, cntrl and ascii come after the others as it may be useful 1532 * to group these which have no members that match above Latin1, (or above 1533 * ASCII in the latter case) */ 1534 1535 # define _CC_WORDCHAR 0 /* \w and [:word:] */ 1536 # define _CC_DIGIT 1 /* \d and [:digit:] */ 1537 # define _CC_ALPHA 2 /* [:alpha:] */ 1538 # define _CC_LOWER 3 /* [:lower:] */ 1539 # define _CC_UPPER 4 /* [:upper:] */ 1540 # define _CC_PUNCT 5 /* [:punct:] */ 1541 # define _CC_PRINT 6 /* [:print:] */ 1542 # define _CC_ALPHANUMERIC 7 /* [:alnum:] */ 1543 # define _CC_GRAPH 8 /* [:graph:] */ 1544 # define _CC_CASED 9 /* [:lower:] or [:upper:] under /i */ 1545 # define _CC_SPACE 10 /* \s, [:space:] */ 1546 # define _CC_BLANK 11 /* [:blank:] */ 1547 # define _CC_XDIGIT 12 /* [:xdigit:] */ 1548 # define _CC_CNTRL 13 /* [:cntrl:] */ 1549 # define _CC_ASCII 14 /* [:ascii:] */ 1550 # define _CC_VERTSPACE 15 /* \v */ 1551 1552 # define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE 1553 1554 /* The members of the third group below do not need to be coordinated with data 1555 * structures in regcomp.[ch] and regexec.c. */ 1556 # define _CC_IDFIRST 16 1557 # define _CC_CHARNAME_CONT 17 1558 # define _CC_NONLATIN1_FOLD 18 1559 # define _CC_NONLATIN1_SIMPLE_FOLD 19 1560 # define _CC_QUOTEMETA 20 1561 # define _CC_NON_FINAL_FOLD 21 1562 # define _CC_IS_IN_SOME_FOLD 22 1563 # define _CC_BINDIGIT 23 1564 # define _CC_OCTDIGIT 24 1565 # define _CC_MNEMONIC_CNTRL 25 1566 1567 /* Unused: 26-31 1568 * If more bits are needed, one could add a second word for non-64bit 1569 * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd 1570 * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it 1571 * is used only for optimization (as of this writing), and differs in the 1572 * Latin1 range from the ALPHA bit only in two relatively unimportant 1573 * characters: the masculine and feminine ordinal indicators, so removing it 1574 * would just cause /i regexes which match them to run less efficiently. 1575 * Similarly the EBCDIC-only bits are used just for speed, and could be 1576 * replaced by other means */ 1577 1578 #if defined(PERL_CORE) || defined(PERL_EXT) 1579 /* An enum version of the character class numbers, to help compilers 1580 * optimize */ 1581 typedef enum { 1582 _CC_ENUM_ALPHA = _CC_ALPHA, 1583 _CC_ENUM_ALPHANUMERIC = _CC_ALPHANUMERIC, 1584 _CC_ENUM_ASCII = _CC_ASCII, 1585 _CC_ENUM_BLANK = _CC_BLANK, 1586 _CC_ENUM_CASED = _CC_CASED, 1587 _CC_ENUM_CNTRL = _CC_CNTRL, 1588 _CC_ENUM_DIGIT = _CC_DIGIT, 1589 _CC_ENUM_GRAPH = _CC_GRAPH, 1590 _CC_ENUM_LOWER = _CC_LOWER, 1591 _CC_ENUM_PRINT = _CC_PRINT, 1592 _CC_ENUM_PUNCT = _CC_PUNCT, 1593 _CC_ENUM_SPACE = _CC_SPACE, 1594 _CC_ENUM_UPPER = _CC_UPPER, 1595 _CC_ENUM_VERTSPACE = _CC_VERTSPACE, 1596 _CC_ENUM_WORDCHAR = _CC_WORDCHAR, 1597 _CC_ENUM_XDIGIT = _CC_XDIGIT 1598 } _char_class_number; 1599 #endif 1600 1601 #define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1) 1602 1603 START_EXTERN_C 1604 # ifdef DOINIT 1605 EXTCONST U32 PL_charclass[] = { 1606 # include "l1_char_class_tab.h" 1607 }; 1608 1609 # else /* ! DOINIT */ 1610 EXTCONST U32 PL_charclass[]; 1611 # endif 1612 END_EXTERN_C 1613 1614 /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ 1615 # define _CC_mask(classnum) (1U << (classnum)) 1616 1617 /* For internal core Perl use only: the base macro for defining macros like 1618 * isALPHA */ 1619 # define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ 1620 && (PL_charclass[(U8) (c)] & _CC_mask(classnum))) 1621 1622 /* The mask for the _A versions of the macros; it just adds in the bit for 1623 * ASCII. */ 1624 # define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII)) 1625 1626 /* For internal core Perl use only: the base macro for defining macros like 1627 * isALPHA_A. The foo_A version makes sure that both the desired bit and 1628 * the ASCII bit are present */ 1629 # define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ 1630 && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \ 1631 == _CC_mask_A(classnum))) 1632 1633 /* On ASCII platforms certain classes form a single range. It's faster to 1634 * special case these. isDIGIT is a single range on all platforms */ 1635 # ifdef EBCDIC 1636 # define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) 1637 # define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH) 1638 # define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER) 1639 # define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT) 1640 # define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER) 1641 # else 1642 /* By folding the upper and lowercase, we can use a single range */ 1643 # define isALPHA_A(c) inRANGE((~('A' ^ 'a') & (c)), 'A', 'Z') 1644 # define isGRAPH_A(c) inRANGE(c, ' ' + 1, 0x7e) 1645 # define isLOWER_A(c) inRANGE(c, 'a', 'z') 1646 # define isPRINT_A(c) inRANGE(c, ' ', 0x7e) 1647 # define isUPPER_A(c) inRANGE(c, 'A', 'Z') 1648 # endif 1649 # define isALPHANUMERIC_A(c) _generic_isCC_A(c, _CC_ALPHANUMERIC) 1650 # define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK) 1651 # define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL) 1652 # define isDIGIT_A(c) inRANGE(c, '0', '9') 1653 # define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT) 1654 # define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE) 1655 # define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR) 1656 # define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) /* No non-ASCII xdigits 1657 */ 1658 # define isIDFIRST_A(c) _generic_isCC_A(c, _CC_IDFIRST) 1659 # define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA) 1660 # define isALPHANUMERIC_L1(c) _generic_isCC(c, _CC_ALPHANUMERIC) 1661 # define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK) 1662 1663 /* continuation character for legal NAME in \N{NAME} */ 1664 # define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT) 1665 1666 # define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL) 1667 # define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH) 1668 # define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER) 1669 # define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT) 1670 # define isPSXSPC_L1(c) isSPACE_L1(c) 1671 # define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT) 1672 # define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE) 1673 # define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER) 1674 # define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR) 1675 # define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST) 1676 1677 # ifdef EBCDIC 1678 # define isASCII(c) _generic_isCC(c, _CC_ASCII) 1679 # endif 1680 1681 /* Participates in a single-character fold with a character above 255 */ 1682 # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) 1683 # define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(c) \ 1684 (( ! cBOOL(FITS_IN_8_BITS(c))) \ 1685 || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_SIMPLE_FOLD))) 1686 1687 # define IS_NON_FINAL_FOLD(c) _generic_isCC(c, _CC_NON_FINAL_FOLD) 1688 # define IS_IN_SOME_FOLD_L1(c) _generic_isCC(c, _CC_IS_IN_SOME_FOLD) 1689 # endif 1690 1691 /* Like the above, but also can be part of a multi-char fold */ 1692 # define HAS_NONLATIN1_FOLD_CLOSURE(c) \ 1693 ( (! cBOOL(FITS_IN_8_BITS(c))) \ 1694 || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD))) 1695 1696 # define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) 1697 1698 /* is c a control character for which we have a mnemonic? */ 1699 # if defined(PERL_CORE) || defined(PERL_EXT) 1700 # define isMNEMONIC_CNTRL(c) _generic_isCC(c, _CC_MNEMONIC_CNTRL) 1701 # endif 1702 #else /* else we don't have perl.h H_PERL */ 1703 1704 /* If we don't have perl.h, we are compiling a utility program. Below we 1705 * hard-code various macro definitions that wouldn't otherwise be available 1706 * to it. Most are coded based on first principles. These are written to 1707 * avoid EBCDIC vs. ASCII #ifdef's as much as possible. */ 1708 # define isDIGIT_A(c) inRANGE(c, '0', '9') 1709 # define isBLANK_A(c) ((c) == ' ' || (c) == '\t') 1710 # define isSPACE_A(c) (isBLANK_A(c) \ 1711 || (c) == '\n' \ 1712 || (c) == '\r' \ 1713 || (c) == '\v' \ 1714 || (c) == '\f') 1715 /* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for 1716 * uppercase. The tests for those aren't necessary on ASCII, but hurt only 1717 * performance (if optimization isn't on), and allow the same code to be 1718 * used for both platform types */ 1719 # define isLOWER_A(c) inRANGE((c), 'a', 'i') \ 1720 || inRANGE((c), 'j', 'r') \ 1721 || inRANGE((c), 's', 'z') 1722 # define isUPPER_A(c) inRANGE((c), 'A', 'I') \ 1723 || inRANGE((c), 'J', 'R') \ 1724 || inRANGE((c), 'S', 'Z') 1725 # define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c)) 1726 # define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c)) 1727 # define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_') 1728 # define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_') 1729 # define isXDIGIT_A(c) ( isDIGIT_A(c) \ 1730 || inRANGE((c), 'a', 'f') \ 1731 || inRANGE((c), 'A', 'F') 1732 # define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \ 1733 || (c) == '#' || (c) == '$' || (c) == '%' \ 1734 || (c) == '&' || (c) == '\'' || (c) == '(' \ 1735 || (c) == ')' || (c) == '*' || (c) == '+' \ 1736 || (c) == ',' || (c) == '.' || (c) == '/' \ 1737 || (c) == ':' || (c) == ';' || (c) == '<' \ 1738 || (c) == '=' || (c) == '>' || (c) == '?' \ 1739 || (c) == '@' || (c) == '[' || (c) == '\\' \ 1740 || (c) == ']' || (c) == '^' || (c) == '_' \ 1741 || (c) == '`' || (c) == '{' || (c) == '|' \ 1742 || (c) == '}' || (c) == '~') 1743 # define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c)) 1744 # define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ') 1745 1746 # ifdef EBCDIC 1747 /* The below is accurate for the 3 EBCDIC code pages traditionally 1748 * supported by perl. The only difference between them in the controls 1749 * is the position of \n, and that is represented symbolically below */ 1750 # define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \ 1751 || (c) == '\f' || (c) == '\n' || (c) == '\r' \ 1752 || (c) == '\t' || (c) == '\v' \ 1753 || inRANGE((c), 1, 3) /* SOH, STX, ETX */ \ 1754 || (c) == 7F /* U+7F DEL */ \ 1755 || inRANGE((c), 0x0E, 0x13) /* SO SI DLE \ 1756 DC[1-3] */ \ 1757 || (c) == 0x18 /* U+18 CAN */ \ 1758 || (c) == 0x19 /* U+19 EOM */ \ 1759 || inRANGE((c), 0x1C, 0x1F) /* [FGRU]S */ \ 1760 || (c) == 0x26 /* U+17 ETB */ \ 1761 || (c) == 0x27 /* U+1B ESC */ \ 1762 || (c) == 0x2D /* U+05 ENQ */ \ 1763 || (c) == 0x2E /* U+06 ACK */ \ 1764 || (c) == 0x32 /* U+16 SYN */ \ 1765 || (c) == 0x37 /* U+04 EOT */ \ 1766 || (c) == 0x3C /* U+14 DC4 */ \ 1767 || (c) == 0x3D /* U+15 NAK */ \ 1768 || (c) == 0x3F)/* U+1A SUB */ 1769 # define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c)) 1770 # else /* isASCII is already defined for ASCII platforms, so can use that to 1771 define isCNTRL */ 1772 # define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c)) 1773 # endif 1774 1775 /* The _L1 macros may be unnecessary for the utilities; I (khw) added them 1776 * during debugging, and it seems best to keep them. We may be called 1777 * without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't 1778 * do anything anyway, so make it not a problem */ 1779 # if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1) 1780 # define NATIVE_TO_LATIN1(ch) (ch) 1781 # endif 1782 # define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c)) 1783 # define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c)) 1784 # define isBLANK_L1(c) (isBLANK_A(c) \ 1785 || (FITS_IN_8_BITS(c) \ 1786 && NATIVE_TO_LATIN1((U8) c) == 0xA0)) 1787 # define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c))) 1788 # define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c))) 1789 # define isLOWER_L1(c) (isLOWER_A(c) \ 1790 || (FITS_IN_8_BITS(c) \ 1791 && (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \ 1792 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \ 1793 || NATIVE_TO_LATIN1((U8) c) == 0xAA \ 1794 || NATIVE_TO_LATIN1((U8) c) == 0xBA \ 1795 || NATIVE_TO_LATIN1((U8) c) == 0xB5))) 1796 # define isPRINT_L1(c) (isPRINT_A(c) \ 1797 || (FITS_IN_8_BITS(c) \ 1798 && NATIVE_TO_LATIN1((U8) c) >= 0xA0)) 1799 # define isPUNCT_L1(c) (isPUNCT_A(c) \ 1800 || (FITS_IN_8_BITS(c) \ 1801 && ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \ 1802 || NATIVE_TO_LATIN1((U8) c) == 0xA7 \ 1803 || NATIVE_TO_LATIN1((U8) c) == 0xAB \ 1804 || NATIVE_TO_LATIN1((U8) c) == 0xB6 \ 1805 || NATIVE_TO_LATIN1((U8) c) == 0xB7 \ 1806 || NATIVE_TO_LATIN1((U8) c) == 0xBB \ 1807 || NATIVE_TO_LATIN1((U8) c) == 0xBF))) 1808 # define isSPACE_L1(c) (isSPACE_A(c) \ 1809 || (FITS_IN_8_BITS(c) \ 1810 && ( NATIVE_TO_LATIN1((U8) c) == 0x85 \ 1811 || NATIVE_TO_LATIN1((U8) c) == 0xA0))) 1812 # define isUPPER_L1(c) (isUPPER_A(c) \ 1813 || (FITS_IN_8_BITS(c) \ 1814 && ( IN_RANGE(NATIVE_TO_LATIN1((U8) c), \ 1815 0xC0, 0xDE) \ 1816 && NATIVE_TO_LATIN1((U8) c) != 0xD7))) 1817 # define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c)) 1818 # define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_') 1819 # define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \ 1820 || isBLANK_L1(c) \ 1821 || (c) == '-' \ 1822 || (c) == '(' \ 1823 || (c) == ')') 1824 /* The following are not fully accurate in the above-ASCII range. I (khw) 1825 * don't think it's necessary to be so for the purposes where this gets 1826 * compiled */ 1827 # define _isQUOTEMETA(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c)) 1828 # define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c) 1829 1830 /* And these aren't accurate at all. They are useful only for above 1831 * Latin1, which utilities and bootstrapping don't deal with */ 1832 # define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0 1833 # define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0 1834 # define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0 1835 1836 /* Many of the macros later in this file are defined in terms of these. By 1837 * implementing them with a function, which converts the class number into 1838 * a call to the desired macro, all of the later ones work. However, that 1839 * function won't be actually defined when building a utility program (no 1840 * perl.h), and so a compiler error will be generated if one is attempted 1841 * to be used. And the above-Latin1 code points require Unicode tables to 1842 * be present, something unlikely to be the case when bootstrapping */ 1843 # define _generic_isCC(c, classnum) \ 1844 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE)) 1845 # define _generic_isCC_A(c, classnum) \ 1846 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE)) 1847 #endif /* End of no perl.h H_PERL */ 1848 1849 #define isALPHANUMERIC(c) isALPHANUMERIC_A(c) 1850 #define isALPHA(c) isALPHA_A(c) 1851 #define isASCII_A(c) isASCII(c) 1852 #define isASCII_L1(c) isASCII(c) 1853 #define isBLANK(c) isBLANK_A(c) 1854 #define isCNTRL(c) isCNTRL_A(c) 1855 #define isDIGIT(c) isDIGIT_A(c) 1856 #define isGRAPH(c) isGRAPH_A(c) 1857 #define isIDFIRST(c) isIDFIRST_A(c) 1858 #define isLOWER(c) isLOWER_A(c) 1859 #define isPRINT(c) isPRINT_A(c) 1860 #define isPSXSPC_A(c) isSPACE_A(c) 1861 #define isPSXSPC(c) isPSXSPC_A(c) 1862 #define isPSXSPC_L1(c) isSPACE_L1(c) 1863 #define isPUNCT(c) isPUNCT_A(c) 1864 #define isSPACE(c) isSPACE_A(c) 1865 #define isUPPER(c) isUPPER_A(c) 1866 #define isWORDCHAR(c) isWORDCHAR_A(c) 1867 #define isXDIGIT(c) isXDIGIT_A(c) 1868 1869 /* ASCII casing. These could also be written as 1870 #define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c)) 1871 #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c)) 1872 which uses table lookup and mask instead of subtraction. (This would 1873 work because the _MOD does not apply in the ASCII range). 1874 1875 These actually are UTF-8 invariant casing, not just ASCII, as any non-ASCII 1876 UTF-8 invariants are neither upper nor lower. (Only on EBCDIC platforms are 1877 there non-ASCII invariants, and all of them are controls.) */ 1878 #define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c)) 1879 #define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c)) 1880 1881 /* In the ASCII range, these are equivalent to what they're here defined to be. 1882 * But by creating these definitions, other code doesn't have to be aware of 1883 * this detail. Actually this works for all UTF-8 invariants, not just the 1884 * ASCII range. (EBCDIC platforms can have non-ASCII invariants.) */ 1885 #define toFOLD(c) toLOWER(c) 1886 #define toTITLE(c) toUPPER(c) 1887 1888 #define toLOWER_A(c) toLOWER(c) 1889 #define toUPPER_A(c) toUPPER(c) 1890 #define toFOLD_A(c) toFOLD(c) 1891 #define toTITLE_A(c) toTITLE(c) 1892 1893 /* Use table lookup for speed; returns the input itself if is out-of-range */ 1894 #define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \ 1895 ? (c) \ 1896 : PL_latin1_lc[ (U8) (c) ]) 1897 #define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */ 1898 1899 /* Modified uc. Is correct uc except for three non-ascii chars which are 1900 * all mapped to one of them, and these need special handling; returns the 1901 * input itself if is out-of-range */ 1902 #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ 1903 ? (c) \ 1904 : PL_mod_latin1_uc[ (U8) (c) ]) 1905 #define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale 1906 1907 /* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ 1908 1909 /* For internal core Perl use only: the base macro for defining macros like 1910 * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point 1911 * (0-255) to check. In a UTF-8 locale, the result is the same as calling 1912 * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like 1913 * _CC_UPPER, which gives the class number for doing this. For non-UTF-8 1914 * locales, the code to actually do the test this is passed in 'non_utf8'. If 1915 * 'c' is above 255, 0 is returned. For accessing the full range of possible 1916 * code points under locale rules, use the macros based on _generic_LC_uvchr 1917 * instead of this. */ 1918 #define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \ 1919 (! FITS_IN_8_BITS(c) \ 1920 ? 0 \ 1921 : IN_UTF8_CTYPE_LOCALE \ 1922 ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \ 1923 : cBOOL(non_utf8)) 1924 1925 /* For internal core Perl use only: a helper macro for defining macros like 1926 * isALPHA_LC. 'c' is the code point (0-255) to check. The function name to 1927 * actually do this test is passed in 'non_utf8_func', which is called on 'c', 1928 * casting 'c' to the macro _LC_CAST, which should not be parenthesized. See 1929 * _generic_LC_base for more info */ 1930 #define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \ 1931 _generic_LC_base(c,utf8_locale_classnum, \ 1932 non_utf8_func( (_LC_CAST) (c))) 1933 1934 /* For internal core Perl use only: like _generic_LC, but also returns TRUE if 1935 * 'c' is the platform's native underscore character */ 1936 #define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \ 1937 _generic_LC_base(c, utf8_locale_classnum, \ 1938 (non_utf8_func( (_LC_CAST) (c)) \ 1939 || (char)(c) == '_')) 1940 1941 /* These next three are also for internal core Perl use only: case-change 1942 * helper macros. The reason for using the PL_latin arrays is in case the 1943 * system function is defective; it ensures uniform results that conform to the 1944 * Unicod standard. It does not handle the anomalies in UTF-8 Turkic locales */ 1945 #define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \ 1946 ? (c) \ 1947 : (IN_UTF8_CTYPE_LOCALE) \ 1948 ? PL_latin1_lc[ (U8) (c) ] \ 1949 : (cast)function((cast)(c))) 1950 1951 /* Note that the result can be larger than a byte in a UTF-8 locale. It 1952 * returns a single value, so can't adequately return the upper case of LATIN 1953 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two 1954 * values "SS"); instead it asserts against that under DEBUGGING, and 1955 * otherwise returns its input. It does not handle the anomalies in UTF-8 1956 * Turkic locales. */ 1957 #define _generic_toUPPER_LC(c, function, cast) \ 1958 (! FITS_IN_8_BITS(c) \ 1959 ? (c) \ 1960 : ((! IN_UTF8_CTYPE_LOCALE) \ 1961 ? (cast)function((cast)(c)) \ 1962 : ((((U8)(c)) == MICRO_SIGN) \ 1963 ? GREEK_CAPITAL_LETTER_MU \ 1964 : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \ 1965 ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \ 1966 : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \ 1967 ? (__ASSERT_(0) (c)) \ 1968 : PL_mod_latin1_uc[ (U8) (c) ]))))) 1969 1970 /* Note that the result can be larger than a byte in a UTF-8 locale. It 1971 * returns a single value, so can't adequately return the fold case of LATIN 1972 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two 1973 * values "ss"); instead it asserts against that under DEBUGGING, and 1974 * otherwise returns its input. It does not handle the anomalies in UTF-8 1975 * Turkic locales */ 1976 #define _generic_toFOLD_LC(c, function, cast) \ 1977 ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \ 1978 ? GREEK_SMALL_LETTER_MU \ 1979 : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \ 1980 || (c) != LATIN_SMALL_LETTER_SHARP_S) \ 1981 _generic_toLOWER_LC(c, function, cast))) 1982 1983 /* Use the libc versions for these if available. */ 1984 #if defined(HAS_ISASCII) 1985 # define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c))) 1986 #else 1987 # define isASCII_LC(c) isASCII(c) 1988 #endif 1989 1990 #if defined(HAS_ISBLANK) 1991 # define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank) 1992 #else /* Unlike isASCII, varies if in a UTF-8 locale */ 1993 # define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c)) 1994 #endif 1995 1996 #define _LC_CAST U8 1997 1998 #ifdef WIN32 1999 /* The Windows functions don't bother to follow the POSIX standard, which 2000 * for example says that something can't both be a printable and a control. 2001 * But Windows treats the \t control as a printable, and does such things 2002 * as making superscripts into both digits and punctuation. This tames 2003 * these flaws by assuming that the definitions of both controls and space 2004 * are correct, and then making sure that other definitions don't have 2005 * weirdnesses, by making sure that isalnum() isn't also ispunct(), etc. 2006 * Not all possible weirdnesses are checked for, just the ones that were 2007 * detected on actual Microsoft code pages */ 2008 2009 # define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl) 2010 # define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace) 2011 2012 # define isALPHA_LC(c) (_generic_LC(c, _CC_ALPHA, isalpha) \ 2013 && isALPHANUMERIC_LC(c)) 2014 # define isALPHANUMERIC_LC(c) (_generic_LC(c, _CC_ALPHANUMERIC, isalnum) && \ 2015 ! isPUNCT_LC(c)) 2016 # define isDIGIT_LC(c) (_generic_LC(c, _CC_DIGIT, isdigit) && \ 2017 isALPHANUMERIC_LC(c)) 2018 # define isGRAPH_LC(c) (_generic_LC(c, _CC_GRAPH, isgraph) && isPRINT_LC(c)) 2019 # define isIDFIRST_LC(c) (((c) == '_') \ 2020 || (_generic_LC(c, _CC_IDFIRST, isalpha) && ! isPUNCT_LC(c))) 2021 # define isLOWER_LC(c) (_generic_LC(c, _CC_LOWER, islower) && isALPHA_LC(c)) 2022 # define isPRINT_LC(c) (_generic_LC(c, _CC_PRINT, isprint) && ! isCNTRL_LC(c)) 2023 # define isPUNCT_LC(c) (_generic_LC(c, _CC_PUNCT, ispunct) && ! isCNTRL_LC(c)) 2024 # define isUPPER_LC(c) (_generic_LC(c, _CC_UPPER, isupper) && isALPHA_LC(c)) 2025 # define isWORDCHAR_LC(c) (((c) == '_') || isALPHANUMERIC_LC(c)) 2026 # define isXDIGIT_LC(c) (_generic_LC(c, _CC_XDIGIT, isxdigit) \ 2027 && isALPHANUMERIC_LC(c)) 2028 2029 # define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8) 2030 # define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8) 2031 # define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8) 2032 2033 #elif defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) 2034 /* For most other platforms */ 2035 2036 # define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha) 2037 # define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum) 2038 # define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl) 2039 # define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit) 2040 # define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph) 2041 # define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha) 2042 # define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower) 2043 # define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint) 2044 # define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct) 2045 # define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace) 2046 # define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper) 2047 # define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum) 2048 # define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit) 2049 2050 2051 # define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8) 2052 # define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8) 2053 # define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8) 2054 2055 #else /* The final fallback position */ 2056 2057 # define isALPHA_LC(c) (isascii(c) && isalpha(c)) 2058 # define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c)) 2059 # define isCNTRL_LC(c) (isascii(c) && iscntrl(c)) 2060 # define isDIGIT_LC(c) (isascii(c) && isdigit(c)) 2061 # define isGRAPH_LC(c) (isascii(c) && isgraph(c)) 2062 # define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) 2063 # define isLOWER_LC(c) (isascii(c) && islower(c)) 2064 # define isPRINT_LC(c) (isascii(c) && isprint(c)) 2065 # define isPUNCT_LC(c) (isascii(c) && ispunct(c)) 2066 # define isSPACE_LC(c) (isascii(c) && isspace(c)) 2067 # define isUPPER_LC(c) (isascii(c) && isupper(c)) 2068 # define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) 2069 # define isXDIGIT_LC(c) (isascii(c) && isxdigit(c)) 2070 2071 # define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c)) 2072 # define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c)) 2073 # define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c)) 2074 2075 #endif 2076 2077 #define isIDCONT(c) isWORDCHAR(c) 2078 #define isIDCONT_A(c) isWORDCHAR_A(c) 2079 #define isIDCONT_L1(c) isWORDCHAR_L1(c) 2080 #define isIDCONT_LC(c) isWORDCHAR_LC(c) 2081 #define isPSXSPC_LC(c) isSPACE_LC(c) 2082 2083 /* For internal core Perl use only: the base macros for defining macros like 2084 * isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class 2085 * number defined earlier in this file. _generic_uvchr() is used for POSIX 2086 * classes where there is a macro or function 'above_latin1' that takes the 2087 * single argument 'c' and returns the desired value. These exist for those 2088 * classes which have simple definitions, avoiding the overhead of an inversion 2089 * list binary search. _generic_invlist_uvchr() can be used 2090 * for classes where that overhead is faster than a direct lookup. 2091 * _generic_uvchr() won't compile if 'c' isn't unsigned, as it won't match the 2092 * 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so 2093 * have duplicate checks here, so could create versions of the macros that 2094 * don't, but experiments show that gcc optimizes them out anyway. */ 2095 2096 /* Note that all ignore 'use bytes' */ 2097 #define _generic_uvchr(classnum, above_latin1, c) ((c) < 256 \ 2098 ? _generic_isCC(c, classnum) \ 2099 : above_latin1(c)) 2100 #define _generic_invlist_uvchr(classnum, c) ((c) < 256 \ 2101 ? _generic_isCC(c, classnum) \ 2102 : _is_uni_FOO(classnum, c)) 2103 #define isALPHA_uvchr(c) _generic_invlist_uvchr(_CC_ALPHA, c) 2104 #define isALPHANUMERIC_uvchr(c) _generic_invlist_uvchr(_CC_ALPHANUMERIC, c) 2105 #define isASCII_uvchr(c) isASCII(c) 2106 #define isBLANK_uvchr(c) _generic_uvchr(_CC_BLANK, is_HORIZWS_cp_high, c) 2107 #define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */ 2108 #define isDIGIT_uvchr(c) _generic_invlist_uvchr(_CC_DIGIT, c) 2109 #define isGRAPH_uvchr(c) _generic_invlist_uvchr(_CC_GRAPH, c) 2110 #define isIDCONT_uvchr(c) \ 2111 _generic_uvchr(_CC_WORDCHAR, _is_uni_perl_idcont, c) 2112 #define isIDFIRST_uvchr(c) \ 2113 _generic_uvchr(_CC_IDFIRST, _is_uni_perl_idstart, c) 2114 #define isLOWER_uvchr(c) _generic_invlist_uvchr(_CC_LOWER, c) 2115 #define isPRINT_uvchr(c) _generic_invlist_uvchr(_CC_PRINT, c) 2116 2117 #define isPUNCT_uvchr(c) _generic_invlist_uvchr(_CC_PUNCT, c) 2118 #define isSPACE_uvchr(c) _generic_uvchr(_CC_SPACE, is_XPERLSPACE_cp_high, c) 2119 #define isPSXSPC_uvchr(c) isSPACE_uvchr(c) 2120 2121 #define isUPPER_uvchr(c) _generic_invlist_uvchr(_CC_UPPER, c) 2122 #define isVERTWS_uvchr(c) _generic_uvchr(_CC_VERTSPACE, is_VERTWS_cp_high, c) 2123 #define isWORDCHAR_uvchr(c) _generic_invlist_uvchr(_CC_WORDCHAR, c) 2124 #define isXDIGIT_uvchr(c) _generic_uvchr(_CC_XDIGIT, is_XDIGIT_cp_high, c) 2125 2126 #define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l) 2127 #define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l) 2128 #define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l) 2129 #define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l) 2130 2131 /* For backwards compatibility, even though '_uni' should mean official Unicode 2132 * code points, in Perl it means native for those below 256 */ 2133 #define isALPHA_uni(c) isALPHA_uvchr(c) 2134 #define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c) 2135 #define isASCII_uni(c) isASCII_uvchr(c) 2136 #define isBLANK_uni(c) isBLANK_uvchr(c) 2137 #define isCNTRL_uni(c) isCNTRL_uvchr(c) 2138 #define isDIGIT_uni(c) isDIGIT_uvchr(c) 2139 #define isGRAPH_uni(c) isGRAPH_uvchr(c) 2140 #define isIDCONT_uni(c) isIDCONT_uvchr(c) 2141 #define isIDFIRST_uni(c) isIDFIRST_uvchr(c) 2142 #define isLOWER_uni(c) isLOWER_uvchr(c) 2143 #define isPRINT_uni(c) isPRINT_uvchr(c) 2144 #define isPUNCT_uni(c) isPUNCT_uvchr(c) 2145 #define isSPACE_uni(c) isSPACE_uvchr(c) 2146 #define isPSXSPC_uni(c) isPSXSPC_uvchr(c) 2147 #define isUPPER_uni(c) isUPPER_uvchr(c) 2148 #define isVERTWS_uni(c) isVERTWS_uvchr(c) 2149 #define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c) 2150 #define isXDIGIT_uni(c) isXDIGIT_uvchr(c) 2151 #define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l) 2152 #define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l) 2153 #define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l) 2154 #define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l) 2155 2156 /* For internal core Perl use only: the base macros for defining macros like 2157 * isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code 2158 * point, not just 0-255. Like _generic_uvchr, there are two versions, one for 2159 * simple class definitions; the other for more complex. These are like 2160 * _generic_uvchr, so see it for more info. */ 2161 #define _generic_LC_uvchr(latin1, above_latin1, c) \ 2162 (c < 256 ? latin1(c) : above_latin1(c)) 2163 #define _generic_LC_invlist_uvchr(latin1, classnum, c) \ 2164 (c < 256 ? latin1(c) : _is_uni_FOO(classnum, c)) 2165 2166 #define isALPHA_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHA_LC, _CC_ALPHA, c) 2167 #define isALPHANUMERIC_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHANUMERIC_LC, \ 2168 _CC_ALPHANUMERIC, c) 2169 #define isASCII_LC_uvchr(c) isASCII_LC(c) 2170 #define isBLANK_LC_uvchr(c) _generic_LC_uvchr(isBLANK_LC, \ 2171 is_HORIZWS_cp_high, c) 2172 #define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0) 2173 #define isDIGIT_LC_uvchr(c) _generic_LC_invlist_uvchr(isDIGIT_LC, _CC_DIGIT, c) 2174 #define isGRAPH_LC_uvchr(c) _generic_LC_invlist_uvchr(isGRAPH_LC, _CC_GRAPH, c) 2175 #define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \ 2176 _is_uni_perl_idcont, c) 2177 #define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \ 2178 _is_uni_perl_idstart, c) 2179 #define isLOWER_LC_uvchr(c) _generic_LC_invlist_uvchr(isLOWER_LC, _CC_LOWER, c) 2180 #define isPRINT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPRINT_LC, _CC_PRINT, c) 2181 #define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c) 2182 #define isPUNCT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPUNCT_LC, _CC_PUNCT, c) 2183 #define isSPACE_LC_uvchr(c) _generic_LC_uvchr(isSPACE_LC, \ 2184 is_XPERLSPACE_cp_high, c) 2185 #define isUPPER_LC_uvchr(c) _generic_LC_invlist_uvchr(isUPPER_LC, _CC_UPPER, c) 2186 #define isWORDCHAR_LC_uvchr(c) _generic_LC_invlist_uvchr(isWORDCHAR_LC, \ 2187 _CC_WORDCHAR, c) 2188 #define isXDIGIT_LC_uvchr(c) _generic_LC_uvchr(isXDIGIT_LC, \ 2189 is_XDIGIT_cp_high, c) 2190 2191 #define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c)) 2192 2193 /* The "_safe" macros make sure that we don't attempt to read beyond 'e', but 2194 * they don't otherwise go out of their way to look for malformed UTF-8. If 2195 * they can return accurate results without knowing if the input is otherwise 2196 * malformed, they do so. For example isASCII is accurate in spite of any 2197 * non-length malformations because it looks only at a single byte. Likewise 2198 * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8 2199 * variant ones return FALSE. But, if the input has to be well-formed in order 2200 * for the results to be accurate, the macros will test and if malformed will 2201 * call a routine to die 2202 * 2203 * Except for toke.c, the macros do assume that e > p, asserting that on 2204 * DEBUGGING builds. Much code that calls these depends on this being true, 2205 * for other reasons. toke.c is treated specially as using the regular 2206 * assertion breaks it in many ways. All strings that these operate on there 2207 * are supposed to have an extra NUL character at the end, so that *e = \0. A 2208 * bunch of code in toke.c assumes that this is true, so the assertion allows 2209 * for that */ 2210 #ifdef PERL_IN_TOKE_C 2211 # define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0')) 2212 #else 2213 # define _utf8_safe_assert(p,e) ((e) > (p)) 2214 #endif 2215 2216 #define _generic_utf8_safe(classnum, p, e, above_latin1) \ 2217 ((! _utf8_safe_assert(p, e)) \ 2218 ? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, 1), 0)\ 2219 : (UTF8_IS_INVARIANT(*(p))) \ 2220 ? _generic_isCC(*(p), classnum) \ 2221 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ 2222 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ 2223 ? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ 2224 classnum) \ 2225 : (_force_out_malformed_utf8_message( \ 2226 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \ 2227 : above_latin1)) 2228 /* Like the above, but calls 'above_latin1(p)' to get the utf8 value. 2229 * 'above_latin1' can be a macro */ 2230 #define _generic_func_utf8_safe(classnum, above_latin1, p, e) \ 2231 _generic_utf8_safe(classnum, p, e, above_latin1(p, e)) 2232 #define _generic_non_invlist_utf8_safe(classnum, above_latin1, p, e) \ 2233 _generic_utf8_safe(classnum, p, e, \ 2234 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2235 ? (_force_out_malformed_utf8_message( \ 2236 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2237 : above_latin1(p))) 2238 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an 2239 * 'above_latin1' parameter */ 2240 #define _generic_invlist_utf8_safe(classnum, p, e) \ 2241 _generic_utf8_safe(classnum, p, e, _is_utf8_FOO(classnum, p, e)) 2242 2243 /* Like the above, but should be used only when it is known that there are no 2244 * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the 2245 * class is TRUE for. Hence it can skip the tests for this range. 2246 * 'above_latin1' should include its arguments */ 2247 #define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1) \ 2248 (__ASSERT_(_utf8_safe_assert(p, e)) \ 2249 (isASCII(*(p))) \ 2250 ? _generic_isCC(*(p), classnum) \ 2251 : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ 2252 ? 0 /* Note that doesn't check validity for latin1 */ \ 2253 : above_latin1) 2254 2255 2256 #define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e) 2257 #define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e) 2258 #define isASCII_utf8(p, e) isASCII_utf8_safe(p, e) 2259 #define isBLANK_utf8(p, e) isBLANK_utf8_safe(p, e) 2260 #define isCNTRL_utf8(p, e) isCNTRL_utf8_safe(p, e) 2261 #define isDIGIT_utf8(p, e) isDIGIT_utf8_safe(p, e) 2262 #define isGRAPH_utf8(p, e) isGRAPH_utf8_safe(p, e) 2263 #define isIDCONT_utf8(p, e) isIDCONT_utf8_safe(p, e) 2264 #define isIDFIRST_utf8(p, e) isIDFIRST_utf8_safe(p, e) 2265 #define isLOWER_utf8(p, e) isLOWER_utf8_safe(p, e) 2266 #define isPRINT_utf8(p, e) isPRINT_utf8_safe(p, e) 2267 #define isPSXSPC_utf8(p, e) isPSXSPC_utf8_safe(p, e) 2268 #define isPUNCT_utf8(p, e) isPUNCT_utf8_safe(p, e) 2269 #define isSPACE_utf8(p, e) isSPACE_utf8_safe(p, e) 2270 #define isUPPER_utf8(p, e) isUPPER_utf8_safe(p, e) 2271 #define isVERTWS_utf8(p, e) isVERTWS_utf8_safe(p, e) 2272 #define isWORDCHAR_utf8(p, e) isWORDCHAR_utf8_safe(p, e) 2273 #define isXDIGIT_utf8(p, e) isXDIGIT_utf8_safe(p, e) 2274 2275 #define isALPHA_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_ALPHA, p, e) 2276 #define isALPHANUMERIC_utf8_safe(p, e) \ 2277 _generic_invlist_utf8_safe(_CC_ALPHANUMERIC, p, e) 2278 #define isASCII_utf8_safe(p, e) \ 2279 /* Because ASCII is invariant under utf8, the non-utf8 macro \ 2280 * works */ \ 2281 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p))) 2282 #define isBLANK_utf8_safe(p, e) \ 2283 _generic_non_invlist_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e) 2284 2285 #ifdef EBCDIC 2286 /* Because all controls are UTF-8 invariants in EBCDIC, we can use this 2287 * more efficient macro instead of the more general one */ 2288 # define isCNTRL_utf8_safe(p, e) \ 2289 (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p))) 2290 #else 2291 # define isCNTRL_utf8_safe(p, e) _generic_utf8_safe(_CC_CNTRL, p, e, 0) 2292 #endif 2293 2294 #define isDIGIT_utf8_safe(p, e) \ 2295 _generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e, \ 2296 _is_utf8_FOO(_CC_DIGIT, p, e)) 2297 #define isGRAPH_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_GRAPH, p, e) 2298 #define isIDCONT_utf8_safe(p, e) _generic_func_utf8_safe(_CC_WORDCHAR, \ 2299 _is_utf8_perl_idcont, p, e) 2300 2301 /* To prevent S_scan_word in toke.c from hanging, we have to make sure that 2302 * IDFIRST is an alnum. See 2303 * https://github.com/Perl/perl5/issues/10275 for more detail than you 2304 * ever wanted to know about. (In the ASCII range, there isn't a difference.) 2305 * This used to be not the XID version, but we decided to go with the more 2306 * modern Unicode definition */ 2307 #define isIDFIRST_utf8_safe(p, e) \ 2308 _generic_func_utf8_safe(_CC_IDFIRST, \ 2309 _is_utf8_perl_idstart, (U8 *) (p), (U8 *) (e)) 2310 2311 #define isLOWER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_LOWER, p, e) 2312 #define isPRINT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PRINT, p, e) 2313 #define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e) 2314 #define isPUNCT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PUNCT, p, e) 2315 #define isSPACE_utf8_safe(p, e) \ 2316 _generic_non_invlist_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e) 2317 #define isUPPER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_UPPER, p, e) 2318 #define isVERTWS_utf8_safe(p, e) \ 2319 _generic_non_invlist_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e) 2320 #define isWORDCHAR_utf8_safe(p, e) \ 2321 _generic_invlist_utf8_safe(_CC_WORDCHAR, p, e) 2322 #define isXDIGIT_utf8_safe(p, e) \ 2323 _generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e, \ 2324 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2325 ? (_force_out_malformed_utf8_message( \ 2326 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2327 : is_XDIGIT_high(p))) 2328 2329 #define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l) 2330 #define toLOWER_utf8(p,e,s,l) toLOWER_utf8_safe(p,e,s,l) 2331 #define toTITLE_utf8(p,e,s,l) toTITLE_utf8_safe(p,e,s,l) 2332 #define toUPPER_utf8(p,e,s,l) toUPPER_utf8_safe(p,e,s,l) 2333 2334 /* For internal core use only, subject to change */ 2335 #define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f) 2336 #define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f) 2337 #define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f) 2338 #define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f) 2339 2340 #define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL) 2341 #define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0) 2342 #define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0) 2343 #define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0) 2344 2345 #define isALPHA_LC_utf8(p, e) isALPHA_LC_utf8_safe(p, e) 2346 #define isALPHANUMERIC_LC_utf8(p, e) isALPHANUMERIC_LC_utf8_safe(p, e) 2347 #define isASCII_LC_utf8(p, e) isASCII_LC_utf8_safe(p, e) 2348 #define isBLANK_LC_utf8(p, e) isBLANK_LC_utf8_safe(p, e) 2349 #define isCNTRL_LC_utf8(p, e) isCNTRL_LC_utf8_safe(p, e) 2350 #define isDIGIT_LC_utf8(p, e) isDIGIT_LC_utf8_safe(p, e) 2351 #define isGRAPH_LC_utf8(p, e) isGRAPH_LC_utf8_safe(p, e) 2352 #define isIDCONT_LC_utf8(p, e) isIDCONT_LC_utf8_safe(p, e) 2353 #define isIDFIRST_LC_utf8(p, e) isIDFIRST_LC_utf8_safe(p, e) 2354 #define isLOWER_LC_utf8(p, e) isLOWER_LC_utf8_safe(p, e) 2355 #define isPRINT_LC_utf8(p, e) isPRINT_LC_utf8_safe(p, e) 2356 #define isPSXSPC_LC_utf8(p, e) isPSXSPC_LC_utf8_safe(p, e) 2357 #define isPUNCT_LC_utf8(p, e) isPUNCT_LC_utf8_safe(p, e) 2358 #define isSPACE_LC_utf8(p, e) isSPACE_LC_utf8_safe(p, e) 2359 #define isUPPER_LC_utf8(p, e) isUPPER_LC_utf8_safe(p, e) 2360 #define isWORDCHAR_LC_utf8(p, e) isWORDCHAR_LC_utf8_safe(p, e) 2361 #define isXDIGIT_LC_utf8(p, e) isXDIGIT_LC_utf8_safe(p, e) 2362 2363 /* For internal core Perl use only: the base macros for defining macros like 2364 * isALPHA_LC_utf8_safe. These are like _generic_utf8, but if the first code 2365 * point in 'p' is within the 0-255 range, it uses locale rules from the 2366 * passed-in 'macro' parameter */ 2367 #define _generic_LC_utf8_safe(macro, p, e, above_latin1) \ 2368 (__ASSERT_(_utf8_safe_assert(p, e)) \ 2369 (UTF8_IS_INVARIANT(*(p))) \ 2370 ? macro(*(p)) \ 2371 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ 2372 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ 2373 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \ 2374 : (_force_out_malformed_utf8_message( \ 2375 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \ 2376 : above_latin1)) 2377 2378 #define _generic_LC_invlist_utf8_safe(macro, classnum, p, e) \ 2379 _generic_LC_utf8_safe(macro, p, e, \ 2380 _is_utf8_FOO(classnum, p, e)) 2381 2382 #define _generic_LC_func_utf8_safe(macro, above_latin1, p, e) \ 2383 _generic_LC_utf8_safe(macro, p, e, above_latin1(p, e)) 2384 2385 #define _generic_LC_non_invlist_utf8_safe(classnum, above_latin1, p, e) \ 2386 _generic_LC_utf8_safe(classnum, p, e, \ 2387 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2388 ? (_force_out_malformed_utf8_message( \ 2389 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2390 : above_latin1(p))) 2391 2392 #define isALPHANUMERIC_LC_utf8_safe(p, e) \ 2393 _generic_LC_invlist_utf8_safe(isALPHANUMERIC_LC, \ 2394 _CC_ALPHANUMERIC, p, e) 2395 #define isALPHA_LC_utf8_safe(p, e) \ 2396 _generic_LC_invlist_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e) 2397 #define isASCII_LC_utf8_safe(p, e) \ 2398 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p))) 2399 #define isBLANK_LC_utf8_safe(p, e) \ 2400 _generic_LC_non_invlist_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e) 2401 #define isCNTRL_LC_utf8_safe(p, e) \ 2402 _generic_LC_utf8_safe(isCNTRL_LC, p, e, 0) 2403 #define isDIGIT_LC_utf8_safe(p, e) \ 2404 _generic_LC_invlist_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e) 2405 #define isGRAPH_LC_utf8_safe(p, e) \ 2406 _generic_LC_invlist_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e) 2407 #define isIDCONT_LC_utf8_safe(p, e) \ 2408 _generic_LC_func_utf8_safe(isIDCONT_LC, \ 2409 _is_utf8_perl_idcont, p, e) 2410 #define isIDFIRST_LC_utf8_safe(p, e) \ 2411 _generic_LC_func_utf8_safe(isIDFIRST_LC, \ 2412 _is_utf8_perl_idstart, p, e) 2413 #define isLOWER_LC_utf8_safe(p, e) \ 2414 _generic_LC_invlist_utf8_safe(isLOWER_LC, _CC_LOWER, p, e) 2415 #define isPRINT_LC_utf8_safe(p, e) \ 2416 _generic_LC_invlist_utf8_safe(isPRINT_LC, _CC_PRINT, p, e) 2417 #define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e) 2418 #define isPUNCT_LC_utf8_safe(p, e) \ 2419 _generic_LC_invlist_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e) 2420 #define isSPACE_LC_utf8_safe(p, e) \ 2421 _generic_LC_non_invlist_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e) 2422 #define isUPPER_LC_utf8_safe(p, e) \ 2423 _generic_LC_invlist_utf8_safe(isUPPER_LC, _CC_UPPER, p, e) 2424 #define isWORDCHAR_LC_utf8_safe(p, e) \ 2425 _generic_LC_invlist_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e) 2426 #define isXDIGIT_LC_utf8_safe(p, e) \ 2427 _generic_LC_non_invlist_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e) 2428 2429 /* Macros for backwards compatibility and for completeness when the ASCII and 2430 * Latin1 values are identical */ 2431 #define isALPHAU(c) isALPHA_L1(c) 2432 #define isDIGIT_L1(c) isDIGIT_A(c) 2433 #define isOCTAL(c) isOCTAL_A(c) 2434 #define isOCTAL_L1(c) isOCTAL_A(c) 2435 #define isXDIGIT_L1(c) isXDIGIT_A(c) 2436 #define isALNUM(c) isWORDCHAR(c) 2437 #define isALNUM_A(c) isALNUM(c) 2438 #define isALNUMU(c) isWORDCHAR_L1(c) 2439 #define isALNUM_LC(c) isWORDCHAR_LC(c) 2440 #define isALNUM_uni(c) isWORDCHAR_uni(c) 2441 #define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c) 2442 #define isALNUM_utf8(p,e) isWORDCHAR_utf8(p,e) 2443 #define isALNUM_utf8_safe(p,e) isWORDCHAR_utf8_safe(p,e) 2444 #define isALNUM_LC_utf8(p,e)isWORDCHAR_LC_utf8(p,e) 2445 #define isALNUM_LC_utf8_safe(p,e)isWORDCHAR_LC_utf8_safe(p,e) 2446 #define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */ 2447 #define isALNUMC_L1(c) isALPHANUMERIC_L1(c) 2448 #define isALNUMC(c) isALPHANUMERIC(c) 2449 #define isALNUMC_LC(c) isALPHANUMERIC_LC(c) 2450 #define isALNUMC_uni(c) isALPHANUMERIC_uni(c) 2451 #define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c) 2452 #define isALNUMC_utf8(p,e) isALPHANUMERIC_utf8(p,e) 2453 #define isALNUMC_utf8_safe(p,e) isALPHANUMERIC_utf8_safe(p,e) 2454 #define isALNUMC_LC_utf8_safe(p,e) isALPHANUMERIC_LC_utf8_safe(p,e) 2455 2456 /* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, 2457 * except that they don't necessarily mean the same characters, e.g. CTRL-D is 2458 * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC. 2459 * '?' is special-cased on EBCDIC to APC, which is the control there that is 2460 * the outlier from the block that contains the other controls, just like 2461 * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0 2462 * block. If it weren't special cased, it would yield a non-control. 2463 * The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D, 2464 * etc. */ 2465 #ifndef EBCDIC 2466 # define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64) 2467 #else 2468 # define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ 2469 ((isPRINT_A(c)) \ 2470 ? (UNLIKELY((c) == '?') \ 2471 ? QUESTION_MARK_CTRL \ 2472 : (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \ 2473 : (UNLIKELY((c) == QUESTION_MARK_CTRL) \ 2474 ? '?' \ 2475 : (LATIN1_TO_NATIVE(((U8) (c)) ^ 64))))) 2476 #endif 2477 2478 /* 2479 =for apidoc Ay||line_t 2480 The typedef to use to declare variables that are to hold line numbers. 2481 2482 =cut 2483 2484 Line numbers are unsigned, 32 bits. 2485 */ 2486 typedef U32 line_t; 2487 #define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */ 2488 2489 /* Helpful alias for version prescan */ 2490 #define is_LAX_VERSION(a,b) \ 2491 (a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL)) 2492 2493 #define is_STRICT_VERSION(a,b) \ 2494 (a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL)) 2495 2496 #define BADVERSION(a,b,c) \ 2497 if (b) { \ 2498 *b = c; \ 2499 } \ 2500 return a; 2501 2502 /* Converts a character KNOWN to represent a hexadecimal digit (0-9, A-F, or 2503 * a-f) to its numeric value without using any branches. The input is 2504 * validated only by an assert() in DEBUGGING builds. 2505 * 2506 * It works by right shifting and isolating the bit that is 0 for the digits, 2507 * and 1 for at least the alphas A-F, a-f. The bit is shifted to the ones 2508 * position, and then to the eights position. Both are added together to form 2509 * 0 if the input is '0'-'9' and to form 9 if alpha. This is added to the 2510 * final four bits of the input to form the correct value. */ 2511 #define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) \ 2512 ((NATIVE_TO_LATIN1(c) >> 6) & 1) /* 1 if alpha; 0 if not */ \ 2513 + ((NATIVE_TO_LATIN1(c) >> 3) & 8) /* 8 if alpha; 0 if not */ \ 2514 + ((c) & 0xF)) /* 0-9 if input valid hex digit */ 2515 2516 /* The argument is a string pointer, which is advanced. */ 2517 #define READ_XDIGIT(s) ((s)++, XDIGIT_VALUE(*((s) - 1))) 2518 2519 /* Converts a character known to represent an octal digit (0-7) to its numeric 2520 * value. The input is validated only by an assert() in DEBUGGING builds. In 2521 * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */ 2522 #define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c))) 2523 2524 /* Efficiently returns a boolean as to if two native characters are equivalent 2525 * case-insensitively. At least one of the characters must be one of [A-Za-z]; 2526 * the ALPHA in the name is to remind you of that. This is asserted() in 2527 * DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro 2528 * works (on valid input) for both non- and UTF-8-encoded bytes. 2529 * 2530 * When one of the inputs is a compile-time constant and gets folded by the 2531 * compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII 2532 * machines, 'A' and 'a' differ by a single bit; the same with the upper and 2533 * lower case of all other ASCII-range alphabetics. On ASCII platforms, they 2534 * are 32 apart; on EBCDIC, they are 64. At compile time, this uses an 2535 * exclusive 'or' to find that bit and then inverts it to form a mask, with 2536 * just a single 0, in the bit position where the upper- and lowercase differ. 2537 * */ 2538 #define isALPHA_FOLD_EQ(c1, c2) \ 2539 (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \ 2540 ((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a'))) 2541 #define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2))) 2542 2543 /* 2544 =for apidoc_section $memory 2545 2546 =for apidoc Am|void|Newx|void* ptr|int nitems|type 2547 The XSUB-writer's interface to the C C<malloc> function. 2548 2549 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2550 2551 In 5.9.3, Newx() and friends replace the older New() API, and drops 2552 the first parameter, I<x>, a debug aid which allowed callers to identify 2553 themselves. This aid has been superseded by a new build option, 2554 PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still 2555 there for use in XS modules supporting older perls. 2556 2557 =for apidoc Am|void|Newxc|void* ptr|int nitems|type|cast 2558 The XSUB-writer's interface to the C C<malloc> function, with 2559 cast. See also C<L</Newx>>. 2560 2561 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2562 2563 =for apidoc Am|void|Newxz|void* ptr|int nitems|type 2564 The XSUB-writer's interface to the C C<malloc> function. The allocated 2565 memory is zeroed with C<memzero>. See also C<L</Newx>>. 2566 2567 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2568 2569 =for apidoc Am|void|Renew|void* ptr|int nitems|type 2570 The XSUB-writer's interface to the C C<realloc> function. 2571 2572 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2573 2574 =for apidoc Am|void|Renewc|void* ptr|int nitems|type|cast 2575 The XSUB-writer's interface to the C C<realloc> function, with 2576 cast. 2577 2578 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2579 2580 =for apidoc Am|void|Safefree|void* ptr 2581 The XSUB-writer's interface to the C C<free> function. 2582 2583 This should B<ONLY> be used on memory obtained using L</"Newx"> and friends. 2584 2585 =for apidoc_section $string 2586 =for apidoc Am|void|Move|void* src|void* dest|int nitems|type 2587 The XSUB-writer's interface to the C C<memmove> function. The C<src> is the 2588 source, C<dest> is the destination, C<nitems> is the number of items, and 2589 C<type> is the type. Can do overlapping moves. See also C<L</Copy>>. 2590 2591 =for apidoc Am|void *|MoveD|void* src|void* dest|int nitems|type 2592 Like C<Move> but returns C<dest>. Useful 2593 for encouraging compilers to tail-call 2594 optimise. 2595 2596 =for apidoc Am|void|Copy|void* src|void* dest|int nitems|type 2597 The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the 2598 source, C<dest> is the destination, C<nitems> is the number of items, and 2599 C<type> is the type. May fail on overlapping copies. See also C<L</Move>>. 2600 2601 =for apidoc Am|void *|CopyD|void* src|void* dest|int nitems|type 2602 2603 Like C<Copy> but returns C<dest>. Useful 2604 for encouraging compilers to tail-call 2605 optimise. 2606 2607 =for apidoc Am|void|Zero|void* dest|int nitems|type 2608 2609 The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the 2610 destination, C<nitems> is the number of items, and C<type> is the type. 2611 2612 =for apidoc Am|void *|ZeroD|void* dest|int nitems|type 2613 2614 Like C<Zero> but returns dest. Useful 2615 for encouraging compilers to tail-call 2616 optimise. 2617 2618 =for apidoc_section $utility 2619 =for apidoc Amu|void|StructCopy|type *src|type *dest|type 2620 This is an architecture-independent macro to copy one structure to another. 2621 2622 =for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte 2623 2624 Fill up memory with a byte pattern (a byte repeated over and over 2625 again) that hopefully catches attempts to access uninitialized memory. 2626 2627 =for apidoc Am|void|PoisonNew|void* dest|int nitems|type 2628 2629 PoisonWith(0xAB) for catching access to allocated but uninitialized memory. 2630 2631 =for apidoc Am|void|PoisonFree|void* dest|int nitems|type 2632 2633 PoisonWith(0xEF) for catching access to freed memory. 2634 2635 =for apidoc Am|void|Poison|void* dest|int nitems|type 2636 2637 PoisonWith(0xEF) for catching access to freed memory. 2638 2639 =cut */ 2640 2641 /* Maintained for backwards-compatibility only. Use newSV() instead. */ 2642 #ifndef PERL_CORE 2643 #define NEWSV(x,len) newSV(len) 2644 #endif 2645 2646 #define MEM_SIZE_MAX ((MEM_SIZE)-1) 2647 2648 #define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1)) 2649 2650 #ifdef PERL_MALLOC_WRAP 2651 2652 /* This expression will be constant-folded at compile time. It checks 2653 * whether or not the type of the count n is so small (e.g. U8 or U16, or 2654 * U32 on 64-bit systems) that there's no way a wrap-around could occur. 2655 * As well as avoiding the need for a run-time check in some cases, it's 2656 * designed to avoid compiler warnings like: 2657 * comparison is always false due to limited range of data type 2658 * It's mathematically equivalent to 2659 * max(n) * sizeof(t) > MEM_SIZE_MAX 2660 */ 2661 2662 # define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \ 2663 ( sizeof(MEM_SIZE) < sizeof(n) \ 2664 || sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n)))) 2665 2666 /* This is written in a slightly odd way to avoid various spurious 2667 * compiler warnings. We *want* to write the expression as 2668 * _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C) 2669 * (for some compile-time constant C), but even when the LHS 2670 * constant-folds to false at compile-time, g++ insists on emitting 2671 * warnings about the RHS (e.g. "comparison is always false"), so instead 2672 * we write it as 2673 * 2674 * (cond ? n : X) > C 2675 * 2676 * where X is a constant with X > C always false. Choosing a value for X 2677 * is tricky. If 0, some compilers will complain about 0 > C always being 2678 * false; if 1, Coverity complains when n happens to be the constant value 2679 * '1', that cond ? 1 : 1 has the same value on both branches; so use C 2680 * for X and hope that nothing else whines. 2681 */ 2682 2683 # define _MEM_WRAP_WILL_WRAP(n,t) \ 2684 ((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \ 2685 MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t)) 2686 2687 # define MEM_WRAP_CHECK(n,t) \ 2688 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2689 && (croak_memory_wrap(),0)) 2690 2691 # define MEM_WRAP_CHECK_1(n,t,a) \ 2692 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2693 && (Perl_croak_nocontext("%s",(a)),0)) 2694 2695 /* "a" arg must be a string literal */ 2696 # define MEM_WRAP_CHECK_s(n,t,a) \ 2697 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2698 && (Perl_croak_nocontext("" a ""),0)) 2699 2700 #define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t), 2701 2702 #define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n)) 2703 #else 2704 2705 #define MEM_WRAP_CHECK(n,t) 2706 #define MEM_WRAP_CHECK_1(n,t,a) 2707 #define MEM_WRAP_CHECK_s(n,t,a) 2708 #define MEM_WRAP_CHECK_(n,t) 2709 2710 #define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n) 2711 2712 #endif 2713 2714 #ifdef PERL_MEM_LOG 2715 /* 2716 * If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s 2717 * go through functions, which are handy for debugging breakpoints, but 2718 * which more importantly get the immediate calling environment (file and 2719 * line number, and C function name if available) passed in. This info can 2720 * then be used for logging the calls, for which one gets a sample 2721 * implementation unless -DPERL_MEM_LOG_NOIMPL is also defined. 2722 * 2723 * Known problems: 2724 * - not all memory allocs get logged, only those 2725 * that go through Newx() and derivatives (while all 2726 * Safefrees do get logged) 2727 * - __FILE__ and __LINE__ do not work everywhere 2728 * - __func__ or __FUNCTION__ even less so 2729 * - I think more goes on after the perlio frees but 2730 * the thing is that STDERR gets closed (as do all 2731 * the file descriptors) 2732 * - no deeper calling stack than the caller of the Newx() 2733 * or the kind, but do I look like a C reflection/introspection 2734 * utility to you? 2735 * - the function prototypes for the logging functions 2736 * probably should maybe be somewhere else than handy.h 2737 * - one could consider inlining (macrofying) the logging 2738 * for speed, but I am too lazy 2739 * - one could imagine recording the allocations in a hash, 2740 * (keyed by the allocation address?), and maintain that 2741 * through reallocs and frees, but how to do that without 2742 * any News() happening...? 2743 * - lots of -Ddefines to get useful/controllable output 2744 * - lots of ENV reads 2745 */ 2746 2747 # ifdef PERL_CORE 2748 # ifndef PERL_MEM_LOG_NOIMPL 2749 enum mem_log_type { 2750 MLT_ALLOC, 2751 MLT_REALLOC, 2752 MLT_FREE, 2753 MLT_NEW_SV, 2754 MLT_DEL_SV 2755 }; 2756 # endif 2757 # if defined(PERL_IN_SV_C) /* those are only used in sv.c */ 2758 void Perl_mem_log_new_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname); 2759 void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname); 2760 # endif 2761 # endif 2762 2763 #endif 2764 2765 #ifdef PERL_MEM_LOG 2766 #define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__) 2767 #define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__) 2768 #define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__) 2769 #endif 2770 2771 #ifndef MEM_LOG_ALLOC 2772 #define MEM_LOG_ALLOC(n,t,a) (a) 2773 #endif 2774 #ifndef MEM_LOG_REALLOC 2775 #define MEM_LOG_REALLOC(n,t,v,a) (a) 2776 #endif 2777 #ifndef MEM_LOG_FREE 2778 #define MEM_LOG_FREE(a) (a) 2779 #endif 2780 2781 #define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t)))))) 2782 #define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t)))))) 2783 #define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t))))) 2784 2785 #ifndef PERL_CORE 2786 /* pre 5.9.x compatibility */ 2787 #define New(x,v,n,t) Newx(v,n,t) 2788 #define Newc(x,v,n,t,c) Newxc(v,n,t,c) 2789 #define Newz(x,v,n,t) Newxz(v,n,t) 2790 #endif 2791 2792 #define Renew(v,n,t) \ 2793 (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t)))))) 2794 #define Renewc(v,n,t,c) \ 2795 (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t)))))) 2796 2797 #ifdef PERL_POISON 2798 #define Safefree(d) \ 2799 ((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0) 2800 #else 2801 #define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d))) 2802 #endif 2803 2804 /* assert that a valid ptr has been supplied - use this instead of assert(ptr) * 2805 * as it handles cases like constant string arguments without throwing warnings * 2806 * the cast is required, as is the inequality check, to avoid warnings */ 2807 #define perl_assert_ptr(p) assert( ((void*)(p)) != 0 ) 2808 2809 2810 #define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t))) 2811 #define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t))) 2812 #define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t))) 2813 2814 /* Like above, but returns a pointer to 'd' */ 2815 #define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t))) 2816 #define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t))) 2817 #define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t))) 2818 2819 #define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t))) 2820 #define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB) 2821 #define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF) 2822 #define Poison(d,n,t) PoisonFree(d,n,t) 2823 2824 #ifdef PERL_POISON 2825 # define PERL_POISON_EXPR(x) x 2826 #else 2827 # define PERL_POISON_EXPR(x) 2828 #endif 2829 2830 /* Shallow copy */ 2831 #define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s))) 2832 2833 /* 2834 =for apidoc_section $utility 2835 2836 =for apidoc Am|STRLEN|C_ARRAY_LENGTH|void *a 2837 2838 Returns the number of elements in the input C array (so you want your 2839 zero-based indices to be less than but not equal to). 2840 2841 =for apidoc Am|void *|C_ARRAY_END|void *a 2842 2843 Returns a pointer to one element past the final element of the input C array. 2844 2845 =cut 2846 2847 C_ARRAY_END is one past the last: half-open/half-closed range, not 2848 last-inclusive range. 2849 */ 2850 #define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) 2851 #define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a)) 2852 2853 #ifdef NEED_VA_COPY 2854 # ifdef va_copy 2855 # define Perl_va_copy(s, d) va_copy(d, s) 2856 # elif defined(__va_copy) 2857 # define Perl_va_copy(s, d) __va_copy(d, s) 2858 # else 2859 # define Perl_va_copy(s, d) Copy(s, d, 1, va_list) 2860 # endif 2861 #endif 2862 2863 /* convenience debug macros */ 2864 #ifdef USE_ITHREADS 2865 #define pTHX_FORMAT "Perl interpreter: 0x%p" 2866 #define pTHX__FORMAT ", Perl interpreter: 0x%p" 2867 #define pTHX_VALUE_ (void *)my_perl, 2868 #define pTHX_VALUE (void *)my_perl 2869 #define pTHX__VALUE_ ,(void *)my_perl, 2870 #define pTHX__VALUE ,(void *)my_perl 2871 #else 2872 #define pTHX_FORMAT 2873 #define pTHX__FORMAT 2874 #define pTHX_VALUE_ 2875 #define pTHX_VALUE 2876 #define pTHX__VALUE_ 2877 #define pTHX__VALUE 2878 #endif /* USE_ITHREADS */ 2879 2880 /* Perl_deprecate was not part of the public API, and did not have a deprecate() 2881 shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor 2882 CPAN::Unpack show any users outside the core. */ 2883 #ifdef PERL_CORE 2884 # define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2885 "Use of " s " is deprecated") 2886 # define deprecate_disappears_in(when,message) \ 2887 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2888 message ", and will disappear in Perl " when) 2889 # define deprecate_fatal_in(when,message) \ 2890 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2891 message ". Its use will be fatal in Perl " when) 2892 #endif 2893 2894 /* Internal macros to deal with gids and uids */ 2895 #ifdef PERL_CORE 2896 2897 # if Uid_t_size > IVSIZE 2898 # define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid)) 2899 # define SvUID(sv) SvNV(sv) 2900 # elif Uid_t_sign <= 0 2901 # define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid)) 2902 # define SvUID(sv) SvIV(sv) 2903 # else 2904 # define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid)) 2905 # define SvUID(sv) SvUV(sv) 2906 # endif /* Uid_t_size */ 2907 2908 # if Gid_t_size > IVSIZE 2909 # define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid)) 2910 # define SvGID(sv) SvNV(sv) 2911 # elif Gid_t_sign <= 0 2912 # define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid)) 2913 # define SvGID(sv) SvIV(sv) 2914 # else 2915 # define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid)) 2916 # define SvGID(sv) SvUV(sv) 2917 # endif /* Gid_t_size */ 2918 2919 #endif 2920 2921 #endif /* PERL_HANDY_H_ */ 2922 2923 /* 2924 * ex: set ts=8 sts=4 sw=4 et: 2925 */ 2926