1 /* handy.h 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1999, 2000, 4 * 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2012 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* IMPORTANT NOTE: Everything whose name begins with an underscore is for 12 * internal core Perl use only. */ 13 14 #ifndef PERL_HANDY_H_ /* Guard against nested #inclusion */ 15 #define PERL_HANDY_H_ 16 17 #ifndef PERL_CORE 18 # define Null(type) ((type)NULL) 19 20 /* 21 =head1 Handy Values 22 23 =for apidoc AmnU||Nullch 24 Null character pointer. (No longer available when C<PERL_CORE> is 25 defined.) 26 27 =for apidoc AmnU||Nullsv 28 Null SV pointer. (No longer available when C<PERL_CORE> is defined.) 29 30 =cut 31 */ 32 33 # define Nullch Null(char*) 34 # define Nullfp Null(PerlIO*) 35 # define Nullsv Null(SV*) 36 #endif 37 38 #ifdef TRUE 39 #undef TRUE 40 #endif 41 #ifdef FALSE 42 #undef FALSE 43 #endif 44 #define TRUE (1) 45 #define FALSE (0) 46 47 /* The MUTABLE_*() macros cast pointers to the types shown, in such a way 48 * (compiler permitting) that casting away const-ness will give a warning; 49 * e.g.: 50 * 51 * const SV *sv = ...; 52 * AV *av1 = (AV*)sv; <== BAD: the const has been silently cast away 53 * AV *av2 = MUTABLE_AV(sv); <== GOOD: it may warn 54 */ 55 56 #if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) 57 # define MUTABLE_PTR(p) ({ void *_p = (p); _p; }) 58 #else 59 # define MUTABLE_PTR(p) ((void *) (p)) 60 #endif 61 62 #define MUTABLE_AV(p) ((AV *)MUTABLE_PTR(p)) 63 #define MUTABLE_CV(p) ((CV *)MUTABLE_PTR(p)) 64 #define MUTABLE_GV(p) ((GV *)MUTABLE_PTR(p)) 65 #define MUTABLE_HV(p) ((HV *)MUTABLE_PTR(p)) 66 #define MUTABLE_IO(p) ((IO *)MUTABLE_PTR(p)) 67 #define MUTABLE_SV(p) ((SV *)MUTABLE_PTR(p)) 68 69 #if defined(I_STDBOOL) && !defined(PERL_BOOL_AS_CHAR) 70 # include <stdbool.h> 71 # ifndef HAS_BOOL 72 # define HAS_BOOL 1 73 # endif 74 #endif 75 76 /* bool is built-in for g++-2.6.3 and later, which might be used 77 for extensions. <_G_config.h> defines _G_HAVE_BOOL, but we can't 78 be sure _G_config.h will be included before this file. _G_config.h 79 also defines _G_HAVE_BOOL for both gcc and g++, but only g++ 80 actually has bool. Hence, _G_HAVE_BOOL is pretty useless for us. 81 g++ can be identified by __GNUG__. 82 Andy Dougherty February 2000 83 */ 84 #ifdef __GNUG__ /* GNU g++ has bool built-in */ 85 # ifndef PERL_BOOL_AS_CHAR 86 # ifndef HAS_BOOL 87 # define HAS_BOOL 1 88 # endif 89 # endif 90 #endif 91 92 #ifndef HAS_BOOL 93 # ifdef bool 94 # undef bool 95 # endif 96 # define bool char 97 # define HAS_BOOL 1 98 #endif 99 100 /* 101 =for apidoc Am|bool|cBOOL|bool expr 102 103 Cast-to-bool. A simple S<C<(bool) I<expr>>> cast may not do the right thing: 104 if C<bool> is defined as C<char>, for example, then the cast from C<int> is 105 implementation-defined. 106 107 C<(bool)!!(cbool)> in a ternary triggers a bug in xlc on AIX 108 109 =cut 110 */ 111 #define cBOOL(cbool) ((cbool) ? (bool)1 : (bool)0) 112 113 /* Try to figure out __func__ or __FUNCTION__ equivalent, if any. 114 * XXX Should really be a Configure probe, with HAS__FUNCTION__ 115 * and FUNCTION__ as results. 116 * XXX Similarly, a Configure probe for __FILE__ and __LINE__ is needed. */ 117 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined(__SUNPRO_C)) /* C99 or close enough. */ 118 # define FUNCTION__ __func__ 119 #elif (defined(__DECC_VER)) /* Tru64 or VMS, and strict C89 being used, but not modern enough cc (in Tur64, -c99 not known, only -std1). */ 120 # define FUNCTION__ "" 121 #else 122 # define FUNCTION__ __FUNCTION__ /* Common extension. */ 123 #endif 124 125 /* XXX A note on the perl source internal type system. The 126 original intent was that I32 be *exactly* 32 bits. 127 128 Currently, we only guarantee that I32 is *at least* 32 bits. 129 Specifically, if int is 64 bits, then so is I32. (This is the case 130 for the Cray.) This has the advantage of meshing nicely with 131 standard library calls (where we pass an I32 and the library is 132 expecting an int), but the disadvantage that an I32 is not 32 bits. 133 Andy Dougherty August 1996 134 135 There is no guarantee that there is *any* integral type with 136 exactly 32 bits. It is perfectly legal for a system to have 137 sizeof(short) == sizeof(int) == sizeof(long) == 8. 138 139 Similarly, there is no guarantee that I16 and U16 have exactly 16 140 bits. 141 142 For dealing with issues that may arise from various 32/64-bit 143 systems, we will ask Configure to check out 144 145 SHORTSIZE == sizeof(short) 146 INTSIZE == sizeof(int) 147 LONGSIZE == sizeof(long) 148 LONGLONGSIZE == sizeof(long long) (if HAS_LONG_LONG) 149 PTRSIZE == sizeof(void *) 150 DOUBLESIZE == sizeof(double) 151 LONG_DOUBLESIZE == sizeof(long double) (if HAS_LONG_DOUBLE). 152 153 */ 154 155 #ifdef I_INTTYPES /* e.g. Linux has int64_t without <inttypes.h> */ 156 # include <inttypes.h> 157 # ifdef INT32_MIN_BROKEN 158 # undef INT32_MIN 159 # define INT32_MIN (-2147483647-1) 160 # endif 161 # ifdef INT64_MIN_BROKEN 162 # undef INT64_MIN 163 # define INT64_MIN (-9223372036854775807LL-1) 164 # endif 165 #endif 166 167 typedef I8TYPE I8; 168 typedef U8TYPE U8; 169 typedef I16TYPE I16; 170 typedef U16TYPE U16; 171 typedef I32TYPE I32; 172 typedef U32TYPE U32; 173 174 #ifdef QUADKIND 175 typedef I64TYPE I64; 176 typedef U64TYPE U64; 177 #endif 178 179 #if defined(UINT8_MAX) && defined(INT16_MAX) && defined(INT32_MAX) 180 181 /* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type. 182 Please search CHAR_MAX in perl.h for further details. */ 183 #define U8_MAX UINT8_MAX 184 #define U8_MIN UINT8_MIN 185 186 #define I16_MAX INT16_MAX 187 #define I16_MIN INT16_MIN 188 #define U16_MAX UINT16_MAX 189 #define U16_MIN UINT16_MIN 190 191 #define I32_MAX INT32_MAX 192 #define I32_MIN INT32_MIN 193 #ifndef UINT32_MAX_BROKEN /* e.g. HP-UX with gcc messes this up */ 194 # define U32_MAX UINT32_MAX 195 #else 196 # define U32_MAX 4294967295U 197 #endif 198 #define U32_MIN UINT32_MIN 199 200 #else 201 202 /* I8_MAX and I8_MIN constants are not defined, as I8 is an ambiguous type. 203 Please search CHAR_MAX in perl.h for further details. */ 204 #define U8_MAX PERL_UCHAR_MAX 205 #define U8_MIN PERL_UCHAR_MIN 206 207 #define I16_MAX PERL_SHORT_MAX 208 #define I16_MIN PERL_SHORT_MIN 209 #define U16_MAX PERL_USHORT_MAX 210 #define U16_MIN PERL_USHORT_MIN 211 212 #if LONGSIZE > 4 213 # define I32_MAX PERL_INT_MAX 214 # define I32_MIN PERL_INT_MIN 215 # define U32_MAX PERL_UINT_MAX 216 # define U32_MIN PERL_UINT_MIN 217 #else 218 # define I32_MAX PERL_LONG_MAX 219 # define I32_MIN PERL_LONG_MIN 220 # define U32_MAX PERL_ULONG_MAX 221 # define U32_MIN PERL_ULONG_MIN 222 #endif 223 224 #endif 225 226 /* These C99 typedefs are useful sometimes for, say, loop variables whose 227 * maximum values are small, but for which speed trumps size. If we have a C99 228 * compiler, use that. Otherwise, a plain 'int' should be good enough. 229 * 230 * Restrict these to core for now until we are more certain this is a good 231 * idea. */ 232 #if defined(PERL_CORE) || defined(PERL_EXT) 233 # ifdef I_STDINT 234 typedef int_fast8_t PERL_INT_FAST8_T; 235 typedef uint_fast8_t PERL_UINT_FAST8_T; 236 typedef int_fast16_t PERL_INT_FAST16_T; 237 typedef uint_fast16_t PERL_UINT_FAST16_T; 238 # else 239 typedef int PERL_INT_FAST8_T; 240 typedef unsigned int PERL_UINT_FAST8_T; 241 typedef int PERL_INT_FAST16_T; 242 typedef unsigned int PERL_UINT_FAST16_T; 243 # endif 244 #endif 245 246 /* log(2) (i.e., log base 10 of 2) is pretty close to 0.30103, just in case 247 * anyone is grepping for it */ 248 #define BIT_DIGITS(N) (((N)*146)/485 + 1) /* log10(2) =~ 146/485 */ 249 #define TYPE_DIGITS(T) BIT_DIGITS(sizeof(T) * 8) 250 #define TYPE_CHARS(T) (TYPE_DIGITS(T) + 2) /* sign, NUL */ 251 252 /* Unused by core; should be deprecated */ 253 #define Ctl(ch) ((ch) & 037) 254 255 #if defined(PERL_CORE) || defined(PERL_EXT) 256 # ifndef MIN 257 # define MIN(a,b) ((a) < (b) ? (a) : (b)) 258 # endif 259 # ifndef MAX 260 # define MAX(a,b) ((a) > (b) ? (a) : (b)) 261 # endif 262 #endif 263 264 /* Returns a boolean as to whether the input unsigned number is a power of 2 265 * (2**0, 2**1, etc). In other words if it has just a single bit set. 266 * If not, subtracting 1 would leave the uppermost bit set, so the & would 267 * yield non-zero */ 268 #if defined(PERL_CORE) || defined(PERL_EXT) 269 # define isPOWER_OF_2(n) ((n) && ((n) & ((n)-1)) == 0) 270 #endif 271 272 /* 273 =for apidoc Am|void|__ASSERT_|bool expr 274 275 This is a helper macro to avoid preprocessor issues, replaced by nothing 276 unless under DEBUGGING, where it expands to an assert of its argument, 277 followed by a comma (hence the comma operator). If we just used a straight 278 assert(), we would get a comma with nothing before it when not DEBUGGING. 279 280 =cut 281 282 We also use empty definition under Coverity since the __ASSERT__ 283 checks often check for things that Really Cannot Happen, and Coverity 284 detects that and gets all excited. */ 285 286 #if defined(DEBUGGING) && !defined(__COVERITY__) \ 287 && ! defined(PERL_SMALL_MACRO_BUFFER) 288 # define __ASSERT_(statement) assert(statement), 289 #else 290 # define __ASSERT_(statement) 291 #endif 292 293 /* 294 =head1 SV Manipulation Functions 295 296 =for apidoc Ama|SV*|newSVpvs|"literal string" 297 Like C<newSVpvn>, but takes a literal string instead of a 298 string/length pair. 299 300 =for apidoc Ama|SV*|newSVpvs_flags|"literal string"|U32 flags 301 Like C<newSVpvn_flags>, but takes a literal string instead of 302 a string/length pair. 303 304 =for apidoc Ama|SV*|newSVpvs_share|"literal string" 305 Like C<newSVpvn_share>, but takes a literal string instead of 306 a string/length pair and omits the hash parameter. 307 308 =for apidoc Am|void|sv_catpvs_flags|SV* sv|"literal string"|I32 flags 309 Like C<sv_catpvn_flags>, but takes a literal string instead 310 of a string/length pair. 311 312 =for apidoc Am|void|sv_catpvs_nomg|SV* sv|"literal string" 313 Like C<sv_catpvn_nomg>, but takes a literal string instead of 314 a string/length pair. 315 316 =for apidoc Am|void|sv_catpvs|SV* sv|"literal string" 317 Like C<sv_catpvn>, but takes a literal string instead of a 318 string/length pair. 319 320 =for apidoc Am|void|sv_catpvs_mg|SV* sv|"literal string" 321 Like C<sv_catpvn_mg>, but takes a literal string instead of a 322 string/length pair. 323 324 =for apidoc Am|void|sv_setpvs|SV* sv|"literal string" 325 Like C<sv_setpvn>, but takes a literal string instead of a 326 string/length pair. 327 328 =for apidoc Am|void|sv_setpvs_mg|SV* sv|"literal string" 329 Like C<sv_setpvn_mg>, but takes a literal string instead of a 330 string/length pair. 331 332 =for apidoc Am|SV *|sv_setref_pvs|SV *const rv|const char *const classname|"literal string" 333 Like C<sv_setref_pvn>, but takes a literal string instead of 334 a string/length pair. 335 336 =head1 Memory Management 337 338 =for apidoc Ama|char*|savepvs|"literal string" 339 Like C<savepvn>, but takes a literal string instead of a 340 string/length pair. 341 342 =for apidoc Ama|char*|savesharedpvs|"literal string" 343 A version of C<savepvs()> which allocates the duplicate string in memory 344 which is shared between threads. 345 346 =head1 GV Functions 347 348 =for apidoc Am|HV*|gv_stashpvs|"name"|I32 create 349 Like C<gv_stashpvn>, but takes a literal string instead of a 350 string/length pair. 351 352 =head1 Hash Manipulation Functions 353 354 =for apidoc Am|SV**|hv_fetchs|HV* tb|"key"|I32 lval 355 Like C<hv_fetch>, but takes a literal string instead of a 356 string/length pair. 357 358 =for apidoc Am|SV**|hv_stores|HV* tb|"key"|SV* val 359 Like C<hv_store>, but takes a literal string instead of a 360 string/length pair 361 and omits the hash parameter. 362 363 =head1 Lexer interface 364 365 =for apidoc Amx|void|lex_stuff_pvs|"pv"|U32 flags 366 367 Like L</lex_stuff_pvn>, but takes a literal string instead of 368 a string/length pair. 369 370 =cut 371 */ 372 373 /* 374 =head1 Handy Values 375 376 =for apidoc Amu|pair|STR_WITH_LEN|"literal string" 377 378 Returns two comma separated tokens of the input literal string, and its length. 379 This is convenience macro which helps out in some API calls. 380 Note that it can't be used as an argument to macros or functions that under 381 some configurations might be macros, which means that it requires the full 382 Perl_xxx(aTHX_ ...) form for any API calls where it's used. 383 384 =cut 385 */ 386 387 388 #define STR_WITH_LEN(s) ("" s ""), (sizeof(s)-1) 389 390 /* STR_WITH_LEN() shortcuts */ 391 #define newSVpvs(str) Perl_newSVpvn(aTHX_ STR_WITH_LEN(str)) 392 #define newSVpvs_flags(str,flags) \ 393 Perl_newSVpvn_flags(aTHX_ STR_WITH_LEN(str), flags) 394 #define newSVpvs_share(str) Perl_newSVpvn_share(aTHX_ STR_WITH_LEN(str), 0) 395 #define sv_catpvs_flags(sv, str, flags) \ 396 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), flags) 397 #define sv_catpvs_nomg(sv, str) \ 398 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), 0) 399 #define sv_catpvs(sv, str) \ 400 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC) 401 #define sv_catpvs_mg(sv, str) \ 402 Perl_sv_catpvn_flags(aTHX_ sv, STR_WITH_LEN(str), SV_GMAGIC|SV_SMAGIC) 403 #define sv_setpvs(sv, str) Perl_sv_setpvn(aTHX_ sv, STR_WITH_LEN(str)) 404 #define sv_setpvs_mg(sv, str) Perl_sv_setpvn_mg(aTHX_ sv, STR_WITH_LEN(str)) 405 #define sv_setref_pvs(rv, classname, str) \ 406 Perl_sv_setref_pvn(aTHX_ rv, classname, STR_WITH_LEN(str)) 407 #define savepvs(str) Perl_savepvn(aTHX_ STR_WITH_LEN(str)) 408 #define savesharedpvs(str) Perl_savesharedpvn(aTHX_ STR_WITH_LEN(str)) 409 #define gv_stashpvs(str, create) \ 410 Perl_gv_stashpvn(aTHX_ STR_WITH_LEN(str), create) 411 #define gv_fetchpvs(namebeg, add, sv_type) \ 412 Perl_gv_fetchpvn_flags(aTHX_ STR_WITH_LEN(namebeg), add, sv_type) 413 #define gv_fetchpvn(namebeg, len, add, sv_type) \ 414 Perl_gv_fetchpvn_flags(aTHX_ namebeg, len, add, sv_type) 415 #define sv_catxmlpvs(dsv, str, utf8) \ 416 Perl_sv_catxmlpvn(aTHX_ dsv, STR_WITH_LEN(str), utf8) 417 418 419 #define lex_stuff_pvs(pv,flags) Perl_lex_stuff_pvn(aTHX_ STR_WITH_LEN(pv), flags) 420 421 #define get_cvs(str, flags) \ 422 Perl_get_cvn_flags(aTHX_ STR_WITH_LEN(str), (flags)) 423 424 /* 425 =head1 Miscellaneous Functions 426 427 =for apidoc Am|bool|strNE|char* s1|char* s2 428 Test two C<NUL>-terminated strings to see if they are different. Returns true 429 or false. 430 431 =for apidoc Am|bool|strEQ|char* s1|char* s2 432 Test two C<NUL>-terminated strings to see if they are equal. Returns true or 433 false. 434 435 =for apidoc Am|bool|strLT|char* s1|char* s2 436 Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than the 437 second, C<s2>. Returns true or false. 438 439 =for apidoc Am|bool|strLE|char* s1|char* s2 440 Test two C<NUL>-terminated strings to see if the first, C<s1>, is less than or 441 equal to the second, C<s2>. Returns true or false. 442 443 =for apidoc Am|bool|strGT|char* s1|char* s2 444 Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than 445 the second, C<s2>. Returns true or false. 446 447 =for apidoc Am|bool|strGE|char* s1|char* s2 448 Test two C<NUL>-terminated strings to see if the first, C<s1>, is greater than 449 or equal to the second, C<s2>. Returns true or false. 450 451 =for apidoc Am|bool|strnNE|char* s1|char* s2|STRLEN len 452 Test two C<NUL>-terminated strings to see if they are different. The C<len> 453 parameter indicates the number of bytes to compare. Returns true or false. (A 454 wrapper for C<strncmp>). 455 456 =for apidoc Am|bool|strnEQ|char* s1|char* s2|STRLEN len 457 Test two C<NUL>-terminated strings to see if they are equal. The C<len> 458 parameter indicates the number of bytes to compare. Returns true or false. (A 459 wrapper for C<strncmp>). 460 461 =for apidoc Am|bool|memEQ|char* s1|char* s2|STRLEN len 462 Test two buffers (which may contain embedded C<NUL> characters, to see if they 463 are equal. The C<len> parameter indicates the number of bytes to compare. 464 Returns zero if equal, or non-zero if non-equal. 465 466 =for apidoc Am|bool|memEQs|char* s1|STRLEN l1|"s2" 467 Like L</memEQ>, but the second string is a literal enclosed in double quotes, 468 C<l1> gives the number of bytes in C<s1>. 469 Returns zero if equal, or non-zero if non-equal. 470 471 =for apidoc Am|bool|memNE|char* s1|char* s2|STRLEN len 472 Test two buffers (which may contain embedded C<NUL> characters, to see if they 473 are not equal. The C<len> parameter indicates the number of bytes to compare. 474 Returns zero if non-equal, or non-zero if equal. 475 476 =for apidoc Am|bool|memNEs|char* s1|STRLEN l1|"s2" 477 Like L</memNE>, but the second string is a literal enclosed in double quotes, 478 C<l1> gives the number of bytes in C<s1>. 479 Returns zero if non-equal, or zero if non-equal. 480 481 =for apidoc Am|bool|memCHRs|"list"|char c 482 Returns the position of the first occurence of the byte C<c> in the literal 483 string C<"list">, or NULL if C<c> doesn't appear in C<"list">. All bytes are 484 treated as unsigned char. Thus this macro can be used to determine if C<c> is 485 in a set of particular characters. Unlike L<strchr(3)>, it works even if C<c> 486 is C<NUL> (and the set doesn't include C<NUL>). 487 488 =cut 489 490 New macros should use the following conventions for their names (which are 491 based on the underlying C library functions): 492 493 (mem | str n? ) (EQ | NE | LT | GT | GE | (( BEGIN | END ) P? )) l? s? 494 495 Each has two main parameters, string-like operands that are compared 496 against each other, as specified by the macro name. Some macros may 497 additionally have one or potentially even two length parameters. If a length 498 parameter applies to both string parameters, it will be positioned third; 499 otherwise any length parameter immediately follows the string parameter it 500 applies to. 501 502 If the prefix to the name is 'str', the string parameter is a pointer to a C 503 language string. Such a string does not contain embedded NUL bytes; its 504 length may be unknown, but can be calculated by C<strlen()>, since it is 505 terminated by a NUL, which isn't included in its length. 506 507 The optional 'n' following 'str' means that there is a third parameter, 508 giving the maximum number of bytes to look at in each string. Even if both 509 strings are longer than the length parameter, those extra bytes will be 510 unexamined. 511 512 The 's' suffix means that the 2nd byte string parameter is a literal C 513 double-quoted string. Its length will automatically be calculated by the 514 macro, so no length parameter will ever be needed for it. 515 516 If the prefix is 'mem', the string parameters don't have to be C strings; 517 they may contain embedded NUL bytes, do not necessarily have a terminating 518 NUL, and their lengths can be known only through other means, which in 519 practice are additional parameter(s) passed to the function. All 'mem' 520 functions have at least one length parameter. Barring any 'l' or 's' suffix, 521 there is a single length parameter, in position 3, which applies to both 522 string parameters. The 's' suffix means, as described above, that the 2nd 523 string is a literal double-quoted C string (hence its length is calculated by 524 the macro, and the length parameter to the function applies just to the first 525 string parameter, and hence is positioned just after it). An 'l' suffix 526 means that the 2nd string parameter has its own length parameter, and the 527 signature will look like memFOOl(s1, l1, s2, l2). 528 529 BEGIN (and END) are for testing if the 2nd string is an initial (or final) 530 substring of the 1st string. 'P' if present indicates that the substring 531 must be a "proper" one in tha mathematical sense that the first one must be 532 strictly larger than the 2nd. 533 534 */ 535 536 537 #define strNE(s1,s2) (strcmp(s1,s2) != 0) 538 #define strEQ(s1,s2) (strcmp(s1,s2) == 0) 539 #define strLT(s1,s2) (strcmp(s1,s2) < 0) 540 #define strLE(s1,s2) (strcmp(s1,s2) <= 0) 541 #define strGT(s1,s2) (strcmp(s1,s2) > 0) 542 #define strGE(s1,s2) (strcmp(s1,s2) >= 0) 543 544 #define strnNE(s1,s2,l) (strncmp(s1,s2,l) != 0) 545 #define strnEQ(s1,s2,l) (strncmp(s1,s2,l) == 0) 546 547 #define memEQ(s1,s2,l) (memcmp(((const void *) (s1)), ((const void *) (s2)), l) == 0) 548 #define memNE(s1,s2,l) (! memEQ(s1,s2,l)) 549 550 /* memEQ and memNE where second comparand is a string constant */ 551 #define memEQs(s1, l, s2) \ 552 (((sizeof(s2)-1) == (l)) && memEQ((s1), ("" s2 ""), (sizeof(s2)-1))) 553 #define memNEs(s1, l, s2) (! memEQs(s1, l, s2)) 554 555 /* Keep these private until we decide it was a good idea */ 556 #if defined(PERL_CORE) || defined(PERL_EXT) || defined(PERL_EXT_POSIX) 557 558 #define strBEGINs(s1,s2) (strncmp(s1,"" s2 "", sizeof(s2)-1) == 0) 559 560 #define memBEGINs(s1, l, s2) \ 561 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \ 562 && memEQ(s1, "" s2 "", sizeof(s2)-1)) 563 #define memBEGINPs(s1, l, s2) \ 564 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) - 1 \ 565 && memEQ(s1, "" s2 "", sizeof(s2)-1)) 566 #define memENDs(s1, l, s2) \ 567 ( (Ptrdiff_t) (l) >= (Ptrdiff_t) sizeof(s2) - 1 \ 568 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1)) 569 #define memENDPs(s1, l, s2) \ 570 ( (Ptrdiff_t) (l) > (Ptrdiff_t) sizeof(s2) \ 571 && memEQ(s1 + (l) - (sizeof(s2) - 1), "" s2 "", sizeof(s2)-1)) 572 #endif /* End of making macros private */ 573 574 #define memLT(s1,s2,l) (memcmp(s1,s2,l) < 0) 575 #define memLE(s1,s2,l) (memcmp(s1,s2,l) <= 0) 576 #define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0) 577 #define memGE(s1,s2,l) (memcmp(s1,s2,l) >= 0) 578 579 #define memCHRs(s1,c) ((const char *) memchr("" s1 "" , c, sizeof(s1)-1)) 580 581 /* 582 * Character classes. 583 * 584 * Unfortunately, the introduction of locales means that we 585 * can't trust isupper(), etc. to tell the truth. And when 586 * it comes to /\w+/ with tainting enabled, we *must* be able 587 * to trust our character classes. 588 * 589 * Therefore, the default tests in the text of Perl will be 590 * independent of locale. Any code that wants to depend on 591 * the current locale will use the tests that begin with "lc". 592 */ 593 594 #ifdef HAS_SETLOCALE /* XXX Is there a better test for this? */ 595 # ifndef CTYPE256 596 # define CTYPE256 597 # endif 598 #endif 599 600 /* 601 602 =head1 Character classification 603 This section is about functions (really macros) that classify characters 604 into types, such as punctuation versus alphabetic, etc. Most of these are 605 analogous to regular expression character classes. (See 606 L<perlrecharclass/POSIX Character Classes>.) There are several variants for 607 each class. (Not all macros have all variants; each item below lists the 608 ones valid for it.) None are affected by C<use bytes>, and only the ones 609 with C<LC> in the name are affected by the current locale. 610 611 The base function, e.g., C<isALPHA()>, takes any signed or unsigned value, 612 treating it as a code point, and returns a boolean as to whether or not the 613 character represented by it is (or on non-ASCII platforms, corresponds to) an 614 ASCII character in the named class based on platform, Unicode, and Perl rules. 615 If the input is a number that doesn't fit in an octet, FALSE is returned. 616 617 Variant C<isI<FOO>_A> (e.g., C<isALPHA_A()>) is identical to the base function 618 with no suffix C<"_A">. This variant is used to emphasize by its name that 619 only ASCII-range characters can return TRUE. 620 621 Variant C<isI<FOO>_L1> imposes the Latin-1 (or EBCDIC equivalent) character set 622 onto the platform. That is, the code points that are ASCII are unaffected, 623 since ASCII is a subset of Latin-1. But the non-ASCII code points are treated 624 as if they are Latin-1 characters. For example, C<isWORDCHAR_L1()> will return 625 true when called with the code point 0xDF, which is a word character in both 626 ASCII and EBCDIC (though it represents different characters in each). 627 If the input is a number that doesn't fit in an octet, FALSE is returned. 628 (Perl's documentation uses a colloquial definition of Latin-1, to include all 629 code points below 256.) 630 631 Variant C<isI<FOO>_uvchr> is exactly like the C<isI<FOO>_L1> variant, for 632 inputs below 256, but if the code point is larger than 255, Unicode rules are 633 used to determine if it is in the character class. For example, 634 C<isWORDCHAR_uvchr(0x100)> returns TRUE, since 0x100 is LATIN CAPITAL LETTER A 635 WITH MACRON in Unicode, and is a word character. 636 637 Variants C<isI<FOO>_utf8> and C<isI<FOO>_utf8_safe> are like C<isI<FOO>_uvchr>, 638 but are used for UTF-8 encoded strings. The two forms are different names for 639 the same thing. Each call to one of these classifies the first character of 640 the string starting at C<p>. The second parameter, C<e>, points to anywhere in 641 the string beyond the first character, up to one byte past the end of the 642 entire string. Although both variants are identical, the suffix C<_safe> in 643 one name emphasizes that it will not attempt to read beyond S<C<e - 1>>, 644 provided that the constraint S<C<s E<lt> e>> is true (this is asserted for in 645 C<-DDEBUGGING> builds). If the UTF-8 for the input character is malformed in 646 some way, the program may croak, or the function may return FALSE, at the 647 discretion of the implementation, and subject to change in future releases. 648 649 Variant C<isI<FOO>_LC> is like the C<isI<FOO>_A> and C<isI<FOO>_L1> variants, 650 but the result is based on the current locale, which is what C<LC> in the name 651 stands for. If Perl can determine that the current locale is a UTF-8 locale, 652 it uses the published Unicode rules; otherwise, it uses the C library function 653 that gives the named classification. For example, C<isDIGIT_LC()> when not in 654 a UTF-8 locale returns the result of calling C<isdigit()>. FALSE is always 655 returned if the input won't fit into an octet. On some platforms where the C 656 library function is known to be defective, Perl changes its result to follow 657 the POSIX standard's rules. 658 659 Variant C<isI<FOO>_LC_uvchr> acts exactly like C<isI<FOO>_LC> for inputs less 660 than 256, but for larger ones it returns the Unicode classification of the code 661 point. 662 663 Variants C<isI<FOO>_LC_utf8> and C<isI<FOO>_LC_utf8_safe> are like 664 C<isI<FOO>_LC_uvchr>, but are used for UTF-8 encoded strings. The two forms 665 are different names for the same thing. Each call to one of these classifies 666 the first character of the string starting at C<p>. The second parameter, 667 C<e>, points to anywhere in the string beyond the first character, up to one 668 byte past the end of the entire string. Although both variants are identical, 669 the suffix C<_safe> in one name emphasizes that it will not attempt to read 670 beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is true (this 671 is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the input 672 character is malformed in some way, the program may croak, or the function may 673 return FALSE, at the discretion of the implementation, and subject to change in 674 future releases. 675 676 =for apidoc Am|bool|isALPHA|int ch 677 Returns a boolean indicating whether the specified input is one of C<[A-Za-z]>, 678 analogous to C<m/[[:alpha:]]/>. 679 See the L<top of this section|/Character classification> for an explanation of 680 variants 681 C<isALPHA_A>, C<isALPHA_L1>, C<isALPHA_uvchr>, C<isALPHA_utf8>, 682 C<isALPHA_utf8_safe>, C<isALPHA_LC>, C<isALPHA_LC_uvchr>, C<isALPHA_LC_utf8>, 683 and C<isALPHA_LC_utf8_safe>. 684 685 =cut 686 687 Here and below, we add the protoypes of these macros for downstream programs 688 that would be interested in them, such as Devel::PPPort 689 690 =for apidoc Amh|bool|isALPHA_A|int ch 691 =for apidoc Amh|bool|isALPHA_L1|int ch 692 =for apidoc Amh|bool|isALPHA_uvchr|int ch 693 =for apidoc Amh|bool|isALPHA_utf8_safe|U8 * s|U8 * end 694 =for apidoc Amh|bool|isALPHA_utf8|U8 * s|U8 * end 695 =for apidoc Amh|bool|isALPHA_LC|int ch 696 =for apidoc Amh|bool|isALPHA_LC_uvchr|int ch 697 =for apidoc Amh|bool|isALPHA_LC_utf8_safe|U8 * s| U8 *end 698 699 =for apidoc Am|bool|isALPHANUMERIC|int ch 700 Returns a boolean indicating whether the specified character is one of 701 C<[A-Za-z0-9]>, analogous to C<m/[[:alnum:]]/>. 702 See the L<top of this section|/Character classification> for an explanation of 703 variants 704 C<isALPHANUMERIC_A>, C<isALPHANUMERIC_L1>, C<isALPHANUMERIC_uvchr>, 705 C<isALPHANUMERIC_utf8>, C<isALPHANUMERIC_utf8_safe>, C<isALPHANUMERIC_LC>, 706 C<isALPHANUMERIC_LC_uvchr>, C<isALPHANUMERIC_LC_utf8>, and 707 C<isALPHANUMERIC_LC_utf8_safe>. 708 709 A (discouraged from use) synonym is C<isALNUMC> (where the C<C> suffix means 710 this corresponds to the C language alphanumeric definition). Also 711 there are the variants 712 C<isALNUMC_A>, C<isALNUMC_L1> 713 C<isALNUMC_LC>, and C<isALNUMC_LC_uvchr>. 714 715 =for apidoc Amh|bool|isALPHANUMERIC_A|int ch 716 =for apidoc Amh|bool|isALPHANUMERIC_L1|int ch 717 =for apidoc Amh|bool|isALPHANUMERIC_uvchr|int ch 718 =for apidoc Amh|bool|isALPHANUMERIC_utf8_safe|U8 * s|U8 * end 719 =for apidoc Amh|bool|isALPHANUMERIC_utf8|U8 * s|U8 * end 720 =for apidoc Amh|bool|isALPHANUMERIC_LC|int ch 721 =for apidoc Amh|bool|isALPHANUMERIC_LC_uvchr|int ch 722 =for apidoc Amh|bool|isALPHANUMERIC_LC_utf8_safe|U8 * s| U8 *end 723 =for apidoc Amh|bool|isALNUMC|int ch 724 =for apidoc Amh|bool|isALNUMC_A|int ch 725 =for apidoc Amh|bool|isALNUMC_L1|int ch 726 =for apidoc Amh|bool|isALNUMC_LC|int ch 727 =for apidoc Amh|bool|isALNUMC_LC_uvchr|int ch 728 729 =for apidoc Am|bool|isASCII|int ch 730 Returns a boolean indicating whether the specified character is one of the 128 731 characters in the ASCII character set, analogous to C<m/[[:ascii:]]/>. 732 On non-ASCII platforms, it returns TRUE iff this 733 character corresponds to an ASCII character. Variants C<isASCII_A()> and 734 C<isASCII_L1()> are identical to C<isASCII()>. 735 See the L<top of this section|/Character classification> for an explanation of 736 variants 737 C<isASCII_uvchr>, C<isASCII_utf8>, C<isASCII_utf8_safe>, C<isASCII_LC>, 738 C<isASCII_LC_uvchr>, C<isASCII_LC_utf8>, and C<isASCII_LC_utf8_safe>. 739 Note, however, that some platforms do not have the C library routine 740 C<isascii()>. In these cases, the variants whose names contain C<LC> are the 741 same as the corresponding ones without. 742 743 =for apidoc Amh|bool|isASCII_A|int ch 744 =for apidoc Amh|bool|isASCII_L1|int ch 745 =for apidoc Amh|bool|isASCII_uvchr|int ch 746 =for apidoc Amh|bool|isASCII_utf8_safe|U8 * s|U8 * end 747 =for apidoc Amh|bool|isASCII_utf8|U8 * s|U8 * end 748 =for apidoc Amh|bool|isASCII_LC|int ch 749 =for apidoc Amh|bool|isASCII_LC_uvchr|int ch 750 =for apidoc Amh|bool|isASCII_LC_utf8_safe|U8 * s| U8 *end 751 752 Also note, that because all ASCII characters are UTF-8 invariant (meaning they 753 have the exact same representation (always a single byte) whether encoded in 754 UTF-8 or not), C<isASCII> will give the correct results when called with any 755 byte in any string encoded or not in UTF-8. And similarly C<isASCII_utf8> and 756 C<isASCII_utf8_safe> will work properly on any string encoded or not in UTF-8. 757 758 =for apidoc Am|bool|isBLANK|char ch 759 Returns a boolean indicating whether the specified character is a 760 character considered to be a blank, analogous to C<m/[[:blank:]]/>. 761 See the L<top of this section|/Character classification> for an explanation of 762 variants 763 C<isBLANK_A>, C<isBLANK_L1>, C<isBLANK_uvchr>, C<isBLANK_utf8>, 764 C<isBLANK_utf8_safe>, C<isBLANK_LC>, C<isBLANK_LC_uvchr>, C<isBLANK_LC_utf8>, 765 and C<isBLANK_LC_utf8_safe>. Note, 766 however, that some platforms do not have the C library routine 767 C<isblank()>. In these cases, the variants whose names contain C<LC> are 768 the same as the corresponding ones without. 769 770 =for apidoc Amh|bool|isBLANK_A|int ch 771 =for apidoc Amh|bool|isBLANK_L1|int ch 772 =for apidoc Amh|bool|isBLANK_uvchr|int ch 773 =for apidoc Amh|bool|isBLANK_utf8_safe|U8 * s|U8 * end 774 =for apidoc Amh|bool|isBLANK_utf8|U8 * s|U8 * end 775 =for apidoc Amh|bool|isBLANK_LC|int ch 776 =for apidoc Amh|bool|isBLANK_LC_uvchr|int ch 777 =for apidoc Amh|bool|isBLANK_LC_utf8_safe|U8 * s| U8 *end 778 779 =for apidoc Am|bool|isCNTRL|char ch 780 Returns a boolean indicating whether the specified character is a 781 control character, analogous to C<m/[[:cntrl:]]/>. 782 See the L<top of this section|/Character classification> for an explanation of 783 variants 784 C<isCNTRL_A>, C<isCNTRL_L1>, C<isCNTRL_uvchr>, C<isCNTRL_utf8>, 785 C<isCNTRL_utf8_safe>, C<isCNTRL_LC>, C<isCNTRL_LC_uvchr>, C<isCNTRL_LC_utf8> 786 and C<isCNTRL_LC_utf8_safe>. On EBCDIC 787 platforms, you almost always want to use the C<isCNTRL_L1> variant. 788 789 =for apidoc Amh|bool|isCNTRL_A|int ch 790 =for apidoc Amh|bool|isCNTRL_L1|int ch 791 =for apidoc Amh|bool|isCNTRL_uvchr|int ch 792 =for apidoc Amh|bool|isCNTRL_utf8_safe|U8 * s|U8 * end 793 =for apidoc Amh|bool|isCNTRL_utf8|U8 * s|U8 * end 794 =for apidoc Amh|bool|isCNTRL_LC|int ch 795 =for apidoc Amh|bool|isCNTRL_LC_uvchr|int ch 796 =for apidoc Amh|bool|isCNTRL_LC_utf8_safe|U8 * s| U8 *end 797 798 =for apidoc Am|bool|isDIGIT|char ch 799 Returns a boolean indicating whether the specified character is a 800 digit, analogous to C<m/[[:digit:]]/>. 801 Variants C<isDIGIT_A> and C<isDIGIT_L1> are identical to C<isDIGIT>. 802 See the L<top of this section|/Character classification> for an explanation of 803 variants 804 C<isDIGIT_uvchr>, C<isDIGIT_utf8>, C<isDIGIT_utf8_safe>, C<isDIGIT_LC>, 805 C<isDIGIT_LC_uvchr>, C<isDIGIT_LC_utf8>, and C<isDIGIT_LC_utf8_safe>. 806 807 =for apidoc Amh|bool|isDIGIT_A|int ch 808 =for apidoc Amh|bool|isDIGIT_L1|int ch 809 =for apidoc Amh|bool|isDIGIT_uvchr|int ch 810 =for apidoc Amh|bool|isDIGIT_utf8_safe|U8 * s|U8 * end 811 =for apidoc Amh|bool|isDIGIT_utf8|U8 * s|U8 * end 812 =for apidoc Amh|bool|isDIGIT_LC|int ch 813 =for apidoc Amh|bool|isDIGIT_LC_uvchr|int ch 814 =for apidoc Amh|bool|isDIGIT_LC_utf8_safe|U8 * s| U8 *end 815 816 =for apidoc Am|bool|isGRAPH|char ch 817 Returns a boolean indicating whether the specified character is a 818 graphic character, analogous to C<m/[[:graph:]]/>. 819 See the L<top of this section|/Character classification> for an explanation of 820 variants C<isGRAPH_A>, C<isGRAPH_L1>, C<isGRAPH_uvchr>, C<isGRAPH_utf8>, 821 C<isGRAPH_utf8_safe>, C<isGRAPH_LC>, C<isGRAPH_LC_uvchr>, 822 C<isGRAPH_LC_utf8_safe>, and C<isGRAPH_LC_utf8_safe>. 823 824 =for apidoc Amh|bool|isGRAPH_A|int ch 825 =for apidoc Amh|bool|isGRAPH_L1|int ch 826 =for apidoc Amh|bool|isGRAPH_uvchr|int ch 827 =for apidoc Amh|bool|isGRAPH_utf8_safe|U8 * s|U8 * end 828 =for apidoc Amh|bool|isGRAPH_utf8|U8 * s|U8 * end 829 =for apidoc Amh|bool|isGRAPH_LC|int ch 830 =for apidoc Amh|bool|isGRAPH_LC_uvchr|int ch 831 =for apidoc Amh|bool|isGRAPH_LC_utf8_safe|U8 * s| U8 *end 832 833 =for apidoc Am|bool|isLOWER|char ch 834 Returns a boolean indicating whether the specified character is a 835 lowercase character, analogous to C<m/[[:lower:]]/>. 836 See the L<top of this section|/Character classification> for an explanation of 837 variants 838 C<isLOWER_A>, C<isLOWER_L1>, C<isLOWER_uvchr>, C<isLOWER_utf8>, 839 C<isLOWER_utf8_safe>, C<isLOWER_LC>, C<isLOWER_LC_uvchr>, C<isLOWER_LC_utf8>, 840 and C<isLOWER_LC_utf8_safe>. 841 842 =for apidoc Amh|bool|isLOWER_A|int ch 843 =for apidoc Amh|bool|isLOWER_L1|int ch 844 =for apidoc Amh|bool|isLOWER_uvchr|int ch 845 =for apidoc Amh|bool|isLOWER_utf8_safe|U8 * s|U8 * end 846 =for apidoc Amh|bool|isLOWER_utf8|U8 * s|U8 * end 847 =for apidoc Amh|bool|isLOWER_LC|int ch 848 =for apidoc Amh|bool|isLOWER_LC_uvchr|int ch 849 =for apidoc Amh|bool|isLOWER_LC_utf8_safe|U8 * s| U8 *end 850 851 =for apidoc Am|bool|isOCTAL|char ch 852 Returns a boolean indicating whether the specified character is an 853 octal digit, [0-7]. 854 The only two variants are C<isOCTAL_A> and C<isOCTAL_L1>; each is identical to 855 C<isOCTAL>. 856 857 =for apidoc Amh|bool|isOCTAL_A|int ch 858 =for apidoc Amh|bool|isOCTAL_L1|int ch 859 860 =for apidoc Am|bool|isPUNCT|char ch 861 Returns a boolean indicating whether the specified character is a 862 punctuation character, analogous to C<m/[[:punct:]]/>. 863 Note that the definition of what is punctuation isn't as 864 straightforward as one might desire. See L<perlrecharclass/POSIX Character 865 Classes> for details. 866 See the L<top of this section|/Character classification> for an explanation of 867 variants C<isPUNCT_A>, C<isPUNCT_L1>, C<isPUNCT_uvchr>, C<isPUNCT_utf8>, 868 C<isPUNCT_utf8_safe>, C<isPUNCT_LC>, C<isPUNCT_LC_uvchr>, C<isPUNCT_LC_utf8>, 869 and C<isPUNCT_LC_utf8_safe>. 870 871 =for apidoc Amh|bool|isPUNCT_A|int ch 872 =for apidoc Amh|bool|isPUNCT_L1|int ch 873 =for apidoc Amh|bool|isPUNCT_uvchr|int ch 874 =for apidoc Amh|bool|isPUNCT_utf8_safe|U8 * s|U8 * end 875 =for apidoc Amh|bool|isPUNCT_utf8|U8 * s|U8 * end 876 =for apidoc Amh|bool|isPUNCT_LC|int ch 877 =for apidoc Amh|bool|isPUNCT_LC_uvchr|int ch 878 =for apidoc Amh|bool|isPUNCT_LC_utf8_safe|U8 * s| U8 *end 879 880 =for apidoc Am|bool|isSPACE|char ch 881 Returns a boolean indicating whether the specified character is a 882 whitespace character. This is analogous 883 to what C<m/\s/> matches in a regular expression. Starting in Perl 5.18 884 this also matches what C<m/[[:space:]]/> does. Prior to 5.18, only the 885 locale forms of this macro (the ones with C<LC> in their names) matched 886 precisely what C<m/[[:space:]]/> does. In those releases, the only difference, 887 in the non-locale variants, was that C<isSPACE()> did not match a vertical tab. 888 (See L</isPSXSPC> for a macro that matches a vertical tab in all releases.) 889 See the L<top of this section|/Character classification> for an explanation of 890 variants 891 C<isSPACE_A>, C<isSPACE_L1>, C<isSPACE_uvchr>, C<isSPACE_utf8>, 892 C<isSPACE_utf8_safe>, C<isSPACE_LC>, C<isSPACE_LC_uvchr>, C<isSPACE_LC_utf8>, 893 and C<isSPACE_LC_utf8_safe>. 894 895 =for apidoc Amh|bool|isSPACE_A|int ch 896 =for apidoc Amh|bool|isSPACE_L1|int ch 897 =for apidoc Amh|bool|isSPACE_uvchr|int ch 898 =for apidoc Amh|bool|isSPACE_utf8_safe|U8 * s|U8 * end 899 =for apidoc Amh|bool|isSPACE_utf8|U8 * s|U8 * end 900 =for apidoc Amh|bool|isSPACE_LC|int ch 901 =for apidoc Amh|bool|isSPACE_LC_uvchr|int ch 902 =for apidoc Amh|bool|isSPACE_LC_utf8_safe|U8 * s| U8 *end 903 904 =for apidoc Am|bool|isPSXSPC|char ch 905 (short for Posix Space) 906 Starting in 5.18, this is identical in all its forms to the 907 corresponding C<isSPACE()> macros. 908 The locale forms of this macro are identical to their corresponding 909 C<isSPACE()> forms in all Perl releases. In releases prior to 5.18, the 910 non-locale forms differ from their C<isSPACE()> forms only in that the 911 C<isSPACE()> forms don't match a Vertical Tab, and the C<isPSXSPC()> forms do. 912 Otherwise they are identical. Thus this macro is analogous to what 913 C<m/[[:space:]]/> matches in a regular expression. 914 See the L<top of this section|/Character classification> for an explanation of 915 variants C<isPSXSPC_A>, C<isPSXSPC_L1>, C<isPSXSPC_uvchr>, C<isPSXSPC_utf8>, 916 C<isPSXSPC_utf8_safe>, C<isPSXSPC_LC>, C<isPSXSPC_LC_uvchr>, 917 C<isPSXSPC_LC_utf8>, and C<isPSXSPC_LC_utf8_safe>. 918 919 =for apidoc Amh|bool|isPSXSPC_A|int ch 920 =for apidoc Amh|bool|isPSXSPC_L1|int ch 921 =for apidoc Amh|bool|isPSXSPC_uvchr|int ch 922 =for apidoc Amh|bool|isPSXSPC_utf8_safe|U8 * s|U8 * end 923 =for apidoc Amh|bool|isPSXSPC_utf8|U8 * s|U8 * end 924 =for apidoc Amh|bool|isPSXSPC_LC|int ch 925 =for apidoc Amh|bool|isPSXSPC_LC_uvchr|int ch 926 =for apidoc Amh|bool|isPSXSPC_LC_utf8_safe|U8 * s| U8 *end 927 928 =for apidoc Am|bool|isUPPER|char ch 929 Returns a boolean indicating whether the specified character is an 930 uppercase character, analogous to C<m/[[:upper:]]/>. 931 See the L<top of this section|/Character classification> for an explanation of 932 variants C<isUPPER_A>, C<isUPPER_L1>, C<isUPPER_uvchr>, C<isUPPER_utf8>, 933 C<isUPPER_utf8_safe>, C<isUPPER_LC>, C<isUPPER_LC_uvchr>, C<isUPPER_LC_utf8>, 934 and C<isUPPER_LC_utf8_safe>. 935 936 =for apidoc Amh|bool|isUPPER_A|int ch 937 =for apidoc Amh|bool|isUPPER_L1|int ch 938 =for apidoc Amh|bool|isUPPER_uvchr|int ch 939 =for apidoc Amh|bool|isUPPER_utf8_safe|U8 * s|U8 * end 940 =for apidoc Amh|bool|isUPPER_utf8|U8 * s|U8 * end 941 =for apidoc Amh|bool|isUPPER_LC|int ch 942 =for apidoc Amh|bool|isUPPER_LC_uvchr|int ch 943 =for apidoc Amh|bool|isUPPER_LC_utf8_safe|U8 * s| U8 *end 944 945 =for apidoc Am|bool|isPRINT|char ch 946 Returns a boolean indicating whether the specified character is a 947 printable character, analogous to C<m/[[:print:]]/>. 948 See the L<top of this section|/Character classification> for an explanation of 949 variants 950 C<isPRINT_A>, C<isPRINT_L1>, C<isPRINT_uvchr>, C<isPRINT_utf8>, 951 C<isPRINT_utf8_safe>, C<isPRINT_LC>, C<isPRINT_LC_uvchr>, C<isPRINT_LC_utf8>, 952 and C<isPRINT_LC_utf8_safe>. 953 954 =for apidoc Amh|bool|isPRINT_A|int ch 955 =for apidoc Amh|bool|isPRINT_L1|int ch 956 =for apidoc Amh|bool|isPRINT_uvchr|int ch 957 =for apidoc Amh|bool|isPRINT_utf8_safe|U8 * s|U8 * end 958 =for apidoc Amh|bool|isPRINT_utf8|U8 * s|U8 * end 959 =for apidoc Amh|bool|isPRINT_LC|int ch 960 =for apidoc Amh|bool|isPRINT_LC_uvchr|int ch 961 =for apidoc Amh|bool|isPRINT_LC_utf8_safe|U8 * s| U8 *end 962 963 =for apidoc Am|bool|isWORDCHAR|char ch 964 Returns a boolean indicating whether the specified character is a character 965 that is a word character, analogous to what C<m/\w/> and C<m/[[:word:]]/> match 966 in a regular expression. A word character is an alphabetic character, a 967 decimal digit, a connecting punctuation character (such as an underscore), or 968 a "mark" character that attaches to one of those (like some sort of accent). 969 C<isALNUM()> is a synonym provided for backward compatibility, even though a 970 word character includes more than the standard C language meaning of 971 alphanumeric. 972 See the L<top of this section|/Character classification> for an explanation of 973 variants C<isWORDCHAR_A>, C<isWORDCHAR_L1>, C<isWORDCHAR_uvchr>, 974 C<isWORDCHAR_utf8>, and C<isWORDCHAR_utf8_safe>. C<isWORDCHAR_LC>, 975 C<isWORDCHAR_LC_uvchr>, C<isWORDCHAR_LC_utf8>, and C<isWORDCHAR_LC_utf8_safe> 976 are also as described there, but additionally include the platform's native 977 underscore. 978 979 =for apidoc Amh|bool|isWORDCHAR_A|int ch 980 =for apidoc Amh|bool|isWORDCHAR_L1|int ch 981 =for apidoc Amh|bool|isWORDCHAR_uvchr|int ch 982 =for apidoc Amh|bool|isWORDCHAR_utf8_safe|U8 * s|U8 * end 983 =for apidoc Amh|bool|isWORDCHAR_utf8|U8 * s|U8 * end 984 =for apidoc Amh|bool|isWORDCHAR_LC|int ch 985 =for apidoc Amh|bool|isWORDCHAR_LC_uvchr|int ch 986 =for apidoc Amh|bool|isWORDCHAR_LC_utf8_safe|U8 * s| U8 *end 987 =for apidoc Amh|bool|isALNUM|int ch 988 =for apidoc Amh|bool|isALNUM_A|int ch 989 =for apidoc Amh|bool|isALNUM_LC|int ch 990 =for apidoc Amh|bool|isALNUM_LC_uvchr|int ch 991 992 =for apidoc Am|bool|isXDIGIT|char ch 993 Returns a boolean indicating whether the specified character is a hexadecimal 994 digit. In the ASCII range these are C<[0-9A-Fa-f]>. Variants C<isXDIGIT_A()> 995 and C<isXDIGIT_L1()> are identical to C<isXDIGIT()>. 996 See the L<top of this section|/Character classification> for an explanation of 997 variants 998 C<isXDIGIT_uvchr>, C<isXDIGIT_utf8>, C<isXDIGIT_utf8_safe>, C<isXDIGIT_LC>, 999 C<isXDIGIT_LC_uvchr>, C<isXDIGIT_LC_utf8>, and C<isXDIGIT_LC_utf8_safe>. 1000 1001 =for apidoc Amh|bool|isXDIGIT_A|int ch 1002 =for apidoc Amh|bool|isXDIGIT_L1|int ch 1003 =for apidoc Amh|bool|isXDIGIT_uvchr|int ch 1004 =for apidoc Amh|bool|isXDIGIT_utf8_safe|U8 * s|U8 * end 1005 =for apidoc Amh|bool|isXDIGIT_utf8|U8 * s|U8 * end 1006 =for apidoc Amh|bool|isXDIGIT_LC|int ch 1007 =for apidoc Amh|bool|isXDIGIT_LC_uvchr|int ch 1008 =for apidoc Amh|bool|isXDIGIT_LC_utf8_safe|U8 * s| U8 *end 1009 1010 =for apidoc Am|bool|isIDFIRST|char ch 1011 Returns a boolean indicating whether the specified character can be the first 1012 character of an identifier. This is very close to, but not quite the same as 1013 the official Unicode property C<XID_Start>. The difference is that this 1014 returns true only if the input character also matches L</isWORDCHAR>. 1015 See the L<top of this section|/Character classification> for an explanation of 1016 variants 1017 C<isIDFIRST_A>, C<isIDFIRST_L1>, C<isIDFIRST_uvchr>, C<isIDFIRST_utf8>, 1018 C<isIDFIRST_utf8_safe>, C<isIDFIRST_LC>, C<isIDFIRST_LC_uvchr>, 1019 C<isIDFIRST_LC_utf8>, and C<isIDFIRST_LC_utf8_safe>. 1020 1021 =for apidoc Amh|bool|isIDFIRST_A|int ch 1022 =for apidoc Amh|bool|isIDFIRST_L1|int ch 1023 =for apidoc Amh|bool|isIDFIRST_uvchr|int ch 1024 =for apidoc Amh|bool|isIDFIRST_utf8_safe|U8 * s|U8 * end 1025 =for apidoc Amh|bool|isIDFIRST_utf8|U8 * s|U8 * end 1026 =for apidoc Amh|bool|isIDFIRST_LC|int ch 1027 =for apidoc Amh|bool|isIDFIRST_LC_uvchr|int ch 1028 =for apidoc Amh|bool|isIDFIRST_LC_utf8_safe|U8 * s| U8 *end 1029 1030 =for apidoc Am|bool|isIDCONT|char ch 1031 Returns a boolean indicating whether the specified character can be the 1032 second or succeeding character of an identifier. This is very close to, but 1033 not quite the same as the official Unicode property C<XID_Continue>. The 1034 difference is that this returns true only if the input character also matches 1035 L</isWORDCHAR>. See the L<top of this section|/Character classification> for 1036 an explanation of variants C<isIDCONT_A>, C<isIDCONT_L1>, C<isIDCONT_uvchr>, 1037 C<isIDCONT_utf8>, C<isIDCONT_utf8_safe>, C<isIDCONT_LC>, C<isIDCONT_LC_uvchr>, 1038 C<isIDCONT_LC_utf8>, and C<isIDCONT_LC_utf8_safe>. 1039 1040 =for apidoc Amh|bool|isIDCONT_A|int ch 1041 =for apidoc Amh|bool|isIDCONT_L1|int ch 1042 =for apidoc Amh|bool|isIDCONT_uvchr|int ch 1043 =for apidoc Amh|bool|isIDCONT_utf8_safe|U8 * s|U8 * end 1044 =for apidoc Amh|bool|isIDCONT_utf8|U8 * s|U8 * end 1045 =for apidoc Amh|bool|isIDCONT_LC|int ch 1046 =for apidoc Amh|bool|isIDCONT_LC_uvchr|int ch 1047 =for apidoc Amh|bool|isIDCONT_LC_utf8_safe|U8 * s| U8 *end 1048 1049 =head1 Miscellaneous Functions 1050 1051 =for apidoc Am|U8|READ_XDIGIT|char str* 1052 Returns the value of an ASCII-range hex digit and advances the string pointer. 1053 Behaviour is only well defined when isXDIGIT(*str) is true. 1054 1055 =head1 Character case changing 1056 Perl uses "full" Unicode case mappings. This means that converting a single 1057 character to another case may result in a sequence of more than one character. 1058 For example, the uppercase of C<E<223>> (LATIN SMALL LETTER SHARP S) is the two 1059 character sequence C<SS>. This presents some complications The lowercase of 1060 all characters in the range 0..255 is a single character, and thus 1061 C<L</toLOWER_L1>> is furnished. But, C<toUPPER_L1> can't exist, as it couldn't 1062 return a valid result for all legal inputs. Instead C<L</toUPPER_uvchr>> has 1063 an API that does allow every possible legal result to be returned.) Likewise 1064 no other function that is crippled by not being able to give the correct 1065 results for the full range of possible inputs has been implemented here. 1066 1067 =for apidoc Am|U8|toUPPER|int ch 1068 Converts the specified character to uppercase. If the input is anything but an 1069 ASCII lowercase character, that input character itself is returned. Variant 1070 C<toUPPER_A> is equivalent. 1071 1072 =for apidoc Am|UV|toUPPER_uvchr|UV cp|U8* s|STRLEN* lenp 1073 Converts the code point C<cp> to its uppercase version, and 1074 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code 1075 point is interpreted as native if less than 256; otherwise as Unicode. Note 1076 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1077 bytes since the uppercase version may be longer than the original character. 1078 1079 The first code point of the uppercased version is returned 1080 (but note, as explained at L<the top of this section|/Character case 1081 changing>, that there may be more.) 1082 1083 =for apidoc Am|UV|toUPPER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1084 Converts the first UTF-8 encoded character in the sequence starting at C<p> and 1085 extending no further than S<C<e - 1>> to its uppercase version, and 1086 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note 1087 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1088 bytes since the uppercase version may be longer than the original character. 1089 1090 The first code point of the uppercased version is returned 1091 (but note, as explained at L<the top of this section|/Character case 1092 changing>, that there may be more). 1093 1094 It will not attempt to read beyond S<C<e - 1>>, provided that the constraint 1095 S<C<s E<lt> e>> is true (this is asserted for in C<-DDEBUGGING> builds). If 1096 the UTF-8 for the input character is malformed in some way, the program may 1097 croak, or the function may return the REPLACEMENT CHARACTER, at the discretion 1098 of the implementation, and subject to change in future releases. 1099 1100 =for apidoc Am|UV|toUPPER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1101 Same as L</toUPPER_utf8>. 1102 1103 =for apidoc Am|U8|toFOLD|U8 ch 1104 Converts the specified character to foldcase. If the input is anything but an 1105 ASCII uppercase character, that input character itself is returned. Variant 1106 C<toFOLD_A> is equivalent. (There is no equivalent C<to_FOLD_L1> for the full 1107 Latin1 range, as the full generality of L</toFOLD_uvchr> is needed there.) 1108 1109 =for apidoc Am|UV|toFOLD_uvchr|UV cp|U8* s|STRLEN* lenp 1110 Converts the code point C<cp> to its foldcase version, and 1111 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code 1112 point is interpreted as native if less than 256; otherwise as Unicode. Note 1113 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1114 bytes since the foldcase version may be longer than the original character. 1115 1116 The first code point of the foldcased version is returned 1117 (but note, as explained at L<the top of this section|/Character case 1118 changing>, that there may be more). 1119 1120 =for apidoc Am|UV|toFOLD_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1121 Converts the first UTF-8 encoded character in the sequence starting at C<p> and 1122 extending no further than S<C<e - 1>> to its foldcase version, and 1123 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note 1124 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1125 bytes since the foldcase version may be longer than the original character. 1126 1127 The first code point of the foldcased version is returned 1128 (but note, as explained at L<the top of this section|/Character case 1129 changing>, that there may be more). 1130 1131 It will not attempt 1132 to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is 1133 true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the 1134 input character is malformed in some way, the program may croak, or the 1135 function may return the REPLACEMENT CHARACTER, at the discretion of the 1136 implementation, and subject to change in future releases. 1137 1138 =for apidoc Am|UV|toFOLD_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1139 Same as L</toFOLD_utf8>. 1140 1141 =for apidoc Am|U8|toLOWER|U8 ch 1142 Converts the specified character to lowercase. If the input is anything but an 1143 ASCII uppercase character, that input character itself is returned. Variant 1144 C<toLOWER_A> is equivalent. 1145 1146 =for apidoc Am|U8|toLOWER_L1|U8 ch 1147 Converts the specified Latin1 character to lowercase. The results are 1148 undefined if the input doesn't fit in a byte. 1149 1150 =for apidoc Am|U8|toLOWER_LC|U8 ch 1151 Converts the specified character to lowercase using the current locale's rules, 1152 if possible; otherwise returns the input character itself. 1153 1154 =for apidoc Am|UV|toLOWER_uvchr|UV cp|U8* s|STRLEN* lenp 1155 Converts the code point C<cp> to its lowercase version, and 1156 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code 1157 point is interpreted as native if less than 256; otherwise as Unicode. Note 1158 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1159 bytes since the lowercase version may be longer than the original character. 1160 1161 The first code point of the lowercased version is returned 1162 (but note, as explained at L<the top of this section|/Character case 1163 changing>, that there may be more). 1164 1165 =for apidoc Am|UV|toLOWER_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1166 Converts the first UTF-8 encoded character in the sequence starting at C<p> and 1167 extending no further than S<C<e - 1>> to its lowercase version, and 1168 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note 1169 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1170 bytes since the lowercase version may be longer than the original character. 1171 1172 The first code point of the lowercased version is returned 1173 (but note, as explained at L<the top of this section|/Character case 1174 changing>, that there may be more). 1175 It will not attempt to read beyond S<C<e - 1>>, provided that the constraint 1176 S<C<s E<lt> e>> is true (this is asserted for in C<-DDEBUGGING> builds). If 1177 the UTF-8 for the input character is malformed in some way, the program may 1178 croak, or the function may return the REPLACEMENT CHARACTER, at the discretion 1179 of the implementation, and subject to change in future releases. 1180 1181 =for apidoc Am|UV|toLOWER_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1182 Same as L</toLOWER_utf8>. 1183 1184 =for apidoc Am|U8|toTITLE|U8 ch 1185 Converts the specified character to titlecase. If the input is anything but an 1186 ASCII lowercase character, that input character itself is returned. Variant 1187 C<toTITLE_A> is equivalent. (There is no C<toTITLE_L1> for the full Latin1 1188 range, as the full generality of L</toTITLE_uvchr> is needed there. Titlecase is 1189 not a concept used in locale handling, so there is no functionality for that.) 1190 1191 =for apidoc Am|UV|toTITLE_uvchr|UV cp|U8* s|STRLEN* lenp 1192 Converts the code point C<cp> to its titlecase version, and 1193 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. The code 1194 point is interpreted as native if less than 256; otherwise as Unicode. Note 1195 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1196 bytes since the titlecase version may be longer than the original character. 1197 1198 The first code point of the titlecased version is returned 1199 (but note, as explained at L<the top of this section|/Character case 1200 changing>, that there may be more). 1201 1202 =for apidoc Am|UV|toTITLE_utf8|U8* p|U8* e|U8* s|STRLEN* lenp 1203 Converts the first UTF-8 encoded character in the sequence starting at C<p> and 1204 extending no further than S<C<e - 1>> to its titlecase version, and 1205 stores that in UTF-8 in C<s>, and its length in bytes in C<lenp>. Note 1206 that the buffer pointed to by C<s> needs to be at least C<UTF8_MAXBYTES_CASE+1> 1207 bytes since the titlecase version may be longer than the original character. 1208 1209 The first code point of the titlecased version is returned 1210 (but note, as explained at L<the top of this section|/Character case 1211 changing>, that there may be more). 1212 1213 It will not attempt 1214 to read beyond S<C<e - 1>>, provided that the constraint S<C<s E<lt> e>> is 1215 true (this is asserted for in C<-DDEBUGGING> builds). If the UTF-8 for the 1216 input character is malformed in some way, the program may croak, or the 1217 function may return the REPLACEMENT CHARACTER, at the discretion of the 1218 implementation, and subject to change in future releases. 1219 1220 =for apidoc Am|UV|toTITLE_utf8_safe|U8* p|U8* e|U8* s|STRLEN* lenp 1221 Same as L</toTITLE_utf8>. 1222 1223 =cut 1224 1225 XXX Still undocumented isVERTWS_uvchr and _utf8; it's unclear what their names 1226 really should be. Also toUPPER_LC and toFOLD_LC, which are subject to change, 1227 and aren't general purpose as they don't work on U+DF, and assert against that. 1228 1229 Note that these macros are repeated in Devel::PPPort, so should also be 1230 patched there. The file as of this writing is cpan/Devel-PPPort/parts/inc/misc 1231 1232 */ 1233 1234 /* 1235 void below because that's the best fit, and works for Devel::PPPort 1236 =for apidoc AmnU|void|WIDEST_UTYPE 1237 1238 Yields the widest unsigned integer type on the platform, currently either 1239 C<U32> or C<64>. This can be used in declarations such as 1240 1241 WIDEST_UTYPE my_uv; 1242 1243 or casts 1244 1245 my_uv = (WIDEST_UTYPE) val; 1246 1247 =cut 1248 1249 */ 1250 #ifdef QUADKIND 1251 # define WIDEST_UTYPE U64 1252 #else 1253 # define WIDEST_UTYPE U32 1254 #endif 1255 1256 /* FITS_IN_8_BITS(c) returns true if c doesn't have a bit set other than in 1257 * the lower 8. It is designed to be hopefully bomb-proof, making sure that no 1258 * bits of information are lost even on a 64-bit machine, but to get the 1259 * compiler to optimize it out if possible. This is because Configure makes 1260 * sure that the machine has an 8-bit byte, so if c is stored in a byte, the 1261 * sizeof() guarantees that this evaluates to a constant true at compile time. 1262 * 1263 * For Coverity, be always true, because otherwise Coverity thinks 1264 * it finds several expressions that are always true, independent 1265 * of operands. Well, they are, but that is kind of the point. 1266 */ 1267 #ifndef __COVERITY__ 1268 /* The '| 0' part ensures a compiler error if c is not integer (like e.g., a 1269 * pointer) */ 1270 #define FITS_IN_8_BITS(c) ( (sizeof(c) == 1) \ 1271 || !(((WIDEST_UTYPE)((c) | 0)) & ~0xFF)) 1272 #else 1273 #define FITS_IN_8_BITS(c) (1) 1274 #endif 1275 1276 /* Returns true if l <= c <= (l + n), where 'l' and 'n' are non-negative 1277 * Written this way so that after optimization, only one conditional test is 1278 * needed. (The NV casts stop any warnings about comparison always being true 1279 * if called with an unsigned. The cast preserves the sign, which is all we 1280 * care about.) */ 1281 #define withinCOUNT(c, l, n) (__ASSERT_((NV) (l) >= 0) \ 1282 __ASSERT_((NV) (n) >= 0) \ 1283 (((WIDEST_UTYPE) (((c)) - ((l) | 0))) <= (((WIDEST_UTYPE) ((n) | 0))))) 1284 1285 /* Returns true if c is in the range l..u, where 'l' is non-negative 1286 * Written this way so that after optimization, only one conditional test is 1287 * needed. */ 1288 #define inRANGE(c, l, u) (__ASSERT_((u) >= (l)) \ 1289 ( (sizeof(c) == sizeof(U8)) ? withinCOUNT(((U8) (c)), (l), ((u) - (l))) \ 1290 : (sizeof(c) == sizeof(U32)) ? withinCOUNT(((U32) (c)), (l), ((u) - (l))) \ 1291 : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE)) \ 1292 withinCOUNT(((WIDEST_UTYPE) (c)), (l), ((u) - (l)))))) 1293 1294 #ifdef EBCDIC 1295 # ifndef _ALL_SOURCE 1296 /* The native libc isascii() et.al. functions return the wrong results 1297 * on at least z/OS unless this is defined. */ 1298 # error _ALL_SOURCE should probably be defined 1299 # endif 1300 #else 1301 /* There is a simple definition of ASCII for ASCII platforms. But the 1302 * EBCDIC one isn't so simple, so is defined using table look-up like the 1303 * other macros below. 1304 * 1305 * The cast here is used instead of '(c) >= 0', because some compilers emit 1306 * a warning that that test is always true when the parameter is an 1307 * unsigned type. khw supposes that it could be written as 1308 * && ((c) == '\0' || (c) > 0) 1309 * to avoid the message, but the cast will likely avoid extra branches even 1310 * with stupid compilers. 1311 * 1312 * The '| 0' part ensures a compiler error if c is not integer (like e.g., 1313 * a pointer) */ 1314 # define isASCII(c) ((WIDEST_UTYPE)((c) | 0) < 128) 1315 #endif 1316 1317 /* Take the eight possible bit patterns of the lower 3 bits and you get the 1318 * lower 3 bits of the 8 octal digits, in both ASCII and EBCDIC, so those bits 1319 * can be ignored. If the rest match '0', we have an octal */ 1320 #define isOCTAL_A(c) (((WIDEST_UTYPE)((c) | 0) & ~7) == '0') 1321 1322 #ifdef H_PERL /* If have access to perl.h, lookup in its table */ 1323 1324 /* Character class numbers. For internal core Perl use only. The ones less 1325 * than 32 are used in PL_charclass[] and the ones up through the one that 1326 * corresponds to <_HIGHEST_REGCOMP_DOT_H_SYNC> are used by regcomp.h and 1327 * related files. PL_charclass ones use names used in l1_char_class_tab.h but 1328 * their actual definitions are here. If that file has a name not used here, 1329 * it won't compile. 1330 * 1331 * The first group of these is ordered in what I (khw) estimate to be the 1332 * frequency of their use. This gives a slight edge to exiting a loop earlier 1333 * (in reginclass() in regexec.c). Except \v should be last, as it isn't a 1334 * real Posix character class, and some (small) inefficiencies in regular 1335 * expression handling would be introduced by putting it in the middle of those 1336 * that are. Also, cntrl and ascii come after the others as it may be useful 1337 * to group these which have no members that match above Latin1, (or above 1338 * ASCII in the latter case) */ 1339 1340 # define _CC_WORDCHAR 0 /* \w and [:word:] */ 1341 # define _CC_DIGIT 1 /* \d and [:digit:] */ 1342 # define _CC_ALPHA 2 /* [:alpha:] */ 1343 # define _CC_LOWER 3 /* [:lower:] */ 1344 # define _CC_UPPER 4 /* [:upper:] */ 1345 # define _CC_PUNCT 5 /* [:punct:] */ 1346 # define _CC_PRINT 6 /* [:print:] */ 1347 # define _CC_ALPHANUMERIC 7 /* [:alnum:] */ 1348 # define _CC_GRAPH 8 /* [:graph:] */ 1349 # define _CC_CASED 9 /* [:lower:] or [:upper:] under /i */ 1350 # define _CC_SPACE 10 /* \s, [:space:] */ 1351 # define _CC_BLANK 11 /* [:blank:] */ 1352 # define _CC_XDIGIT 12 /* [:xdigit:] */ 1353 # define _CC_CNTRL 13 /* [:cntrl:] */ 1354 # define _CC_ASCII 14 /* [:ascii:] */ 1355 # define _CC_VERTSPACE 15 /* \v */ 1356 1357 # define _HIGHEST_REGCOMP_DOT_H_SYNC _CC_VERTSPACE 1358 1359 /* The members of the third group below do not need to be coordinated with data 1360 * structures in regcomp.[ch] and regexec.c. */ 1361 # define _CC_IDFIRST 16 1362 # define _CC_CHARNAME_CONT 17 1363 # define _CC_NONLATIN1_FOLD 18 1364 # define _CC_NONLATIN1_SIMPLE_FOLD 19 1365 # define _CC_QUOTEMETA 20 1366 # define _CC_NON_FINAL_FOLD 21 1367 # define _CC_IS_IN_SOME_FOLD 22 1368 # define _CC_BINDIGIT 23 1369 # define _CC_OCTDIGIT 24 1370 # define _CC_MNEMONIC_CNTRL 25 1371 1372 /* This next group is only used on EBCDIC platforms, so theoretically could be 1373 * shared with something entirely different that's only on ASCII platforms */ 1374 # define _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE 31 1375 /* Unused: 24-30 1376 * If more bits are needed, one could add a second word for non-64bit 1377 * QUAD_IS_INT systems, using some #ifdefs to distinguish between having a 2nd 1378 * word or not. The IS_IN_SOME_FOLD bit is the most easily expendable, as it 1379 * is used only for optimization (as of this writing), and differs in the 1380 * Latin1 range from the ALPHA bit only in two relatively unimportant 1381 * characters: the masculine and feminine ordinal indicators, so removing it 1382 * would just cause /i regexes which match them to run less efficiently. 1383 * Similarly the EBCDIC-only bits are used just for speed, and could be 1384 * replaced by other means */ 1385 1386 #if defined(PERL_CORE) || defined(PERL_EXT) 1387 /* An enum version of the character class numbers, to help compilers 1388 * optimize */ 1389 typedef enum { 1390 _CC_ENUM_ALPHA = _CC_ALPHA, 1391 _CC_ENUM_ALPHANUMERIC = _CC_ALPHANUMERIC, 1392 _CC_ENUM_ASCII = _CC_ASCII, 1393 _CC_ENUM_BLANK = _CC_BLANK, 1394 _CC_ENUM_CASED = _CC_CASED, 1395 _CC_ENUM_CNTRL = _CC_CNTRL, 1396 _CC_ENUM_DIGIT = _CC_DIGIT, 1397 _CC_ENUM_GRAPH = _CC_GRAPH, 1398 _CC_ENUM_LOWER = _CC_LOWER, 1399 _CC_ENUM_PRINT = _CC_PRINT, 1400 _CC_ENUM_PUNCT = _CC_PUNCT, 1401 _CC_ENUM_SPACE = _CC_SPACE, 1402 _CC_ENUM_UPPER = _CC_UPPER, 1403 _CC_ENUM_VERTSPACE = _CC_VERTSPACE, 1404 _CC_ENUM_WORDCHAR = _CC_WORDCHAR, 1405 _CC_ENUM_XDIGIT = _CC_XDIGIT 1406 } _char_class_number; 1407 #endif 1408 1409 #define POSIX_CC_COUNT (_HIGHEST_REGCOMP_DOT_H_SYNC + 1) 1410 1411 START_EXTERN_C 1412 # ifdef DOINIT 1413 EXTCONST U32 PL_charclass[] = { 1414 # include "l1_char_class_tab.h" 1415 }; 1416 1417 # else /* ! DOINIT */ 1418 EXTCONST U32 PL_charclass[]; 1419 # endif 1420 END_EXTERN_C 1421 1422 /* The 1U keeps Solaris from griping when shifting sets the uppermost bit */ 1423 # define _CC_mask(classnum) (1U << (classnum)) 1424 1425 /* For internal core Perl use only: the base macro for defining macros like 1426 * isALPHA */ 1427 # define _generic_isCC(c, classnum) cBOOL(FITS_IN_8_BITS(c) \ 1428 && (PL_charclass[(U8) (c)] & _CC_mask(classnum))) 1429 1430 /* The mask for the _A versions of the macros; it just adds in the bit for 1431 * ASCII. */ 1432 # define _CC_mask_A(classnum) (_CC_mask(classnum) | _CC_mask(_CC_ASCII)) 1433 1434 /* For internal core Perl use only: the base macro for defining macros like 1435 * isALPHA_A. The foo_A version makes sure that both the desired bit and 1436 * the ASCII bit are present */ 1437 # define _generic_isCC_A(c, classnum) (FITS_IN_8_BITS(c) \ 1438 && ((PL_charclass[(U8) (c)] & _CC_mask_A(classnum)) \ 1439 == _CC_mask_A(classnum))) 1440 1441 /* On ASCII platforms certain classes form a single range. It's faster to 1442 * special case these. isDIGIT is a single range on all platforms */ 1443 # ifdef EBCDIC 1444 # define isALPHA_A(c) _generic_isCC_A(c, _CC_ALPHA) 1445 # define isGRAPH_A(c) _generic_isCC_A(c, _CC_GRAPH) 1446 # define isLOWER_A(c) _generic_isCC_A(c, _CC_LOWER) 1447 # define isPRINT_A(c) _generic_isCC_A(c, _CC_PRINT) 1448 # define isUPPER_A(c) _generic_isCC_A(c, _CC_UPPER) 1449 # else 1450 /* By folding the upper and lowercase, we can use a single range */ 1451 # define isALPHA_A(c) inRANGE((~('A' ^ 'a') & (c)), 'A', 'Z') 1452 # define isGRAPH_A(c) inRANGE(c, ' ' + 1, 0x7e) 1453 # define isLOWER_A(c) inRANGE(c, 'a', 'z') 1454 # define isPRINT_A(c) inRANGE(c, ' ', 0x7e) 1455 # define isUPPER_A(c) inRANGE(c, 'A', 'Z') 1456 # endif 1457 # define isALPHANUMERIC_A(c) _generic_isCC_A(c, _CC_ALPHANUMERIC) 1458 # define isBLANK_A(c) _generic_isCC_A(c, _CC_BLANK) 1459 # define isCNTRL_A(c) _generic_isCC_A(c, _CC_CNTRL) 1460 # define isDIGIT_A(c) inRANGE(c, '0', '9') 1461 # define isPUNCT_A(c) _generic_isCC_A(c, _CC_PUNCT) 1462 # define isSPACE_A(c) _generic_isCC_A(c, _CC_SPACE) 1463 # define isWORDCHAR_A(c) _generic_isCC_A(c, _CC_WORDCHAR) 1464 # define isXDIGIT_A(c) _generic_isCC(c, _CC_XDIGIT) /* No non-ASCII xdigits 1465 */ 1466 # define isIDFIRST_A(c) _generic_isCC_A(c, _CC_IDFIRST) 1467 # define isALPHA_L1(c) _generic_isCC(c, _CC_ALPHA) 1468 # define isALPHANUMERIC_L1(c) _generic_isCC(c, _CC_ALPHANUMERIC) 1469 # define isBLANK_L1(c) _generic_isCC(c, _CC_BLANK) 1470 1471 /* continuation character for legal NAME in \N{NAME} */ 1472 # define isCHARNAME_CONT(c) _generic_isCC(c, _CC_CHARNAME_CONT) 1473 1474 # define isCNTRL_L1(c) _generic_isCC(c, _CC_CNTRL) 1475 # define isGRAPH_L1(c) _generic_isCC(c, _CC_GRAPH) 1476 # define isLOWER_L1(c) _generic_isCC(c, _CC_LOWER) 1477 # define isPRINT_L1(c) _generic_isCC(c, _CC_PRINT) 1478 # define isPSXSPC_L1(c) isSPACE_L1(c) 1479 # define isPUNCT_L1(c) _generic_isCC(c, _CC_PUNCT) 1480 # define isSPACE_L1(c) _generic_isCC(c, _CC_SPACE) 1481 # define isUPPER_L1(c) _generic_isCC(c, _CC_UPPER) 1482 # define isWORDCHAR_L1(c) _generic_isCC(c, _CC_WORDCHAR) 1483 # define isIDFIRST_L1(c) _generic_isCC(c, _CC_IDFIRST) 1484 1485 # ifdef EBCDIC 1486 # define isASCII(c) _generic_isCC(c, _CC_ASCII) 1487 # endif 1488 1489 /* Participates in a single-character fold with a character above 255 */ 1490 # define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_SIMPLE_FOLD))) 1491 1492 /* Like the above, but also can be part of a multi-char fold */ 1493 # define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) (c)] & _CC_mask(_CC_NONLATIN1_FOLD))) 1494 1495 # define _isQUOTEMETA(c) _generic_isCC(c, _CC_QUOTEMETA) 1496 # define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ 1497 _generic_isCC(c, _CC_NON_FINAL_FOLD) 1498 # define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) \ 1499 _generic_isCC(c, _CC_IS_IN_SOME_FOLD) 1500 1501 /* is c a control character for which we have a mnemonic? */ 1502 # if defined(PERL_CORE) || defined(PERL_EXT) 1503 # define isMNEMONIC_CNTRL(c) _generic_isCC(c, _CC_MNEMONIC_CNTRL) 1504 # endif 1505 #else /* else we don't have perl.h H_PERL */ 1506 1507 /* If we don't have perl.h, we are compiling a utility program. Below we 1508 * hard-code various macro definitions that wouldn't otherwise be available 1509 * to it. Most are coded based on first principles. These are written to 1510 * avoid EBCDIC vs. ASCII #ifdef's as much as possible. */ 1511 # define isDIGIT_A(c) inRANGE(c, '0', '9') 1512 # define isBLANK_A(c) ((c) == ' ' || (c) == '\t') 1513 # define isSPACE_A(c) (isBLANK_A(c) \ 1514 || (c) == '\n' \ 1515 || (c) == '\r' \ 1516 || (c) == '\v' \ 1517 || (c) == '\f') 1518 /* On EBCDIC, there are gaps between 'i' and 'j'; 'r' and 's'. Same for 1519 * uppercase. The tests for those aren't necessary on ASCII, but hurt only 1520 * performance (if optimization isn't on), and allow the same code to be 1521 * used for both platform types */ 1522 # define isLOWER_A(c) inRANGE((c), 'a', 'i') \ 1523 || inRANGE((c), 'j', 'r') \ 1524 || inRANGE((c), 's', 'z') 1525 # define isUPPER_A(c) inRANGE((c), 'A', 'I') \ 1526 || inRANGE((c), 'J', 'R') \ 1527 || inRANGE((c), 'S', 'Z') 1528 # define isALPHA_A(c) (isUPPER_A(c) || isLOWER_A(c)) 1529 # define isALPHANUMERIC_A(c) (isALPHA_A(c) || isDIGIT_A(c)) 1530 # define isWORDCHAR_A(c) (isALPHANUMERIC_A(c) || (c) == '_') 1531 # define isIDFIRST_A(c) (isALPHA_A(c) || (c) == '_') 1532 # define isXDIGIT_A(c) ( isDIGIT_A(c) \ 1533 || inRANGE((c), 'a', 'f') \ 1534 || inRANGE((c), 'A', 'F') 1535 # define isPUNCT_A(c) ((c) == '-' || (c) == '!' || (c) == '"' \ 1536 || (c) == '#' || (c) == '$' || (c) == '%' \ 1537 || (c) == '&' || (c) == '\'' || (c) == '(' \ 1538 || (c) == ')' || (c) == '*' || (c) == '+' \ 1539 || (c) == ',' || (c) == '.' || (c) == '/' \ 1540 || (c) == ':' || (c) == ';' || (c) == '<' \ 1541 || (c) == '=' || (c) == '>' || (c) == '?' \ 1542 || (c) == '@' || (c) == '[' || (c) == '\\' \ 1543 || (c) == ']' || (c) == '^' || (c) == '_' \ 1544 || (c) == '`' || (c) == '{' || (c) == '|' \ 1545 || (c) == '}' || (c) == '~') 1546 # define isGRAPH_A(c) (isALPHANUMERIC_A(c) || isPUNCT_A(c)) 1547 # define isPRINT_A(c) (isGRAPH_A(c) || (c) == ' ') 1548 1549 # ifdef EBCDIC 1550 /* The below is accurate for the 3 EBCDIC code pages traditionally 1551 * supported by perl. The only difference between them in the controls 1552 * is the position of \n, and that is represented symbolically below */ 1553 # define isCNTRL_A(c) ((c) == '\0' || (c) == '\a' || (c) == '\b' \ 1554 || (c) == '\f' || (c) == '\n' || (c) == '\r' \ 1555 || (c) == '\t' || (c) == '\v' \ 1556 || inRANGE((c), 1, 3) /* SOH, STX, ETX */ \ 1557 || (c) == 7F /* U+7F DEL */ \ 1558 || inRANGE((c), 0x0E, 0x13) /* SO SI DLE \ 1559 DC[1-3] */ \ 1560 || (c) == 0x18 /* U+18 CAN */ \ 1561 || (c) == 0x19 /* U+19 EOM */ \ 1562 || inRANGE((c), 0x1C, 0x1F) /* [FGRU]S */ \ 1563 || (c) == 0x26 /* U+17 ETB */ \ 1564 || (c) == 0x27 /* U+1B ESC */ \ 1565 || (c) == 0x2D /* U+05 ENQ */ \ 1566 || (c) == 0x2E /* U+06 ACK */ \ 1567 || (c) == 0x32 /* U+16 SYN */ \ 1568 || (c) == 0x37 /* U+04 EOT */ \ 1569 || (c) == 0x3C /* U+14 DC4 */ \ 1570 || (c) == 0x3D /* U+15 NAK */ \ 1571 || (c) == 0x3F)/* U+1A SUB */ 1572 # define isASCII(c) (isCNTRL_A(c) || isPRINT_A(c)) 1573 # else /* isASCII is already defined for ASCII platforms, so can use that to 1574 define isCNTRL */ 1575 # define isCNTRL_A(c) (isASCII(c) && ! isPRINT_A(c)) 1576 # endif 1577 1578 /* The _L1 macros may be unnecessary for the utilities; I (khw) added them 1579 * during debugging, and it seems best to keep them. We may be called 1580 * without NATIVE_TO_LATIN1 being defined. On ASCII platforms, it doesn't 1581 * do anything anyway, so make it not a problem */ 1582 # if ! defined(EBCDIC) && ! defined(NATIVE_TO_LATIN1) 1583 # define NATIVE_TO_LATIN1(ch) (ch) 1584 # endif 1585 # define isALPHA_L1(c) (isUPPER_L1(c) || isLOWER_L1(c)) 1586 # define isALPHANUMERIC_L1(c) (isALPHA_L1(c) || isDIGIT_A(c)) 1587 # define isBLANK_L1(c) (isBLANK_A(c) \ 1588 || (FITS_IN_8_BITS(c) \ 1589 && NATIVE_TO_LATIN1((U8) c) == 0xA0)) 1590 # define isCNTRL_L1(c) (FITS_IN_8_BITS(c) && (! isPRINT_L1(c))) 1591 # define isGRAPH_L1(c) (isPRINT_L1(c) && (! isBLANK_L1(c))) 1592 # define isLOWER_L1(c) (isLOWER_A(c) \ 1593 || (FITS_IN_8_BITS(c) \ 1594 && (( NATIVE_TO_LATIN1((U8) c) >= 0xDF \ 1595 && NATIVE_TO_LATIN1((U8) c) != 0xF7) \ 1596 || NATIVE_TO_LATIN1((U8) c) == 0xAA \ 1597 || NATIVE_TO_LATIN1((U8) c) == 0xBA \ 1598 || NATIVE_TO_LATIN1((U8) c) == 0xB5))) 1599 # define isPRINT_L1(c) (isPRINT_A(c) \ 1600 || (FITS_IN_8_BITS(c) \ 1601 && NATIVE_TO_LATIN1((U8) c) >= 0xA0)) 1602 # define isPUNCT_L1(c) (isPUNCT_A(c) \ 1603 || (FITS_IN_8_BITS(c) \ 1604 && ( NATIVE_TO_LATIN1((U8) c) == 0xA1 \ 1605 || NATIVE_TO_LATIN1((U8) c) == 0xA7 \ 1606 || NATIVE_TO_LATIN1((U8) c) == 0xAB \ 1607 || NATIVE_TO_LATIN1((U8) c) == 0xB6 \ 1608 || NATIVE_TO_LATIN1((U8) c) == 0xB7 \ 1609 || NATIVE_TO_LATIN1((U8) c) == 0xBB \ 1610 || NATIVE_TO_LATIN1((U8) c) == 0xBF))) 1611 # define isSPACE_L1(c) (isSPACE_A(c) \ 1612 || (FITS_IN_8_BITS(c) \ 1613 && ( NATIVE_TO_LATIN1((U8) c) == 0x85 \ 1614 || NATIVE_TO_LATIN1((U8) c) == 0xA0))) 1615 # define isUPPER_L1(c) (isUPPER_A(c) \ 1616 || (FITS_IN_8_BITS(c) \ 1617 && ( IN_RANGE(NATIVE_TO_LATIN1((U8) c), \ 1618 0xC0, 0xDE) \ 1619 && NATIVE_TO_LATIN1((U8) c) != 0xD7))) 1620 # define isWORDCHAR_L1(c) (isIDFIRST_L1(c) || isDIGIT_A(c)) 1621 # define isIDFIRST_L1(c) (isALPHA_L1(c) || NATIVE_TO_LATIN1(c) == '_') 1622 # define isCHARNAME_CONT(c) (isWORDCHAR_L1(c) \ 1623 || isBLANK_L1(c) \ 1624 || (c) == '-' \ 1625 || (c) == '(' \ 1626 || (c) == ')') 1627 /* The following are not fully accurate in the above-ASCII range. I (khw) 1628 * don't think it's necessary to be so for the purposes where this gets 1629 * compiled */ 1630 # define _isQUOTEMETA(c) (FITS_IN_8_BITS(c) && ! isWORDCHAR_L1(c)) 1631 # define _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) isALPHA_L1(c) 1632 1633 /* And these aren't accurate at all. They are useful only for above 1634 * Latin1, which utilities and bootstrapping don't deal with */ 1635 # define _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) 0 1636 # define _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0 1637 # define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) 0 1638 1639 /* Many of the macros later in this file are defined in terms of these. By 1640 * implementing them with a function, which converts the class number into 1641 * a call to the desired macro, all of the later ones work. However, that 1642 * function won't be actually defined when building a utility program (no 1643 * perl.h), and so a compiler error will be generated if one is attempted 1644 * to be used. And the above-Latin1 code points require Unicode tables to 1645 * be present, something unlikely to be the case when bootstrapping */ 1646 # define _generic_isCC(c, classnum) \ 1647 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), TRUE)) 1648 # define _generic_isCC_A(c, classnum) \ 1649 (FITS_IN_8_BITS(c) && S_bootstrap_ctype((U8) (c), (classnum), FALSE)) 1650 #endif /* End of no perl.h H_PERL */ 1651 1652 #define isALPHANUMERIC(c) isALPHANUMERIC_A(c) 1653 #define isALPHA(c) isALPHA_A(c) 1654 #define isASCII_A(c) isASCII(c) 1655 #define isASCII_L1(c) isASCII(c) 1656 #define isBLANK(c) isBLANK_A(c) 1657 #define isCNTRL(c) isCNTRL_A(c) 1658 #define isDIGIT(c) isDIGIT_A(c) 1659 #define isGRAPH(c) isGRAPH_A(c) 1660 #define isIDFIRST(c) isIDFIRST_A(c) 1661 #define isLOWER(c) isLOWER_A(c) 1662 #define isPRINT(c) isPRINT_A(c) 1663 #define isPSXSPC_A(c) isSPACE_A(c) 1664 #define isPSXSPC(c) isPSXSPC_A(c) 1665 #define isPSXSPC_L1(c) isSPACE_L1(c) 1666 #define isPUNCT(c) isPUNCT_A(c) 1667 #define isSPACE(c) isSPACE_A(c) 1668 #define isUPPER(c) isUPPER_A(c) 1669 #define isWORDCHAR(c) isWORDCHAR_A(c) 1670 #define isXDIGIT(c) isXDIGIT_A(c) 1671 1672 /* ASCII casing. These could also be written as 1673 #define toLOWER(c) (isASCII(c) ? toLOWER_LATIN1(c) : (c)) 1674 #define toUPPER(c) (isASCII(c) ? toUPPER_LATIN1_MOD(c) : (c)) 1675 which uses table lookup and mask instead of subtraction. (This would 1676 work because the _MOD does not apply in the ASCII range). 1677 1678 These actually are UTF-8 invariant casing, not just ASCII, as any non-ASCII 1679 UTF-8 invariants are neither upper nor lower. (Only on EBCDIC platforms are 1680 there non-ASCII invariants, and all of them are controls.) */ 1681 #define toLOWER(c) (isUPPER(c) ? (U8)((c) + ('a' - 'A')) : (c)) 1682 #define toUPPER(c) (isLOWER(c) ? (U8)((c) - ('a' - 'A')) : (c)) 1683 1684 /* In the ASCII range, these are equivalent to what they're here defined to be. 1685 * But by creating these definitions, other code doesn't have to be aware of 1686 * this detail. Actually this works for all UTF-8 invariants, not just the 1687 * ASCII range. (EBCDIC platforms can have non-ASCII invariants.) */ 1688 #define toFOLD(c) toLOWER(c) 1689 #define toTITLE(c) toUPPER(c) 1690 1691 #define toLOWER_A(c) toLOWER(c) 1692 #define toUPPER_A(c) toUPPER(c) 1693 #define toFOLD_A(c) toFOLD(c) 1694 #define toTITLE_A(c) toTITLE(c) 1695 1696 /* Use table lookup for speed; returns the input itself if is out-of-range */ 1697 #define toLOWER_LATIN1(c) ((! FITS_IN_8_BITS(c)) \ 1698 ? (c) \ 1699 : PL_latin1_lc[ (U8) (c) ]) 1700 #define toLOWER_L1(c) toLOWER_LATIN1(c) /* Synonym for consistency */ 1701 1702 /* Modified uc. Is correct uc except for three non-ascii chars which are 1703 * all mapped to one of them, and these need special handling; returns the 1704 * input itself if is out-of-range */ 1705 #define toUPPER_LATIN1_MOD(c) ((! FITS_IN_8_BITS(c)) \ 1706 ? (c) \ 1707 : PL_mod_latin1_uc[ (U8) (c) ]) 1708 #define IN_UTF8_CTYPE_LOCALE PL_in_utf8_CTYPE_locale 1709 1710 /* Use foo_LC_uvchr() instead of these for beyond the Latin1 range */ 1711 1712 /* For internal core Perl use only: the base macro for defining macros like 1713 * isALPHA_LC, which uses the current LC_CTYPE locale. 'c' is the code point 1714 * (0-255) to check. In a UTF-8 locale, the result is the same as calling 1715 * isFOO_L1(); the 'utf8_locale_classnum' parameter is something like 1716 * _CC_UPPER, which gives the class number for doing this. For non-UTF-8 1717 * locales, the code to actually do the test this is passed in 'non_utf8'. If 1718 * 'c' is above 255, 0 is returned. For accessing the full range of possible 1719 * code points under locale rules, use the macros based on _generic_LC_uvchr 1720 * instead of this. */ 1721 #define _generic_LC_base(c, utf8_locale_classnum, non_utf8) \ 1722 (! FITS_IN_8_BITS(c) \ 1723 ? 0 \ 1724 : IN_UTF8_CTYPE_LOCALE \ 1725 ? cBOOL(PL_charclass[(U8) (c)] & _CC_mask(utf8_locale_classnum)) \ 1726 : cBOOL(non_utf8)) 1727 1728 /* For internal core Perl use only: a helper macro for defining macros like 1729 * isALPHA_LC. 'c' is the code point (0-255) to check. The function name to 1730 * actually do this test is passed in 'non_utf8_func', which is called on 'c', 1731 * casting 'c' to the macro _LC_CAST, which should not be parenthesized. See 1732 * _generic_LC_base for more info */ 1733 #define _generic_LC(c, utf8_locale_classnum, non_utf8_func) \ 1734 _generic_LC_base(c,utf8_locale_classnum, \ 1735 non_utf8_func( (_LC_CAST) (c))) 1736 1737 /* For internal core Perl use only: like _generic_LC, but also returns TRUE if 1738 * 'c' is the platform's native underscore character */ 1739 #define _generic_LC_underscore(c,utf8_locale_classnum,non_utf8_func) \ 1740 _generic_LC_base(c, utf8_locale_classnum, \ 1741 (non_utf8_func( (_LC_CAST) (c)) \ 1742 || (char)(c) == '_')) 1743 1744 /* These next three are also for internal core Perl use only: case-change 1745 * helper macros. The reason for using the PL_latin arrays is in case the 1746 * system function is defective; it ensures uniform results that conform to the 1747 * Unicod standard. It does not handle the anomalies in UTF-8 Turkic locales */ 1748 #define _generic_toLOWER_LC(c, function, cast) (! FITS_IN_8_BITS(c) \ 1749 ? (c) \ 1750 : (IN_UTF8_CTYPE_LOCALE) \ 1751 ? PL_latin1_lc[ (U8) (c) ] \ 1752 : (cast)function((cast)(c))) 1753 1754 /* Note that the result can be larger than a byte in a UTF-8 locale. It 1755 * returns a single value, so can't adequately return the upper case of LATIN 1756 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two 1757 * values "SS"); instead it asserts against that under DEBUGGING, and 1758 * otherwise returns its input. It does not handle the anomalies in UTF-8 1759 * Turkic locales. */ 1760 #define _generic_toUPPER_LC(c, function, cast) \ 1761 (! FITS_IN_8_BITS(c) \ 1762 ? (c) \ 1763 : ((! IN_UTF8_CTYPE_LOCALE) \ 1764 ? (cast)function((cast)(c)) \ 1765 : ((((U8)(c)) == MICRO_SIGN) \ 1766 ? GREEK_CAPITAL_LETTER_MU \ 1767 : ((((U8)(c)) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) \ 1768 ? LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS \ 1769 : ((((U8)(c)) == LATIN_SMALL_LETTER_SHARP_S) \ 1770 ? (__ASSERT_(0) (c)) \ 1771 : PL_mod_latin1_uc[ (U8) (c) ]))))) 1772 1773 /* Note that the result can be larger than a byte in a UTF-8 locale. It 1774 * returns a single value, so can't adequately return the fold case of LATIN 1775 * SMALL LETTER SHARP S in a UTF-8 locale (which should be a string of two 1776 * values "ss"); instead it asserts against that under DEBUGGING, and 1777 * otherwise returns its input. It does not handle the anomalies in UTF-8 1778 * Turkic locales */ 1779 #define _generic_toFOLD_LC(c, function, cast) \ 1780 ((UNLIKELY((c) == MICRO_SIGN) && IN_UTF8_CTYPE_LOCALE) \ 1781 ? GREEK_SMALL_LETTER_MU \ 1782 : (__ASSERT_(! IN_UTF8_CTYPE_LOCALE \ 1783 || (c) != LATIN_SMALL_LETTER_SHARP_S) \ 1784 _generic_toLOWER_LC(c, function, cast))) 1785 1786 /* Use the libc versions for these if available. */ 1787 #if defined(HAS_ISASCII) 1788 # define isASCII_LC(c) (FITS_IN_8_BITS(c) && isascii( (U8) (c))) 1789 #else 1790 # define isASCII_LC(c) isASCII(c) 1791 #endif 1792 1793 #if defined(HAS_ISBLANK) 1794 # define isBLANK_LC(c) _generic_LC(c, _CC_BLANK, isblank) 1795 #else /* Unlike isASCII, varies if in a UTF-8 locale */ 1796 # define isBLANK_LC(c) ((IN_UTF8_CTYPE_LOCALE) ? isBLANK_L1(c) : isBLANK(c)) 1797 #endif 1798 1799 #define _LC_CAST U8 1800 1801 #ifdef WIN32 1802 /* The Windows functions don't bother to follow the POSIX standard, which 1803 * for example says that something can't both be a printable and a control. 1804 * But Windows treats the \t control as a printable, and does such things 1805 * as making superscripts into both digits and punctuation. This tames 1806 * these flaws by assuming that the definitions of both controls and space 1807 * are correct, and then making sure that other definitions don't have 1808 * weirdnesses, by making sure that isalnum() isn't also ispunct(), etc. 1809 * Not all possible weirdnesses are checked for, just the ones that were 1810 * detected on actual Microsoft code pages */ 1811 1812 # define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl) 1813 # define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace) 1814 1815 # define isALPHA_LC(c) (_generic_LC(c, _CC_ALPHA, isalpha) \ 1816 && isALPHANUMERIC_LC(c)) 1817 # define isALPHANUMERIC_LC(c) (_generic_LC(c, _CC_ALPHANUMERIC, isalnum) && \ 1818 ! isPUNCT_LC(c)) 1819 # define isDIGIT_LC(c) (_generic_LC(c, _CC_DIGIT, isdigit) && \ 1820 isALPHANUMERIC_LC(c)) 1821 # define isGRAPH_LC(c) (_generic_LC(c, _CC_GRAPH, isgraph) && isPRINT_LC(c)) 1822 # define isIDFIRST_LC(c) (((c) == '_') \ 1823 || (_generic_LC(c, _CC_IDFIRST, isalpha) && ! isPUNCT_LC(c))) 1824 # define isLOWER_LC(c) (_generic_LC(c, _CC_LOWER, islower) && isALPHA_LC(c)) 1825 # define isPRINT_LC(c) (_generic_LC(c, _CC_PRINT, isprint) && ! isCNTRL_LC(c)) 1826 # define isPUNCT_LC(c) (_generic_LC(c, _CC_PUNCT, ispunct) && ! isCNTRL_LC(c)) 1827 # define isUPPER_LC(c) (_generic_LC(c, _CC_UPPER, isupper) && isALPHA_LC(c)) 1828 # define isWORDCHAR_LC(c) (((c) == '_') || isALPHANUMERIC_LC(c)) 1829 # define isXDIGIT_LC(c) (_generic_LC(c, _CC_XDIGIT, isxdigit) \ 1830 && isALPHANUMERIC_LC(c)) 1831 1832 # define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8) 1833 # define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8) 1834 # define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8) 1835 1836 #elif defined(CTYPE256) || (!defined(isascii) && !defined(HAS_ISASCII)) 1837 /* For most other platforms */ 1838 1839 # define isALPHA_LC(c) _generic_LC(c, _CC_ALPHA, isalpha) 1840 # define isALPHANUMERIC_LC(c) _generic_LC(c, _CC_ALPHANUMERIC, isalnum) 1841 # define isCNTRL_LC(c) _generic_LC(c, _CC_CNTRL, iscntrl) 1842 # define isDIGIT_LC(c) _generic_LC(c, _CC_DIGIT, isdigit) 1843 # define isGRAPH_LC(c) _generic_LC(c, _CC_GRAPH, isgraph) 1844 # define isIDFIRST_LC(c) _generic_LC_underscore(c, _CC_IDFIRST, isalpha) 1845 # define isLOWER_LC(c) _generic_LC(c, _CC_LOWER, islower) 1846 # define isPRINT_LC(c) _generic_LC(c, _CC_PRINT, isprint) 1847 # define isPUNCT_LC(c) _generic_LC(c, _CC_PUNCT, ispunct) 1848 # define isSPACE_LC(c) _generic_LC(c, _CC_SPACE, isspace) 1849 # define isUPPER_LC(c) _generic_LC(c, _CC_UPPER, isupper) 1850 # define isWORDCHAR_LC(c) _generic_LC_underscore(c, _CC_WORDCHAR, isalnum) 1851 # define isXDIGIT_LC(c) _generic_LC(c, _CC_XDIGIT, isxdigit) 1852 1853 1854 # define toLOWER_LC(c) _generic_toLOWER_LC((c), tolower, U8) 1855 # define toUPPER_LC(c) _generic_toUPPER_LC((c), toupper, U8) 1856 # define toFOLD_LC(c) _generic_toFOLD_LC((c), tolower, U8) 1857 1858 #else /* The final fallback position */ 1859 1860 # define isALPHA_LC(c) (isascii(c) && isalpha(c)) 1861 # define isALPHANUMERIC_LC(c) (isascii(c) && isalnum(c)) 1862 # define isCNTRL_LC(c) (isascii(c) && iscntrl(c)) 1863 # define isDIGIT_LC(c) (isascii(c) && isdigit(c)) 1864 # define isGRAPH_LC(c) (isascii(c) && isgraph(c)) 1865 # define isIDFIRST_LC(c) (isascii(c) && (isalpha(c) || (c) == '_')) 1866 # define isLOWER_LC(c) (isascii(c) && islower(c)) 1867 # define isPRINT_LC(c) (isascii(c) && isprint(c)) 1868 # define isPUNCT_LC(c) (isascii(c) && ispunct(c)) 1869 # define isSPACE_LC(c) (isascii(c) && isspace(c)) 1870 # define isUPPER_LC(c) (isascii(c) && isupper(c)) 1871 # define isWORDCHAR_LC(c) (isascii(c) && (isalnum(c) || (c) == '_')) 1872 # define isXDIGIT_LC(c) (isascii(c) && isxdigit(c)) 1873 1874 # define toLOWER_LC(c) (isascii(c) ? tolower(c) : (c)) 1875 # define toUPPER_LC(c) (isascii(c) ? toupper(c) : (c)) 1876 # define toFOLD_LC(c) (isascii(c) ? tolower(c) : (c)) 1877 1878 #endif 1879 1880 #define isIDCONT(c) isWORDCHAR(c) 1881 #define isIDCONT_A(c) isWORDCHAR_A(c) 1882 #define isIDCONT_L1(c) isWORDCHAR_L1(c) 1883 #define isIDCONT_LC(c) isWORDCHAR_LC(c) 1884 #define isPSXSPC_LC(c) isSPACE_LC(c) 1885 1886 /* For internal core Perl use only: the base macros for defining macros like 1887 * isALPHA_uvchr. 'c' is the code point to check. 'classnum' is the POSIX class 1888 * number defined earlier in this file. _generic_uvchr() is used for POSIX 1889 * classes where there is a macro or function 'above_latin1' that takes the 1890 * single argument 'c' and returns the desired value. These exist for those 1891 * classes which have simple definitions, avoiding the overhead of an inversion 1892 * list binary search. _generic_invlist_uvchr() can be used 1893 * for classes where that overhead is faster than a direct lookup. 1894 * _generic_uvchr() won't compile if 'c' isn't unsigned, as it won't match the 1895 * 'above_latin1' prototype. _generic_isCC() macro does bounds checking, so 1896 * have duplicate checks here, so could create versions of the macros that 1897 * don't, but experiments show that gcc optimizes them out anyway. */ 1898 1899 /* Note that all ignore 'use bytes' */ 1900 #define _generic_uvchr(classnum, above_latin1, c) ((c) < 256 \ 1901 ? _generic_isCC(c, classnum) \ 1902 : above_latin1(c)) 1903 #define _generic_invlist_uvchr(classnum, c) ((c) < 256 \ 1904 ? _generic_isCC(c, classnum) \ 1905 : _is_uni_FOO(classnum, c)) 1906 #define isALPHA_uvchr(c) _generic_invlist_uvchr(_CC_ALPHA, c) 1907 #define isALPHANUMERIC_uvchr(c) _generic_invlist_uvchr(_CC_ALPHANUMERIC, c) 1908 #define isASCII_uvchr(c) isASCII(c) 1909 #define isBLANK_uvchr(c) _generic_uvchr(_CC_BLANK, is_HORIZWS_cp_high, c) 1910 #define isCNTRL_uvchr(c) isCNTRL_L1(c) /* All controls are in Latin1 */ 1911 #define isDIGIT_uvchr(c) _generic_invlist_uvchr(_CC_DIGIT, c) 1912 #define isGRAPH_uvchr(c) _generic_invlist_uvchr(_CC_GRAPH, c) 1913 #define isIDCONT_uvchr(c) \ 1914 _generic_uvchr(_CC_WORDCHAR, _is_uni_perl_idcont, c) 1915 #define isIDFIRST_uvchr(c) \ 1916 _generic_uvchr(_CC_IDFIRST, _is_uni_perl_idstart, c) 1917 #define isLOWER_uvchr(c) _generic_invlist_uvchr(_CC_LOWER, c) 1918 #define isPRINT_uvchr(c) _generic_invlist_uvchr(_CC_PRINT, c) 1919 1920 #define isPUNCT_uvchr(c) _generic_invlist_uvchr(_CC_PUNCT, c) 1921 #define isSPACE_uvchr(c) _generic_uvchr(_CC_SPACE, is_XPERLSPACE_cp_high, c) 1922 #define isPSXSPC_uvchr(c) isSPACE_uvchr(c) 1923 1924 #define isUPPER_uvchr(c) _generic_invlist_uvchr(_CC_UPPER, c) 1925 #define isVERTWS_uvchr(c) _generic_uvchr(_CC_VERTSPACE, is_VERTWS_cp_high, c) 1926 #define isWORDCHAR_uvchr(c) _generic_invlist_uvchr(_CC_WORDCHAR, c) 1927 #define isXDIGIT_uvchr(c) _generic_uvchr(_CC_XDIGIT, is_XDIGIT_cp_high, c) 1928 1929 #define toFOLD_uvchr(c,s,l) to_uni_fold(c,s,l) 1930 #define toLOWER_uvchr(c,s,l) to_uni_lower(c,s,l) 1931 #define toTITLE_uvchr(c,s,l) to_uni_title(c,s,l) 1932 #define toUPPER_uvchr(c,s,l) to_uni_upper(c,s,l) 1933 1934 /* For backwards compatibility, even though '_uni' should mean official Unicode 1935 * code points, in Perl it means native for those below 256 */ 1936 #define isALPHA_uni(c) isALPHA_uvchr(c) 1937 #define isALPHANUMERIC_uni(c) isALPHANUMERIC_uvchr(c) 1938 #define isASCII_uni(c) isASCII_uvchr(c) 1939 #define isBLANK_uni(c) isBLANK_uvchr(c) 1940 #define isCNTRL_uni(c) isCNTRL_uvchr(c) 1941 #define isDIGIT_uni(c) isDIGIT_uvchr(c) 1942 #define isGRAPH_uni(c) isGRAPH_uvchr(c) 1943 #define isIDCONT_uni(c) isIDCONT_uvchr(c) 1944 #define isIDFIRST_uni(c) isIDFIRST_uvchr(c) 1945 #define isLOWER_uni(c) isLOWER_uvchr(c) 1946 #define isPRINT_uni(c) isPRINT_uvchr(c) 1947 #define isPUNCT_uni(c) isPUNCT_uvchr(c) 1948 #define isSPACE_uni(c) isSPACE_uvchr(c) 1949 #define isPSXSPC_uni(c) isPSXSPC_uvchr(c) 1950 #define isUPPER_uni(c) isUPPER_uvchr(c) 1951 #define isVERTWS_uni(c) isVERTWS_uvchr(c) 1952 #define isWORDCHAR_uni(c) isWORDCHAR_uvchr(c) 1953 #define isXDIGIT_uni(c) isXDIGIT_uvchr(c) 1954 #define toFOLD_uni(c,s,l) toFOLD_uvchr(c,s,l) 1955 #define toLOWER_uni(c,s,l) toLOWER_uvchr(c,s,l) 1956 #define toTITLE_uni(c,s,l) toTITLE_uvchr(c,s,l) 1957 #define toUPPER_uni(c,s,l) toUPPER_uvchr(c,s,l) 1958 1959 /* For internal core Perl use only: the base macros for defining macros like 1960 * isALPHA_LC_uvchr. These are like isALPHA_LC, but the input can be any code 1961 * point, not just 0-255. Like _generic_uvchr, there are two versions, one for 1962 * simple class definitions; the other for more complex. These are like 1963 * _generic_uvchr, so see it for more info. */ 1964 #define _generic_LC_uvchr(latin1, above_latin1, c) \ 1965 (c < 256 ? latin1(c) : above_latin1(c)) 1966 #define _generic_LC_invlist_uvchr(latin1, classnum, c) \ 1967 (c < 256 ? latin1(c) : _is_uni_FOO(classnum, c)) 1968 1969 #define isALPHA_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHA_LC, _CC_ALPHA, c) 1970 #define isALPHANUMERIC_LC_uvchr(c) _generic_LC_invlist_uvchr(isALPHANUMERIC_LC, \ 1971 _CC_ALPHANUMERIC, c) 1972 #define isASCII_LC_uvchr(c) isASCII_LC(c) 1973 #define isBLANK_LC_uvchr(c) _generic_LC_uvchr(isBLANK_LC, \ 1974 is_HORIZWS_cp_high, c) 1975 #define isCNTRL_LC_uvchr(c) (c < 256 ? isCNTRL_LC(c) : 0) 1976 #define isDIGIT_LC_uvchr(c) _generic_LC_invlist_uvchr(isDIGIT_LC, _CC_DIGIT, c) 1977 #define isGRAPH_LC_uvchr(c) _generic_LC_invlist_uvchr(isGRAPH_LC, _CC_GRAPH, c) 1978 #define isIDCONT_LC_uvchr(c) _generic_LC_uvchr(isIDCONT_LC, \ 1979 _is_uni_perl_idcont, c) 1980 #define isIDFIRST_LC_uvchr(c) _generic_LC_uvchr(isIDFIRST_LC, \ 1981 _is_uni_perl_idstart, c) 1982 #define isLOWER_LC_uvchr(c) _generic_LC_invlist_uvchr(isLOWER_LC, _CC_LOWER, c) 1983 #define isPRINT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPRINT_LC, _CC_PRINT, c) 1984 #define isPSXSPC_LC_uvchr(c) isSPACE_LC_uvchr(c) 1985 #define isPUNCT_LC_uvchr(c) _generic_LC_invlist_uvchr(isPUNCT_LC, _CC_PUNCT, c) 1986 #define isSPACE_LC_uvchr(c) _generic_LC_uvchr(isSPACE_LC, \ 1987 is_XPERLSPACE_cp_high, c) 1988 #define isUPPER_LC_uvchr(c) _generic_LC_invlist_uvchr(isUPPER_LC, _CC_UPPER, c) 1989 #define isWORDCHAR_LC_uvchr(c) _generic_LC_invlist_uvchr(isWORDCHAR_LC, \ 1990 _CC_WORDCHAR, c) 1991 #define isXDIGIT_LC_uvchr(c) _generic_LC_uvchr(isXDIGIT_LC, \ 1992 is_XDIGIT_cp_high, c) 1993 1994 #define isBLANK_LC_uni(c) isBLANK_LC_uvchr(UNI_TO_NATIVE(c)) 1995 1996 /* The "_safe" macros make sure that we don't attempt to read beyond 'e', but 1997 * they don't otherwise go out of their way to look for malformed UTF-8. If 1998 * they can return accurate results without knowing if the input is otherwise 1999 * malformed, they do so. For example isASCII is accurate in spite of any 2000 * non-length malformations because it looks only at a single byte. Likewise 2001 * isDIGIT looks just at the first byte for code points 0-255, as all UTF-8 2002 * variant ones return FALSE. But, if the input has to be well-formed in order 2003 * for the results to be accurate, the macros will test and if malformed will 2004 * call a routine to die 2005 * 2006 * Except for toke.c, the macros do assume that e > p, asserting that on 2007 * DEBUGGING builds. Much code that calls these depends on this being true, 2008 * for other reasons. toke.c is treated specially as using the regular 2009 * assertion breaks it in many ways. All strings that these operate on there 2010 * are supposed to have an extra NUL character at the end, so that *e = \0. A 2011 * bunch of code in toke.c assumes that this is true, so the assertion allows 2012 * for that */ 2013 #ifdef PERL_IN_TOKE_C 2014 # define _utf8_safe_assert(p,e) ((e) > (p) || ((e) == (p) && *(p) == '\0')) 2015 #else 2016 # define _utf8_safe_assert(p,e) ((e) > (p)) 2017 #endif 2018 2019 #define _generic_utf8_safe(classnum, p, e, above_latin1) \ 2020 ((! _utf8_safe_assert(p, e)) \ 2021 ? (_force_out_malformed_utf8_message((U8 *) (p), (U8 *) (e), 0, 1), 0)\ 2022 : (UTF8_IS_INVARIANT(*(p))) \ 2023 ? _generic_isCC(*(p), classnum) \ 2024 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ 2025 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ 2026 ? _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1 )), \ 2027 classnum) \ 2028 : (_force_out_malformed_utf8_message( \ 2029 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \ 2030 : above_latin1)) 2031 /* Like the above, but calls 'above_latin1(p)' to get the utf8 value. 2032 * 'above_latin1' can be a macro */ 2033 #define _generic_func_utf8_safe(classnum, above_latin1, p, e) \ 2034 _generic_utf8_safe(classnum, p, e, above_latin1(p, e)) 2035 #define _generic_non_invlist_utf8_safe(classnum, above_latin1, p, e) \ 2036 _generic_utf8_safe(classnum, p, e, \ 2037 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2038 ? (_force_out_malformed_utf8_message( \ 2039 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2040 : above_latin1(p))) 2041 /* Like the above, but passes classnum to _isFOO_utf8(), instead of having an 2042 * 'above_latin1' parameter */ 2043 #define _generic_invlist_utf8_safe(classnum, p, e) \ 2044 _generic_utf8_safe(classnum, p, e, _is_utf8_FOO(classnum, p, e)) 2045 2046 /* Like the above, but should be used only when it is known that there are no 2047 * characters in the upper-Latin1 range (128-255 on ASCII platforms) which the 2048 * class is TRUE for. Hence it can skip the tests for this range. 2049 * 'above_latin1' should include its arguments */ 2050 #define _generic_utf8_safe_no_upper_latin1(classnum, p, e, above_latin1) \ 2051 (__ASSERT_(_utf8_safe_assert(p, e)) \ 2052 (UTF8_IS_INVARIANT(*(p))) \ 2053 ? _generic_isCC(*(p), classnum) \ 2054 : (UTF8_IS_DOWNGRADEABLE_START(*(p))) \ 2055 ? 0 /* Note that doesn't check validity for latin1 */ \ 2056 : above_latin1) 2057 2058 2059 #define isALPHA_utf8(p, e) isALPHA_utf8_safe(p, e) 2060 #define isALPHANUMERIC_utf8(p, e) isALPHANUMERIC_utf8_safe(p, e) 2061 #define isASCII_utf8(p, e) isASCII_utf8_safe(p, e) 2062 #define isBLANK_utf8(p, e) isBLANK_utf8_safe(p, e) 2063 #define isCNTRL_utf8(p, e) isCNTRL_utf8_safe(p, e) 2064 #define isDIGIT_utf8(p, e) isDIGIT_utf8_safe(p, e) 2065 #define isGRAPH_utf8(p, e) isGRAPH_utf8_safe(p, e) 2066 #define isIDCONT_utf8(p, e) isIDCONT_utf8_safe(p, e) 2067 #define isIDFIRST_utf8(p, e) isIDFIRST_utf8_safe(p, e) 2068 #define isLOWER_utf8(p, e) isLOWER_utf8_safe(p, e) 2069 #define isPRINT_utf8(p, e) isPRINT_utf8_safe(p, e) 2070 #define isPSXSPC_utf8(p, e) isPSXSPC_utf8_safe(p, e) 2071 #define isPUNCT_utf8(p, e) isPUNCT_utf8_safe(p, e) 2072 #define isSPACE_utf8(p, e) isSPACE_utf8_safe(p, e) 2073 #define isUPPER_utf8(p, e) isUPPER_utf8_safe(p, e) 2074 #define isVERTWS_utf8(p, e) isVERTWS_utf8_safe(p, e) 2075 #define isWORDCHAR_utf8(p, e) isWORDCHAR_utf8_safe(p, e) 2076 #define isXDIGIT_utf8(p, e) isXDIGIT_utf8_safe(p, e) 2077 2078 #define isALPHA_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_ALPHA, p, e) 2079 #define isALPHANUMERIC_utf8_safe(p, e) \ 2080 _generic_invlist_utf8_safe(_CC_ALPHANUMERIC, p, e) 2081 #define isASCII_utf8_safe(p, e) \ 2082 /* Because ASCII is invariant under utf8, the non-utf8 macro \ 2083 * works */ \ 2084 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII(*(p))) 2085 #define isBLANK_utf8_safe(p, e) \ 2086 _generic_non_invlist_utf8_safe(_CC_BLANK, is_HORIZWS_high, p, e) 2087 2088 #ifdef EBCDIC 2089 /* Because all controls are UTF-8 invariants in EBCDIC, we can use this 2090 * more efficient macro instead of the more general one */ 2091 # define isCNTRL_utf8_safe(p, e) \ 2092 (__ASSERT_(_utf8_safe_assert(p, e)) isCNTRL_L1(*(p))) 2093 #else 2094 # define isCNTRL_utf8_safe(p, e) _generic_utf8_safe(_CC_CNTRL, p, e, 0) 2095 #endif 2096 2097 #define isDIGIT_utf8_safe(p, e) \ 2098 _generic_utf8_safe_no_upper_latin1(_CC_DIGIT, p, e, \ 2099 _is_utf8_FOO(_CC_DIGIT, p, e)) 2100 #define isGRAPH_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_GRAPH, p, e) 2101 #define isIDCONT_utf8_safe(p, e) _generic_func_utf8_safe(_CC_WORDCHAR, \ 2102 _is_utf8_perl_idcont, p, e) 2103 2104 /* To prevent S_scan_word in toke.c from hanging, we have to make sure that 2105 * IDFIRST is an alnum. See 2106 * https://github.com/Perl/perl5/issues/10275 for more detail than you 2107 * ever wanted to know about. (In the ASCII range, there isn't a difference.) 2108 * This used to be not the XID version, but we decided to go with the more 2109 * modern Unicode definition */ 2110 #define isIDFIRST_utf8_safe(p, e) \ 2111 _generic_func_utf8_safe(_CC_IDFIRST, \ 2112 _is_utf8_perl_idstart, (U8 *) (p), (U8 *) (e)) 2113 2114 #define isLOWER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_LOWER, p, e) 2115 #define isPRINT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PRINT, p, e) 2116 #define isPSXSPC_utf8_safe(p, e) isSPACE_utf8_safe(p, e) 2117 #define isPUNCT_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_PUNCT, p, e) 2118 #define isSPACE_utf8_safe(p, e) \ 2119 _generic_non_invlist_utf8_safe(_CC_SPACE, is_XPERLSPACE_high, p, e) 2120 #define isUPPER_utf8_safe(p, e) _generic_invlist_utf8_safe(_CC_UPPER, p, e) 2121 #define isVERTWS_utf8_safe(p, e) \ 2122 _generic_non_invlist_utf8_safe(_CC_VERTSPACE, is_VERTWS_high, p, e) 2123 #define isWORDCHAR_utf8_safe(p, e) \ 2124 _generic_invlist_utf8_safe(_CC_WORDCHAR, p, e) 2125 #define isXDIGIT_utf8_safe(p, e) \ 2126 _generic_utf8_safe_no_upper_latin1(_CC_XDIGIT, p, e, \ 2127 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2128 ? (_force_out_malformed_utf8_message( \ 2129 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2130 : is_XDIGIT_high(p))) 2131 2132 #define toFOLD_utf8(p,e,s,l) toFOLD_utf8_safe(p,e,s,l) 2133 #define toLOWER_utf8(p,e,s,l) toLOWER_utf8_safe(p,e,s,l) 2134 #define toTITLE_utf8(p,e,s,l) toTITLE_utf8_safe(p,e,s,l) 2135 #define toUPPER_utf8(p,e,s,l) toUPPER_utf8_safe(p,e,s,l) 2136 2137 /* For internal core use only, subject to change */ 2138 #define _toFOLD_utf8_flags(p,e,s,l,f) _to_utf8_fold_flags (p,e,s,l,f) 2139 #define _toLOWER_utf8_flags(p,e,s,l,f) _to_utf8_lower_flags(p,e,s,l,f) 2140 #define _toTITLE_utf8_flags(p,e,s,l,f) _to_utf8_title_flags(p,e,s,l,f) 2141 #define _toUPPER_utf8_flags(p,e,s,l,f) _to_utf8_upper_flags(p,e,s,l,f) 2142 2143 #define toFOLD_utf8_safe(p,e,s,l) _toFOLD_utf8_flags(p,e,s,l, FOLD_FLAGS_FULL) 2144 #define toLOWER_utf8_safe(p,e,s,l) _toLOWER_utf8_flags(p,e,s,l, 0) 2145 #define toTITLE_utf8_safe(p,e,s,l) _toTITLE_utf8_flags(p,e,s,l, 0) 2146 #define toUPPER_utf8_safe(p,e,s,l) _toUPPER_utf8_flags(p,e,s,l, 0) 2147 2148 #define isALPHA_LC_utf8(p, e) isALPHA_LC_utf8_safe(p, e) 2149 #define isALPHANUMERIC_LC_utf8(p, e) isALPHANUMERIC_LC_utf8_safe(p, e) 2150 #define isASCII_LC_utf8(p, e) isASCII_LC_utf8_safe(p, e) 2151 #define isBLANK_LC_utf8(p, e) isBLANK_LC_utf8_safe(p, e) 2152 #define isCNTRL_LC_utf8(p, e) isCNTRL_LC_utf8_safe(p, e) 2153 #define isDIGIT_LC_utf8(p, e) isDIGIT_LC_utf8_safe(p, e) 2154 #define isGRAPH_LC_utf8(p, e) isGRAPH_LC_utf8_safe(p, e) 2155 #define isIDCONT_LC_utf8(p, e) isIDCONT_LC_utf8_safe(p, e) 2156 #define isIDFIRST_LC_utf8(p, e) isIDFIRST_LC_utf8_safe(p, e) 2157 #define isLOWER_LC_utf8(p, e) isLOWER_LC_utf8_safe(p, e) 2158 #define isPRINT_LC_utf8(p, e) isPRINT_LC_utf8_safe(p, e) 2159 #define isPSXSPC_LC_utf8(p, e) isPSXSPC_LC_utf8_safe(p, e) 2160 #define isPUNCT_LC_utf8(p, e) isPUNCT_LC_utf8_safe(p, e) 2161 #define isSPACE_LC_utf8(p, e) isSPACE_LC_utf8_safe(p, e) 2162 #define isUPPER_LC_utf8(p, e) isUPPER_LC_utf8_safe(p, e) 2163 #define isWORDCHAR_LC_utf8(p, e) isWORDCHAR_LC_utf8_safe(p, e) 2164 #define isXDIGIT_LC_utf8(p, e) isXDIGIT_LC_utf8_safe(p, e) 2165 2166 /* For internal core Perl use only: the base macros for defining macros like 2167 * isALPHA_LC_utf8_safe. These are like _generic_utf8, but if the first code 2168 * point in 'p' is within the 0-255 range, it uses locale rules from the 2169 * passed-in 'macro' parameter */ 2170 #define _generic_LC_utf8_safe(macro, p, e, above_latin1) \ 2171 (__ASSERT_(_utf8_safe_assert(p, e)) \ 2172 (UTF8_IS_INVARIANT(*(p))) \ 2173 ? macro(*(p)) \ 2174 : (UTF8_IS_DOWNGRADEABLE_START(*(p)) \ 2175 ? ((LIKELY((e) - (p) > 1 && UTF8_IS_CONTINUATION(*((p)+1)))) \ 2176 ? macro(EIGHT_BIT_UTF8_TO_NATIVE(*(p), *((p)+1))) \ 2177 : (_force_out_malformed_utf8_message( \ 2178 (U8 *) (p), (U8 *) (e), 0, 1), 0)) \ 2179 : above_latin1)) 2180 2181 #define _generic_LC_invlist_utf8_safe(macro, classnum, p, e) \ 2182 _generic_LC_utf8_safe(macro, p, e, \ 2183 _is_utf8_FOO(classnum, p, e)) 2184 2185 #define _generic_LC_func_utf8_safe(macro, above_latin1, p, e) \ 2186 _generic_LC_utf8_safe(macro, p, e, above_latin1(p, e)) 2187 2188 #define _generic_LC_non_invlist_utf8_safe(classnum, above_latin1, p, e) \ 2189 _generic_LC_utf8_safe(classnum, p, e, \ 2190 (UNLIKELY((e) - (p) < UTF8SKIP(p)) \ 2191 ? (_force_out_malformed_utf8_message( \ 2192 (U8 *) (p), (U8 *) (e), 0, 1), 0) \ 2193 : above_latin1(p))) 2194 2195 #define isALPHANUMERIC_LC_utf8_safe(p, e) \ 2196 _generic_LC_invlist_utf8_safe(isALPHANUMERIC_LC, \ 2197 _CC_ALPHANUMERIC, p, e) 2198 #define isALPHA_LC_utf8_safe(p, e) \ 2199 _generic_LC_invlist_utf8_safe(isALPHA_LC, _CC_ALPHA, p, e) 2200 #define isASCII_LC_utf8_safe(p, e) \ 2201 (__ASSERT_(_utf8_safe_assert(p, e)) isASCII_LC(*(p))) 2202 #define isBLANK_LC_utf8_safe(p, e) \ 2203 _generic_LC_non_invlist_utf8_safe(isBLANK_LC, is_HORIZWS_high, p, e) 2204 #define isCNTRL_LC_utf8_safe(p, e) \ 2205 _generic_LC_utf8_safe(isCNTRL_LC, p, e, 0) 2206 #define isDIGIT_LC_utf8_safe(p, e) \ 2207 _generic_LC_invlist_utf8_safe(isDIGIT_LC, _CC_DIGIT, p, e) 2208 #define isGRAPH_LC_utf8_safe(p, e) \ 2209 _generic_LC_invlist_utf8_safe(isGRAPH_LC, _CC_GRAPH, p, e) 2210 #define isIDCONT_LC_utf8_safe(p, e) \ 2211 _generic_LC_func_utf8_safe(isIDCONT_LC, \ 2212 _is_utf8_perl_idcont, p, e) 2213 #define isIDFIRST_LC_utf8_safe(p, e) \ 2214 _generic_LC_func_utf8_safe(isIDFIRST_LC, \ 2215 _is_utf8_perl_idstart, p, e) 2216 #define isLOWER_LC_utf8_safe(p, e) \ 2217 _generic_LC_invlist_utf8_safe(isLOWER_LC, _CC_LOWER, p, e) 2218 #define isPRINT_LC_utf8_safe(p, e) \ 2219 _generic_LC_invlist_utf8_safe(isPRINT_LC, _CC_PRINT, p, e) 2220 #define isPSXSPC_LC_utf8_safe(p, e) isSPACE_LC_utf8_safe(p, e) 2221 #define isPUNCT_LC_utf8_safe(p, e) \ 2222 _generic_LC_invlist_utf8_safe(isPUNCT_LC, _CC_PUNCT, p, e) 2223 #define isSPACE_LC_utf8_safe(p, e) \ 2224 _generic_LC_non_invlist_utf8_safe(isSPACE_LC, is_XPERLSPACE_high, p, e) 2225 #define isUPPER_LC_utf8_safe(p, e) \ 2226 _generic_LC_invlist_utf8_safe(isUPPER_LC, _CC_UPPER, p, e) 2227 #define isWORDCHAR_LC_utf8_safe(p, e) \ 2228 _generic_LC_invlist_utf8_safe(isWORDCHAR_LC, _CC_WORDCHAR, p, e) 2229 #define isXDIGIT_LC_utf8_safe(p, e) \ 2230 _generic_LC_non_invlist_utf8_safe(isXDIGIT_LC, is_XDIGIT_high, p, e) 2231 2232 /* Macros for backwards compatibility and for completeness when the ASCII and 2233 * Latin1 values are identical */ 2234 #define isALPHAU(c) isALPHA_L1(c) 2235 #define isDIGIT_L1(c) isDIGIT_A(c) 2236 #define isOCTAL(c) isOCTAL_A(c) 2237 #define isOCTAL_L1(c) isOCTAL_A(c) 2238 #define isXDIGIT_L1(c) isXDIGIT_A(c) 2239 #define isALNUM(c) isWORDCHAR(c) 2240 #define isALNUM_A(c) isALNUM(c) 2241 #define isALNUMU(c) isWORDCHAR_L1(c) 2242 #define isALNUM_LC(c) isWORDCHAR_LC(c) 2243 #define isALNUM_uni(c) isWORDCHAR_uni(c) 2244 #define isALNUM_LC_uvchr(c) isWORDCHAR_LC_uvchr(c) 2245 #define isALNUM_utf8(p,e) isWORDCHAR_utf8(p,e) 2246 #define isALNUM_utf8_safe(p,e) isWORDCHAR_utf8_safe(p,e) 2247 #define isALNUM_LC_utf8(p,e)isWORDCHAR_LC_utf8(p,e) 2248 #define isALNUM_LC_utf8_safe(p,e)isWORDCHAR_LC_utf8_safe(p,e) 2249 #define isALNUMC_A(c) isALPHANUMERIC_A(c) /* Mnemonic: "C's alnum" */ 2250 #define isALNUMC_L1(c) isALPHANUMERIC_L1(c) 2251 #define isALNUMC(c) isALPHANUMERIC(c) 2252 #define isALNUMC_LC(c) isALPHANUMERIC_LC(c) 2253 #define isALNUMC_uni(c) isALPHANUMERIC_uni(c) 2254 #define isALNUMC_LC_uvchr(c) isALPHANUMERIC_LC_uvchr(c) 2255 #define isALNUMC_utf8(p,e) isALPHANUMERIC_utf8(p,e) 2256 #define isALNUMC_utf8_safe(p,e) isALPHANUMERIC_utf8_safe(p,e) 2257 #define isALNUMC_LC_utf8_safe(p,e) isALPHANUMERIC_LC_utf8_safe(p,e) 2258 2259 /* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, 2260 * except that they don't necessarily mean the same characters, e.g. CTRL-D is 2261 * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC. 2262 * '?' is special-cased on EBCDIC to APC, which is the control there that is 2263 * the outlier from the block that contains the other controls, just like 2264 * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0 2265 * block. If it weren't special cased, it would yield a non-control. 2266 * The conversion works both ways, so toCTRL('D') is 4, and toCTRL(4) is D, 2267 * etc. */ 2268 #ifndef EBCDIC 2269 # define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) toUPPER(((U8)(c))) ^ 64) 2270 #else 2271 # define toCTRL(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ 2272 ((isPRINT_A(c)) \ 2273 ? (UNLIKELY((c) == '?') \ 2274 ? QUESTION_MARK_CTRL \ 2275 : (NATIVE_TO_LATIN1(toUPPER((U8) (c))) ^ 64)) \ 2276 : (UNLIKELY((c) == QUESTION_MARK_CTRL) \ 2277 ? '?' \ 2278 : (LATIN1_TO_NATIVE(((U8) (c)) ^ 64))))) 2279 #endif 2280 2281 /* Line numbers are unsigned, 32 bits. */ 2282 typedef U32 line_t; 2283 #define NOLINE ((line_t) 4294967295UL) /* = FFFFFFFF */ 2284 2285 /* Helpful alias for version prescan */ 2286 #define is_LAX_VERSION(a,b) \ 2287 (a != Perl_prescan_version(aTHX_ a, FALSE, b, NULL, NULL, NULL, NULL)) 2288 2289 #define is_STRICT_VERSION(a,b) \ 2290 (a != Perl_prescan_version(aTHX_ a, TRUE, b, NULL, NULL, NULL, NULL)) 2291 2292 #define BADVERSION(a,b,c) \ 2293 if (b) { \ 2294 *b = c; \ 2295 } \ 2296 return a; 2297 2298 /* Converts a character KNOWN to represent a hexadecimal digit (0-9, A-F, or 2299 * a-f) to its numeric value without using any branches. The input is 2300 * validated only by an assert() in DEBUGGING builds. 2301 * 2302 * It works by right shifting and isolating the bit that is 0 for the digits, 2303 * and 1 for at least the alphas A-F, a-f. The bit is shifted to the ones 2304 * position, and then to the eights position. Both are added together to form 2305 * 0 if the input is '0'-'9' and to form 9 if alpha. This is added to the 2306 * final four bits of the input to form the correct value. */ 2307 #define XDIGIT_VALUE(c) (__ASSERT_(isXDIGIT(c)) \ 2308 ((NATIVE_TO_LATIN1(c) >> 6) & 1) /* 1 if alpha; 0 if not */ \ 2309 + ((NATIVE_TO_LATIN1(c) >> 3) & 8) /* 8 if alpha; 0 if not */ \ 2310 + ((c) & 0xF)) /* 0-9 if input valid hex digit */ 2311 2312 /* The argument is a string pointer, which is advanced. */ 2313 #define READ_XDIGIT(s) ((s)++, XDIGIT_VALUE(*((s) - 1))) 2314 2315 /* Converts a character known to represent an octal digit (0-7) to its numeric 2316 * value. The input is validated only by an assert() in DEBUGGING builds. In 2317 * both ASCII and EBCDIC the last 3 bits of the octal digits range from 0-7. */ 2318 #define OCTAL_VALUE(c) (__ASSERT_(isOCTAL(c)) (7 & (c))) 2319 2320 /* Efficiently returns a boolean as to if two native characters are equivalent 2321 * case-insenstively. At least one of the characters must be one of [A-Za-z]; 2322 * the ALPHA in the name is to remind you of that. This is asserted() in 2323 * DEBUGGING builds. Because [A-Za-z] are invariant under UTF-8, this macro 2324 * works (on valid input) for both non- and UTF-8-encoded bytes. 2325 * 2326 * When one of the inputs is a compile-time constant and gets folded by the 2327 * compiler, this reduces to an AND and a TEST. On both EBCDIC and ASCII 2328 * machines, 'A' and 'a' differ by a single bit; the same with the upper and 2329 * lower case of all other ASCII-range alphabetics. On ASCII platforms, they 2330 * are 32 apart; on EBCDIC, they are 64. At compile time, this uses an 2331 * exclusive 'or' to find that bit and then inverts it to form a mask, with 2332 * just a single 0, in the bit position where the upper- and lowercase differ. 2333 * */ 2334 #define isALPHA_FOLD_EQ(c1, c2) \ 2335 (__ASSERT_(isALPHA_A(c1) || isALPHA_A(c2)) \ 2336 ((c1) & ~('A' ^ 'a')) == ((c2) & ~('A' ^ 'a'))) 2337 #define isALPHA_FOLD_NE(c1, c2) (! isALPHA_FOLD_EQ((c1), (c2))) 2338 2339 /* 2340 =head1 Memory Management 2341 2342 =for apidoc Am|void|Newx|void* ptr|int nitems|type 2343 The XSUB-writer's interface to the C C<malloc> function. 2344 2345 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2346 2347 In 5.9.3, Newx() and friends replace the older New() API, and drops 2348 the first parameter, I<x>, a debug aid which allowed callers to identify 2349 themselves. This aid has been superseded by a new build option, 2350 PERL_MEM_LOG (see L<perlhacktips/PERL_MEM_LOG>). The older API is still 2351 there for use in XS modules supporting older perls. 2352 2353 =for apidoc Am|void|Newxc|void* ptr|int nitems|type|cast 2354 The XSUB-writer's interface to the C C<malloc> function, with 2355 cast. See also C<L</Newx>>. 2356 2357 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2358 2359 =for apidoc Am|void|Newxz|void* ptr|int nitems|type 2360 The XSUB-writer's interface to the C C<malloc> function. The allocated 2361 memory is zeroed with C<memzero>. See also C<L</Newx>>. 2362 2363 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2364 2365 =for apidoc Am|void|Renew|void* ptr|int nitems|type 2366 The XSUB-writer's interface to the C C<realloc> function. 2367 2368 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2369 2370 =for apidoc Am|void|Renewc|void* ptr|int nitems|type|cast 2371 The XSUB-writer's interface to the C C<realloc> function, with 2372 cast. 2373 2374 Memory obtained by this should B<ONLY> be freed with L</"Safefree">. 2375 2376 =for apidoc Am|void|Safefree|void* ptr 2377 The XSUB-writer's interface to the C C<free> function. 2378 2379 This should B<ONLY> be used on memory obtained using L</"Newx"> and friends. 2380 2381 =for apidoc Am|void|Move|void* src|void* dest|int nitems|type 2382 The XSUB-writer's interface to the C C<memmove> function. The C<src> is the 2383 source, C<dest> is the destination, C<nitems> is the number of items, and 2384 C<type> is the type. Can do overlapping moves. See also C<L</Copy>>. 2385 2386 =for apidoc Am|void *|MoveD|void* src|void* dest|int nitems|type 2387 Like C<Move> but returns C<dest>. Useful 2388 for encouraging compilers to tail-call 2389 optimise. 2390 2391 =for apidoc Am|void|Copy|void* src|void* dest|int nitems|type 2392 The XSUB-writer's interface to the C C<memcpy> function. The C<src> is the 2393 source, C<dest> is the destination, C<nitems> is the number of items, and 2394 C<type> is the type. May fail on overlapping copies. See also C<L</Move>>. 2395 2396 =for apidoc Am|void *|CopyD|void* src|void* dest|int nitems|type 2397 2398 Like C<Copy> but returns C<dest>. Useful 2399 for encouraging compilers to tail-call 2400 optimise. 2401 2402 =for apidoc Am|void|Zero|void* dest|int nitems|type 2403 2404 The XSUB-writer's interface to the C C<memzero> function. The C<dest> is the 2405 destination, C<nitems> is the number of items, and C<type> is the type. 2406 2407 =for apidoc Am|void *|ZeroD|void* dest|int nitems|type 2408 2409 Like C<Zero> but returns dest. Useful 2410 for encouraging compilers to tail-call 2411 optimise. 2412 2413 =for apidoc Am|void|StructCopy|type *src|type *dest|type 2414 This is an architecture-independent macro to copy one structure to another. 2415 2416 =for apidoc Am|void|PoisonWith|void* dest|int nitems|type|U8 byte 2417 2418 Fill up memory with a byte pattern (a byte repeated over and over 2419 again) that hopefully catches attempts to access uninitialized memory. 2420 2421 =for apidoc Am|void|PoisonNew|void* dest|int nitems|type 2422 2423 PoisonWith(0xAB) for catching access to allocated but uninitialized memory. 2424 2425 =for apidoc Am|void|PoisonFree|void* dest|int nitems|type 2426 2427 PoisonWith(0xEF) for catching access to freed memory. 2428 2429 =for apidoc Am|void|Poison|void* dest|int nitems|type 2430 2431 PoisonWith(0xEF) for catching access to freed memory. 2432 2433 =cut */ 2434 2435 /* Maintained for backwards-compatibility only. Use newSV() instead. */ 2436 #ifndef PERL_CORE 2437 #define NEWSV(x,len) newSV(len) 2438 #endif 2439 2440 #define MEM_SIZE_MAX ((MEM_SIZE)-1) 2441 2442 #define _PERL_STRLEN_ROUNDUP_UNCHECKED(n) (((n) - 1 + PERL_STRLEN_ROUNDUP_QUANTUM) & ~((MEM_SIZE)PERL_STRLEN_ROUNDUP_QUANTUM - 1)) 2443 2444 #ifdef PERL_MALLOC_WRAP 2445 2446 /* This expression will be constant-folded at compile time. It checks 2447 * whether or not the type of the count n is so small (e.g. U8 or U16, or 2448 * U32 on 64-bit systems) that there's no way a wrap-around could occur. 2449 * As well as avoiding the need for a run-time check in some cases, it's 2450 * designed to avoid compiler warnings like: 2451 * comparison is always false due to limited range of data type 2452 * It's mathematically equivalent to 2453 * max(n) * sizeof(t) > MEM_SIZE_MAX 2454 */ 2455 2456 # define _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) \ 2457 ( sizeof(MEM_SIZE) < sizeof(n) \ 2458 || sizeof(t) > ((MEM_SIZE)1 << 8*(sizeof(MEM_SIZE) - sizeof(n)))) 2459 2460 /* This is written in a slightly odd way to avoid various spurious 2461 * compiler warnings. We *want* to write the expression as 2462 * _MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) && (n > C) 2463 * (for some compile-time constant C), but even when the LHS 2464 * constant-folds to false at compile-time, g++ insists on emitting 2465 * warnings about the RHS (e.g. "comparison is always false"), so instead 2466 * we write it as 2467 * 2468 * (cond ? n : X) > C 2469 * 2470 * where X is a constant with X > C always false. Choosing a value for X 2471 * is tricky. If 0, some compilers will complain about 0 > C always being 2472 * false; if 1, Coverity complains when n happens to be the constant value 2473 * '1', that cond ? 1 : 1 has the same value on both branches; so use C 2474 * for X and hope that nothing else whines. 2475 */ 2476 2477 # define _MEM_WRAP_WILL_WRAP(n,t) \ 2478 ((_MEM_WRAP_NEEDS_RUNTIME_CHECK(n,t) ? (MEM_SIZE)(n) : \ 2479 MEM_SIZE_MAX/sizeof(t)) > MEM_SIZE_MAX/sizeof(t)) 2480 2481 # define MEM_WRAP_CHECK(n,t) \ 2482 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2483 && (croak_memory_wrap(),0)) 2484 2485 # define MEM_WRAP_CHECK_1(n,t,a) \ 2486 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2487 && (Perl_croak_nocontext("%s",(a)),0)) 2488 2489 /* "a" arg must be a string literal */ 2490 # define MEM_WRAP_CHECK_s(n,t,a) \ 2491 (void)(UNLIKELY(_MEM_WRAP_WILL_WRAP(n,t)) \ 2492 && (Perl_croak_nocontext("" a ""),0)) 2493 2494 #define MEM_WRAP_CHECK_(n,t) MEM_WRAP_CHECK(n,t), 2495 2496 #define PERL_STRLEN_ROUNDUP(n) ((void)(((n) > MEM_SIZE_MAX - 2 * PERL_STRLEN_ROUNDUP_QUANTUM) ? (croak_memory_wrap(),0) : 0), _PERL_STRLEN_ROUNDUP_UNCHECKED(n)) 2497 #else 2498 2499 #define MEM_WRAP_CHECK(n,t) 2500 #define MEM_WRAP_CHECK_1(n,t,a) 2501 #define MEM_WRAP_CHECK_s(n,t,a) 2502 #define MEM_WRAP_CHECK_(n,t) 2503 2504 #define PERL_STRLEN_ROUNDUP(n) _PERL_STRLEN_ROUNDUP_UNCHECKED(n) 2505 2506 #endif 2507 2508 #ifdef PERL_MEM_LOG 2509 /* 2510 * If PERL_MEM_LOG is defined, all Newx()s, Renew()s, and Safefree()s 2511 * go through functions, which are handy for debugging breakpoints, but 2512 * which more importantly get the immediate calling environment (file and 2513 * line number, and C function name if available) passed in. This info can 2514 * then be used for logging the calls, for which one gets a sample 2515 * implementation unless -DPERL_MEM_LOG_NOIMPL is also defined. 2516 * 2517 * Known problems: 2518 * - not all memory allocs get logged, only those 2519 * that go through Newx() and derivatives (while all 2520 * Safefrees do get logged) 2521 * - __FILE__ and __LINE__ do not work everywhere 2522 * - __func__ or __FUNCTION__ even less so 2523 * - I think more goes on after the perlio frees but 2524 * the thing is that STDERR gets closed (as do all 2525 * the file descriptors) 2526 * - no deeper calling stack than the caller of the Newx() 2527 * or the kind, but do I look like a C reflection/introspection 2528 * utility to you? 2529 * - the function prototypes for the logging functions 2530 * probably should maybe be somewhere else than handy.h 2531 * - one could consider inlining (macrofying) the logging 2532 * for speed, but I am too lazy 2533 * - one could imagine recording the allocations in a hash, 2534 * (keyed by the allocation address?), and maintain that 2535 * through reallocs and frees, but how to do that without 2536 * any News() happening...? 2537 * - lots of -Ddefines to get useful/controllable output 2538 * - lots of ENV reads 2539 */ 2540 2541 # ifdef PERL_CORE 2542 # ifndef PERL_MEM_LOG_NOIMPL 2543 enum mem_log_type { 2544 MLT_ALLOC, 2545 MLT_REALLOC, 2546 MLT_FREE, 2547 MLT_NEW_SV, 2548 MLT_DEL_SV 2549 }; 2550 # endif 2551 # if defined(PERL_IN_SV_C) /* those are only used in sv.c */ 2552 void Perl_mem_log_new_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname); 2553 void Perl_mem_log_del_sv(const SV *sv, const char *filename, const int linenumber, const char *funcname); 2554 # endif 2555 # endif 2556 2557 #endif 2558 2559 #ifdef PERL_MEM_LOG 2560 #define MEM_LOG_ALLOC(n,t,a) Perl_mem_log_alloc(n,sizeof(t),STRINGIFY(t),a,__FILE__,__LINE__,FUNCTION__) 2561 #define MEM_LOG_REALLOC(n,t,v,a) Perl_mem_log_realloc(n,sizeof(t),STRINGIFY(t),v,a,__FILE__,__LINE__,FUNCTION__) 2562 #define MEM_LOG_FREE(a) Perl_mem_log_free(a,__FILE__,__LINE__,FUNCTION__) 2563 #endif 2564 2565 #ifndef MEM_LOG_ALLOC 2566 #define MEM_LOG_ALLOC(n,t,a) (a) 2567 #endif 2568 #ifndef MEM_LOG_REALLOC 2569 #define MEM_LOG_REALLOC(n,t,v,a) (a) 2570 #endif 2571 #ifndef MEM_LOG_FREE 2572 #define MEM_LOG_FREE(a) (a) 2573 #endif 2574 2575 #define Newx(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t)))))) 2576 #define Newxc(v,n,t,c) (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_ALLOC(n,t,safemalloc((MEM_SIZE)((n)*sizeof(t)))))) 2577 #define Newxz(v,n,t) (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_ALLOC(n,t,safecalloc((n),sizeof(t))))) 2578 2579 #ifndef PERL_CORE 2580 /* pre 5.9.x compatibility */ 2581 #define New(x,v,n,t) Newx(v,n,t) 2582 #define Newc(x,v,n,t,c) Newxc(v,n,t,c) 2583 #define Newz(x,v,n,t) Newxz(v,n,t) 2584 #endif 2585 2586 #define Renew(v,n,t) \ 2587 (v = (MEM_WRAP_CHECK_(n,t) (t*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t)))))) 2588 #define Renewc(v,n,t,c) \ 2589 (v = (MEM_WRAP_CHECK_(n,t) (c*)MEM_LOG_REALLOC(n,t,v,saferealloc((Malloc_t)(v),(MEM_SIZE)((n)*sizeof(t)))))) 2590 2591 #ifdef PERL_POISON 2592 #define Safefree(d) \ 2593 ((d) ? (void)(safefree(MEM_LOG_FREE((Malloc_t)(d))), Poison(&(d), 1, Malloc_t)) : (void) 0) 2594 #else 2595 #define Safefree(d) safefree(MEM_LOG_FREE((Malloc_t)(d))) 2596 #endif 2597 2598 /* assert that a valid ptr has been supplied - use this instead of assert(ptr) * 2599 * as it handles cases like constant string arguments without throwing warnings * 2600 * the cast is required, as is the inequality check, to avoid warnings */ 2601 #define perl_assert_ptr(p) assert( ((void*)(p)) != 0 ) 2602 2603 2604 #define Move(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memmove((char*)(d),(const char*)(s), (n) * sizeof(t))) 2605 #define Copy(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), (void)memcpy((char*)(d),(const char*)(s), (n) * sizeof(t))) 2606 #define Zero(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), (void)memzero((char*)(d), (n) * sizeof(t))) 2607 2608 /* Like above, but returns a pointer to 'd' */ 2609 #define MoveD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memmove((char*)(d),(const char*)(s), (n) * sizeof(t))) 2610 #define CopyD(s,d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), perl_assert_ptr(s), memcpy((char*)(d),(const char*)(s), (n) * sizeof(t))) 2611 #define ZeroD(d,n,t) (MEM_WRAP_CHECK_(n,t) perl_assert_ptr(d), memzero((char*)(d), (n) * sizeof(t))) 2612 2613 #define PoisonWith(d,n,t,b) (MEM_WRAP_CHECK_(n,t) (void)memset((char*)(d), (U8)(b), (n) * sizeof(t))) 2614 #define PoisonNew(d,n,t) PoisonWith(d,n,t,0xAB) 2615 #define PoisonFree(d,n,t) PoisonWith(d,n,t,0xEF) 2616 #define Poison(d,n,t) PoisonFree(d,n,t) 2617 2618 #ifdef PERL_POISON 2619 # define PERL_POISON_EXPR(x) x 2620 #else 2621 # define PERL_POISON_EXPR(x) 2622 #endif 2623 2624 #define StructCopy(s,d,t) (*((t*)(d)) = *((t*)(s))) 2625 2626 /* 2627 =head1 Handy Values 2628 2629 =for apidoc Am|STRLEN|C_ARRAY_LENGTH|void *a 2630 2631 Returns the number of elements in the input C array (so you want your 2632 zero-based indices to be less than but not equal to). 2633 2634 =for apidoc Am|void *|C_ARRAY_END|void *a 2635 2636 Returns a pointer to one element past the final element of the input C array. 2637 2638 =cut 2639 2640 C_ARRAY_END is one past the last: half-open/half-closed range, not 2641 last-inclusive range. 2642 */ 2643 #define C_ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) 2644 #define C_ARRAY_END(a) ((a) + C_ARRAY_LENGTH(a)) 2645 2646 #ifdef NEED_VA_COPY 2647 # ifdef va_copy 2648 # define Perl_va_copy(s, d) va_copy(d, s) 2649 # elif defined(__va_copy) 2650 # define Perl_va_copy(s, d) __va_copy(d, s) 2651 # else 2652 # define Perl_va_copy(s, d) Copy(s, d, 1, va_list) 2653 # endif 2654 #endif 2655 2656 /* convenience debug macros */ 2657 #ifdef USE_ITHREADS 2658 #define pTHX_FORMAT "Perl interpreter: 0x%p" 2659 #define pTHX__FORMAT ", Perl interpreter: 0x%p" 2660 #define pTHX_VALUE_ (void *)my_perl, 2661 #define pTHX_VALUE (void *)my_perl 2662 #define pTHX__VALUE_ ,(void *)my_perl, 2663 #define pTHX__VALUE ,(void *)my_perl 2664 #else 2665 #define pTHX_FORMAT 2666 #define pTHX__FORMAT 2667 #define pTHX_VALUE_ 2668 #define pTHX_VALUE 2669 #define pTHX__VALUE_ 2670 #define pTHX__VALUE 2671 #endif /* USE_ITHREADS */ 2672 2673 /* Perl_deprecate was not part of the public API, and did not have a deprecate() 2674 shortcut macro defined without -DPERL_CORE. Neither codesearch.google.com nor 2675 CPAN::Unpack show any users outside the core. */ 2676 #ifdef PERL_CORE 2677 # define deprecate(s) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2678 "Use of " s " is deprecated") 2679 # define deprecate_disappears_in(when,message) \ 2680 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2681 message ", and will disappear in Perl " when) 2682 # define deprecate_fatal_in(when,message) \ 2683 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ 2684 message ". Its use will be fatal in Perl " when) 2685 #endif 2686 2687 /* Internal macros to deal with gids and uids */ 2688 #ifdef PERL_CORE 2689 2690 # if Uid_t_size > IVSIZE 2691 # define sv_setuid(sv, uid) sv_setnv((sv), (NV)(uid)) 2692 # define SvUID(sv) SvNV(sv) 2693 # elif Uid_t_sign <= 0 2694 # define sv_setuid(sv, uid) sv_setiv((sv), (IV)(uid)) 2695 # define SvUID(sv) SvIV(sv) 2696 # else 2697 # define sv_setuid(sv, uid) sv_setuv((sv), (UV)(uid)) 2698 # define SvUID(sv) SvUV(sv) 2699 # endif /* Uid_t_size */ 2700 2701 # if Gid_t_size > IVSIZE 2702 # define sv_setgid(sv, gid) sv_setnv((sv), (NV)(gid)) 2703 # define SvGID(sv) SvNV(sv) 2704 # elif Gid_t_sign <= 0 2705 # define sv_setgid(sv, gid) sv_setiv((sv), (IV)(gid)) 2706 # define SvGID(sv) SvIV(sv) 2707 # else 2708 # define sv_setgid(sv, gid) sv_setuv((sv), (UV)(gid)) 2709 # define SvGID(sv) SvUV(sv) 2710 # endif /* Gid_t_size */ 2711 2712 #endif 2713 2714 #endif /* PERL_HANDY_H_ */ 2715 2716 /* 2717 * ex: set ts=8 sts=4 sw=4 et: 2718 */ 2719