1 /* locale.c 2 * 3 * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 4 * 2002, 2003, 2005, 2006, 2007, 2008 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * A Elbereth Gilthoniel, 13 * silivren penna míriel 14 * o menel aglar elenath! 15 * Na-chaered palan-díriel 16 * o galadhremmin ennorath, 17 * Fanuilos, le linnathon 18 * nef aear, si nef aearon! 19 * 20 * [p.238 of _The Lord of the Rings_, II/i: "Many Meetings"] 21 */ 22 23 /* utility functions for handling locale-specific stuff like what 24 * character represents the decimal point. 25 * 26 * All C programs have an underlying locale. Perl code generally doesn't pay 27 * any attention to it except within the scope of a 'use locale'. For most 28 * categories, it accomplishes this by just using different operations if it is 29 * in such scope than if not. However, various libc functions called by Perl 30 * are affected by the LC_NUMERIC category, so there are macros in perl.h that 31 * are used to toggle between the current locale and the C locale depending on 32 * the desired behavior of those functions at the moment. And, LC_MESSAGES is 33 * switched to the C locale for outputting the message unless within the scope 34 * of 'use locale'. 35 * 36 * This code now has multi-thread-safe locale handling on systems that support 37 * that. This is completely transparent to most XS code. On earlier systems, 38 * it would be possible to emulate thread-safe locales, but this likely would 39 * involve a lot of locale switching, and would require XS code changes. 40 * Macros could be written so that the code wouldn't have to know which type of 41 * system is being used. It's unlikely that we would ever do that, since most 42 * modern systems support thread-safe locales, but there was code written to 43 * this end, and is retained, #ifdef'd out. 44 */ 45 46 #include "EXTERN.h" 47 #define PERL_IN_LOCALE_C 48 #include "perl_langinfo.h" 49 #include "perl.h" 50 51 #include "reentr.h" 52 53 #ifdef I_WCHAR 54 # include <wchar.h> 55 #endif 56 #ifdef I_WCTYPE 57 # include <wctype.h> 58 #endif 59 60 /* If the environment says to, we can output debugging information during 61 * initialization. This is done before option parsing, and before any thread 62 * creation, so can be a file-level static */ 63 #if ! defined(DEBUGGING) 64 # define debug_initialization 0 65 # define DEBUG_INITIALIZATION_set(v) 66 #else 67 static bool debug_initialization = FALSE; 68 # define DEBUG_INITIALIZATION_set(v) (debug_initialization = v) 69 #endif 70 71 72 /* Returns the Unix errno portion; ignoring any others. This is a macro here 73 * instead of putting it into perl.h, because unclear to khw what should be 74 * done generally. */ 75 #define GET_ERRNO saved_errno 76 77 /* strlen() of a literal string constant. We might want this more general, 78 * but using it in just this file for now. A problem with more generality is 79 * the compiler warnings about comparing unlike signs */ 80 #define STRLENs(s) (sizeof("" s "") - 1) 81 82 /* Is the C string input 'name' "C" or "POSIX"? If so, and 'name' is the 83 * return of setlocale(), then this is extremely likely to be the C or POSIX 84 * locale. However, the output of setlocale() is documented to be opaque, but 85 * the odds are extremely small that it would return these two strings for some 86 * other locale. Note that VMS in these two locales includes many non-ASCII 87 * characters as controls and punctuation (below are hex bytes): 88 * cntrl: 84-97 9B-9F 89 * punct: A1-A3 A5 A7-AB B0-B3 B5-B7 B9-BD BF-CF D1-DD DF-EF F1-FD 90 * Oddly, none there are listed as alphas, though some represent alphabetics 91 * http://www.nntp.perl.org/group/perl.perl5.porters/2013/02/msg198753.html */ 92 #define isNAME_C_OR_POSIX(name) \ 93 ( (name) != NULL \ 94 && (( *(name) == 'C' && (*(name + 1)) == '\0') \ 95 || strEQ((name), "POSIX"))) 96 97 #ifdef USE_LOCALE 98 99 /* This code keeps a LRU cache of the UTF-8ness of the locales it has so-far 100 * looked up. This is in the form of a C string: */ 101 102 #define UTF8NESS_SEP "\v" 103 #define UTF8NESS_PREFIX "\f" 104 105 /* So, the string looks like: 106 * 107 * \vC\a0\vPOSIX\a0\vam_ET\a0\vaf_ZA.utf8\a1\ven_US.UTF-8\a1\0 108 * 109 * where the digit 0 after the \a indicates that the locale starting just 110 * after the preceding \v is not UTF-8, and the digit 1 mean it is. */ 111 112 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_SEP) == 1); 113 STATIC_ASSERT_DECL(STRLENs(UTF8NESS_PREFIX) == 1); 114 115 #define C_and_POSIX_utf8ness UTF8NESS_SEP "C" UTF8NESS_PREFIX "0" \ 116 UTF8NESS_SEP "POSIX" UTF8NESS_PREFIX "0" 117 118 /* The cache is initialized to C_and_POSIX_utf8ness at start up. These are 119 * kept there always. The remining portion of the cache is LRU, with the 120 * oldest looked-up locale at the tail end */ 121 122 STATIC char * 123 S_stdize_locale(pTHX_ char *locs) 124 { 125 /* Standardize the locale name from a string returned by 'setlocale', 126 * possibly modifying that string. 127 * 128 * The typical return value of setlocale() is either 129 * (1) "xx_YY" if the first argument of setlocale() is not LC_ALL 130 * (2) "xa_YY xb_YY ..." if the first argument of setlocale() is LC_ALL 131 * (the space-separated values represent the various sublocales, 132 * in some unspecified order). This is not handled by this function. 133 * 134 * In some platforms it has a form like "LC_SOMETHING=Lang_Country.866\n", 135 * which is harmful for further use of the string in setlocale(). This 136 * function removes the trailing new line and everything up through the '=' 137 * */ 138 139 const char * const s = strchr(locs, '='); 140 bool okay = TRUE; 141 142 PERL_ARGS_ASSERT_STDIZE_LOCALE; 143 144 if (s) { 145 const char * const t = strchr(s, '.'); 146 okay = FALSE; 147 if (t) { 148 const char * const u = strchr(t, '\n'); 149 if (u && (u[1] == 0)) { 150 const STRLEN len = u - s; 151 Move(s + 1, locs, len, char); 152 locs[len] = 0; 153 okay = TRUE; 154 } 155 } 156 } 157 158 if (!okay) 159 Perl_croak(aTHX_ "Can't fix broken locale name \"%s\"", locs); 160 161 return locs; 162 } 163 164 /* Two parallel arrays; first the locale categories Perl uses on this system; 165 * the second array is their names. These arrays are in mostly arbitrary 166 * order. */ 167 168 const int categories[] = { 169 170 # ifdef USE_LOCALE_NUMERIC 171 LC_NUMERIC, 172 # endif 173 # ifdef USE_LOCALE_CTYPE 174 LC_CTYPE, 175 # endif 176 # ifdef USE_LOCALE_COLLATE 177 LC_COLLATE, 178 # endif 179 # ifdef USE_LOCALE_TIME 180 LC_TIME, 181 # endif 182 # ifdef USE_LOCALE_MESSAGES 183 LC_MESSAGES, 184 # endif 185 # ifdef USE_LOCALE_MONETARY 186 LC_MONETARY, 187 # endif 188 # ifdef USE_LOCALE_ADDRESS 189 LC_ADDRESS, 190 # endif 191 # ifdef USE_LOCALE_IDENTIFICATION 192 LC_IDENTIFICATION, 193 # endif 194 # ifdef USE_LOCALE_MEASUREMENT 195 LC_MEASUREMENT, 196 # endif 197 # ifdef USE_LOCALE_PAPER 198 LC_PAPER, 199 # endif 200 # ifdef USE_LOCALE_TELEPHONE 201 LC_TELEPHONE, 202 # endif 203 # ifdef USE_LOCALE_SYNTAX 204 LC_SYNTAX, 205 # endif 206 # ifdef USE_LOCALE_TOD 207 LC_TOD, 208 # endif 209 # ifdef LC_ALL 210 LC_ALL, 211 # endif 212 -1 /* Placeholder because C doesn't allow a 213 trailing comma, and it would get complicated 214 with all the #ifdef's */ 215 }; 216 217 /* The top-most real element is LC_ALL */ 218 219 const char * const category_names[] = { 220 221 # ifdef USE_LOCALE_NUMERIC 222 "LC_NUMERIC", 223 # endif 224 # ifdef USE_LOCALE_CTYPE 225 "LC_CTYPE", 226 # endif 227 # ifdef USE_LOCALE_COLLATE 228 "LC_COLLATE", 229 # endif 230 # ifdef USE_LOCALE_TIME 231 "LC_TIME", 232 # endif 233 # ifdef USE_LOCALE_MESSAGES 234 "LC_MESSAGES", 235 # endif 236 # ifdef USE_LOCALE_MONETARY 237 "LC_MONETARY", 238 # endif 239 # ifdef USE_LOCALE_ADDRESS 240 "LC_ADDRESS", 241 # endif 242 # ifdef USE_LOCALE_IDENTIFICATION 243 "LC_IDENTIFICATION", 244 # endif 245 # ifdef USE_LOCALE_MEASUREMENT 246 "LC_MEASUREMENT", 247 # endif 248 # ifdef USE_LOCALE_PAPER 249 "LC_PAPER", 250 # endif 251 # ifdef USE_LOCALE_TELEPHONE 252 "LC_TELEPHONE", 253 # endif 254 # ifdef USE_LOCALE_SYNTAX 255 "LC_SYNTAX", 256 # endif 257 # ifdef USE_LOCALE_TOD 258 "LC_TOD", 259 # endif 260 # ifdef LC_ALL 261 "LC_ALL", 262 # endif 263 NULL /* Placeholder */ 264 }; 265 266 # ifdef LC_ALL 267 268 /* On systems with LC_ALL, it is kept in the highest index position. (-2 269 * to account for the final unused placeholder element.) */ 270 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 2) 271 272 # else 273 274 /* On systems without LC_ALL, we pretend it is there, one beyond the real 275 * top element, hence in the unused placeholder element. */ 276 # define NOMINAL_LC_ALL_INDEX (C_ARRAY_LENGTH(categories) - 1) 277 278 # endif 279 280 /* Pretending there is an LC_ALL element just above allows us to avoid most 281 * special cases. Most loops through these arrays in the code below are 282 * written like 'for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++)'. They will work 283 * on either type of system. But the code must be written to not access the 284 * element at 'LC_ALL_INDEX' except on platforms that have it. This can be 285 * checked for at compile time by using the #define LC_ALL_INDEX which is only 286 * defined if we do have LC_ALL. */ 287 288 STATIC const char * 289 S_category_name(const int category) 290 { 291 unsigned int i; 292 293 #ifdef LC_ALL 294 295 if (category == LC_ALL) { 296 return "LC_ALL"; 297 } 298 299 #endif 300 301 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { 302 if (category == categories[i]) { 303 return category_names[i]; 304 } 305 } 306 307 { 308 const char suffix[] = " (unknown)"; 309 int temp = category; 310 Size_t length = sizeof(suffix) + 1; 311 char * unknown; 312 dTHX; 313 314 if (temp < 0) { 315 length++; 316 temp = - temp; 317 } 318 319 /* Calculate the number of digits */ 320 while (temp >= 10) { 321 temp /= 10; 322 length++; 323 } 324 325 Newx(unknown, length, char); 326 my_snprintf(unknown, length, "%d%s", category, suffix); 327 SAVEFREEPV(unknown); 328 return unknown; 329 } 330 } 331 332 /* Now create LC_foo_INDEX #defines for just those categories on this system */ 333 # ifdef USE_LOCALE_NUMERIC 334 # define LC_NUMERIC_INDEX 0 335 # define _DUMMY_NUMERIC LC_NUMERIC_INDEX 336 # else 337 # define _DUMMY_NUMERIC -1 338 # endif 339 # ifdef USE_LOCALE_CTYPE 340 # define LC_CTYPE_INDEX _DUMMY_NUMERIC + 1 341 # define _DUMMY_CTYPE LC_CTYPE_INDEX 342 # else 343 # define _DUMMY_CTYPE _DUMMY_NUMERIC 344 # endif 345 # ifdef USE_LOCALE_COLLATE 346 # define LC_COLLATE_INDEX _DUMMY_CTYPE + 1 347 # define _DUMMY_COLLATE LC_COLLATE_INDEX 348 # else 349 # define _DUMMY_COLLATE _DUMMY_CTYPE 350 # endif 351 # ifdef USE_LOCALE_TIME 352 # define LC_TIME_INDEX _DUMMY_COLLATE + 1 353 # define _DUMMY_TIME LC_TIME_INDEX 354 # else 355 # define _DUMMY_TIME _DUMMY_COLLATE 356 # endif 357 # ifdef USE_LOCALE_MESSAGES 358 # define LC_MESSAGES_INDEX _DUMMY_TIME + 1 359 # define _DUMMY_MESSAGES LC_MESSAGES_INDEX 360 # else 361 # define _DUMMY_MESSAGES _DUMMY_TIME 362 # endif 363 # ifdef USE_LOCALE_MONETARY 364 # define LC_MONETARY_INDEX _DUMMY_MESSAGES + 1 365 # define _DUMMY_MONETARY LC_MONETARY_INDEX 366 # else 367 # define _DUMMY_MONETARY _DUMMY_MESSAGES 368 # endif 369 # ifdef USE_LOCALE_ADDRESS 370 # define LC_ADDRESS_INDEX _DUMMY_MONETARY + 1 371 # define _DUMMY_ADDRESS LC_ADDRESS_INDEX 372 # else 373 # define _DUMMY_ADDRESS _DUMMY_MONETARY 374 # endif 375 # ifdef USE_LOCALE_IDENTIFICATION 376 # define LC_IDENTIFICATION_INDEX _DUMMY_ADDRESS + 1 377 # define _DUMMY_IDENTIFICATION LC_IDENTIFICATION_INDEX 378 # else 379 # define _DUMMY_IDENTIFICATION _DUMMY_ADDRESS 380 # endif 381 # ifdef USE_LOCALE_MEASUREMENT 382 # define LC_MEASUREMENT_INDEX _DUMMY_IDENTIFICATION + 1 383 # define _DUMMY_MEASUREMENT LC_MEASUREMENT_INDEX 384 # else 385 # define _DUMMY_MEASUREMENT _DUMMY_IDENTIFICATION 386 # endif 387 # ifdef USE_LOCALE_PAPER 388 # define LC_PAPER_INDEX _DUMMY_MEASUREMENT + 1 389 # define _DUMMY_PAPER LC_PAPER_INDEX 390 # else 391 # define _DUMMY_PAPER _DUMMY_MEASUREMENT 392 # endif 393 # ifdef USE_LOCALE_TELEPHONE 394 # define LC_TELEPHONE_INDEX _DUMMY_PAPER + 1 395 # define _DUMMY_TELEPHONE LC_TELEPHONE_INDEX 396 # else 397 # define _DUMMY_TELEPHONE _DUMMY_PAPER 398 # endif 399 # ifdef USE_LOCALE_SYNTAX 400 # define LC_SYNTAX_INDEX _DUMMY_TELEPHONE + 1 401 # define _DUMMY_SYNTAX LC_SYNTAX_INDEX 402 # else 403 # define _DUMMY_SYNTAX _DUMMY_TELEPHONE 404 # endif 405 # ifdef USE_LOCALE_TOD 406 # define LC_TOD_INDEX _DUMMY_SYNTAX + 1 407 # define _DUMMY_TOD LC_TOD_INDEX 408 # else 409 # define _DUMMY_TOD _DUMMY_SYNTAX 410 # endif 411 # ifdef LC_ALL 412 # define LC_ALL_INDEX _DUMMY_TOD + 1 413 # endif 414 #endif /* ifdef USE_LOCALE */ 415 416 /* Windows requres a customized base-level setlocale() */ 417 #ifdef WIN32 418 # define my_setlocale(cat, locale) win32_setlocale(cat, locale) 419 #else 420 # define my_setlocale(cat, locale) setlocale(cat, locale) 421 #endif 422 423 #ifndef USE_POSIX_2008_LOCALE 424 425 /* "do_setlocale_c" is intended to be called when the category is a constant 426 * known at compile time; "do_setlocale_r", not known until run time */ 427 # define do_setlocale_c(cat, locale) my_setlocale(cat, locale) 428 # define do_setlocale_r(cat, locale) my_setlocale(cat, locale) 429 # define FIX_GLIBC_LC_MESSAGES_BUG(i) 430 431 #else /* Below uses POSIX 2008 */ 432 433 /* We emulate setlocale with our own function. LC_foo is not valid for the 434 * POSIX 2008 functions. Instead LC_foo_MASK is used, which we use an array 435 * lookup to convert to. At compile time we have defined LC_foo_INDEX as the 436 * proper offset into the array 'category_masks[]'. At runtime, we have to 437 * search through the array (as the actual numbers may not be small contiguous 438 * positive integers which would lend themselves to array lookup). */ 439 # define do_setlocale_c(cat, locale) \ 440 emulate_setlocale(cat, locale, cat ## _INDEX, TRUE) 441 # define do_setlocale_r(cat, locale) emulate_setlocale(cat, locale, 0, FALSE) 442 443 # if ! defined(__GLIBC__) || ! defined(USE_LOCALE_MESSAGES) 444 445 # define FIX_GLIBC_LC_MESSAGES_BUG(i) 446 447 # else /* Invalidate glibc cache of loaded translations, see [perl #134264] */ 448 449 # include <libintl.h> 450 # define FIX_GLIBC_LC_MESSAGES_BUG(i) \ 451 STMT_START { \ 452 if ((i) == LC_MESSAGES_INDEX) { \ 453 textdomain(textdomain(NULL)); \ 454 } \ 455 } STMT_END 456 457 # endif 458 459 /* A third array, parallel to the ones above to map from category to its 460 * equivalent mask */ 461 const int category_masks[] = { 462 # ifdef USE_LOCALE_NUMERIC 463 LC_NUMERIC_MASK, 464 # endif 465 # ifdef USE_LOCALE_CTYPE 466 LC_CTYPE_MASK, 467 # endif 468 # ifdef USE_LOCALE_COLLATE 469 LC_COLLATE_MASK, 470 # endif 471 # ifdef USE_LOCALE_TIME 472 LC_TIME_MASK, 473 # endif 474 # ifdef USE_LOCALE_MESSAGES 475 LC_MESSAGES_MASK, 476 # endif 477 # ifdef USE_LOCALE_MONETARY 478 LC_MONETARY_MASK, 479 # endif 480 # ifdef USE_LOCALE_ADDRESS 481 LC_ADDRESS_MASK, 482 # endif 483 # ifdef USE_LOCALE_IDENTIFICATION 484 LC_IDENTIFICATION_MASK, 485 # endif 486 # ifdef USE_LOCALE_MEASUREMENT 487 LC_MEASUREMENT_MASK, 488 # endif 489 # ifdef USE_LOCALE_PAPER 490 LC_PAPER_MASK, 491 # endif 492 # ifdef USE_LOCALE_TELEPHONE 493 LC_TELEPHONE_MASK, 494 # endif 495 # ifdef USE_LOCALE_SYNTAX 496 LC_SYNTAX_MASK, 497 # endif 498 # ifdef USE_LOCALE_TOD 499 LC_TOD_MASK, 500 # endif 501 /* LC_ALL can't be turned off by a Configure 502 * option, and in Posix 2008, should always be 503 * here, so compile it in unconditionally. 504 * This could catch some glitches at compile 505 * time */ 506 LC_ALL_MASK 507 }; 508 509 STATIC const char * 510 S_emulate_setlocale(const int category, 511 const char * locale, 512 unsigned int index, 513 const bool is_index_valid 514 ) 515 { 516 /* This function effectively performs a setlocale() on just the current 517 * thread; thus it is thread-safe. It does this by using the POSIX 2008 518 * locale functions to emulate the behavior of setlocale(). Similar to 519 * regular setlocale(), the return from this function points to memory that 520 * can be overwritten by other system calls, so needs to be copied 521 * immediately if you need to retain it. The difference here is that 522 * system calls besides another setlocale() can overwrite it. 523 * 524 * By doing this, most locale-sensitive functions become thread-safe. The 525 * exceptions are mostly those that return a pointer to static memory. 526 * 527 * This function takes the same parameters, 'category' and 'locale', that 528 * the regular setlocale() function does, but it also takes two additional 529 * ones. This is because the 2008 functions don't use a category; instead 530 * they use a corresponding mask. Because this function operates in both 531 * worlds, it may need one or the other or both. This function can 532 * calculate the mask from the input category, but to avoid this 533 * calculation, if the caller knows at compile time what the mask is, it 534 * can pass it, setting 'is_index_valid' to TRUE; otherwise the mask 535 * parameter is ignored. 536 * 537 * POSIX 2008, for some sick reason, chose not to provide a method to find 538 * the category name of a locale. Some vendors have created a 539 * querylocale() function to do just that. This function is a lot simpler 540 * to implement on systems that have this. Otherwise, we have to keep 541 * track of what the locale has been set to, so that we can return its 542 * name to emulate setlocale(). It's also possible for C code in some 543 * library to change the locale without us knowing it, though as of 544 * September 2017, there are no occurrences in CPAN of uselocale(). Some 545 * libraries do use setlocale(), but that changes the global locale, and 546 * threads using per-thread locales will just ignore those changes. 547 * Another problem is that without querylocale(), we have to guess at what 548 * was meant by setting a locale of "". We handle this by not actually 549 * ever setting to "" (unless querylocale exists), but to emulate what we 550 * think should happen for "". 551 */ 552 553 int mask; 554 locale_t old_obj; 555 locale_t new_obj; 556 const char * safelocale = locale ? locale : "(null)"; 557 dTHX; 558 559 # ifdef DEBUGGING 560 561 if (DEBUG_Lv_TEST || debug_initialization) { 562 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale input=%d (%s), \"%s\", %d, %d\n", __FILE__, __LINE__, category, category_name(category), safelocale, index, is_index_valid); 563 } 564 565 # endif 566 567 /* If the input mask might be incorrect, calculate the correct one */ 568 if (! is_index_valid) { 569 unsigned int i; 570 571 # ifdef DEBUGGING 572 573 if (DEBUG_Lv_TEST || debug_initialization) { 574 PerlIO_printf(Perl_debug_log, "%s:%d: finding index of category %d (%s)\n", __FILE__, __LINE__, category, category_name(category)); 575 } 576 577 # endif 578 579 for (i = 0; i <= LC_ALL_INDEX; i++) { 580 if (category == categories[i]) { 581 index = i; 582 goto found_index; 583 } 584 } 585 586 /* Here, we don't know about this category, so can't handle it. 587 * Fallback to the early POSIX usages */ 588 Perl_warner(aTHX_ packWARN(WARN_LOCALE), 589 "Unknown locale category %d; can't set it to %s\n", 590 category, safelocale); 591 return NULL; 592 593 found_index: ; 594 595 # ifdef DEBUGGING 596 597 if (DEBUG_Lv_TEST || debug_initialization) { 598 PerlIO_printf(Perl_debug_log, "%s:%d: index is %d for %s\n", __FILE__, __LINE__, index, category_name(category)); 599 } 600 601 # endif 602 603 } 604 605 mask = category_masks[index]; 606 607 # ifdef DEBUGGING 608 609 if (DEBUG_Lv_TEST || debug_initialization) { 610 PerlIO_printf(Perl_debug_log, "%s:%d: category name is %s; mask is 0x%x\n", __FILE__, __LINE__, category_names[index], mask); 611 } 612 613 # endif 614 615 /* If just querying what the existing locale is ... */ 616 if (locale == NULL) { 617 locale_t cur_obj = uselocale((locale_t) 0); 618 619 # ifdef DEBUGGING 620 621 if (DEBUG_Lv_TEST || debug_initialization) { 622 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale querying %p\n", __FILE__, __LINE__, cur_obj); 623 } 624 625 # endif 626 627 if (cur_obj == LC_GLOBAL_LOCALE) { 628 return my_setlocale(category, NULL); 629 } 630 631 # ifdef HAS_QUERYLOCALE 632 633 return (char *) querylocale(mask, cur_obj); 634 635 # else 636 637 /* If this assert fails, adjust the size of curlocales in intrpvar.h */ 638 STATIC_ASSERT_STMT(C_ARRAY_LENGTH(PL_curlocales) > LC_ALL_INDEX); 639 640 # if defined(_NL_LOCALE_NAME) \ 641 && defined(DEBUGGING) \ 642 /* On systems that accept any locale name, the real underlying \ 643 * locale is often returned by this internal function, so we \ 644 * can't use it */ \ 645 && ! defined(SETLOCALE_ACCEPTS_ANY_LOCALE_NAME) 646 { 647 /* Internal glibc for querylocale(), but doesn't handle 648 * empty-string ("") locale properly; who knows what other 649 * glitches. Check for it now, under debug. */ 650 651 char * temp_name = nl_langinfo_l(_NL_LOCALE_NAME(category), 652 uselocale((locale_t) 0)); 653 /* 654 PerlIO_printf(Perl_debug_log, "%s:%d: temp_name=%s\n", __FILE__, __LINE__, temp_name ? temp_name : "NULL"); 655 PerlIO_printf(Perl_debug_log, "%s:%d: index=%d\n", __FILE__, __LINE__, index); 656 PerlIO_printf(Perl_debug_log, "%s:%d: PL_curlocales[index]=%s\n", __FILE__, __LINE__, PL_curlocales[index]); 657 */ 658 if (temp_name && PL_curlocales[index] && strNE(temp_name, "")) { 659 if ( strNE(PL_curlocales[index], temp_name) 660 && ! ( isNAME_C_OR_POSIX(temp_name) 661 && isNAME_C_OR_POSIX(PL_curlocales[index]))) { 662 663 # ifdef USE_C_BACKTRACE 664 665 dump_c_backtrace(Perl_debug_log, 20, 1); 666 667 # endif 668 669 Perl_croak(aTHX_ "panic: Mismatch between what Perl thinks %s is" 670 " (%s) and what internal glibc thinks" 671 " (%s)\n", category_names[index], 672 PL_curlocales[index], temp_name); 673 } 674 675 return temp_name; 676 } 677 } 678 679 # endif 680 681 /* Without querylocale(), we have to use our record-keeping we've 682 * done. */ 683 684 if (category != LC_ALL) { 685 686 # ifdef DEBUGGING 687 688 if (DEBUG_Lv_TEST || debug_initialization) { 689 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[index]); 690 } 691 692 # endif 693 694 return PL_curlocales[index]; 695 } 696 else { /* For LC_ALL */ 697 unsigned int i; 698 Size_t names_len = 0; 699 char * all_string; 700 bool are_all_categories_the_same_locale = TRUE; 701 702 /* If we have a valid LC_ALL value, just return it */ 703 if (PL_curlocales[LC_ALL_INDEX]) { 704 705 # ifdef DEBUGGING 706 707 if (DEBUG_Lv_TEST || debug_initialization) { 708 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, PL_curlocales[LC_ALL_INDEX]); 709 } 710 711 # endif 712 713 return PL_curlocales[LC_ALL_INDEX]; 714 } 715 716 /* Otherwise, we need to construct a string of name=value pairs. 717 * We use the glibc syntax, like 718 * LC_NUMERIC=C;LC_TIME=en_US.UTF-8;... 719 * First calculate the needed size. Along the way, check if all 720 * the locale names are the same */ 721 for (i = 0; i < LC_ALL_INDEX; i++) { 722 723 # ifdef DEBUGGING 724 725 if (DEBUG_Lv_TEST || debug_initialization) { 726 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); 727 } 728 729 # endif 730 731 names_len += strlen(category_names[i]) 732 + 1 /* '=' */ 733 + strlen(PL_curlocales[i]) 734 + 1; /* ';' */ 735 736 if (i > 0 && strNE(PL_curlocales[i], PL_curlocales[i-1])) { 737 are_all_categories_the_same_locale = FALSE; 738 } 739 } 740 741 /* If they are the same, we don't actually have to construct the 742 * string; we just make the entry in LC_ALL_INDEX valid, and be 743 * that single name */ 744 if (are_all_categories_the_same_locale) { 745 PL_curlocales[LC_ALL_INDEX] = savepv(PL_curlocales[0]); 746 return PL_curlocales[LC_ALL_INDEX]; 747 } 748 749 names_len++; /* Trailing '\0' */ 750 SAVEFREEPV(Newx(all_string, names_len, char)); 751 *all_string = '\0'; 752 753 /* Then fill in the string */ 754 for (i = 0; i < LC_ALL_INDEX; i++) { 755 756 # ifdef DEBUGGING 757 758 if (DEBUG_Lv_TEST || debug_initialization) { 759 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale i=%d, name=%s, locale=%s\n", __FILE__, __LINE__, i, category_names[i], PL_curlocales[i]); 760 } 761 762 # endif 763 764 my_strlcat(all_string, category_names[i], names_len); 765 my_strlcat(all_string, "=", names_len); 766 my_strlcat(all_string, PL_curlocales[i], names_len); 767 my_strlcat(all_string, ";", names_len); 768 } 769 770 # ifdef DEBUGGING 771 772 if (DEBUG_L_TEST || debug_initialization) { 773 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale returning %s\n", __FILE__, __LINE__, all_string); 774 } 775 776 #endif 777 778 return all_string; 779 } 780 781 # ifdef EINVAL 782 783 SETERRNO(EINVAL, LIB_INVARG); 784 785 # endif 786 787 return NULL; 788 789 # endif 790 791 } /* End of this being setlocale(LC_foo, NULL) */ 792 793 /* Here, we are switching locales. */ 794 795 # ifndef HAS_QUERYLOCALE 796 797 if (strEQ(locale, "")) { 798 799 /* For non-querylocale() systems, we do the setting of "" ourselves to 800 * be sure that we really know what's going on. We follow the Linux 801 * documented behavior (but if that differs from the actual behavior, 802 * this won't work exactly as the OS implements). We go out and 803 * examine the environment based on our understanding of how the system 804 * works, and use that to figure things out */ 805 806 const char * const lc_all = PerlEnv_getenv("LC_ALL"); 807 808 /* Use any "LC_ALL" environment variable, as it overrides everything 809 * else. */ 810 if (lc_all && strNE(lc_all, "")) { 811 locale = lc_all; 812 } 813 else { 814 815 /* Otherwise, we need to dig deeper. Unless overridden, the 816 * default is the LANG environment variable; if it doesn't exist, 817 * then "C" */ 818 819 const char * default_name; 820 821 default_name = PerlEnv_getenv("LANG"); 822 823 if (! default_name || strEQ(default_name, "")) { 824 default_name = "C"; 825 } 826 827 if (category != LC_ALL) { 828 const char * const name = PerlEnv_getenv(category_names[index]); 829 830 /* Here we are setting a single category. Assume will have the 831 * default name */ 832 locale = default_name; 833 834 /* But then look for an overriding environment variable */ 835 if (name && strNE(name, "")) { 836 locale = name; 837 } 838 } 839 else { 840 bool did_override = FALSE; 841 unsigned int i; 842 843 /* Here, we are getting LC_ALL. Any categories that don't have 844 * a corresponding environment variable set should be set to 845 * LANG, or to "C" if there is no LANG. If no individual 846 * categories differ from this, we can just set LC_ALL. This 847 * is buggy on systems that have extra categories that we don't 848 * know about. If there is an environment variable that sets 849 * that category, we won't know to look for it, and so our use 850 * of LANG or "C" improperly overrides it. On the other hand, 851 * if we don't do what is done here, and there is no 852 * environment variable, the category's locale should be set to 853 * LANG or "C". So there is no good solution. khw thinks the 854 * best is to look at systems to see what categories they have, 855 * and include them, and then to assume that we know the 856 * complete set */ 857 858 for (i = 0; i < LC_ALL_INDEX; i++) { 859 const char * const env_override 860 = PerlEnv_getenv(category_names[i]); 861 const char * this_locale = ( env_override 862 && strNE(env_override, "")) 863 ? env_override 864 : default_name; 865 if (! emulate_setlocale(categories[i], this_locale, i, TRUE)) 866 { 867 return NULL; 868 } 869 870 if (strNE(this_locale, default_name)) { 871 did_override = TRUE; 872 } 873 } 874 875 /* If all the categories are the same, we can set LC_ALL to 876 * that */ 877 if (! did_override) { 878 locale = default_name; 879 } 880 else { 881 882 /* Here, LC_ALL is no longer valid, as some individual 883 * categories don't match it. We call ourselves 884 * recursively, as that will execute the code that 885 * generates the proper locale string for this situation. 886 * We don't do the remainder of this function, as that is 887 * to update our records, and we've just done that for the 888 * individual categories in the loop above, and doing so 889 * would cause LC_ALL to be done as well */ 890 return emulate_setlocale(LC_ALL, NULL, LC_ALL_INDEX, TRUE); 891 } 892 } 893 } 894 } /* End of this being setlocale(LC_foo, "") */ 895 else if (strchr(locale, ';')) { 896 897 /* LC_ALL may actually incude a conglomeration of various categories. 898 * Without querylocale, this code uses the glibc (as of this writing) 899 * syntax for representing that, but that is not a stable API, and 900 * other platforms do it differently, so we have to handle all cases 901 * ourselves */ 902 903 unsigned int i; 904 const char * s = locale; 905 const char * e = locale + strlen(locale); 906 const char * p = s; 907 const char * category_end; 908 const char * name_start; 909 const char * name_end; 910 911 /* If the string that gives what to set doesn't include all categories, 912 * the omitted ones get set to "C". To get this behavior, first set 913 * all the individual categories to "C", and override the furnished 914 * ones below */ 915 for (i = 0; i < LC_ALL_INDEX; i++) { 916 if (! emulate_setlocale(categories[i], "C", i, TRUE)) { 917 return NULL; 918 } 919 } 920 921 while (s < e) { 922 923 /* Parse through the category */ 924 while (isWORDCHAR(*p)) { 925 p++; 926 } 927 category_end = p; 928 929 if (*p++ != '=') { 930 Perl_croak(aTHX_ 931 "panic: %s: %d: Unexpected character in locale name '%02X", 932 __FILE__, __LINE__, *(p-1)); 933 } 934 935 /* Parse through the locale name */ 936 name_start = p; 937 while (p < e && *p != ';') { 938 if (! isGRAPH(*p)) { 939 Perl_croak(aTHX_ 940 "panic: %s: %d: Unexpected character in locale name '%02X", 941 __FILE__, __LINE__, *(p-1)); 942 } 943 p++; 944 } 945 name_end = p; 946 947 /* Space past the semi-colon */ 948 if (p < e) { 949 p++; 950 } 951 952 /* Find the index of the category name in our lists */ 953 for (i = 0; i < LC_ALL_INDEX; i++) { 954 char * individ_locale; 955 956 /* Keep going if this isn't the index. The strnNE() avoids a 957 * Perl_form(), but would fail if ever a category name could be 958 * a substring of another one, like if there were a 959 * "LC_TIME_DATE" */ 960 if strnNE(s, category_names[i], category_end - s) { 961 continue; 962 } 963 964 /* If this index is for the single category we're changing, we 965 * have found the locale to set it to. */ 966 if (category == categories[i]) { 967 locale = Perl_form(aTHX_ "%.*s", 968 (int) (name_end - name_start), 969 name_start); 970 goto ready_to_set; 971 } 972 973 assert(category == LC_ALL); 974 individ_locale = Perl_form(aTHX_ "%.*s", 975 (int) (name_end - name_start), name_start); 976 if (! emulate_setlocale(categories[i], individ_locale, i, TRUE)) 977 { 978 return NULL; 979 } 980 } 981 982 s = p; 983 } 984 985 /* Here we have set all the individual categories by recursive calls. 986 * These collectively should have fixed up LC_ALL, so can just query 987 * what that now is */ 988 assert(category == LC_ALL); 989 990 return do_setlocale_c(LC_ALL, NULL); 991 } /* End of this being setlocale(LC_ALL, 992 "LC_CTYPE=foo;LC_NUMERIC=bar;...") */ 993 994 ready_to_set: ; 995 996 /* Here at the end of having to deal with the absence of querylocale(). 997 * Some cases have already been fully handled by recursive calls to this 998 * function. But at this point, we haven't dealt with those, but are now 999 * prepared to, knowing what the locale name to set this category to is. 1000 * This would have come for free if this system had had querylocale() */ 1001 1002 # endif /* end of ! querylocale */ 1003 1004 assert(PL_C_locale_obj); 1005 1006 /* Switching locales generally entails freeing the current one's space (at 1007 * the C library's discretion). We need to stop using that locale before 1008 * the switch. So switch to a known locale object that we don't otherwise 1009 * mess with. This returns the locale object in effect at the time of the 1010 * switch. */ 1011 old_obj = uselocale(PL_C_locale_obj); 1012 1013 # ifdef DEBUGGING 1014 1015 if (DEBUG_Lv_TEST || debug_initialization) { 1016 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale was using %p\n", __FILE__, __LINE__, old_obj); 1017 } 1018 1019 # endif 1020 1021 if (! old_obj) { 1022 1023 # ifdef DEBUGGING 1024 1025 if (DEBUG_L_TEST || debug_initialization) { 1026 dSAVE_ERRNO; 1027 PerlIO_printf(Perl_debug_log, "%s:%d: emulate_setlocale switching to C failed: %d\n", __FILE__, __LINE__, GET_ERRNO); 1028 RESTORE_ERRNO; 1029 } 1030 1031 # endif 1032 1033 return NULL; 1034 } 1035 1036 # ifdef DEBUGGING 1037 1038 if (DEBUG_Lv_TEST || debug_initialization) { 1039 PerlIO_printf(Perl_debug_log, 1040 "%s:%d: emulate_setlocale now using %p\n", 1041 __FILE__, __LINE__, PL_C_locale_obj); 1042 } 1043 1044 # endif 1045 1046 /* If this call is to switch to the LC_ALL C locale, it already exists, and 1047 * in fact, we already have switched to it (in preparation for what 1048 * normally is to come). But since we're already there, continue to use 1049 * it instead of trying to create a new locale */ 1050 if (mask == LC_ALL_MASK && isNAME_C_OR_POSIX(locale)) { 1051 1052 # ifdef DEBUGGING 1053 1054 if (DEBUG_Lv_TEST || debug_initialization) { 1055 PerlIO_printf(Perl_debug_log, 1056 "%s:%d: will stay in C object\n", __FILE__, __LINE__); 1057 } 1058 1059 # endif 1060 1061 new_obj = PL_C_locale_obj; 1062 1063 /* We already had switched to the C locale in preparation for freeing 1064 * 'old_obj' */ 1065 if (old_obj != LC_GLOBAL_LOCALE && old_obj != PL_C_locale_obj) { 1066 freelocale(old_obj); 1067 } 1068 } 1069 else { 1070 /* If we weren't in a thread safe locale, set so that newlocale() below 1071 * which uses 'old_obj', uses an empty one. Same for our reserved C 1072 * object. The latter is defensive coding, so that, even if there is 1073 * some bug, we will never end up trying to modify either of these, as 1074 * if passed to newlocale(), they can be. */ 1075 if (old_obj == LC_GLOBAL_LOCALE || old_obj == PL_C_locale_obj) { 1076 old_obj = (locale_t) 0; 1077 } 1078 1079 /* Ready to create a new locale by modification of the exising one */ 1080 new_obj = newlocale(mask, locale, old_obj); 1081 1082 if (! new_obj) { 1083 dSAVE_ERRNO; 1084 1085 # ifdef DEBUGGING 1086 1087 if (DEBUG_L_TEST || debug_initialization) { 1088 PerlIO_printf(Perl_debug_log, 1089 "%s:%d: emulate_setlocale creating new object" 1090 " failed: %d\n", __FILE__, __LINE__, GET_ERRNO); 1091 } 1092 1093 # endif 1094 1095 if (! uselocale(old_obj)) { 1096 1097 # ifdef DEBUGGING 1098 1099 if (DEBUG_L_TEST || debug_initialization) { 1100 PerlIO_printf(Perl_debug_log, 1101 "%s:%d: switching back failed: %d\n", 1102 __FILE__, __LINE__, GET_ERRNO); 1103 } 1104 1105 # endif 1106 1107 } 1108 RESTORE_ERRNO; 1109 return NULL; 1110 } 1111 1112 # ifdef DEBUGGING 1113 1114 if (DEBUG_Lv_TEST || debug_initialization) { 1115 PerlIO_printf(Perl_debug_log, 1116 "%s:%d: emulate_setlocale created %p", 1117 __FILE__, __LINE__, new_obj); 1118 if (old_obj) { 1119 PerlIO_printf(Perl_debug_log, 1120 "; should have freed %p", old_obj); 1121 } 1122 PerlIO_printf(Perl_debug_log, "\n"); 1123 } 1124 1125 # endif 1126 1127 /* And switch into it */ 1128 if (! uselocale(new_obj)) { 1129 dSAVE_ERRNO; 1130 1131 # ifdef DEBUGGING 1132 1133 if (DEBUG_L_TEST || debug_initialization) { 1134 PerlIO_printf(Perl_debug_log, 1135 "%s:%d: emulate_setlocale switching to new object" 1136 " failed\n", __FILE__, __LINE__); 1137 } 1138 1139 # endif 1140 1141 if (! uselocale(old_obj)) { 1142 1143 # ifdef DEBUGGING 1144 1145 if (DEBUG_L_TEST || debug_initialization) { 1146 PerlIO_printf(Perl_debug_log, 1147 "%s:%d: switching back failed: %d\n", 1148 __FILE__, __LINE__, GET_ERRNO); 1149 } 1150 1151 # endif 1152 1153 } 1154 freelocale(new_obj); 1155 RESTORE_ERRNO; 1156 return NULL; 1157 } 1158 } 1159 1160 # ifdef DEBUGGING 1161 1162 if (DEBUG_Lv_TEST || debug_initialization) { 1163 PerlIO_printf(Perl_debug_log, 1164 "%s:%d: emulate_setlocale now using %p\n", 1165 __FILE__, __LINE__, new_obj); 1166 } 1167 1168 # endif 1169 1170 /* We are done, except for updating our records (if the system doesn't keep 1171 * them) and in the case of locale "", we don't actually know what the 1172 * locale that got switched to is, as it came from the environment. So 1173 * have to find it */ 1174 1175 # ifdef HAS_QUERYLOCALE 1176 1177 if (strEQ(locale, "")) { 1178 locale = querylocale(mask, new_obj); 1179 } 1180 1181 # else 1182 1183 /* Here, 'locale' is the return value */ 1184 1185 /* Without querylocale(), we have to update our records */ 1186 1187 if (category == LC_ALL) { 1188 unsigned int i; 1189 1190 /* For LC_ALL, we change all individual categories to correspond */ 1191 /* PL_curlocales is a parallel array, so has same 1192 * length as 'categories' */ 1193 for (i = 0; i <= LC_ALL_INDEX; i++) { 1194 Safefree(PL_curlocales[i]); 1195 PL_curlocales[i] = savepv(locale); 1196 } 1197 1198 FIX_GLIBC_LC_MESSAGES_BUG(LC_MESSAGES_INDEX); 1199 } 1200 else { 1201 1202 /* For a single category, if it's not the same as the one in LC_ALL, we 1203 * nullify LC_ALL */ 1204 1205 if (PL_curlocales[LC_ALL_INDEX] && strNE(PL_curlocales[LC_ALL_INDEX], locale)) { 1206 Safefree(PL_curlocales[LC_ALL_INDEX]); 1207 PL_curlocales[LC_ALL_INDEX] = NULL; 1208 } 1209 1210 /* Then update the category's record */ 1211 Safefree(PL_curlocales[index]); 1212 PL_curlocales[index] = savepv(locale); 1213 1214 FIX_GLIBC_LC_MESSAGES_BUG(index); 1215 } 1216 1217 # endif 1218 1219 return locale; 1220 } 1221 1222 #endif /* USE_POSIX_2008_LOCALE */ 1223 1224 #ifdef USE_LOCALE 1225 1226 STATIC void 1227 S_set_numeric_radix(pTHX_ const bool use_locale) 1228 { 1229 /* If 'use_locale' is FALSE, set to use a dot for the radix character. If 1230 * TRUE, use the radix character derived from the current locale */ 1231 1232 #if defined(USE_LOCALE_NUMERIC) && ( defined(HAS_LOCALECONV) \ 1233 || defined(HAS_NL_LANGINFO)) 1234 1235 const char * radix = (use_locale) 1236 ? my_nl_langinfo(RADIXCHAR, FALSE) 1237 /* FALSE => already in dest locale */ 1238 : "."; 1239 1240 sv_setpv(PL_numeric_radix_sv, radix); 1241 1242 /* If this is valid UTF-8 that isn't totally ASCII, and we are in 1243 * a UTF-8 locale, then mark the radix as being in UTF-8 */ 1244 if (is_utf8_non_invariant_string((U8 *) SvPVX(PL_numeric_radix_sv), 1245 SvCUR(PL_numeric_radix_sv)) 1246 && _is_cur_LC_category_utf8(LC_NUMERIC)) 1247 { 1248 SvUTF8_on(PL_numeric_radix_sv); 1249 } 1250 1251 # ifdef DEBUGGING 1252 1253 if (DEBUG_L_TEST || debug_initialization) { 1254 PerlIO_printf(Perl_debug_log, "Locale radix is '%s', ?UTF-8=%d\n", 1255 SvPVX(PL_numeric_radix_sv), 1256 cBOOL(SvUTF8(PL_numeric_radix_sv))); 1257 } 1258 1259 # endif 1260 #else 1261 1262 PERL_UNUSED_ARG(use_locale); 1263 1264 #endif /* USE_LOCALE_NUMERIC and can find the radix char */ 1265 1266 } 1267 1268 STATIC void 1269 S_new_numeric(pTHX_ const char *newnum) 1270 { 1271 1272 #ifndef USE_LOCALE_NUMERIC 1273 1274 PERL_UNUSED_ARG(newnum); 1275 1276 #else 1277 1278 /* Called after each libc setlocale() call affecting LC_NUMERIC, to tell 1279 * core Perl this and that 'newnum' is the name of the new locale. 1280 * It installs this locale as the current underlying default. 1281 * 1282 * The default locale and the C locale can be toggled between by use of the 1283 * set_numeric_underlying() and set_numeric_standard() functions, which 1284 * should probably not be called directly, but only via macros like 1285 * SET_NUMERIC_STANDARD() in perl.h. 1286 * 1287 * The toggling is necessary mainly so that a non-dot radix decimal point 1288 * character can be output, while allowing internal calculations to use a 1289 * dot. 1290 * 1291 * This sets several interpreter-level variables: 1292 * PL_numeric_name The underlying locale's name: a copy of 'newnum' 1293 * PL_numeric_underlying A boolean indicating if the toggled state is such 1294 * that the current locale is the program's underlying 1295 * locale 1296 * PL_numeric_standard An int indicating if the toggled state is such 1297 * that the current locale is the C locale or 1298 * indistinguishable from the C locale. If non-zero, it 1299 * is in C; if > 1, it means it may not be toggled away 1300 * from C. 1301 * PL_numeric_underlying_is_standard A bool kept by this function 1302 * indicating that the underlying locale and the standard 1303 * C locale are indistinguishable for the purposes of 1304 * LC_NUMERIC. This happens when both of the above two 1305 * variables are true at the same time. (Toggling is a 1306 * no-op under these circumstances.) This variable is 1307 * used to avoid having to recalculate. 1308 */ 1309 1310 char *save_newnum; 1311 1312 if (! newnum) { 1313 Safefree(PL_numeric_name); 1314 PL_numeric_name = NULL; 1315 PL_numeric_standard = TRUE; 1316 PL_numeric_underlying = TRUE; 1317 PL_numeric_underlying_is_standard = TRUE; 1318 return; 1319 } 1320 1321 save_newnum = stdize_locale(savepv(newnum)); 1322 PL_numeric_underlying = TRUE; 1323 PL_numeric_standard = isNAME_C_OR_POSIX(save_newnum); 1324 1325 #ifndef TS_W32_BROKEN_LOCALECONV 1326 1327 /* If its name isn't C nor POSIX, it could still be indistinguishable from 1328 * them. But on broken Windows systems calling my_nl_langinfo() for 1329 * THOUSEP can currently (but rarely) cause a race, so avoid doing that, 1330 * and just always change the locale if not C nor POSIX on those systems */ 1331 if (! PL_numeric_standard) { 1332 PL_numeric_standard = cBOOL(strEQ(".", my_nl_langinfo(RADIXCHAR, 1333 FALSE /* Don't toggle locale */ )) 1334 && strEQ("", my_nl_langinfo(THOUSEP, FALSE))); 1335 } 1336 1337 #endif 1338 1339 /* Save the new name if it isn't the same as the previous one, if any */ 1340 if (! PL_numeric_name || strNE(PL_numeric_name, save_newnum)) { 1341 Safefree(PL_numeric_name); 1342 PL_numeric_name = save_newnum; 1343 } 1344 else { 1345 Safefree(save_newnum); 1346 } 1347 1348 PL_numeric_underlying_is_standard = PL_numeric_standard; 1349 1350 # ifdef HAS_POSIX_2008_LOCALE 1351 1352 PL_underlying_numeric_obj = newlocale(LC_NUMERIC_MASK, 1353 PL_numeric_name, 1354 PL_underlying_numeric_obj); 1355 1356 #endif 1357 1358 if (DEBUG_L_TEST || debug_initialization) { 1359 PerlIO_printf(Perl_debug_log, "Called new_numeric with %s, PL_numeric_name=%s\n", newnum, PL_numeric_name); 1360 } 1361 1362 /* Keep LC_NUMERIC in the C locale. This is for XS modules, so they don't 1363 * have to worry about the radix being a non-dot. (Core operations that 1364 * need the underlying locale change to it temporarily). */ 1365 if (PL_numeric_standard) { 1366 set_numeric_radix(0); 1367 } 1368 else { 1369 set_numeric_standard(); 1370 } 1371 1372 #endif /* USE_LOCALE_NUMERIC */ 1373 1374 } 1375 1376 void 1377 Perl_set_numeric_standard(pTHX) 1378 { 1379 1380 #ifdef USE_LOCALE_NUMERIC 1381 1382 /* Toggle the LC_NUMERIC locale to C. Most code should use the macros like 1383 * SET_NUMERIC_STANDARD() in perl.h instead of calling this directly. The 1384 * macro avoids calling this routine if toggling isn't necessary according 1385 * to our records (which could be wrong if some XS code has changed the 1386 * locale behind our back) */ 1387 1388 # ifdef DEBUGGING 1389 1390 if (DEBUG_L_TEST || debug_initialization) { 1391 PerlIO_printf(Perl_debug_log, 1392 "Setting LC_NUMERIC locale to standard C\n"); 1393 } 1394 1395 # endif 1396 1397 do_setlocale_c(LC_NUMERIC, "C"); 1398 PL_numeric_standard = TRUE; 1399 PL_numeric_underlying = PL_numeric_underlying_is_standard; 1400 set_numeric_radix(0); 1401 1402 #endif /* USE_LOCALE_NUMERIC */ 1403 1404 } 1405 1406 void 1407 Perl_set_numeric_underlying(pTHX) 1408 { 1409 1410 #ifdef USE_LOCALE_NUMERIC 1411 1412 /* Toggle the LC_NUMERIC locale to the current underlying default. Most 1413 * code should use the macros like SET_NUMERIC_UNDERLYING() in perl.h 1414 * instead of calling this directly. The macro avoids calling this routine 1415 * if toggling isn't necessary according to our records (which could be 1416 * wrong if some XS code has changed the locale behind our back) */ 1417 1418 # ifdef DEBUGGING 1419 1420 if (DEBUG_L_TEST || debug_initialization) { 1421 PerlIO_printf(Perl_debug_log, 1422 "Setting LC_NUMERIC locale to %s\n", 1423 PL_numeric_name); 1424 } 1425 1426 # endif 1427 1428 do_setlocale_c(LC_NUMERIC, PL_numeric_name); 1429 PL_numeric_standard = PL_numeric_underlying_is_standard; 1430 PL_numeric_underlying = TRUE; 1431 set_numeric_radix(! PL_numeric_standard); 1432 1433 #endif /* USE_LOCALE_NUMERIC */ 1434 1435 } 1436 1437 /* 1438 * Set up for a new ctype locale. 1439 */ 1440 STATIC void 1441 S_new_ctype(pTHX_ const char *newctype) 1442 { 1443 1444 #ifndef USE_LOCALE_CTYPE 1445 1446 PERL_UNUSED_ARG(newctype); 1447 PERL_UNUSED_CONTEXT; 1448 1449 #else 1450 1451 /* Called after each libc setlocale() call affecting LC_CTYPE, to tell 1452 * core Perl this and that 'newctype' is the name of the new locale. 1453 * 1454 * This function sets up the folding arrays for all 256 bytes, assuming 1455 * that tofold() is tolc() since fold case is not a concept in POSIX, 1456 * 1457 * Any code changing the locale (outside this file) should use 1458 * Perl_setlocale or POSIX::setlocale, which call this function. Therefore 1459 * this function should be called directly only from this file and from 1460 * POSIX::setlocale() */ 1461 1462 unsigned int i; 1463 1464 /* Don't check for problems if we are suppressing the warnings */ 1465 bool check_for_problems = ckWARN_d(WARN_LOCALE) || UNLIKELY(DEBUG_L_TEST); 1466 bool maybe_utf8_turkic = FALSE; 1467 1468 PERL_ARGS_ASSERT_NEW_CTYPE; 1469 1470 /* We will replace any bad locale warning with 1) nothing if the new one is 1471 * ok; or 2) a new warning for the bad new locale */ 1472 if (PL_warn_locale) { 1473 SvREFCNT_dec_NN(PL_warn_locale); 1474 PL_warn_locale = NULL; 1475 } 1476 1477 PL_in_utf8_CTYPE_locale = _is_cur_LC_category_utf8(LC_CTYPE); 1478 1479 /* A UTF-8 locale gets standard rules. But note that code still has to 1480 * handle this specially because of the three problematic code points */ 1481 if (PL_in_utf8_CTYPE_locale) { 1482 Copy(PL_fold_latin1, PL_fold_locale, 256, U8); 1483 1484 /* UTF-8 locales can have special handling for 'I' and 'i' if they are 1485 * Turkic. Make sure these two are the only anomalies. (We don't use 1486 * towupper and towlower because they aren't in C89.) */ 1487 1488 #if defined(HAS_TOWUPPER) && defined (HAS_TOWLOWER) 1489 1490 if (towupper('i') == 0x130 && towlower('I') == 0x131) { 1491 1492 #else 1493 1494 if (toupper('i') == 'i' && tolower('I') == 'I') { 1495 1496 #endif 1497 check_for_problems = TRUE; 1498 maybe_utf8_turkic = TRUE; 1499 } 1500 } 1501 1502 /* We don't populate the other lists if a UTF-8 locale, but do check that 1503 * everything works as expected, unless checking turned off */ 1504 if (check_for_problems || ! PL_in_utf8_CTYPE_locale) { 1505 /* Assume enough space for every character being bad. 4 spaces each 1506 * for the 94 printable characters that are output like "'x' "; and 5 1507 * spaces each for "'\\' ", "'\t' ", and "'\n' "; plus a terminating 1508 * NUL */ 1509 char bad_chars_list[ (94 * 4) + (3 * 5) + 1 ] = { '\0' }; 1510 bool multi_byte_locale = FALSE; /* Assume is a single-byte locale 1511 to start */ 1512 unsigned int bad_count = 0; /* Count of bad characters */ 1513 1514 for (i = 0; i < 256; i++) { 1515 if (! PL_in_utf8_CTYPE_locale) { 1516 if (isupper(i)) 1517 PL_fold_locale[i] = (U8) tolower(i); 1518 else if (islower(i)) 1519 PL_fold_locale[i] = (U8) toupper(i); 1520 else 1521 PL_fold_locale[i] = (U8) i; 1522 } 1523 1524 /* If checking for locale problems, see if the native ASCII-range 1525 * printables plus \n and \t are in their expected categories in 1526 * the new locale. If not, this could mean big trouble, upending 1527 * Perl's and most programs' assumptions, like having a 1528 * metacharacter with special meaning become a \w. Fortunately, 1529 * it's very rare to find locales that aren't supersets of ASCII 1530 * nowadays. It isn't a problem for most controls to be changed 1531 * into something else; we check only \n and \t, though perhaps \r 1532 * could be an issue as well. */ 1533 if ( check_for_problems 1534 && (isGRAPH_A(i) || isBLANK_A(i) || i == '\n')) 1535 { 1536 bool is_bad = FALSE; 1537 char name[4] = { '\0' }; 1538 1539 /* Convert the name into a string */ 1540 if (isGRAPH_A(i)) { 1541 name[0] = i; 1542 name[1] = '\0'; 1543 } 1544 else if (i == '\n') { 1545 my_strlcpy(name, "\\n", sizeof(name)); 1546 } 1547 else if (i == '\t') { 1548 my_strlcpy(name, "\\t", sizeof(name)); 1549 } 1550 else { 1551 assert(i == ' '); 1552 my_strlcpy(name, "' '", sizeof(name)); 1553 } 1554 1555 /* Check each possibe class */ 1556 if (UNLIKELY(cBOOL(isalnum(i)) != cBOOL(isALPHANUMERIC_A(i)))) { 1557 is_bad = TRUE; 1558 DEBUG_L(PerlIO_printf(Perl_debug_log, 1559 "isalnum('%s') unexpectedly is %d\n", 1560 name, cBOOL(isalnum(i)))); 1561 } 1562 if (UNLIKELY(cBOOL(isalpha(i)) != cBOOL(isALPHA_A(i)))) { 1563 is_bad = TRUE; 1564 DEBUG_L(PerlIO_printf(Perl_debug_log, 1565 "isalpha('%s') unexpectedly is %d\n", 1566 name, cBOOL(isalpha(i)))); 1567 } 1568 if (UNLIKELY(cBOOL(isdigit(i)) != cBOOL(isDIGIT_A(i)))) { 1569 is_bad = TRUE; 1570 DEBUG_L(PerlIO_printf(Perl_debug_log, 1571 "isdigit('%s') unexpectedly is %d\n", 1572 name, cBOOL(isdigit(i)))); 1573 } 1574 if (UNLIKELY(cBOOL(isgraph(i)) != cBOOL(isGRAPH_A(i)))) { 1575 is_bad = TRUE; 1576 DEBUG_L(PerlIO_printf(Perl_debug_log, 1577 "isgraph('%s') unexpectedly is %d\n", 1578 name, cBOOL(isgraph(i)))); 1579 } 1580 if (UNLIKELY(cBOOL(islower(i)) != cBOOL(isLOWER_A(i)))) { 1581 is_bad = TRUE; 1582 DEBUG_L(PerlIO_printf(Perl_debug_log, 1583 "islower('%s') unexpectedly is %d\n", 1584 name, cBOOL(islower(i)))); 1585 } 1586 if (UNLIKELY(cBOOL(isprint(i)) != cBOOL(isPRINT_A(i)))) { 1587 is_bad = TRUE; 1588 DEBUG_L(PerlIO_printf(Perl_debug_log, 1589 "isprint('%s') unexpectedly is %d\n", 1590 name, cBOOL(isprint(i)))); 1591 } 1592 if (UNLIKELY(cBOOL(ispunct(i)) != cBOOL(isPUNCT_A(i)))) { 1593 is_bad = TRUE; 1594 DEBUG_L(PerlIO_printf(Perl_debug_log, 1595 "ispunct('%s') unexpectedly is %d\n", 1596 name, cBOOL(ispunct(i)))); 1597 } 1598 if (UNLIKELY(cBOOL(isspace(i)) != cBOOL(isSPACE_A(i)))) { 1599 is_bad = TRUE; 1600 DEBUG_L(PerlIO_printf(Perl_debug_log, 1601 "isspace('%s') unexpectedly is %d\n", 1602 name, cBOOL(isspace(i)))); 1603 } 1604 if (UNLIKELY(cBOOL(isupper(i)) != cBOOL(isUPPER_A(i)))) { 1605 is_bad = TRUE; 1606 DEBUG_L(PerlIO_printf(Perl_debug_log, 1607 "isupper('%s') unexpectedly is %d\n", 1608 name, cBOOL(isupper(i)))); 1609 } 1610 if (UNLIKELY(cBOOL(isxdigit(i))!= cBOOL(isXDIGIT_A(i)))) { 1611 is_bad = TRUE; 1612 DEBUG_L(PerlIO_printf(Perl_debug_log, 1613 "isxdigit('%s') unexpectedly is %d\n", 1614 name, cBOOL(isxdigit(i)))); 1615 } 1616 if (UNLIKELY(tolower(i) != (int) toLOWER_A(i))) { 1617 is_bad = TRUE; 1618 DEBUG_L(PerlIO_printf(Perl_debug_log, 1619 "tolower('%s')=0x%x instead of the expected 0x%x\n", 1620 name, tolower(i), (int) toLOWER_A(i))); 1621 } 1622 if (UNLIKELY(toupper(i) != (int) toUPPER_A(i))) { 1623 is_bad = TRUE; 1624 DEBUG_L(PerlIO_printf(Perl_debug_log, 1625 "toupper('%s')=0x%x instead of the expected 0x%x\n", 1626 name, toupper(i), (int) toUPPER_A(i))); 1627 } 1628 if (UNLIKELY((i == '\n' && ! isCNTRL_LC(i)))) { 1629 is_bad = TRUE; 1630 DEBUG_L(PerlIO_printf(Perl_debug_log, 1631 "'\\n' (=%02X) is not a control\n", (int) i)); 1632 } 1633 1634 /* Add to the list; Separate multiple entries with a blank */ 1635 if (is_bad) { 1636 if (bad_count) { 1637 my_strlcat(bad_chars_list, " ", sizeof(bad_chars_list)); 1638 } 1639 my_strlcat(bad_chars_list, name, sizeof(bad_chars_list)); 1640 bad_count++; 1641 } 1642 } 1643 } 1644 1645 if (bad_count == 2 && maybe_utf8_turkic) { 1646 bad_count = 0; 1647 *bad_chars_list = '\0'; 1648 PL_fold_locale['I'] = 'I'; 1649 PL_fold_locale['i'] = 'i'; 1650 PL_in_utf8_turkic_locale = TRUE; 1651 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s:%d: %s is turkic\n", 1652 __FILE__, __LINE__, newctype)); 1653 } 1654 else { 1655 PL_in_utf8_turkic_locale = FALSE; 1656 } 1657 1658 # ifdef MB_CUR_MAX 1659 1660 /* We only handle single-byte locales (outside of UTF-8 ones; so if 1661 * this locale requires more than one byte, there are going to be 1662 * problems. */ 1663 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 1664 "%s:%d: check_for_problems=%d, MB_CUR_MAX=%d\n", 1665 __FILE__, __LINE__, check_for_problems, (int) MB_CUR_MAX)); 1666 1667 if ( check_for_problems && MB_CUR_MAX > 1 1668 && ! PL_in_utf8_CTYPE_locale 1669 1670 /* Some platforms return MB_CUR_MAX > 1 for even the "C" 1671 * locale. Just assume that the implementation for them (plus 1672 * for POSIX) is correct and the > 1 value is spurious. (Since 1673 * these are specially handled to never be considered UTF-8 1674 * locales, as long as this is the only problem, everything 1675 * should work fine */ 1676 && strNE(newctype, "C") && strNE(newctype, "POSIX")) 1677 { 1678 multi_byte_locale = TRUE; 1679 } 1680 1681 # endif 1682 1683 /* If we found problems and we want them output, do so */ 1684 if ( (UNLIKELY(bad_count) || UNLIKELY(multi_byte_locale)) 1685 && (LIKELY(ckWARN_d(WARN_LOCALE)) || UNLIKELY(DEBUG_L_TEST))) 1686 { 1687 if (UNLIKELY(bad_count) && PL_in_utf8_CTYPE_locale) { 1688 PL_warn_locale = Perl_newSVpvf(aTHX_ 1689 "Locale '%s' contains (at least) the following characters" 1690 " which have\nunexpected meanings: %s\nThe Perl program" 1691 " will use the expected meanings", 1692 newctype, bad_chars_list); 1693 } 1694 else { 1695 PL_warn_locale = Perl_newSVpvf(aTHX_ 1696 "Locale '%s' may not work well.%s%s%s\n", 1697 newctype, 1698 (multi_byte_locale) 1699 ? " Some characters in it are not recognized by" 1700 " Perl." 1701 : "", 1702 (bad_count) 1703 ? "\nThe following characters (and maybe others)" 1704 " may not have the same meaning as the Perl" 1705 " program expects:\n" 1706 : "", 1707 (bad_count) 1708 ? bad_chars_list 1709 : "" 1710 ); 1711 } 1712 1713 # ifdef HAS_NL_LANGINFO 1714 1715 Perl_sv_catpvf(aTHX_ PL_warn_locale, "; codeset=%s", 1716 /* parameter FALSE is a don't care here */ 1717 my_nl_langinfo(CODESET, FALSE)); 1718 1719 # endif 1720 1721 Perl_sv_catpvf(aTHX_ PL_warn_locale, "\n"); 1722 1723 /* If we are actually in the scope of the locale or are debugging, 1724 * output the message now. If not in that scope, we save the 1725 * message to be output at the first operation using this locale, 1726 * if that actually happens. Most programs don't use locales, so 1727 * they are immune to bad ones. */ 1728 if (IN_LC(LC_CTYPE) || UNLIKELY(DEBUG_L_TEST)) { 1729 1730 /* The '0' below suppresses a bogus gcc compiler warning */ 1731 Perl_warner(aTHX_ packWARN(WARN_LOCALE), SvPVX(PL_warn_locale), 0); 1732 1733 if (IN_LC(LC_CTYPE)) { 1734 SvREFCNT_dec_NN(PL_warn_locale); 1735 PL_warn_locale = NULL; 1736 } 1737 } 1738 } 1739 } 1740 1741 #endif /* USE_LOCALE_CTYPE */ 1742 1743 } 1744 1745 void 1746 Perl__warn_problematic_locale() 1747 { 1748 1749 #ifdef USE_LOCALE_CTYPE 1750 1751 dTHX; 1752 1753 /* Internal-to-core function that outputs the message in PL_warn_locale, 1754 * and then NULLS it. Should be called only through the macro 1755 * _CHECK_AND_WARN_PROBLEMATIC_LOCALE */ 1756 1757 if (PL_warn_locale) { 1758 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), 1759 SvPVX(PL_warn_locale), 1760 0 /* dummy to avoid compiler warning */ ); 1761 SvREFCNT_dec_NN(PL_warn_locale); 1762 PL_warn_locale = NULL; 1763 } 1764 1765 #endif 1766 1767 } 1768 1769 STATIC void 1770 S_new_collate(pTHX_ const char *newcoll) 1771 { 1772 1773 #ifndef USE_LOCALE_COLLATE 1774 1775 PERL_UNUSED_ARG(newcoll); 1776 PERL_UNUSED_CONTEXT; 1777 1778 #else 1779 1780 /* Called after each libc setlocale() call affecting LC_COLLATE, to tell 1781 * core Perl this and that 'newcoll' is the name of the new locale. 1782 * 1783 * The design of locale collation is that every locale change is given an 1784 * index 'PL_collation_ix'. The first time a string particpates in an 1785 * operation that requires collation while locale collation is active, it 1786 * is given PERL_MAGIC_collxfrm magic (via sv_collxfrm_flags()). That 1787 * magic includes the collation index, and the transformation of the string 1788 * by strxfrm(), q.v. That transformation is used when doing comparisons, 1789 * instead of the string itself. If a string changes, the magic is 1790 * cleared. The next time the locale changes, the index is incremented, 1791 * and so we know during a comparison that the transformation is not 1792 * necessarily still valid, and so is recomputed. Note that if the locale 1793 * changes enough times, the index could wrap (a U32), and it is possible 1794 * that a transformation would improperly be considered valid, leading to 1795 * an unlikely bug */ 1796 1797 if (! newcoll) { 1798 if (PL_collation_name) { 1799 ++PL_collation_ix; 1800 Safefree(PL_collation_name); 1801 PL_collation_name = NULL; 1802 } 1803 PL_collation_standard = TRUE; 1804 is_standard_collation: 1805 PL_collxfrm_base = 0; 1806 PL_collxfrm_mult = 2; 1807 PL_in_utf8_COLLATE_locale = FALSE; 1808 PL_strxfrm_NUL_replacement = '\0'; 1809 PL_strxfrm_max_cp = 0; 1810 return; 1811 } 1812 1813 /* If this is not the same locale as currently, set the new one up */ 1814 if (! PL_collation_name || strNE(PL_collation_name, newcoll)) { 1815 ++PL_collation_ix; 1816 Safefree(PL_collation_name); 1817 PL_collation_name = stdize_locale(savepv(newcoll)); 1818 PL_collation_standard = isNAME_C_OR_POSIX(newcoll); 1819 if (PL_collation_standard) { 1820 goto is_standard_collation; 1821 } 1822 1823 PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE); 1824 PL_strxfrm_NUL_replacement = '\0'; 1825 PL_strxfrm_max_cp = 0; 1826 1827 /* A locale collation definition includes primary, secondary, tertiary, 1828 * etc. weights for each character. To sort, the primary weights are 1829 * used, and only if they compare equal, then the secondary weights are 1830 * used, and only if they compare equal, then the tertiary, etc. 1831 * 1832 * strxfrm() works by taking the input string, say ABC, and creating an 1833 * output transformed string consisting of first the primary weights, 1834 * A¹B¹C¹ followed by the secondary ones, A²B²C²; and then the 1835 * tertiary, etc, yielding A¹B¹C¹ A²B²C² A³B³C³ .... Some characters 1836 * may not have weights at every level. In our example, let's say B 1837 * doesn't have a tertiary weight, and A doesn't have a secondary 1838 * weight. The constructed string is then going to be 1839 * A¹B¹C¹ B²C² A³C³ .... 1840 * This has the desired effect that strcmp() will look at the secondary 1841 * or tertiary weights only if the strings compare equal at all higher 1842 * priority weights. The spaces shown here, like in 1843 * "A¹B¹C¹ A²B²C² " 1844 * are not just for readability. In the general case, these must 1845 * actually be bytes, which we will call here 'separator weights'; and 1846 * they must be smaller than any other weight value, but since these 1847 * are C strings, only the terminating one can be a NUL (some 1848 * implementations may include a non-NUL separator weight just before 1849 * the NUL). Implementations tend to reserve 01 for the separator 1850 * weights. They are needed so that a shorter string's secondary 1851 * weights won't be misconstrued as primary weights of a longer string, 1852 * etc. By making them smaller than any other weight, the shorter 1853 * string will sort first. (Actually, if all secondary weights are 1854 * smaller than all primary ones, there is no need for a separator 1855 * weight between those two levels, etc.) 1856 * 1857 * The length of the transformed string is roughly a linear function of 1858 * the input string. It's not exactly linear because some characters 1859 * don't have weights at all levels. When we call strxfrm() we have to 1860 * allocate some memory to hold the transformed string. The 1861 * calculations below try to find coefficients 'm' and 'b' for this 1862 * locale so that m*x + b equals how much space we need, given the size 1863 * of the input string in 'x'. If we calculate too small, we increase 1864 * the size as needed, and call strxfrm() again, but it is better to 1865 * get it right the first time to avoid wasted expensive string 1866 * transformations. */ 1867 1868 { 1869 /* We use the string below to find how long the tranformation of it 1870 * is. Almost all locales are supersets of ASCII, or at least the 1871 * ASCII letters. We use all of them, half upper half lower, 1872 * because if we used fewer, we might hit just the ones that are 1873 * outliers in a particular locale. Most of the strings being 1874 * collated will contain a preponderance of letters, and even if 1875 * they are above-ASCII, they are likely to have the same number of 1876 * weight levels as the ASCII ones. It turns out that digits tend 1877 * to have fewer levels, and some punctuation has more, but those 1878 * are relatively sparse in text, and khw believes this gives a 1879 * reasonable result, but it could be changed if experience so 1880 * dictates. */ 1881 const char longer[] = "ABCDEFGHIJKLMnopqrstuvwxyz"; 1882 char * x_longer; /* Transformed 'longer' */ 1883 Size_t x_len_longer; /* Length of 'x_longer' */ 1884 1885 char * x_shorter; /* We also transform a substring of 'longer' */ 1886 Size_t x_len_shorter; 1887 1888 /* _mem_collxfrm() is used get the transformation (though here we 1889 * are interested only in its length). It is used because it has 1890 * the intelligence to handle all cases, but to work, it needs some 1891 * values of 'm' and 'b' to get it started. For the purposes of 1892 * this calculation we use a very conservative estimate of 'm' and 1893 * 'b'. This assumes a weight can be multiple bytes, enough to 1894 * hold any UV on the platform, and there are 5 levels, 4 weight 1895 * bytes, and a trailing NUL. */ 1896 PL_collxfrm_base = 5; 1897 PL_collxfrm_mult = 5 * sizeof(UV); 1898 1899 /* Find out how long the transformation really is */ 1900 x_longer = _mem_collxfrm(longer, 1901 sizeof(longer) - 1, 1902 &x_len_longer, 1903 1904 /* We avoid converting to UTF-8 in the 1905 * called function by telling it the 1906 * string is in UTF-8 if the locale is a 1907 * UTF-8 one. Since the string passed 1908 * here is invariant under UTF-8, we can 1909 * claim it's UTF-8 even though it isn't. 1910 * */ 1911 PL_in_utf8_COLLATE_locale); 1912 Safefree(x_longer); 1913 1914 /* Find out how long the transformation of a substring of 'longer' 1915 * is. Together the lengths of these transformations are 1916 * sufficient to calculate 'm' and 'b'. The substring is all of 1917 * 'longer' except the first character. This minimizes the chances 1918 * of being swayed by outliers */ 1919 x_shorter = _mem_collxfrm(longer + 1, 1920 sizeof(longer) - 2, 1921 &x_len_shorter, 1922 PL_in_utf8_COLLATE_locale); 1923 Safefree(x_shorter); 1924 1925 /* If the results are nonsensical for this simple test, the whole 1926 * locale definition is suspect. Mark it so that locale collation 1927 * is not active at all for it. XXX Should we warn? */ 1928 if ( x_len_shorter == 0 1929 || x_len_longer == 0 1930 || x_len_shorter >= x_len_longer) 1931 { 1932 PL_collxfrm_mult = 0; 1933 PL_collxfrm_base = 0; 1934 } 1935 else { 1936 SSize_t base; /* Temporary */ 1937 1938 /* We have both: m * strlen(longer) + b = x_len_longer 1939 * m * strlen(shorter) + b = x_len_shorter; 1940 * subtracting yields: 1941 * m * (strlen(longer) - strlen(shorter)) 1942 * = x_len_longer - x_len_shorter 1943 * But we have set things up so that 'shorter' is 1 byte smaller 1944 * than 'longer'. Hence: 1945 * m = x_len_longer - x_len_shorter 1946 * 1947 * But if something went wrong, make sure the multiplier is at 1948 * least 1. 1949 */ 1950 if (x_len_longer > x_len_shorter) { 1951 PL_collxfrm_mult = (STRLEN) x_len_longer - x_len_shorter; 1952 } 1953 else { 1954 PL_collxfrm_mult = 1; 1955 } 1956 1957 /* mx + b = len 1958 * so: b = len - mx 1959 * but in case something has gone wrong, make sure it is 1960 * non-negative */ 1961 base = x_len_longer - PL_collxfrm_mult * (sizeof(longer) - 1); 1962 if (base < 0) { 1963 base = 0; 1964 } 1965 1966 /* Add 1 for the trailing NUL */ 1967 PL_collxfrm_base = base + 1; 1968 } 1969 1970 # ifdef DEBUGGING 1971 1972 if (DEBUG_L_TEST || debug_initialization) { 1973 PerlIO_printf(Perl_debug_log, 1974 "%s:%d: ?UTF-8 locale=%d; x_len_shorter=%zu, " 1975 "x_len_longer=%zu," 1976 " collate multipler=%zu, collate base=%zu\n", 1977 __FILE__, __LINE__, 1978 PL_in_utf8_COLLATE_locale, 1979 x_len_shorter, x_len_longer, 1980 PL_collxfrm_mult, PL_collxfrm_base); 1981 } 1982 # endif 1983 1984 } 1985 } 1986 1987 #endif /* USE_LOCALE_COLLATE */ 1988 1989 } 1990 1991 #endif 1992 1993 #ifdef WIN32 1994 1995 #define USE_WSETLOCALE 1996 1997 #ifdef USE_WSETLOCALE 1998 1999 STATIC char * 2000 S_wrap_wsetlocale(pTHX_ int category, const char *locale) { 2001 wchar_t *wlocale; 2002 wchar_t *wresult; 2003 char *result; 2004 2005 if (locale) { 2006 int req_size = 2007 MultiByteToWideChar(CP_UTF8, 0, locale, -1, NULL, 0); 2008 2009 if (!req_size) { 2010 errno = EINVAL; 2011 return NULL; 2012 } 2013 2014 Newx(wlocale, req_size, wchar_t); 2015 if (!MultiByteToWideChar(CP_UTF8, 0, locale, -1, wlocale, req_size)) { 2016 Safefree(wlocale); 2017 errno = EINVAL; 2018 return NULL; 2019 } 2020 } 2021 else { 2022 wlocale = NULL; 2023 } 2024 wresult = _wsetlocale(category, wlocale); 2025 Safefree(wlocale); 2026 if (wresult) { 2027 int req_size = 2028 WideCharToMultiByte(CP_UTF8, 0, wresult, -1, NULL, 0, NULL, NULL); 2029 Newx(result, req_size, char); 2030 SAVEFREEPV(result); /* is there something better we can do here? */ 2031 if (!WideCharToMultiByte(CP_UTF8, 0, wresult, -1, 2032 result, req_size, NULL, NULL)) { 2033 errno = EINVAL; 2034 return NULL; 2035 } 2036 } 2037 else { 2038 result = NULL; 2039 } 2040 2041 return result; 2042 } 2043 2044 #endif 2045 2046 STATIC char * 2047 S_win32_setlocale(pTHX_ int category, const char* locale) 2048 { 2049 /* This, for Windows, emulates POSIX setlocale() behavior. There is no 2050 * difference between the two unless the input locale is "", which normally 2051 * means on Windows to get the machine default, which is set via the 2052 * computer's "Regional and Language Options" (or its current equivalent). 2053 * In POSIX, it instead means to find the locale from the user's 2054 * environment. This routine changes the Windows behavior to first look in 2055 * the environment, and, if anything is found, use that instead of going to 2056 * the machine default. If there is no environment override, the machine 2057 * default is used, by calling the real setlocale() with "". 2058 * 2059 * The POSIX behavior is to use the LC_ALL variable if set; otherwise to 2060 * use the particular category's variable if set; otherwise to use the LANG 2061 * variable. */ 2062 2063 bool override_LC_ALL = FALSE; 2064 char * result; 2065 unsigned int i; 2066 2067 if (locale && strEQ(locale, "")) { 2068 2069 # ifdef LC_ALL 2070 2071 locale = PerlEnv_getenv("LC_ALL"); 2072 if (! locale) { 2073 if (category == LC_ALL) { 2074 override_LC_ALL = TRUE; 2075 } 2076 else { 2077 2078 # endif 2079 2080 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { 2081 if (category == categories[i]) { 2082 locale = PerlEnv_getenv(category_names[i]); 2083 goto found_locale; 2084 } 2085 } 2086 2087 locale = PerlEnv_getenv("LANG"); 2088 if (! locale) { 2089 locale = ""; 2090 } 2091 2092 found_locale: ; 2093 2094 # ifdef LC_ALL 2095 2096 } 2097 } 2098 2099 # endif 2100 2101 } 2102 2103 #ifdef USE_WSETLOCALE 2104 result = S_wrap_wsetlocale(aTHX_ category, locale); 2105 #else 2106 result = setlocale(category, locale); 2107 #endif 2108 DEBUG_L(STMT_START { 2109 dSAVE_ERRNO; 2110 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", __FILE__, __LINE__, 2111 setlocale_debug_string(category, locale, result)); 2112 RESTORE_ERRNO; 2113 } STMT_END); 2114 2115 if (! override_LC_ALL) { 2116 return result; 2117 } 2118 2119 /* Here the input category was LC_ALL, and we have set it to what is in the 2120 * LANG variable or the system default if there is no LANG. But these have 2121 * lower priority than the other LC_foo variables, so override it for each 2122 * one that is set. (If they are set to "", it means to use the same thing 2123 * we just set LC_ALL to, so can skip) */ 2124 2125 for (i = 0; i < LC_ALL_INDEX; i++) { 2126 result = PerlEnv_getenv(category_names[i]); 2127 if (result && strNE(result, "")) { 2128 #ifdef USE_WSETLOCALE 2129 S_wrap_wsetlocale(aTHX_ categories[i], result); 2130 #else 2131 setlocale(categories[i], result); 2132 #endif 2133 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", 2134 __FILE__, __LINE__, 2135 setlocale_debug_string(categories[i], result, "not captured"))); 2136 } 2137 } 2138 2139 result = setlocale(LC_ALL, NULL); 2140 DEBUG_L(STMT_START { 2141 dSAVE_ERRNO; 2142 PerlIO_printf(Perl_debug_log, "%s:%d: %s\n", 2143 __FILE__, __LINE__, 2144 setlocale_debug_string(LC_ALL, NULL, result)); 2145 RESTORE_ERRNO; 2146 } STMT_END); 2147 2148 return result; 2149 } 2150 2151 #endif 2152 2153 /* 2154 =for apidoc Perl_setlocale 2155 2156 This is an (almost) drop-in replacement for the system L<C<setlocale(3)>>, 2157 taking the same parameters, and returning the same information, except that it 2158 returns the correct underlying C<LC_NUMERIC> locale. Regular C<setlocale> will 2159 instead return C<C> if the underlying locale has a non-dot decimal point 2160 character, or a non-empty thousands separator for displaying floating point 2161 numbers. This is because perl keeps that locale category such that it has a 2162 dot and empty separator, changing the locale briefly during the operations 2163 where the underlying one is required. C<Perl_setlocale> knows about this, and 2164 compensates; regular C<setlocale> doesn't. 2165 2166 Another reason it isn't completely a drop-in replacement is that it is 2167 declared to return S<C<const char *>>, whereas the system setlocale omits the 2168 C<const> (presumably because its API was specified long ago, and can't be 2169 updated; it is illegal to change the information C<setlocale> returns; doing 2170 so leads to segfaults.) 2171 2172 Finally, C<Perl_setlocale> works under all circumstances, whereas plain 2173 C<setlocale> can be completely ineffective on some platforms under some 2174 configurations. 2175 2176 C<Perl_setlocale> should not be used to change the locale except on systems 2177 where the predefined variable C<${^SAFE_LOCALES}> is 1. On some such systems, 2178 the system C<setlocale()> is ineffective, returning the wrong information, and 2179 failing to actually change the locale. C<Perl_setlocale>, however works 2180 properly in all circumstances. 2181 2182 The return points to a per-thread static buffer, which is overwritten the next 2183 time C<Perl_setlocale> is called from the same thread. 2184 2185 =cut 2186 2187 */ 2188 2189 const char * 2190 Perl_setlocale(const int category, const char * locale) 2191 { 2192 /* This wraps POSIX::setlocale() */ 2193 2194 #ifndef USE_LOCALE 2195 2196 PERL_UNUSED_ARG(category); 2197 PERL_UNUSED_ARG(locale); 2198 2199 return "C"; 2200 2201 #else 2202 2203 const char * retval; 2204 const char * newlocale; 2205 dSAVEDERRNO; 2206 dTHX; 2207 DECLARATION_FOR_LC_NUMERIC_MANIPULATION; 2208 2209 #ifdef USE_LOCALE_NUMERIC 2210 2211 /* A NULL locale means only query what the current one is. We have the 2212 * LC_NUMERIC name saved, because we are normally switched into the C 2213 * (or equivalent) locale for it. For an LC_ALL query, switch back to get 2214 * the correct results. All other categories don't require special 2215 * handling */ 2216 if (locale == NULL) { 2217 if (category == LC_NUMERIC) { 2218 2219 /* We don't have to copy this return value, as it is a per-thread 2220 * variable, and won't change until a future setlocale */ 2221 return PL_numeric_name; 2222 } 2223 2224 # ifdef LC_ALL 2225 2226 else if (category == LC_ALL) { 2227 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); 2228 } 2229 2230 # endif 2231 2232 } 2233 2234 #endif 2235 2236 retval = save_to_buffer(do_setlocale_r(category, locale), 2237 &PL_setlocale_buf, &PL_setlocale_bufsize, 0); 2238 SAVE_ERRNO; 2239 2240 #if defined(USE_LOCALE_NUMERIC) && defined(LC_ALL) 2241 2242 if (locale == NULL && category == LC_ALL) { 2243 RESTORE_LC_NUMERIC(); 2244 } 2245 2246 #endif 2247 2248 DEBUG_L(PerlIO_printf(Perl_debug_log, 2249 "%s:%d: %s\n", __FILE__, __LINE__, 2250 setlocale_debug_string(category, locale, retval))); 2251 2252 RESTORE_ERRNO; 2253 2254 if (! retval) { 2255 return NULL; 2256 } 2257 2258 /* If locale == NULL, we are just querying the state */ 2259 if (locale == NULL) { 2260 return retval; 2261 } 2262 2263 /* Now that have switched locales, we have to update our records to 2264 * correspond. */ 2265 2266 switch (category) { 2267 2268 #ifdef USE_LOCALE_CTYPE 2269 2270 case LC_CTYPE: 2271 new_ctype(retval); 2272 break; 2273 2274 #endif 2275 #ifdef USE_LOCALE_COLLATE 2276 2277 case LC_COLLATE: 2278 new_collate(retval); 2279 break; 2280 2281 #endif 2282 #ifdef USE_LOCALE_NUMERIC 2283 2284 case LC_NUMERIC: 2285 new_numeric(retval); 2286 break; 2287 2288 #endif 2289 #ifdef LC_ALL 2290 2291 case LC_ALL: 2292 2293 /* LC_ALL updates all the things we care about. The values may not 2294 * be the same as 'retval', as the locale "" may have set things 2295 * individually */ 2296 2297 # ifdef USE_LOCALE_CTYPE 2298 2299 newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); 2300 new_ctype(newlocale); 2301 Safefree(newlocale); 2302 2303 # endif /* USE_LOCALE_CTYPE */ 2304 # ifdef USE_LOCALE_COLLATE 2305 2306 newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); 2307 new_collate(newlocale); 2308 Safefree(newlocale); 2309 2310 # endif 2311 # ifdef USE_LOCALE_NUMERIC 2312 2313 newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); 2314 new_numeric(newlocale); 2315 Safefree(newlocale); 2316 2317 # endif /* USE_LOCALE_NUMERIC */ 2318 #endif /* LC_ALL */ 2319 2320 default: 2321 break; 2322 } 2323 2324 return retval; 2325 2326 #endif 2327 2328 } 2329 2330 PERL_STATIC_INLINE const char * 2331 S_save_to_buffer(const char * string, char **buf, Size_t *buf_size, const Size_t offset) 2332 { 2333 /* Copy the NUL-terminated 'string' to 'buf' + 'offset'. 'buf' has size 'buf_size', 2334 * growing it if necessary */ 2335 2336 Size_t string_size; 2337 2338 PERL_ARGS_ASSERT_SAVE_TO_BUFFER; 2339 2340 if (! string) { 2341 return NULL; 2342 } 2343 2344 string_size = strlen(string) + offset + 1; 2345 2346 if (*buf_size == 0) { 2347 Newx(*buf, string_size, char); 2348 *buf_size = string_size; 2349 } 2350 else if (string_size > *buf_size) { 2351 Renew(*buf, string_size, char); 2352 *buf_size = string_size; 2353 } 2354 2355 Copy(string, *buf + offset, string_size - offset, char); 2356 return *buf; 2357 } 2358 2359 /* 2360 2361 =for apidoc Perl_langinfo 2362 2363 This is an (almost) drop-in replacement for the system C<L<nl_langinfo(3)>>, 2364 taking the same C<item> parameter values, and returning the same information. 2365 But it is more thread-safe than regular C<nl_langinfo()>, and hides the quirks 2366 of Perl's locale handling from your code, and can be used on systems that lack 2367 a native C<nl_langinfo>. 2368 2369 Expanding on these: 2370 2371 =over 2372 2373 =item * 2374 2375 The reason it isn't quite a drop-in replacement is actually an advantage. The 2376 only difference is that it returns S<C<const char *>>, whereas plain 2377 C<nl_langinfo()> returns S<C<char *>>, but you are (only by documentation) 2378 forbidden to write into the buffer. By declaring this C<const>, the compiler 2379 enforces this restriction, so if it is violated, you know at compilation time, 2380 rather than getting segfaults at runtime. 2381 2382 =item * 2383 2384 It delivers the correct results for the C<RADIXCHAR> and C<THOUSEP> items, 2385 without you having to write extra code. The reason for the extra code would be 2386 because these are from the C<LC_NUMERIC> locale category, which is normally 2387 kept set by Perl so that the radix is a dot, and the separator is the empty 2388 string, no matter what the underlying locale is supposed to be, and so to get 2389 the expected results, you have to temporarily toggle into the underlying 2390 locale, and later toggle back. (You could use plain C<nl_langinfo> and 2391 C<L</STORE_LC_NUMERIC_FORCE_TO_UNDERLYING>> for this but then you wouldn't get 2392 the other advantages of C<Perl_langinfo()>; not keeping C<LC_NUMERIC> in the C 2393 (or equivalent) locale would break a lot of CPAN, which is expecting the radix 2394 (decimal point) character to be a dot.) 2395 2396 =item * 2397 2398 The system function it replaces can have its static return buffer trashed, 2399 not only by a subsequent call to that function, but by a C<freelocale>, 2400 C<setlocale>, or other locale change. The returned buffer of this function is 2401 not changed until the next call to it, so the buffer is never in a trashed 2402 state. 2403 2404 =item * 2405 2406 Its return buffer is per-thread, so it also is never overwritten by a call to 2407 this function from another thread; unlike the function it replaces. 2408 2409 =item * 2410 2411 But most importantly, it works on systems that don't have C<nl_langinfo>, such 2412 as Windows, hence makes your code more portable. Of the fifty-some possible 2413 items specified by the POSIX 2008 standard, 2414 L<http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/langinfo.h.html>, 2415 only one is completely unimplemented, though on non-Windows platforms, another 2416 significant one is also not implemented). It uses various techniques to 2417 recover the other items, including calling C<L<localeconv(3)>>, and 2418 C<L<strftime(3)>>, both of which are specified in C89, so should be always be 2419 available. Later C<strftime()> versions have additional capabilities; C<""> is 2420 returned for those not available on your system. 2421 2422 It is important to note that when called with an item that is recovered by 2423 using C<localeconv>, the buffer from any previous explicit call to 2424 C<localeconv> will be overwritten. This means you must save that buffer's 2425 contents if you need to access them after a call to this function. (But note 2426 that you might not want to be using C<localeconv()> directly anyway, because of 2427 issues like the ones listed in the second item of this list (above) for 2428 C<RADIXCHAR> and C<THOUSEP>. You can use the methods given in L<perlcall> to 2429 call L<POSIX/localeconv> and avoid all the issues, but then you have a hash to 2430 unpack). 2431 2432 The details for those items which may deviate from what this emulation returns 2433 and what a native C<nl_langinfo()> would return are specified in 2434 L<I18N::Langinfo>. 2435 2436 =back 2437 2438 When using C<Perl_langinfo> on systems that don't have a native 2439 C<nl_langinfo()>, you must 2440 2441 #include "perl_langinfo.h" 2442 2443 before the C<perl.h> C<#include>. You can replace your C<langinfo.h> 2444 C<#include> with this one. (Doing it this way keeps out the symbols that plain 2445 C<langinfo.h> would try to import into the namespace for code that doesn't need 2446 it.) 2447 2448 The original impetus for C<Perl_langinfo()> was so that code that needs to 2449 find out the current currency symbol, floating point radix character, or digit 2450 grouping separator can use, on all systems, the simpler and more 2451 thread-friendly C<nl_langinfo> API instead of C<L<localeconv(3)>> which is a 2452 pain to make thread-friendly. For other fields returned by C<localeconv>, it 2453 is better to use the methods given in L<perlcall> to call 2454 L<C<POSIX::localeconv()>|POSIX/localeconv>, which is thread-friendly. 2455 2456 =cut 2457 2458 */ 2459 2460 const char * 2461 #ifdef HAS_NL_LANGINFO 2462 Perl_langinfo(const nl_item item) 2463 #else 2464 Perl_langinfo(const int item) 2465 #endif 2466 { 2467 return my_nl_langinfo(item, TRUE); 2468 } 2469 2470 STATIC const char * 2471 #ifdef HAS_NL_LANGINFO 2472 S_my_nl_langinfo(const nl_item item, bool toggle) 2473 #else 2474 S_my_nl_langinfo(const int item, bool toggle) 2475 #endif 2476 { 2477 dTHX; 2478 const char * retval; 2479 2480 #ifdef USE_LOCALE_NUMERIC 2481 2482 /* We only need to toggle into the underlying LC_NUMERIC locale for these 2483 * two items, and only if not already there */ 2484 if (toggle && (( item != RADIXCHAR && item != THOUSEP) 2485 || PL_numeric_underlying)) 2486 2487 #endif /* No toggling needed if not using LC_NUMERIC */ 2488 2489 toggle = FALSE; 2490 2491 #if defined(HAS_NL_LANGINFO) /* nl_langinfo() is available. */ 2492 # if ! defined(HAS_THREAD_SAFE_NL_LANGINFO_L) \ 2493 || ! defined(HAS_POSIX_2008_LOCALE) 2494 2495 /* Here, use plain nl_langinfo(), switching to the underlying LC_NUMERIC 2496 * for those items dependent on it. This must be copied to a buffer before 2497 * switching back, as some systems destroy the buffer when setlocale() is 2498 * called */ 2499 2500 { 2501 DECLARATION_FOR_LC_NUMERIC_MANIPULATION; 2502 2503 if (toggle) { 2504 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); 2505 } 2506 2507 /* Prevent interference from another thread executing this code 2508 * section. */ 2509 NL_LANGINFO_LOCK; 2510 2511 /* Copy to a per-thread buffer, which is also one that won't be 2512 * destroyed by a subsequent setlocale(), such as the 2513 * RESTORE_LC_NUMERIC may do just below. */ 2514 retval = save_to_buffer(nl_langinfo(item), 2515 &PL_langinfo_buf, &PL_langinfo_bufsize, 0); 2516 NL_LANGINFO_UNLOCK; 2517 2518 if (toggle) { 2519 RESTORE_LC_NUMERIC(); 2520 } 2521 } 2522 2523 # else /* Use nl_langinfo_l(), avoiding both a mutex and changing the locale */ 2524 2525 { 2526 bool do_free = FALSE; 2527 locale_t cur = uselocale((locale_t) 0); 2528 2529 if (cur == LC_GLOBAL_LOCALE) { 2530 cur = duplocale(LC_GLOBAL_LOCALE); 2531 do_free = TRUE; 2532 } 2533 2534 # ifdef USE_LOCALE_NUMERIC 2535 2536 if (toggle) { 2537 if (PL_underlying_numeric_obj) { 2538 cur = PL_underlying_numeric_obj; 2539 } 2540 else { 2541 cur = newlocale(LC_NUMERIC_MASK, PL_numeric_name, cur); 2542 do_free = TRUE; 2543 } 2544 } 2545 2546 # endif 2547 2548 /* We have to save it to a buffer, because the freelocale() just below 2549 * can invalidate the internal one */ 2550 retval = save_to_buffer(nl_langinfo_l(item, cur), 2551 &PL_langinfo_buf, &PL_langinfo_bufsize, 0); 2552 2553 if (do_free) { 2554 freelocale(cur); 2555 } 2556 } 2557 2558 # endif 2559 2560 if (strEQ(retval, "")) { 2561 if (item == YESSTR) { 2562 return "yes"; 2563 } 2564 if (item == NOSTR) { 2565 return "no"; 2566 } 2567 } 2568 2569 return retval; 2570 2571 #else /* Below, emulate nl_langinfo as best we can */ 2572 2573 { 2574 2575 # ifdef HAS_LOCALECONV 2576 2577 const struct lconv* lc; 2578 const char * temp; 2579 DECLARATION_FOR_LC_NUMERIC_MANIPULATION; 2580 2581 # ifdef TS_W32_BROKEN_LOCALECONV 2582 2583 const char * save_global; 2584 const char * save_thread; 2585 int needed_size; 2586 char * ptr; 2587 char * e; 2588 char * item_start; 2589 2590 # endif 2591 # endif 2592 # ifdef HAS_STRFTIME 2593 2594 struct tm tm; 2595 bool return_format = FALSE; /* Return the %format, not the value */ 2596 const char * format; 2597 2598 # endif 2599 2600 /* We copy the results to a per-thread buffer, even if not 2601 * multi-threaded. This is in part to simplify this code, and partly 2602 * because we need a buffer anyway for strftime(), and partly because a 2603 * call of localeconv() could otherwise wipe out the buffer, and the 2604 * programmer would not be expecting this, as this is a nl_langinfo() 2605 * substitute after all, so s/he might be thinking their localeconv() 2606 * is safe until another localeconv() call. */ 2607 2608 switch (item) { 2609 Size_t len; 2610 2611 /* This is unimplemented */ 2612 case ERA: /* For use with strftime() %E modifier */ 2613 2614 default: 2615 return ""; 2616 2617 /* We use only an English set, since we don't know any more */ 2618 case YESEXPR: return "^[+1yY]"; 2619 case YESSTR: return "yes"; 2620 case NOEXPR: return "^[-0nN]"; 2621 case NOSTR: return "no"; 2622 2623 case CODESET: 2624 2625 # ifndef WIN32 2626 2627 /* On non-windows, this is unimplemented, in part because of 2628 * inconsistencies between vendors. The Darwin native 2629 * nl_langinfo() implementation simply looks at everything past 2630 * any dot in the name, but that doesn't work for other 2631 * vendors. Many Linux locales that don't have UTF-8 in their 2632 * names really are UTF-8, for example; z/OS locales that do 2633 * have UTF-8 in their names, aren't really UTF-8 */ 2634 return ""; 2635 2636 # else 2637 2638 { /* But on Windows, the name does seem to be consistent, so 2639 use that. */ 2640 const char * p; 2641 const char * first; 2642 Size_t offset = 0; 2643 const char * name = my_setlocale(LC_CTYPE, NULL); 2644 2645 if (isNAME_C_OR_POSIX(name)) { 2646 return "ANSI_X3.4-1968"; 2647 } 2648 2649 /* Find the dot in the locale name */ 2650 first = (const char *) strchr(name, '.'); 2651 if (! first) { 2652 first = name; 2653 goto has_nondigit; 2654 } 2655 2656 /* Look at everything past the dot */ 2657 first++; 2658 p = first; 2659 2660 while (*p) { 2661 if (! isDIGIT(*p)) { 2662 goto has_nondigit; 2663 } 2664 2665 p++; 2666 } 2667 2668 /* Here everything past the dot is a digit. Treat it as a 2669 * code page */ 2670 retval = save_to_buffer("CP", &PL_langinfo_buf, 2671 &PL_langinfo_bufsize, 0); 2672 offset = STRLENs("CP"); 2673 2674 has_nondigit: 2675 2676 retval = save_to_buffer(first, &PL_langinfo_buf, 2677 &PL_langinfo_bufsize, offset); 2678 } 2679 2680 break; 2681 2682 # endif 2683 # ifdef HAS_LOCALECONV 2684 2685 case CRNCYSTR: 2686 2687 /* We don't bother with localeconv_l() because any system that 2688 * has it is likely to also have nl_langinfo() */ 2689 2690 LOCALECONV_LOCK; /* Prevent interference with other threads 2691 using localeconv() */ 2692 2693 # ifdef TS_W32_BROKEN_LOCALECONV 2694 2695 /* This is a workaround for a Windows bug prior to VS 15. 2696 * What we do here is, while locked, switch to the global 2697 * locale so localeconv() works; then switch back just before 2698 * the unlock. This can screw things up if some thread is 2699 * already using the global locale while assuming no other is. 2700 * A different workaround would be to call GetCurrencyFormat on 2701 * a known value, and parse it; patches welcome 2702 * 2703 * We have to use LC_ALL instead of LC_MONETARY because of 2704 * another bug in Windows */ 2705 2706 save_thread = savepv(my_setlocale(LC_ALL, NULL)); 2707 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); 2708 save_global= savepv(my_setlocale(LC_ALL, NULL)); 2709 my_setlocale(LC_ALL, save_thread); 2710 2711 # endif 2712 2713 lc = localeconv(); 2714 if ( ! lc 2715 || ! lc->currency_symbol 2716 || strEQ("", lc->currency_symbol)) 2717 { 2718 LOCALECONV_UNLOCK; 2719 return ""; 2720 } 2721 2722 /* Leave the first spot empty to be filled in below */ 2723 retval = save_to_buffer(lc->currency_symbol, &PL_langinfo_buf, 2724 &PL_langinfo_bufsize, 1); 2725 if (lc->mon_decimal_point && strEQ(lc->mon_decimal_point, "")) 2726 { /* khw couldn't figure out how the localedef specifications 2727 would show that the $ should replace the radix; this is 2728 just a guess as to how it might work.*/ 2729 PL_langinfo_buf[0] = '.'; 2730 } 2731 else if (lc->p_cs_precedes) { 2732 PL_langinfo_buf[0] = '-'; 2733 } 2734 else { 2735 PL_langinfo_buf[0] = '+'; 2736 } 2737 2738 # ifdef TS_W32_BROKEN_LOCALECONV 2739 2740 my_setlocale(LC_ALL, save_global); 2741 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); 2742 my_setlocale(LC_ALL, save_thread); 2743 Safefree(save_global); 2744 Safefree(save_thread); 2745 2746 # endif 2747 2748 LOCALECONV_UNLOCK; 2749 break; 2750 2751 # ifdef TS_W32_BROKEN_LOCALECONV 2752 2753 case RADIXCHAR: 2754 2755 /* For this, we output a known simple floating point number to 2756 * a buffer, and parse it, looking for the radix */ 2757 2758 if (toggle) { 2759 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); 2760 } 2761 2762 if (PL_langinfo_bufsize < 10) { 2763 PL_langinfo_bufsize = 10; 2764 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); 2765 } 2766 2767 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, 2768 "%.1f", 1.5); 2769 if (needed_size >= (int) PL_langinfo_bufsize) { 2770 PL_langinfo_bufsize = needed_size + 1; 2771 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); 2772 needed_size = my_snprintf(PL_langinfo_buf, PL_langinfo_bufsize, 2773 "%.1f", 1.5); 2774 assert(needed_size < (int) PL_langinfo_bufsize); 2775 } 2776 2777 ptr = PL_langinfo_buf; 2778 e = PL_langinfo_buf + PL_langinfo_bufsize; 2779 2780 /* Find the '1' */ 2781 while (ptr < e && *ptr != '1') { 2782 ptr++; 2783 } 2784 ptr++; 2785 2786 /* Find the '5' */ 2787 item_start = ptr; 2788 while (ptr < e && *ptr != '5') { 2789 ptr++; 2790 } 2791 2792 /* Everything in between is the radix string */ 2793 if (ptr >= e) { 2794 PL_langinfo_buf[0] = '?'; 2795 PL_langinfo_buf[1] = '\0'; 2796 } 2797 else { 2798 *ptr = '\0'; 2799 Move(item_start, PL_langinfo_buf, ptr - PL_langinfo_buf, char); 2800 } 2801 2802 if (toggle) { 2803 RESTORE_LC_NUMERIC(); 2804 } 2805 2806 retval = PL_langinfo_buf; 2807 break; 2808 2809 # else 2810 2811 case RADIXCHAR: /* No special handling needed */ 2812 2813 # endif 2814 2815 case THOUSEP: 2816 2817 if (toggle) { 2818 STORE_LC_NUMERIC_FORCE_TO_UNDERLYING(); 2819 } 2820 2821 LOCALECONV_LOCK; /* Prevent interference with other threads 2822 using localeconv() */ 2823 2824 # ifdef TS_W32_BROKEN_LOCALECONV 2825 2826 /* This should only be for the thousands separator. A 2827 * different work around would be to use GetNumberFormat on a 2828 * known value and parse the result to find the separator */ 2829 save_thread = savepv(my_setlocale(LC_ALL, NULL)); 2830 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); 2831 save_global = savepv(my_setlocale(LC_ALL, NULL)); 2832 my_setlocale(LC_ALL, save_thread); 2833 # if 0 2834 /* This is the start of code that for broken Windows replaces 2835 * the above and below code, and instead calls 2836 * GetNumberFormat() and then would parse that to find the 2837 * thousands separator. It needs to handle UTF-16 vs -8 2838 * issues. */ 2839 2840 needed_size = GetNumberFormatEx(PL_numeric_name, 0, "1234.5", NULL, PL_langinfo_buf, PL_langinfo_bufsize); 2841 DEBUG_L(PerlIO_printf(Perl_debug_log, 2842 "%s: %d: return from GetNumber, count=%d, val=%s\n", 2843 __FILE__, __LINE__, needed_size, PL_langinfo_buf)); 2844 2845 # endif 2846 # endif 2847 2848 lc = localeconv(); 2849 if (! lc) { 2850 temp = ""; 2851 } 2852 else { 2853 temp = (item == RADIXCHAR) 2854 ? lc->decimal_point 2855 : lc->thousands_sep; 2856 if (! temp) { 2857 temp = ""; 2858 } 2859 } 2860 2861 retval = save_to_buffer(temp, &PL_langinfo_buf, 2862 &PL_langinfo_bufsize, 0); 2863 2864 # ifdef TS_W32_BROKEN_LOCALECONV 2865 2866 my_setlocale(LC_ALL, save_global); 2867 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); 2868 my_setlocale(LC_ALL, save_thread); 2869 Safefree(save_global); 2870 Safefree(save_thread); 2871 2872 # endif 2873 2874 LOCALECONV_UNLOCK; 2875 2876 if (toggle) { 2877 RESTORE_LC_NUMERIC(); 2878 } 2879 2880 break; 2881 2882 # endif 2883 # ifdef HAS_STRFTIME 2884 2885 /* These are defined by C89, so we assume that strftime supports 2886 * them, and so are returned unconditionally; they may not be what 2887 * the locale actually says, but should give good enough results 2888 * for someone using them as formats (as opposed to trying to parse 2889 * them to figure out what the locale says). The other format 2890 * items are actually tested to verify they work on the platform */ 2891 case D_FMT: return "%x"; 2892 case T_FMT: return "%X"; 2893 case D_T_FMT: return "%c"; 2894 2895 /* These formats are only available in later strfmtime's */ 2896 case ERA_D_FMT: case ERA_T_FMT: case ERA_D_T_FMT: case T_FMT_AMPM: 2897 2898 /* The rest can be gotten from most versions of strftime(). */ 2899 case ABDAY_1: case ABDAY_2: case ABDAY_3: 2900 case ABDAY_4: case ABDAY_5: case ABDAY_6: case ABDAY_7: 2901 case ALT_DIGITS: 2902 case AM_STR: case PM_STR: 2903 case ABMON_1: case ABMON_2: case ABMON_3: case ABMON_4: 2904 case ABMON_5: case ABMON_6: case ABMON_7: case ABMON_8: 2905 case ABMON_9: case ABMON_10: case ABMON_11: case ABMON_12: 2906 case DAY_1: case DAY_2: case DAY_3: case DAY_4: 2907 case DAY_5: case DAY_6: case DAY_7: 2908 case MON_1: case MON_2: case MON_3: case MON_4: 2909 case MON_5: case MON_6: case MON_7: case MON_8: 2910 case MON_9: case MON_10: case MON_11: case MON_12: 2911 2912 init_tm(&tm); /* Precaution against core dumps */ 2913 tm.tm_sec = 30; 2914 tm.tm_min = 30; 2915 tm.tm_hour = 6; 2916 tm.tm_year = 2017 - 1900; 2917 tm.tm_wday = 0; 2918 tm.tm_mon = 0; 2919 2920 GCC_DIAG_IGNORE_STMT(-Wimplicit-fallthrough); 2921 2922 switch (item) { 2923 default: 2924 Perl_croak(aTHX_ 2925 "panic: %s: %d: switch case: %d problem", 2926 __FILE__, __LINE__, item); 2927 NOT_REACHED; /* NOTREACHED */ 2928 2929 case PM_STR: tm.tm_hour = 18; 2930 case AM_STR: 2931 format = "%p"; 2932 break; 2933 2934 case ABDAY_7: tm.tm_wday++; 2935 case ABDAY_6: tm.tm_wday++; 2936 case ABDAY_5: tm.tm_wday++; 2937 case ABDAY_4: tm.tm_wday++; 2938 case ABDAY_3: tm.tm_wday++; 2939 case ABDAY_2: tm.tm_wday++; 2940 case ABDAY_1: 2941 format = "%a"; 2942 break; 2943 2944 case DAY_7: tm.tm_wday++; 2945 case DAY_6: tm.tm_wday++; 2946 case DAY_5: tm.tm_wday++; 2947 case DAY_4: tm.tm_wday++; 2948 case DAY_3: tm.tm_wday++; 2949 case DAY_2: tm.tm_wday++; 2950 case DAY_1: 2951 format = "%A"; 2952 break; 2953 2954 case ABMON_12: tm.tm_mon++; 2955 case ABMON_11: tm.tm_mon++; 2956 case ABMON_10: tm.tm_mon++; 2957 case ABMON_9: tm.tm_mon++; 2958 case ABMON_8: tm.tm_mon++; 2959 case ABMON_7: tm.tm_mon++; 2960 case ABMON_6: tm.tm_mon++; 2961 case ABMON_5: tm.tm_mon++; 2962 case ABMON_4: tm.tm_mon++; 2963 case ABMON_3: tm.tm_mon++; 2964 case ABMON_2: tm.tm_mon++; 2965 case ABMON_1: 2966 format = "%b"; 2967 break; 2968 2969 case MON_12: tm.tm_mon++; 2970 case MON_11: tm.tm_mon++; 2971 case MON_10: tm.tm_mon++; 2972 case MON_9: tm.tm_mon++; 2973 case MON_8: tm.tm_mon++; 2974 case MON_7: tm.tm_mon++; 2975 case MON_6: tm.tm_mon++; 2976 case MON_5: tm.tm_mon++; 2977 case MON_4: tm.tm_mon++; 2978 case MON_3: tm.tm_mon++; 2979 case MON_2: tm.tm_mon++; 2980 case MON_1: 2981 format = "%B"; 2982 break; 2983 2984 case T_FMT_AMPM: 2985 format = "%r"; 2986 return_format = TRUE; 2987 break; 2988 2989 case ERA_D_FMT: 2990 format = "%Ex"; 2991 return_format = TRUE; 2992 break; 2993 2994 case ERA_T_FMT: 2995 format = "%EX"; 2996 return_format = TRUE; 2997 break; 2998 2999 case ERA_D_T_FMT: 3000 format = "%Ec"; 3001 return_format = TRUE; 3002 break; 3003 3004 case ALT_DIGITS: 3005 tm.tm_wday = 0; 3006 format = "%Ow"; /* Find the alternate digit for 0 */ 3007 break; 3008 } 3009 3010 GCC_DIAG_RESTORE_STMT; 3011 3012 /* We can't use my_strftime() because it doesn't look at 3013 * tm_wday */ 3014 while (0 == strftime(PL_langinfo_buf, PL_langinfo_bufsize, 3015 format, &tm)) 3016 { 3017 /* A zero return means one of: 3018 * a) there wasn't enough space in PL_langinfo_buf 3019 * b) the format, like a plain %p, returns empty 3020 * c) it was an illegal format, though some 3021 * implementations of strftime will just return the 3022 * illegal format as a plain character sequence. 3023 * 3024 * To quickly test for case 'b)', try again but precede 3025 * the format with a plain character. If that result is 3026 * still empty, the problem is either 'a)' or 'c)' */ 3027 3028 Size_t format_size = strlen(format) + 1; 3029 Size_t mod_size = format_size + 1; 3030 char * mod_format; 3031 char * temp_result; 3032 3033 Newx(mod_format, mod_size, char); 3034 Newx(temp_result, PL_langinfo_bufsize, char); 3035 *mod_format = ' '; 3036 my_strlcpy(mod_format + 1, format, mod_size); 3037 len = strftime(temp_result, 3038 PL_langinfo_bufsize, 3039 mod_format, &tm); 3040 Safefree(mod_format); 3041 Safefree(temp_result); 3042 3043 /* If 'len' is non-zero, it means that we had a case like 3044 * %p which means the current locale doesn't use a.m. or 3045 * p.m., and that is valid */ 3046 if (len == 0) { 3047 3048 /* Here, still didn't work. If we get well beyond a 3049 * reasonable size, bail out to prevent an infinite 3050 * loop. */ 3051 3052 if (PL_langinfo_bufsize > 100 * format_size) { 3053 *PL_langinfo_buf = '\0'; 3054 } 3055 else { 3056 /* Double the buffer size to retry; Add 1 in case 3057 * original was 0, so we aren't stuck at 0. */ 3058 PL_langinfo_bufsize *= 2; 3059 PL_langinfo_bufsize++; 3060 Renew(PL_langinfo_buf, PL_langinfo_bufsize, char); 3061 continue; 3062 } 3063 } 3064 3065 break; 3066 } 3067 3068 /* Here, we got a result. 3069 * 3070 * If the item is 'ALT_DIGITS', PL_langinfo_buf contains the 3071 * alternate format for wday 0. If the value is the same as 3072 * the normal 0, there isn't an alternate, so clear the buffer. 3073 * */ 3074 if ( item == ALT_DIGITS 3075 && strEQ(PL_langinfo_buf, "0")) 3076 { 3077 *PL_langinfo_buf = '\0'; 3078 } 3079 3080 /* ALT_DIGITS is problematic. Experiments on it showed that 3081 * strftime() did not always work properly when going from 3082 * alt-9 to alt-10. Only a few locales have this item defined, 3083 * and in all of them on Linux that khw was able to find, 3084 * nl_langinfo() merely returned the alt-0 character, possibly 3085 * doubled. Most Unicode digits are in blocks of 10 3086 * consecutive code points, so that is sufficient information 3087 * for those scripts, as we can infer alt-1, alt-2, .... But 3088 * for a Japanese locale, a CJK ideographic 0 is returned, and 3089 * the CJK digits are not in code point order, so you can't 3090 * really infer anything. The localedef for this locale did 3091 * specify the succeeding digits, so that strftime() works 3092 * properly on them, without needing to infer anything. But 3093 * the nl_langinfo() return did not give sufficient information 3094 * for the caller to understand what's going on. So until 3095 * there is evidence that it should work differently, this 3096 * returns the alt-0 string for ALT_DIGITS. 3097 * 3098 * wday was chosen because its range is all a single digit. 3099 * Things like tm_sec have two digits as the minimum: '00' */ 3100 3101 retval = PL_langinfo_buf; 3102 3103 /* If to return the format, not the value, overwrite the buffer 3104 * with it. But some strftime()s will keep the original format 3105 * if illegal, so change those to "" */ 3106 if (return_format) { 3107 if (strEQ(PL_langinfo_buf, format)) { 3108 *PL_langinfo_buf = '\0'; 3109 } 3110 else { 3111 retval = save_to_buffer(format, &PL_langinfo_buf, 3112 &PL_langinfo_bufsize, 0); 3113 } 3114 } 3115 3116 break; 3117 3118 # endif 3119 3120 } 3121 } 3122 3123 return retval; 3124 3125 #endif 3126 3127 } 3128 3129 /* 3130 * Initialize locale awareness. 3131 */ 3132 int 3133 Perl_init_i18nl10n(pTHX_ int printwarn) 3134 { 3135 /* printwarn is 3136 * 3137 * 0 if not to output warning when setup locale is bad 3138 * 1 if to output warning based on value of PERL_BADLANG 3139 * >1 if to output regardless of PERL_BADLANG 3140 * 3141 * returns 3142 * 1 = set ok or not applicable, 3143 * 0 = fallback to a locale of lower priority 3144 * -1 = fallback to all locales failed, not even to the C locale 3145 * 3146 * Under -DDEBUGGING, if the environment variable PERL_DEBUG_LOCALE_INIT is 3147 * set, debugging information is output. 3148 * 3149 * This looks more complicated than it is, mainly due to the #ifdefs and 3150 * error handling. 3151 * 3152 * Besides some asserts, data structure initialization, and specific 3153 * platform complications, this routine is effectively just two things. 3154 * 3155 * a) setlocale(LC_ALL, ""); 3156 * 3157 * which sets LC_ALL to the values in the current environment. 3158 * 3159 * And for each individual category 'foo' whose value we care about: 3160 * 3161 * b) save_foo = setlocale(LC_foo, NULL); handle_foo(save_foo); 3162 * 3163 * (We don't tend to care about categories like LC_PAPER, for example.) 3164 * 3165 * But there are complications. On systems without LC_ALL, it emulates 3166 * step a) by looping through all the categories, and doing 3167 * 3168 * setlocale(LC_foo, ""); 3169 * 3170 * on each. 3171 * 3172 * And it has to deal with if this is an embedded perl, whose locale 3173 * doesn't come from the environment, but has been set up by the caller. 3174 * This is pretty simply handled: the "" in the setlocale calls is not a 3175 * string constant, but a variable which is set to NULL in the embedded 3176 * case. 3177 * 3178 * But the major complication is handling failure and doing fallback. 3179 * There is an array, trial_locales, the elements of which are looped over 3180 * until the locale is successfully set. The array is initialized with 3181 * just one element, for 3182 * setlocale(LC_ALL, $NULL_or_empty) 3183 * If that works, as it almost always does, there's no more elements and 3184 * the loop iterates just the once. Otherwise elements are added for each 3185 * of the environment variables that POSIX dictates should control the 3186 * program, in priority order, with a final one being "C". The loop is 3187 * repeated until the first one succeeds. If all fail, we limp along with 3188 * whatever state we got to. If there is no LC_ALL, an inner loop is run 3189 * through all categories (making things look complex). 3190 * 3191 * A further complication is that Windows has an additional fallback, the 3192 * user-default ANSI code page obtained from the operating system. This is 3193 * added as yet another loop iteration, just before the final "C" 3194 * 3195 * On Ultrix, the locale MUST come from the environment, so there is 3196 * preliminary code to set it. I (khw) am not sure that it is necessary, 3197 * and that this couldn't be folded into the loop, but barring any real 3198 * platforms to test on, it's staying as-is 3199 */ 3200 3201 int ok = 1; 3202 3203 #ifndef USE_LOCALE 3204 3205 PERL_UNUSED_ARG(printwarn); 3206 3207 #else /* USE_LOCALE */ 3208 # ifdef __GLIBC__ 3209 3210 const char * const language = PerlEnv_getenv("LANGUAGE"); 3211 3212 # endif 3213 3214 /* NULL uses the existing already set up locale */ 3215 const char * const setlocale_init = (PerlEnv_getenv("PERL_SKIP_LOCALE_INIT")) 3216 ? NULL 3217 : ""; 3218 const char* trial_locales[5]; /* 5 = 1 each for "", LC_ALL, LANG, "", C */ 3219 unsigned int trial_locales_count; 3220 const char * const lc_all = PerlEnv_getenv("LC_ALL"); 3221 const char * const lang = PerlEnv_getenv("LANG"); 3222 bool setlocale_failure = FALSE; 3223 unsigned int i; 3224 3225 /* A later getenv() could zap this, so only use here */ 3226 const char * const bad_lang_use_once = PerlEnv_getenv("PERL_BADLANG"); 3227 3228 const bool locwarn = (printwarn > 1 3229 || ( printwarn 3230 && ( ! bad_lang_use_once 3231 || ( 3232 /* disallow with "" or "0" */ 3233 *bad_lang_use_once 3234 && strNE("0", bad_lang_use_once))))); 3235 3236 /* setlocale() return vals; not copied so must be looked at immediately */ 3237 const char * sl_result[NOMINAL_LC_ALL_INDEX + 1]; 3238 3239 /* current locale for given category; should have been copied so aren't 3240 * volatile */ 3241 const char * curlocales[NOMINAL_LC_ALL_INDEX + 1]; 3242 3243 # ifdef WIN32 3244 3245 /* In some systems you can find out the system default locale 3246 * and use that as the fallback locale. */ 3247 # define SYSTEM_DEFAULT_LOCALE 3248 # endif 3249 # ifdef SYSTEM_DEFAULT_LOCALE 3250 3251 const char *system_default_locale = NULL; 3252 3253 # endif 3254 3255 # ifndef DEBUGGING 3256 # define DEBUG_LOCALE_INIT(a,b,c) 3257 # else 3258 3259 DEBUG_INITIALIZATION_set(cBOOL(PerlEnv_getenv("PERL_DEBUG_LOCALE_INIT"))); 3260 3261 # define DEBUG_LOCALE_INIT(category, locale, result) \ 3262 STMT_START { \ 3263 if (debug_initialization) { \ 3264 PerlIO_printf(Perl_debug_log, \ 3265 "%s:%d: %s\n", \ 3266 __FILE__, __LINE__, \ 3267 setlocale_debug_string(category, \ 3268 locale, \ 3269 result)); \ 3270 } \ 3271 } STMT_END 3272 3273 /* Make sure the parallel arrays are properly set up */ 3274 # ifdef USE_LOCALE_NUMERIC 3275 assert(categories[LC_NUMERIC_INDEX] == LC_NUMERIC); 3276 assert(strEQ(category_names[LC_NUMERIC_INDEX], "LC_NUMERIC")); 3277 # ifdef USE_POSIX_2008_LOCALE 3278 assert(category_masks[LC_NUMERIC_INDEX] == LC_NUMERIC_MASK); 3279 # endif 3280 # endif 3281 # ifdef USE_LOCALE_CTYPE 3282 assert(categories[LC_CTYPE_INDEX] == LC_CTYPE); 3283 assert(strEQ(category_names[LC_CTYPE_INDEX], "LC_CTYPE")); 3284 # ifdef USE_POSIX_2008_LOCALE 3285 assert(category_masks[LC_CTYPE_INDEX] == LC_CTYPE_MASK); 3286 # endif 3287 # endif 3288 # ifdef USE_LOCALE_COLLATE 3289 assert(categories[LC_COLLATE_INDEX] == LC_COLLATE); 3290 assert(strEQ(category_names[LC_COLLATE_INDEX], "LC_COLLATE")); 3291 # ifdef USE_POSIX_2008_LOCALE 3292 assert(category_masks[LC_COLLATE_INDEX] == LC_COLLATE_MASK); 3293 # endif 3294 # endif 3295 # ifdef USE_LOCALE_TIME 3296 assert(categories[LC_TIME_INDEX] == LC_TIME); 3297 assert(strEQ(category_names[LC_TIME_INDEX], "LC_TIME")); 3298 # ifdef USE_POSIX_2008_LOCALE 3299 assert(category_masks[LC_TIME_INDEX] == LC_TIME_MASK); 3300 # endif 3301 # endif 3302 # ifdef USE_LOCALE_MESSAGES 3303 assert(categories[LC_MESSAGES_INDEX] == LC_MESSAGES); 3304 assert(strEQ(category_names[LC_MESSAGES_INDEX], "LC_MESSAGES")); 3305 # ifdef USE_POSIX_2008_LOCALE 3306 assert(category_masks[LC_MESSAGES_INDEX] == LC_MESSAGES_MASK); 3307 # endif 3308 # endif 3309 # ifdef USE_LOCALE_MONETARY 3310 assert(categories[LC_MONETARY_INDEX] == LC_MONETARY); 3311 assert(strEQ(category_names[LC_MONETARY_INDEX], "LC_MONETARY")); 3312 # ifdef USE_POSIX_2008_LOCALE 3313 assert(category_masks[LC_MONETARY_INDEX] == LC_MONETARY_MASK); 3314 # endif 3315 # endif 3316 # ifdef USE_LOCALE_ADDRESS 3317 assert(categories[LC_ADDRESS_INDEX] == LC_ADDRESS); 3318 assert(strEQ(category_names[LC_ADDRESS_INDEX], "LC_ADDRESS")); 3319 # ifdef USE_POSIX_2008_LOCALE 3320 assert(category_masks[LC_ADDRESS_INDEX] == LC_ADDRESS_MASK); 3321 # endif 3322 # endif 3323 # ifdef USE_LOCALE_IDENTIFICATION 3324 assert(categories[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION); 3325 assert(strEQ(category_names[LC_IDENTIFICATION_INDEX], "LC_IDENTIFICATION")); 3326 # ifdef USE_POSIX_2008_LOCALE 3327 assert(category_masks[LC_IDENTIFICATION_INDEX] == LC_IDENTIFICATION_MASK); 3328 # endif 3329 # endif 3330 # ifdef USE_LOCALE_MEASUREMENT 3331 assert(categories[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT); 3332 assert(strEQ(category_names[LC_MEASUREMENT_INDEX], "LC_MEASUREMENT")); 3333 # ifdef USE_POSIX_2008_LOCALE 3334 assert(category_masks[LC_MEASUREMENT_INDEX] == LC_MEASUREMENT_MASK); 3335 # endif 3336 # endif 3337 # ifdef USE_LOCALE_PAPER 3338 assert(categories[LC_PAPER_INDEX] == LC_PAPER); 3339 assert(strEQ(category_names[LC_PAPER_INDEX], "LC_PAPER")); 3340 # ifdef USE_POSIX_2008_LOCALE 3341 assert(category_masks[LC_PAPER_INDEX] == LC_PAPER_MASK); 3342 # endif 3343 # endif 3344 # ifdef USE_LOCALE_TELEPHONE 3345 assert(categories[LC_TELEPHONE_INDEX] == LC_TELEPHONE); 3346 assert(strEQ(category_names[LC_TELEPHONE_INDEX], "LC_TELEPHONE")); 3347 # ifdef USE_POSIX_2008_LOCALE 3348 assert(category_masks[LC_TELEPHONE_INDEX] == LC_TELEPHONE_MASK); 3349 # endif 3350 # endif 3351 # ifdef USE_LOCALE_SYNTAX 3352 assert(categories[LC_SYNTAX_INDEX] == LC_SYNTAX); 3353 assert(strEQ(category_names[LC_SYNTAX_INDEX], "LC_SYNTAX")); 3354 # ifdef USE_POSIX_2008_LOCALE 3355 assert(category_masks[LC_SYNTAX_INDEX] == LC_SYNTAX_MASK); 3356 # endif 3357 # endif 3358 # ifdef USE_LOCALE_TOD 3359 assert(categories[LC_TOD_INDEX] == LC_TOD); 3360 assert(strEQ(category_names[LC_TOD_INDEX], "LC_TOD")); 3361 # ifdef USE_POSIX_2008_LOCALE 3362 assert(category_masks[LC_TOD_INDEX] == LC_TOD_MASK); 3363 # endif 3364 # endif 3365 # ifdef LC_ALL 3366 assert(categories[LC_ALL_INDEX] == LC_ALL); 3367 assert(strEQ(category_names[LC_ALL_INDEX], "LC_ALL")); 3368 assert(NOMINAL_LC_ALL_INDEX == LC_ALL_INDEX); 3369 # ifdef USE_POSIX_2008_LOCALE 3370 assert(category_masks[LC_ALL_INDEX] == LC_ALL_MASK); 3371 # endif 3372 # endif 3373 # endif /* DEBUGGING */ 3374 3375 /* Initialize the per-thread mbrFOO() state variables. See POSIX.xs for 3376 * why these particular incantations are used. */ 3377 #ifdef HAS_MBRLEN 3378 memzero(&PL_mbrlen_ps, sizeof(PL_mbrlen_ps)); 3379 #endif 3380 #ifdef HAS_MBRTOWC 3381 memzero(&PL_mbrtowc_ps, sizeof(PL_mbrtowc_ps)); 3382 #endif 3383 #ifdef HAS_WCTOMBR 3384 wcrtomb(NULL, L'\0', &PL_wcrtomb_ps); 3385 #endif 3386 3387 /* Initialize the cache of the program's UTF-8ness for the always known 3388 * locales C and POSIX */ 3389 my_strlcpy(PL_locale_utf8ness, C_and_POSIX_utf8ness, 3390 sizeof(PL_locale_utf8ness)); 3391 3392 /* See https://github.com/Perl/perl5/issues/17824 */ 3393 Zero(curlocales, NOMINAL_LC_ALL_INDEX, char *); 3394 3395 # ifdef USE_THREAD_SAFE_LOCALE 3396 # ifdef WIN32 3397 3398 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); 3399 3400 # endif 3401 # endif 3402 # ifdef USE_POSIX_2008_LOCALE 3403 3404 PL_C_locale_obj = newlocale(LC_ALL_MASK, "C", (locale_t) 0); 3405 if (! PL_C_locale_obj) { 3406 Perl_croak_nocontext( 3407 "panic: Cannot create POSIX 2008 C locale object; errno=%d", errno); 3408 } 3409 if (DEBUG_Lv_TEST || debug_initialization) { 3410 PerlIO_printf(Perl_debug_log, "%s:%d: created C object %p\n", __FILE__, __LINE__, PL_C_locale_obj); 3411 } 3412 3413 # endif 3414 3415 # ifdef USE_LOCALE_NUMERIC 3416 3417 PL_numeric_radix_sv = newSVpvs("."); 3418 3419 # endif 3420 3421 # if defined(USE_POSIX_2008_LOCALE) && ! defined(HAS_QUERYLOCALE) 3422 3423 /* Initialize our records. If we have POSIX 2008, we have LC_ALL */ 3424 do_setlocale_c(LC_ALL, my_setlocale(LC_ALL, NULL)); 3425 3426 # endif 3427 # ifdef LOCALE_ENVIRON_REQUIRED 3428 3429 /* 3430 * Ultrix setlocale(..., "") fails if there are no environment 3431 * variables from which to get a locale name. 3432 */ 3433 3434 # ifndef LC_ALL 3435 # error Ultrix without LC_ALL not implemented 3436 # else 3437 3438 { 3439 bool done = FALSE; 3440 if (lang) { 3441 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, setlocale_init); 3442 DEBUG_LOCALE_INIT(LC_ALL, setlocale_init, sl_result[LC_ALL_INDEX]); 3443 if (sl_result[LC_ALL_INDEX]) 3444 done = TRUE; 3445 else 3446 setlocale_failure = TRUE; 3447 } 3448 if (! setlocale_failure) { 3449 const char * locale_param; 3450 for (i = 0; i < LC_ALL_INDEX; i++) { 3451 locale_param = (! done && (lang || PerlEnv_getenv(category_names[i]))) 3452 ? setlocale_init 3453 : NULL; 3454 sl_result[i] = do_setlocale_r(categories[i], locale_param); 3455 if (! sl_result[i]) { 3456 setlocale_failure = TRUE; 3457 } 3458 DEBUG_LOCALE_INIT(categories[i], locale_param, sl_result[i]); 3459 } 3460 } 3461 } 3462 3463 # endif /* LC_ALL */ 3464 # endif /* LOCALE_ENVIRON_REQUIRED */ 3465 3466 /* We try each locale in the list until we get one that works, or exhaust 3467 * the list. Normally the loop is executed just once. But if setting the 3468 * locale fails, inside the loop we add fallback trials to the array and so 3469 * will execute the loop multiple times */ 3470 trial_locales[0] = setlocale_init; 3471 trial_locales_count = 1; 3472 3473 for (i= 0; i < trial_locales_count; i++) { 3474 const char * trial_locale = trial_locales[i]; 3475 3476 if (i > 0) { 3477 3478 /* XXX This is to preserve old behavior for LOCALE_ENVIRON_REQUIRED 3479 * when i==0, but I (khw) don't think that behavior makes much 3480 * sense */ 3481 setlocale_failure = FALSE; 3482 3483 # ifdef SYSTEM_DEFAULT_LOCALE 3484 # ifdef WIN32 /* Note that assumes Win32 has LC_ALL */ 3485 3486 /* On Windows machines, an entry of "" after the 0th means to use 3487 * the system default locale, which we now proceed to get. */ 3488 if (strEQ(trial_locale, "")) { 3489 unsigned int j; 3490 3491 /* Note that this may change the locale, but we are going to do 3492 * that anyway just below */ 3493 system_default_locale = do_setlocale_c(LC_ALL, ""); 3494 DEBUG_LOCALE_INIT(LC_ALL, "", system_default_locale); 3495 3496 /* Skip if invalid or if it's already on the list of locales to 3497 * try */ 3498 if (! system_default_locale) { 3499 goto next_iteration; 3500 } 3501 for (j = 0; j < trial_locales_count; j++) { 3502 if (strEQ(system_default_locale, trial_locales[j])) { 3503 goto next_iteration; 3504 } 3505 } 3506 3507 trial_locale = system_default_locale; 3508 } 3509 # else 3510 # error SYSTEM_DEFAULT_LOCALE only implemented for Win32 3511 # endif 3512 # endif /* SYSTEM_DEFAULT_LOCALE */ 3513 3514 } /* For i > 0 */ 3515 3516 # ifdef LC_ALL 3517 3518 sl_result[LC_ALL_INDEX] = do_setlocale_c(LC_ALL, trial_locale); 3519 DEBUG_LOCALE_INIT(LC_ALL, trial_locale, sl_result[LC_ALL_INDEX]); 3520 if (! sl_result[LC_ALL_INDEX]) { 3521 setlocale_failure = TRUE; 3522 } 3523 else { 3524 /* Since LC_ALL succeeded, it should have changed all the other 3525 * categories it can to its value; so we massage things so that the 3526 * setlocales below just return their category's current values. 3527 * This adequately handles the case in NetBSD where LC_COLLATE may 3528 * not be defined for a locale, and setting it individually will 3529 * fail, whereas setting LC_ALL succeeds, leaving LC_COLLATE set to 3530 * the POSIX locale. */ 3531 trial_locale = NULL; 3532 } 3533 3534 # endif /* LC_ALL */ 3535 3536 if (! setlocale_failure) { 3537 unsigned int j; 3538 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { 3539 curlocales[j] 3540 = savepv(do_setlocale_r(categories[j], trial_locale)); 3541 if (! curlocales[j]) { 3542 setlocale_failure = TRUE; 3543 } 3544 DEBUG_LOCALE_INIT(categories[j], trial_locale, curlocales[j]); 3545 } 3546 3547 if (LIKELY(! setlocale_failure)) { /* All succeeded */ 3548 break; /* Exit trial_locales loop */ 3549 } 3550 } 3551 3552 /* Here, something failed; will need to try a fallback. */ 3553 ok = 0; 3554 3555 if (i == 0) { 3556 unsigned int j; 3557 3558 if (locwarn) { /* Output failure info only on the first one */ 3559 3560 # ifdef LC_ALL 3561 3562 PerlIO_printf(Perl_error_log, 3563 "perl: warning: Setting locale failed.\n"); 3564 3565 # else /* !LC_ALL */ 3566 3567 PerlIO_printf(Perl_error_log, 3568 "perl: warning: Setting locale failed for the categories:\n\t"); 3569 3570 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { 3571 if (! curlocales[j]) { 3572 PerlIO_printf(Perl_error_log, category_names[j]); 3573 } 3574 else { 3575 Safefree(curlocales[j]); 3576 } 3577 } 3578 3579 # endif /* LC_ALL */ 3580 3581 PerlIO_printf(Perl_error_log, 3582 "perl: warning: Please check that your locale settings:\n"); 3583 3584 # ifdef __GLIBC__ 3585 3586 PerlIO_printf(Perl_error_log, 3587 "\tLANGUAGE = %c%s%c,\n", 3588 language ? '"' : '(', 3589 language ? language : "unset", 3590 language ? '"' : ')'); 3591 # endif 3592 3593 PerlIO_printf(Perl_error_log, 3594 "\tLC_ALL = %c%s%c,\n", 3595 lc_all ? '"' : '(', 3596 lc_all ? lc_all : "unset", 3597 lc_all ? '"' : ')'); 3598 3599 # if defined(USE_ENVIRON_ARRAY) 3600 3601 { 3602 char **e; 3603 3604 /* Look through the environment for any variables of the 3605 * form qr/ ^ LC_ [A-Z]+ = /x, except LC_ALL which was 3606 * already handled above. These are assumed to be locale 3607 * settings. Output them and their values. */ 3608 for (e = environ; *e; e++) { 3609 const STRLEN prefix_len = sizeof("LC_") - 1; 3610 STRLEN uppers_len; 3611 3612 if ( strBEGINs(*e, "LC_") 3613 && ! strBEGINs(*e, "LC_ALL=") 3614 && (uppers_len = strspn(*e + prefix_len, 3615 "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) 3616 && ((*e)[prefix_len + uppers_len] == '=')) 3617 { 3618 PerlIO_printf(Perl_error_log, "\t%.*s = \"%s\",\n", 3619 (int) (prefix_len + uppers_len), *e, 3620 *e + prefix_len + uppers_len + 1); 3621 } 3622 } 3623 } 3624 3625 # else 3626 3627 PerlIO_printf(Perl_error_log, 3628 "\t(possibly more locale environment variables)\n"); 3629 3630 # endif 3631 3632 PerlIO_printf(Perl_error_log, 3633 "\tLANG = %c%s%c\n", 3634 lang ? '"' : '(', 3635 lang ? lang : "unset", 3636 lang ? '"' : ')'); 3637 3638 PerlIO_printf(Perl_error_log, 3639 " are supported and installed on your system.\n"); 3640 } 3641 3642 /* Calculate what fallback locales to try. We have avoided this 3643 * until we have to, because failure is quite unlikely. This will 3644 * usually change the upper bound of the loop we are in. 3645 * 3646 * Since the system's default way of setting the locale has not 3647 * found one that works, We use Perl's defined ordering: LC_ALL, 3648 * LANG, and the C locale. We don't try the same locale twice, so 3649 * don't add to the list if already there. (On POSIX systems, the 3650 * LC_ALL element will likely be a repeat of the 0th element "", 3651 * but there's no harm done by doing it explicitly. 3652 * 3653 * Note that this tries the LC_ALL environment variable even on 3654 * systems which have no LC_ALL locale setting. This may or may 3655 * not have been originally intentional, but there's no real need 3656 * to change the behavior. */ 3657 if (lc_all) { 3658 for (j = 0; j < trial_locales_count; j++) { 3659 if (strEQ(lc_all, trial_locales[j])) { 3660 goto done_lc_all; 3661 } 3662 } 3663 trial_locales[trial_locales_count++] = lc_all; 3664 } 3665 done_lc_all: 3666 3667 if (lang) { 3668 for (j = 0; j < trial_locales_count; j++) { 3669 if (strEQ(lang, trial_locales[j])) { 3670 goto done_lang; 3671 } 3672 } 3673 trial_locales[trial_locales_count++] = lang; 3674 } 3675 done_lang: 3676 3677 # if defined(WIN32) && defined(LC_ALL) 3678 3679 /* For Windows, we also try the system default locale before "C". 3680 * (If there exists a Windows without LC_ALL we skip this because 3681 * it gets too complicated. For those, the "C" is the next 3682 * fallback possibility). The "" is the same as the 0th element of 3683 * the array, but the code at the loop above knows to treat it 3684 * differently when not the 0th */ 3685 trial_locales[trial_locales_count++] = ""; 3686 3687 # endif 3688 3689 for (j = 0; j < trial_locales_count; j++) { 3690 if (strEQ("C", trial_locales[j])) { 3691 goto done_C; 3692 } 3693 } 3694 trial_locales[trial_locales_count++] = "C"; 3695 3696 done_C: ; 3697 } /* end of first time through the loop */ 3698 3699 # ifdef WIN32 3700 3701 next_iteration: ; 3702 3703 # endif 3704 3705 } /* end of looping through the trial locales */ 3706 3707 if (ok < 1) { /* If we tried to fallback */ 3708 const char* msg; 3709 if (! setlocale_failure) { /* fallback succeeded */ 3710 msg = "Falling back to"; 3711 } 3712 else { /* fallback failed */ 3713 unsigned int j; 3714 3715 /* We dropped off the end of the loop, so have to decrement i to 3716 * get back to the value the last time through */ 3717 i--; 3718 3719 ok = -1; 3720 msg = "Failed to fall back to"; 3721 3722 /* To continue, we should use whatever values we've got */ 3723 3724 for (j = 0; j < NOMINAL_LC_ALL_INDEX; j++) { 3725 Safefree(curlocales[j]); 3726 curlocales[j] = savepv(do_setlocale_r(categories[j], NULL)); 3727 DEBUG_LOCALE_INIT(categories[j], NULL, curlocales[j]); 3728 } 3729 } 3730 3731 if (locwarn) { 3732 const char * description; 3733 const char * name = ""; 3734 if (strEQ(trial_locales[i], "C")) { 3735 description = "the standard locale"; 3736 name = "C"; 3737 } 3738 3739 # ifdef SYSTEM_DEFAULT_LOCALE 3740 3741 else if (strEQ(trial_locales[i], "")) { 3742 description = "the system default locale"; 3743 if (system_default_locale) { 3744 name = system_default_locale; 3745 } 3746 } 3747 3748 # endif /* SYSTEM_DEFAULT_LOCALE */ 3749 3750 else { 3751 description = "a fallback locale"; 3752 name = trial_locales[i]; 3753 } 3754 if (name && strNE(name, "")) { 3755 PerlIO_printf(Perl_error_log, 3756 "perl: warning: %s %s (\"%s\").\n", msg, description, name); 3757 } 3758 else { 3759 PerlIO_printf(Perl_error_log, 3760 "perl: warning: %s %s.\n", msg, description); 3761 } 3762 } 3763 } /* End of tried to fallback */ 3764 3765 /* Done with finding the locales; update our records */ 3766 3767 # ifdef USE_LOCALE_CTYPE 3768 3769 new_ctype(curlocales[LC_CTYPE_INDEX]); 3770 3771 # endif 3772 # ifdef USE_LOCALE_COLLATE 3773 3774 new_collate(curlocales[LC_COLLATE_INDEX]); 3775 3776 # endif 3777 # ifdef USE_LOCALE_NUMERIC 3778 3779 new_numeric(curlocales[LC_NUMERIC_INDEX]); 3780 3781 # endif 3782 3783 for (i = 0; i < NOMINAL_LC_ALL_INDEX; i++) { 3784 3785 # if defined(USE_ITHREADS) && ! defined(USE_THREAD_SAFE_LOCALE) 3786 3787 /* This caches whether each category's locale is UTF-8 or not. This 3788 * may involve changing the locale. It is ok to do this at 3789 * initialization time before any threads have started, but not later 3790 * unless thread-safe operations are used. 3791 * Caching means that if the program heeds our dictate not to change 3792 * locales in threaded applications, this data will remain valid, and 3793 * it may get queried without having to change locales. If the 3794 * environment is such that all categories have the same locale, this 3795 * isn't needed, as the code will not change the locale; but this 3796 * handles the uncommon case where the environment has disparate 3797 * locales for the categories */ 3798 (void) _is_cur_LC_category_utf8(categories[i]); 3799 3800 # endif 3801 3802 Safefree(curlocales[i]); 3803 } 3804 3805 # if defined(USE_PERLIO) && defined(USE_LOCALE_CTYPE) 3806 3807 /* Set PL_utf8locale to TRUE if using PerlIO _and_ the current LC_CTYPE 3808 * locale is UTF-8. The call to new_ctype() just above has already 3809 * calculated the latter value and saved it in PL_in_utf8_CTYPE_locale. If 3810 * both PL_utf8locale and PL_unicode (set by -C or by $ENV{PERL_UNICODE}) 3811 * are true, perl.c:S_parse_body() will turn on the PerlIO :utf8 layer on 3812 * STDIN, STDOUT, STDERR, _and_ the default open discipline. */ 3813 PL_utf8locale = PL_in_utf8_CTYPE_locale; 3814 3815 /* Set PL_unicode to $ENV{PERL_UNICODE} if using PerlIO. 3816 This is an alternative to using the -C command line switch 3817 (the -C if present will override this). */ 3818 { 3819 const char *p = PerlEnv_getenv("PERL_UNICODE"); 3820 PL_unicode = p ? parse_unicode_opts(&p) : 0; 3821 if (PL_unicode & PERL_UNICODE_UTF8CACHEASSERT_FLAG) 3822 PL_utf8cache = -1; 3823 } 3824 3825 # endif 3826 #endif /* USE_LOCALE */ 3827 #ifdef DEBUGGING 3828 3829 /* So won't continue to output stuff */ 3830 DEBUG_INITIALIZATION_set(FALSE); 3831 3832 #endif 3833 3834 return ok; 3835 } 3836 3837 #ifdef USE_LOCALE_COLLATE 3838 3839 char * 3840 Perl__mem_collxfrm(pTHX_ const char *input_string, 3841 STRLEN len, /* Length of 'input_string' */ 3842 STRLEN *xlen, /* Set to length of returned string 3843 (not including the collation index 3844 prefix) */ 3845 bool utf8 /* Is the input in UTF-8? */ 3846 ) 3847 { 3848 3849 /* _mem_collxfrm() is a bit like strxfrm() but with two important 3850 * differences. First, it handles embedded NULs. Second, it allocates a bit 3851 * more memory than needed for the transformed data itself. The real 3852 * transformed data begins at offset COLLXFRM_HDR_LEN. *xlen is set to 3853 * the length of that, and doesn't include the collation index size. 3854 * Please see sv_collxfrm() to see how this is used. */ 3855 3856 #define COLLXFRM_HDR_LEN sizeof(PL_collation_ix) 3857 3858 char * s = (char *) input_string; 3859 STRLEN s_strlen = strlen(input_string); 3860 char *xbuf = NULL; 3861 STRLEN xAlloc; /* xalloc is a reserved word in VC */ 3862 STRLEN length_in_chars; 3863 bool first_time = TRUE; /* Cleared after first loop iteration */ 3864 3865 PERL_ARGS_ASSERT__MEM_COLLXFRM; 3866 3867 /* Must be NUL-terminated */ 3868 assert(*(input_string + len) == '\0'); 3869 3870 /* If this locale has defective collation, skip */ 3871 if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) { 3872 DEBUG_L(PerlIO_printf(Perl_debug_log, 3873 "_mem_collxfrm: locale's collation is defective\n")); 3874 goto bad; 3875 } 3876 3877 /* Replace any embedded NULs with the control that sorts before any others. 3878 * This will give as good as possible results on strings that don't 3879 * otherwise contain that character, but otherwise there may be 3880 * less-than-perfect results with that character and NUL. This is 3881 * unavoidable unless we replace strxfrm with our own implementation. */ 3882 if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded 3883 NUL */ 3884 char * e = s + len; 3885 char * sans_nuls; 3886 STRLEN sans_nuls_len; 3887 int try_non_controls; 3888 char this_replacement_char[] = "?\0"; /* Room for a two-byte string, 3889 making sure 2nd byte is NUL. 3890 */ 3891 STRLEN this_replacement_len; 3892 3893 /* If we don't know what non-NUL control character sorts lowest for 3894 * this locale, find it */ 3895 if (PL_strxfrm_NUL_replacement == '\0') { 3896 int j; 3897 char * cur_min_x = NULL; /* The min_char's xfrm, (except it also 3898 includes the collation index 3899 prefixed. */ 3900 3901 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n")); 3902 3903 /* Unlikely, but it may be that no control will work to replace 3904 * NUL, in which case we instead look for any character. Controls 3905 * are preferred because collation order is, in general, context 3906 * sensitive, with adjoining characters affecting the order, and 3907 * controls are less likely to have such interactions, allowing the 3908 * NUL-replacement to stand on its own. (Another way to look at it 3909 * is to imagine what would happen if the NUL were replaced by a 3910 * combining character; it wouldn't work out all that well.) */ 3911 for (try_non_controls = 0; 3912 try_non_controls < 2; 3913 try_non_controls++) 3914 { 3915 /* Look through all legal code points (NUL isn't) */ 3916 for (j = 1; j < 256; j++) { 3917 char * x; /* j's xfrm plus collation index */ 3918 STRLEN x_len; /* length of 'x' */ 3919 STRLEN trial_len = 1; 3920 char cur_source[] = { '\0', '\0' }; 3921 3922 /* Skip non-controls the first time through the loop. The 3923 * controls in a UTF-8 locale are the L1 ones */ 3924 if (! try_non_controls && (PL_in_utf8_COLLATE_locale) 3925 ? ! isCNTRL_L1(j) 3926 : ! isCNTRL_LC(j)) 3927 { 3928 continue; 3929 } 3930 3931 /* Create a 1-char string of the current code point */ 3932 cur_source[0] = (char) j; 3933 3934 /* Then transform it */ 3935 x = _mem_collxfrm(cur_source, trial_len, &x_len, 3936 0 /* The string is not in UTF-8 */); 3937 3938 /* Ignore any character that didn't successfully transform. 3939 * */ 3940 if (! x) { 3941 continue; 3942 } 3943 3944 /* If this character's transformation is lower than 3945 * the current lowest, this one becomes the lowest */ 3946 if ( cur_min_x == NULL 3947 || strLT(x + COLLXFRM_HDR_LEN, 3948 cur_min_x + COLLXFRM_HDR_LEN)) 3949 { 3950 PL_strxfrm_NUL_replacement = j; 3951 Safefree(cur_min_x); 3952 cur_min_x = x; 3953 } 3954 else { 3955 Safefree(x); 3956 } 3957 } /* end of loop through all 255 characters */ 3958 3959 /* Stop looking if found */ 3960 if (cur_min_x) { 3961 break; 3962 } 3963 3964 /* Unlikely, but possible, if there aren't any controls that 3965 * work in the locale, repeat the loop, looking for any 3966 * character that works */ 3967 DEBUG_L(PerlIO_printf(Perl_debug_log, 3968 "_mem_collxfrm: No control worked. Trying non-controls\n")); 3969 } /* End of loop to try first the controls, then any char */ 3970 3971 if (! cur_min_x) { 3972 DEBUG_L(PerlIO_printf(Perl_debug_log, 3973 "_mem_collxfrm: Couldn't find any character to replace" 3974 " embedded NULs in locale %s with", PL_collation_name)); 3975 goto bad; 3976 } 3977 3978 DEBUG_L(PerlIO_printf(Perl_debug_log, 3979 "_mem_collxfrm: Replacing embedded NULs in locale %s with " 3980 "0x%02X\n", PL_collation_name, PL_strxfrm_NUL_replacement)); 3981 3982 Safefree(cur_min_x); 3983 } /* End of determining the character that is to replace NULs */ 3984 3985 /* If the replacement is variant under UTF-8, it must match the 3986 * UTF8-ness of the original */ 3987 if ( ! UVCHR_IS_INVARIANT(PL_strxfrm_NUL_replacement) && utf8) { 3988 this_replacement_char[0] = 3989 UTF8_EIGHT_BIT_HI(PL_strxfrm_NUL_replacement); 3990 this_replacement_char[1] = 3991 UTF8_EIGHT_BIT_LO(PL_strxfrm_NUL_replacement); 3992 this_replacement_len = 2; 3993 } 3994 else { 3995 this_replacement_char[0] = PL_strxfrm_NUL_replacement; 3996 /* this_replacement_char[1] = '\0' was done at initialization */ 3997 this_replacement_len = 1; 3998 } 3999 4000 /* The worst case length for the replaced string would be if every 4001 * character in it is NUL. Multiply that by the length of each 4002 * replacement, and allow for a trailing NUL */ 4003 sans_nuls_len = (len * this_replacement_len) + 1; 4004 Newx(sans_nuls, sans_nuls_len, char); 4005 *sans_nuls = '\0'; 4006 4007 /* Replace each NUL with the lowest collating control. Loop until have 4008 * exhausted all the NULs */ 4009 while (s + s_strlen < e) { 4010 my_strlcat(sans_nuls, s, sans_nuls_len); 4011 4012 /* Do the actual replacement */ 4013 my_strlcat(sans_nuls, this_replacement_char, sans_nuls_len); 4014 4015 /* Move past the input NUL */ 4016 s += s_strlen + 1; 4017 s_strlen = strlen(s); 4018 } 4019 4020 /* And add anything that trails the final NUL */ 4021 my_strlcat(sans_nuls, s, sans_nuls_len); 4022 4023 /* Switch so below we transform this modified string */ 4024 s = sans_nuls; 4025 len = strlen(s); 4026 } /* End of replacing NULs */ 4027 4028 /* Make sure the UTF8ness of the string and locale match */ 4029 if (utf8 != PL_in_utf8_COLLATE_locale) { 4030 /* XXX convert above Unicode to 10FFFF? */ 4031 const char * const t = s; /* Temporary so we can later find where the 4032 input was */ 4033 4034 /* Here they don't match. Change the string's to be what the locale is 4035 * expecting */ 4036 4037 if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */ 4038 s = (char *) bytes_to_utf8((const U8 *) s, &len); 4039 utf8 = TRUE; 4040 } 4041 else { /* locale is not UTF-8; but input is; downgrade the input */ 4042 4043 s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8); 4044 4045 /* If the downgrade was successful we are done, but if the input 4046 * contains things that require UTF-8 to represent, have to do 4047 * damage control ... */ 4048 if (UNLIKELY(utf8)) { 4049 4050 /* What we do is construct a non-UTF-8 string with 4051 * 1) the characters representable by a single byte converted 4052 * to be so (if necessary); 4053 * 2) and the rest converted to collate the same as the 4054 * highest collating representable character. That makes 4055 * them collate at the end. This is similar to how we 4056 * handle embedded NULs, but we use the highest collating 4057 * code point instead of the smallest. Like the NUL case, 4058 * this isn't perfect, but is the best we can reasonably 4059 * do. Every above-255 code point will sort the same as 4060 * the highest-sorting 0-255 code point. If that code 4061 * point can combine in a sequence with some other code 4062 * points for weight calculations, us changing something to 4063 * be it can adversely affect the results. But in most 4064 * cases, it should work reasonably. And note that this is 4065 * really an illegal situation: using code points above 255 4066 * on a locale where only 0-255 are valid. If two strings 4067 * sort entirely equal, then the sort order for the 4068 * above-255 code points will be in code point order. */ 4069 4070 utf8 = FALSE; 4071 4072 /* If we haven't calculated the code point with the maximum 4073 * collating order for this locale, do so now */ 4074 if (! PL_strxfrm_max_cp) { 4075 int j; 4076 4077 /* The current transformed string that collates the 4078 * highest (except it also includes the prefixed collation 4079 * index. */ 4080 char * cur_max_x = NULL; 4081 4082 /* Look through all legal code points (NUL isn't) */ 4083 for (j = 1; j < 256; j++) { 4084 char * x; 4085 STRLEN x_len; 4086 char cur_source[] = { '\0', '\0' }; 4087 4088 /* Create a 1-char string of the current code point */ 4089 cur_source[0] = (char) j; 4090 4091 /* Then transform it */ 4092 x = _mem_collxfrm(cur_source, 1, &x_len, FALSE); 4093 4094 /* If something went wrong (which it shouldn't), just 4095 * ignore this code point */ 4096 if (! x) { 4097 continue; 4098 } 4099 4100 /* If this character's transformation is higher than 4101 * the current highest, this one becomes the highest */ 4102 if ( cur_max_x == NULL 4103 || strGT(x + COLLXFRM_HDR_LEN, 4104 cur_max_x + COLLXFRM_HDR_LEN)) 4105 { 4106 PL_strxfrm_max_cp = j; 4107 Safefree(cur_max_x); 4108 cur_max_x = x; 4109 } 4110 else { 4111 Safefree(x); 4112 } 4113 } 4114 4115 if (! cur_max_x) { 4116 DEBUG_L(PerlIO_printf(Perl_debug_log, 4117 "_mem_collxfrm: Couldn't find any character to" 4118 " replace above-Latin1 chars in locale %s with", 4119 PL_collation_name)); 4120 goto bad; 4121 } 4122 4123 DEBUG_L(PerlIO_printf(Perl_debug_log, 4124 "_mem_collxfrm: highest 1-byte collating character" 4125 " in locale %s is 0x%02X\n", 4126 PL_collation_name, 4127 PL_strxfrm_max_cp)); 4128 4129 Safefree(cur_max_x); 4130 } 4131 4132 /* Here we know which legal code point collates the highest. 4133 * We are ready to construct the non-UTF-8 string. The length 4134 * will be at least 1 byte smaller than the input string 4135 * (because we changed at least one 2-byte character into a 4136 * single byte), but that is eaten up by the trailing NUL */ 4137 Newx(s, len, char); 4138 4139 { 4140 STRLEN i; 4141 STRLEN d= 0; 4142 char * e = (char *) t + len; 4143 4144 for (i = 0; i < len; i+= UTF8SKIP(t + i)) { 4145 U8 cur_char = t[i]; 4146 if (UTF8_IS_INVARIANT(cur_char)) { 4147 s[d++] = cur_char; 4148 } 4149 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) { 4150 s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]); 4151 } 4152 else { /* Replace illegal cp with highest collating 4153 one */ 4154 s[d++] = PL_strxfrm_max_cp; 4155 } 4156 } 4157 s[d++] = '\0'; 4158 Renew(s, d, char); /* Free up unused space */ 4159 } 4160 } 4161 } 4162 4163 /* Here, we have constructed a modified version of the input. It could 4164 * be that we already had a modified copy before we did this version. 4165 * If so, that copy is no longer needed */ 4166 if (t != input_string) { 4167 Safefree(t); 4168 } 4169 } 4170 4171 length_in_chars = (utf8) 4172 ? utf8_length((U8 *) s, (U8 *) s + len) 4173 : len; 4174 4175 /* The first element in the output is the collation id, used by 4176 * sv_collxfrm(); then comes the space for the transformed string. The 4177 * equation should give us a good estimate as to how much is needed */ 4178 xAlloc = COLLXFRM_HDR_LEN 4179 + PL_collxfrm_base 4180 + (PL_collxfrm_mult * length_in_chars); 4181 Newx(xbuf, xAlloc, char); 4182 if (UNLIKELY(! xbuf)) { 4183 DEBUG_L(PerlIO_printf(Perl_debug_log, 4184 "_mem_collxfrm: Couldn't malloc %zu bytes\n", xAlloc)); 4185 goto bad; 4186 } 4187 4188 /* Store the collation id */ 4189 *(U32*)xbuf = PL_collation_ix; 4190 4191 /* Then the transformation of the input. We loop until successful, or we 4192 * give up */ 4193 for (;;) { 4194 4195 *xlen = strxfrm(xbuf + COLLXFRM_HDR_LEN, s, xAlloc - COLLXFRM_HDR_LEN); 4196 4197 /* If the transformed string occupies less space than we told strxfrm() 4198 * was available, it means it successfully transformed the whole 4199 * string. */ 4200 if (*xlen < xAlloc - COLLXFRM_HDR_LEN) { 4201 4202 /* Some systems include a trailing NUL in the returned length. 4203 * Ignore it, using a loop in case multiple trailing NULs are 4204 * returned. */ 4205 while ( (*xlen) > 0 4206 && *(xbuf + COLLXFRM_HDR_LEN + (*xlen) - 1) == '\0') 4207 { 4208 (*xlen)--; 4209 } 4210 4211 /* If the first try didn't get it, it means our prediction was low. 4212 * Modify the coefficients so that we predict a larger value in any 4213 * future transformations */ 4214 if (! first_time) { 4215 STRLEN needed = *xlen + 1; /* +1 For trailing NUL */ 4216 STRLEN computed_guess = PL_collxfrm_base 4217 + (PL_collxfrm_mult * length_in_chars); 4218 4219 /* On zero-length input, just keep current slope instead of 4220 * dividing by 0 */ 4221 const STRLEN new_m = (length_in_chars != 0) 4222 ? needed / length_in_chars 4223 : PL_collxfrm_mult; 4224 4225 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4226 "%s: %d: initial size of %zu bytes for a length " 4227 "%zu string was insufficient, %zu needed\n", 4228 __FILE__, __LINE__, 4229 computed_guess, length_in_chars, needed)); 4230 4231 /* If slope increased, use it, but discard this result for 4232 * length 1 strings, as we can't be sure that it's a real slope 4233 * change */ 4234 if (length_in_chars > 1 && new_m > PL_collxfrm_mult) { 4235 4236 # ifdef DEBUGGING 4237 4238 STRLEN old_m = PL_collxfrm_mult; 4239 STRLEN old_b = PL_collxfrm_base; 4240 4241 # endif 4242 4243 PL_collxfrm_mult = new_m; 4244 PL_collxfrm_base = 1; /* +1 For trailing NUL */ 4245 computed_guess = PL_collxfrm_base 4246 + (PL_collxfrm_mult * length_in_chars); 4247 if (computed_guess < needed) { 4248 PL_collxfrm_base += needed - computed_guess; 4249 } 4250 4251 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4252 "%s: %d: slope is now %zu; was %zu, base " 4253 "is now %zu; was %zu\n", 4254 __FILE__, __LINE__, 4255 PL_collxfrm_mult, old_m, 4256 PL_collxfrm_base, old_b)); 4257 } 4258 else { /* Slope didn't change, but 'b' did */ 4259 const STRLEN new_b = needed 4260 - computed_guess 4261 + PL_collxfrm_base; 4262 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4263 "%s: %d: base is now %zu; was %zu\n", 4264 __FILE__, __LINE__, 4265 new_b, PL_collxfrm_base)); 4266 PL_collxfrm_base = new_b; 4267 } 4268 } 4269 4270 break; 4271 } 4272 4273 if (UNLIKELY(*xlen >= PERL_INT_MAX)) { 4274 DEBUG_L(PerlIO_printf(Perl_debug_log, 4275 "_mem_collxfrm: Needed %zu bytes, max permissible is %u\n", 4276 *xlen, PERL_INT_MAX)); 4277 goto bad; 4278 } 4279 4280 /* A well-behaved strxfrm() returns exactly how much space it needs 4281 * (usually not including the trailing NUL) when it fails due to not 4282 * enough space being provided. Assume that this is the case unless 4283 * it's been proven otherwise */ 4284 if (LIKELY(PL_strxfrm_is_behaved) && first_time) { 4285 xAlloc = *xlen + COLLXFRM_HDR_LEN + 1; 4286 } 4287 else { /* Here, either: 4288 * 1) The strxfrm() has previously shown bad behavior; or 4289 * 2) It isn't the first time through the loop, which means 4290 * that the strxfrm() is now showing bad behavior, because 4291 * we gave it what it said was needed in the previous 4292 * iteration, and it came back saying it needed still more. 4293 * (Many versions of cygwin fit this. When the buffer size 4294 * isn't sufficient, they return the input size instead of 4295 * how much is needed.) 4296 * Increase the buffer size by a fixed percentage and try again. 4297 * */ 4298 xAlloc += (xAlloc / 4) + 1; 4299 PL_strxfrm_is_behaved = FALSE; 4300 4301 # ifdef DEBUGGING 4302 4303 if (DEBUG_Lv_TEST || debug_initialization) { 4304 PerlIO_printf(Perl_debug_log, 4305 "_mem_collxfrm required more space than previously calculated" 4306 " for locale %s, trying again with new guess=%zu+%zu\n", 4307 PL_collation_name, COLLXFRM_HDR_LEN, 4308 xAlloc - COLLXFRM_HDR_LEN); 4309 } 4310 4311 # endif 4312 4313 } 4314 4315 Renew(xbuf, xAlloc, char); 4316 if (UNLIKELY(! xbuf)) { 4317 DEBUG_L(PerlIO_printf(Perl_debug_log, 4318 "_mem_collxfrm: Couldn't realloc %zu bytes\n", xAlloc)); 4319 goto bad; 4320 } 4321 4322 first_time = FALSE; 4323 } 4324 4325 4326 # ifdef DEBUGGING 4327 4328 if (DEBUG_Lv_TEST || debug_initialization) { 4329 4330 print_collxfrm_input_and_return(s, s + len, xlen, utf8); 4331 PerlIO_printf(Perl_debug_log, "Its xfrm is:"); 4332 PerlIO_printf(Perl_debug_log, "%s\n", 4333 _byte_dump_string((U8 *) xbuf + COLLXFRM_HDR_LEN, 4334 *xlen, 1)); 4335 } 4336 4337 # endif 4338 4339 /* Free up unneeded space; retain ehough for trailing NUL */ 4340 Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); 4341 4342 if (s != input_string) { 4343 Safefree(s); 4344 } 4345 4346 return xbuf; 4347 4348 bad: 4349 4350 # ifdef DEBUGGING 4351 4352 if (DEBUG_Lv_TEST || debug_initialization) { 4353 print_collxfrm_input_and_return(s, s + len, NULL, utf8); 4354 } 4355 4356 # endif 4357 4358 Safefree(xbuf); 4359 if (s != input_string) { 4360 Safefree(s); 4361 } 4362 *xlen = 0; 4363 4364 return NULL; 4365 } 4366 4367 # ifdef DEBUGGING 4368 4369 STATIC void 4370 S_print_collxfrm_input_and_return(pTHX_ 4371 const char * const s, 4372 const char * const e, 4373 const STRLEN * const xlen, 4374 const bool is_utf8) 4375 { 4376 4377 PERL_ARGS_ASSERT_PRINT_COLLXFRM_INPUT_AND_RETURN; 4378 4379 PerlIO_printf(Perl_debug_log, "_mem_collxfrm[%" UVuf "]: returning ", 4380 (UV)PL_collation_ix); 4381 if (xlen) { 4382 PerlIO_printf(Perl_debug_log, "%zu", *xlen); 4383 } 4384 else { 4385 PerlIO_printf(Perl_debug_log, "NULL"); 4386 } 4387 PerlIO_printf(Perl_debug_log, " for locale '%s', string='", 4388 PL_collation_name); 4389 print_bytes_for_locale(s, e, is_utf8); 4390 4391 PerlIO_printf(Perl_debug_log, "'\n"); 4392 } 4393 4394 # endif /* DEBUGGING */ 4395 #endif /* USE_LOCALE_COLLATE */ 4396 #ifdef USE_LOCALE 4397 # ifdef DEBUGGING 4398 4399 STATIC void 4400 S_print_bytes_for_locale(pTHX_ 4401 const char * const s, 4402 const char * const e, 4403 const bool is_utf8) 4404 { 4405 const char * t = s; 4406 bool prev_was_printable = TRUE; 4407 bool first_time = TRUE; 4408 4409 PERL_ARGS_ASSERT_PRINT_BYTES_FOR_LOCALE; 4410 4411 while (t < e) { 4412 UV cp = (is_utf8) 4413 ? utf8_to_uvchr_buf((U8 *) t, e, NULL) 4414 : * (U8 *) t; 4415 if (isPRINT(cp)) { 4416 if (! prev_was_printable) { 4417 PerlIO_printf(Perl_debug_log, " "); 4418 } 4419 PerlIO_printf(Perl_debug_log, "%c", (U8) cp); 4420 prev_was_printable = TRUE; 4421 } 4422 else { 4423 if (! first_time) { 4424 PerlIO_printf(Perl_debug_log, " "); 4425 } 4426 PerlIO_printf(Perl_debug_log, "%02" UVXf, cp); 4427 prev_was_printable = FALSE; 4428 } 4429 t += (is_utf8) ? UTF8SKIP(t) : 1; 4430 first_time = FALSE; 4431 } 4432 } 4433 4434 # endif /* #ifdef DEBUGGING */ 4435 4436 STATIC const char * 4437 S_switch_category_locale_to_template(pTHX_ const int switch_category, const int template_category, const char * template_locale) 4438 { 4439 /* Changes the locale for LC_'switch_category" to that of 4440 * LC_'template_category', if they aren't already the same. If not NULL, 4441 * 'template_locale' is the locale that 'template_category' is in. 4442 * 4443 * Returns a copy of the name of the original locale for 'switch_category' 4444 * so can be switched back to with the companion function 4445 * restore_switched_locale(), (NULL if no restoral is necessary.) */ 4446 4447 char * restore_to_locale = NULL; 4448 4449 if (switch_category == template_category) { /* No changes needed */ 4450 return NULL; 4451 } 4452 4453 /* Find the original locale of the category we may need to change, so that 4454 * it can be restored to later */ 4455 restore_to_locale = stdize_locale(savepv(do_setlocale_r(switch_category, 4456 NULL))); 4457 if (! restore_to_locale) { 4458 Perl_croak(aTHX_ 4459 "panic: %s: %d: Could not find current %s locale, errno=%d\n", 4460 __FILE__, __LINE__, category_name(switch_category), errno); 4461 } 4462 4463 /* If the locale of the template category wasn't passed in, find it now */ 4464 if (template_locale == NULL) { 4465 template_locale = do_setlocale_r(template_category, NULL); 4466 if (! template_locale) { 4467 Perl_croak(aTHX_ 4468 "panic: %s: %d: Could not find current %s locale, errno=%d\n", 4469 __FILE__, __LINE__, category_name(template_category), errno); 4470 } 4471 } 4472 4473 /* It the locales are the same, there's nothing to do */ 4474 if (strEQ(restore_to_locale, template_locale)) { 4475 Safefree(restore_to_locale); 4476 4477 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale unchanged as %s\n", 4478 category_name(switch_category), template_locale)); 4479 4480 return NULL; 4481 } 4482 4483 /* Finally, change the locale to the template one */ 4484 if (! do_setlocale_r(switch_category, template_locale)) { 4485 Perl_croak(aTHX_ 4486 "panic: %s: %d: Could not change %s locale to %s, errno=%d\n", 4487 __FILE__, __LINE__, category_name(switch_category), 4488 template_locale, errno); 4489 } 4490 4491 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s locale switched to %s\n", 4492 category_name(switch_category), template_locale)); 4493 4494 return restore_to_locale; 4495 } 4496 4497 STATIC void 4498 S_restore_switched_locale(pTHX_ const int category, const char * const original_locale) 4499 { 4500 /* Restores the locale for LC_'category' to 'original_locale' (which is a 4501 * copy that will be freed by this function), or do nothing if the latter 4502 * parameter is NULL */ 4503 4504 if (original_locale == NULL) { 4505 return; 4506 } 4507 4508 if (! do_setlocale_r(category, original_locale)) { 4509 Perl_croak(aTHX_ 4510 "panic: %s: %d: setlocale %s restore to %s failed, errno=%d\n", 4511 __FILE__, __LINE__, 4512 category_name(category), original_locale, errno); 4513 } 4514 4515 Safefree(original_locale); 4516 } 4517 4518 /* is_cur_LC_category_utf8 uses a small char buffer to avoid malloc/free */ 4519 #define CUR_LC_BUFFER_SIZE 64 4520 4521 bool 4522 Perl__is_cur_LC_category_utf8(pTHX_ int category) 4523 { 4524 /* Returns TRUE if the current locale for 'category' is UTF-8; FALSE 4525 * otherwise. 'category' may not be LC_ALL. If the platform doesn't have 4526 * nl_langinfo(), nor MB_CUR_MAX, this employs a heuristic, which hence 4527 * could give the wrong result. The result will very likely be correct for 4528 * languages that have commonly used non-ASCII characters, but for notably 4529 * English, it comes down to if the locale's name ends in something like 4530 * "UTF-8". It errs on the side of not being a UTF-8 locale. 4531 * 4532 * If the platform is early C89, not containing mbtowc(), or we are 4533 * compiled to not pay attention to LC_CTYPE, this employs heuristics. 4534 * These work very well for non-Latin locales or those whose currency 4535 * symbol isn't a '$' nor plain ASCII text. But without LC_CTYPE and at 4536 * least MB_CUR_MAX, English locales with an ASCII currency symbol depend 4537 * on the name containing UTF-8 or not. */ 4538 4539 /* Name of current locale corresponding to the input category */ 4540 const char *save_input_locale = NULL; 4541 4542 bool is_utf8 = FALSE; /* The return value */ 4543 4544 /* The variables below are for the cache of previous lookups using this 4545 * function. The cache is a C string, described at the definition for 4546 * 'C_and_POSIX_utf8ness'. 4547 * 4548 * The first part of the cache is fixed, for the C and POSIX locales. The 4549 * varying part starts just after them. */ 4550 char * utf8ness_cache = PL_locale_utf8ness + STRLENs(C_and_POSIX_utf8ness); 4551 4552 Size_t utf8ness_cache_size; /* Size of the varying portion */ 4553 Size_t input_name_len; /* Length in bytes of save_input_locale */ 4554 Size_t input_name_len_with_overhead; /* plus extra chars used to store 4555 the name in the cache */ 4556 char * delimited; /* The name plus the delimiters used to store 4557 it in the cache */ 4558 char buffer[CUR_LC_BUFFER_SIZE]; /* small buffer */ 4559 char * name_pos; /* position of 'delimited' in the cache, or 0 4560 if not there */ 4561 4562 4563 # ifdef LC_ALL 4564 4565 assert(category != LC_ALL); 4566 4567 # endif 4568 4569 /* Get the desired category's locale */ 4570 save_input_locale = stdize_locale(savepv(do_setlocale_r(category, NULL))); 4571 if (! save_input_locale) { 4572 Perl_croak(aTHX_ 4573 "panic: %s: %d: Could not find current %s locale, errno=%d\n", 4574 __FILE__, __LINE__, category_name(category), errno); 4575 } 4576 4577 DEBUG_L(PerlIO_printf(Perl_debug_log, 4578 "Current locale for %s is %s\n", 4579 category_name(category), save_input_locale)); 4580 4581 input_name_len = strlen(save_input_locale); 4582 4583 /* In our cache, each name is accompanied by two delimiters and a single 4584 * utf8ness digit */ 4585 input_name_len_with_overhead = input_name_len + 3; 4586 4587 if ( input_name_len_with_overhead <= CUR_LC_BUFFER_SIZE ) { 4588 /* we can use the buffer, avoid a malloc */ 4589 delimited = buffer; 4590 } else { /* need a malloc */ 4591 /* Allocate and populate space for a copy of the name surrounded by the 4592 * delimiters */ 4593 Newx(delimited, input_name_len_with_overhead, char); 4594 } 4595 4596 delimited[0] = UTF8NESS_SEP[0]; 4597 Copy(save_input_locale, delimited + 1, input_name_len, char); 4598 delimited[input_name_len+1] = UTF8NESS_PREFIX[0]; 4599 delimited[input_name_len+2] = '\0'; 4600 4601 /* And see if that is in the cache */ 4602 name_pos = instr(PL_locale_utf8ness, delimited); 4603 if (name_pos) { 4604 is_utf8 = *(name_pos + input_name_len_with_overhead - 1) - '0'; 4605 4606 # ifdef DEBUGGING 4607 4608 if (DEBUG_Lv_TEST || debug_initialization) { 4609 PerlIO_printf(Perl_debug_log, "UTF8ness for locale %s=%d, \n", 4610 save_input_locale, is_utf8); 4611 } 4612 4613 # endif 4614 4615 /* And, if not already in that position, move it to the beginning of 4616 * the non-constant portion of the list, since it is the most recently 4617 * used. (We don't have to worry about overflow, since just moving 4618 * existing names around) */ 4619 if (name_pos > utf8ness_cache) { 4620 Move(utf8ness_cache, 4621 utf8ness_cache + input_name_len_with_overhead, 4622 name_pos - utf8ness_cache, char); 4623 Copy(delimited, 4624 utf8ness_cache, 4625 input_name_len_with_overhead - 1, char); 4626 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; 4627 } 4628 4629 /* free only when not using the buffer */ 4630 if ( delimited != buffer ) Safefree(delimited); 4631 Safefree(save_input_locale); 4632 return is_utf8; 4633 } 4634 4635 /* Here we don't have stored the utf8ness for the input locale. We have to 4636 * calculate it */ 4637 4638 # if defined(USE_LOCALE_CTYPE) \ 4639 && ( defined(HAS_NL_LANGINFO) \ 4640 || (defined(HAS_MBTOWC) || defined(HAS_MBRTOWC))) 4641 4642 { 4643 const char *original_ctype_locale 4644 = switch_category_locale_to_template(LC_CTYPE, 4645 category, 4646 save_input_locale); 4647 4648 /* Here the current LC_CTYPE is set to the locale of the category whose 4649 * information is desired. This means that nl_langinfo() and mbtowc() 4650 * should give the correct results */ 4651 4652 # ifdef MB_CUR_MAX /* But we can potentially rule out UTF-8ness, avoiding 4653 calling the functions if we have this */ 4654 4655 /* Standard UTF-8 needs at least 4 bytes to represent the maximum 4656 * Unicode code point. */ 4657 4658 DEBUG_L(PerlIO_printf(Perl_debug_log, "%s: %d: MB_CUR_MAX=%d\n", 4659 __FILE__, __LINE__, (int) MB_CUR_MAX)); 4660 if ((unsigned) MB_CUR_MAX < STRLENs(MAX_UNICODE_UTF8)) { 4661 is_utf8 = FALSE; 4662 restore_switched_locale(LC_CTYPE, original_ctype_locale); 4663 goto finish_and_return; 4664 } 4665 4666 # endif 4667 # if defined(HAS_NL_LANGINFO) 4668 4669 { /* The task is easiest if the platform has this POSIX 2001 function. 4670 Except on some platforms it can wrongly return "", so have to have 4671 a fallback. And it can return that it's UTF-8, even if there are 4672 variances from that. For example, Turkish locales may use the 4673 alternate dotted I rules, and sometimes it appears to be a 4674 defective locale definition. XXX We should probably check for 4675 these in the Latin1 range and warn (but on glibc, requires 4676 iswalnum() etc. due to their not handling 80-FF correctly */ 4677 const char *codeset = my_nl_langinfo(CODESET, FALSE); 4678 /* FALSE => already in dest locale */ 4679 4680 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4681 "\tnllanginfo returned CODESET '%s'\n", codeset)); 4682 4683 if (codeset && strNE(codeset, "")) { 4684 4685 /* If the implementation of foldEQ() somehow were 4686 * to change to not go byte-by-byte, this could 4687 * read past end of string, as only one length is 4688 * checked. But currently, a premature NUL will 4689 * compare false, and it will stop there */ 4690 is_utf8 = cBOOL( foldEQ(codeset, STR_WITH_LEN("UTF-8")) 4691 || foldEQ(codeset, STR_WITH_LEN("UTF8"))); 4692 4693 DEBUG_L(PerlIO_printf(Perl_debug_log, 4694 "\tnllanginfo returned CODESET '%s'; ?UTF8 locale=%d\n", 4695 codeset, is_utf8)); 4696 restore_switched_locale(LC_CTYPE, original_ctype_locale); 4697 goto finish_and_return; 4698 } 4699 } 4700 4701 # endif 4702 # if defined(HAS_MBTOWC) || defined(HAS_MBRTOWC) 4703 /* We can see if this is a UTF-8-like locale if have mbtowc(). It was a 4704 * late adder to C89, so very likely to have it. However, testing has 4705 * shown that, like nl_langinfo() above, there are locales that are not 4706 * strictly UTF-8 that this will return that they are */ 4707 4708 { 4709 wchar_t wc; 4710 int len; 4711 dSAVEDERRNO; 4712 4713 # if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) 4714 4715 mbstate_t ps; 4716 4717 # endif 4718 4719 /* mbrtowc() and mbtowc() convert a byte string to a wide 4720 * character. Feed a byte string to one of them and check that the 4721 * result is the expected Unicode code point */ 4722 4723 # if defined(HAS_MBRTOWC) && defined(USE_ITHREADS) 4724 /* Prefer this function if available, as it's reentrant */ 4725 4726 memzero(&ps, sizeof(ps));; 4727 PERL_UNUSED_RESULT(mbrtowc(&wc, NULL, 0, &ps)); /* Reset any shift 4728 state */ 4729 SETERRNO(0, 0); 4730 len = mbrtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8), &ps); 4731 SAVE_ERRNO; 4732 4733 # else 4734 4735 MBTOWC_LOCK; 4736 PERL_UNUSED_RESULT(mbtowc(&wc, NULL, 0));/* Reset any shift state */ 4737 SETERRNO(0, 0); 4738 len = mbtowc(&wc, STR_WITH_LEN(REPLACEMENT_CHARACTER_UTF8)); 4739 SAVE_ERRNO; 4740 MBTOWC_UNLOCK; 4741 4742 # endif 4743 4744 RESTORE_ERRNO; 4745 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 4746 "\treturn from mbtowc; len=%d; code_point=%x; errno=%d\n", 4747 len, (unsigned int) wc, GET_ERRNO)); 4748 4749 is_utf8 = cBOOL( len == STRLENs(REPLACEMENT_CHARACTER_UTF8) 4750 && wc == (wchar_t) UNICODE_REPLACEMENT); 4751 } 4752 4753 # endif 4754 4755 restore_switched_locale(LC_CTYPE, original_ctype_locale); 4756 goto finish_and_return; 4757 } 4758 4759 # else 4760 4761 /* Here, we must have a C89 compiler that doesn't have mbtowc(). Next 4762 * try looking at the currency symbol to see if it disambiguates 4763 * things. Often that will be in the native script, and if the symbol 4764 * isn't in UTF-8, we know that the locale isn't. If it is non-ASCII 4765 * UTF-8, we infer that the locale is too, as the odds of a non-UTF8 4766 * string being valid UTF-8 are quite small */ 4767 4768 # ifdef USE_LOCALE_MONETARY 4769 4770 /* If have LC_MONETARY, we can look at the currency symbol. Often that 4771 * will be in the native script. We do this one first because there is 4772 * just one string to examine, so potentially avoids work */ 4773 4774 { 4775 const char *original_monetary_locale 4776 = switch_category_locale_to_template(LC_MONETARY, 4777 category, 4778 save_input_locale); 4779 bool only_ascii = FALSE; 4780 const U8 * currency_string 4781 = (const U8 *) my_nl_langinfo(CRNCYSTR, FALSE); 4782 /* 2nd param not relevant for this item */ 4783 const U8 * first_variant; 4784 4785 assert( *currency_string == '-' 4786 || *currency_string == '+' 4787 || *currency_string == '.'); 4788 4789 currency_string++; 4790 4791 if (is_utf8_invariant_string_loc(currency_string, 0, &first_variant)) 4792 { 4793 DEBUG_L(PerlIO_printf(Perl_debug_log, "Couldn't get currency symbol for %s, or contains only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); 4794 only_ascii = TRUE; 4795 } 4796 else { 4797 is_utf8 = is_strict_utf8_string(first_variant, 0); 4798 } 4799 4800 restore_switched_locale(LC_MONETARY, original_monetary_locale); 4801 4802 if (! only_ascii) { 4803 4804 /* It isn't a UTF-8 locale if the symbol is not legal UTF-8; 4805 * otherwise assume the locale is UTF-8 if and only if the symbol 4806 * is non-ascii UTF-8. */ 4807 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?Currency symbol for %s is UTF-8=%d\n", 4808 save_input_locale, is_utf8)); 4809 goto finish_and_return; 4810 } 4811 } 4812 4813 # endif /* USE_LOCALE_MONETARY */ 4814 # if defined(HAS_STRFTIME) && defined(USE_LOCALE_TIME) 4815 4816 /* Still haven't found a non-ASCII string to disambiguate UTF-8 or not. Try 4817 * the names of the months and weekdays, timezone, and am/pm indicator */ 4818 { 4819 const char *original_time_locale 4820 = switch_category_locale_to_template(LC_TIME, 4821 category, 4822 save_input_locale); 4823 int hour = 10; 4824 bool is_dst = FALSE; 4825 int dom = 1; 4826 int month = 0; 4827 int i; 4828 char * formatted_time; 4829 4830 /* Here the current LC_TIME is set to the locale of the category 4831 * whose information is desired. Look at all the days of the week and 4832 * month names, and the timezone and am/pm indicator for UTF-8 variant 4833 * characters. The first such a one found will tell us if the locale 4834 * is UTF-8 or not */ 4835 4836 for (i = 0; i < 7 + 12; i++) { /* 7 days; 12 months */ 4837 formatted_time = my_strftime("%A %B %Z %p", 4838 0, 0, hour, dom, month, 2012 - 1900, 0, 0, is_dst); 4839 if ( ! formatted_time 4840 || is_utf8_invariant_string((U8 *) formatted_time, 0)) 4841 { 4842 4843 /* Here, we didn't find a non-ASCII. Try the next time through 4844 * with the complemented dst and am/pm, and try with the next 4845 * weekday. After we have gotten all weekdays, try the next 4846 * month */ 4847 is_dst = ! is_dst; 4848 hour = (hour + 12) % 24; 4849 dom++; 4850 if (i > 6) { 4851 month++; 4852 } 4853 continue; 4854 } 4855 4856 /* Here, we have a non-ASCII. Return TRUE is it is valid UTF8; 4857 * false otherwise. But first, restore LC_TIME to its original 4858 * locale if we changed it */ 4859 restore_switched_locale(LC_TIME, original_time_locale); 4860 4861 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?time-related strings for %s are UTF-8=%d\n", 4862 save_input_locale, 4863 is_utf8_string((U8 *) formatted_time, 0))); 4864 is_utf8 = is_utf8_string((U8 *) formatted_time, 0); 4865 goto finish_and_return; 4866 } 4867 4868 /* Falling off the end of the loop indicates all the names were just 4869 * ASCII. Go on to the next test. If we changed it, restore LC_TIME 4870 * to its original locale */ 4871 restore_switched_locale(LC_TIME, original_time_locale); 4872 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "All time-related words for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); 4873 } 4874 4875 # endif 4876 4877 # if 0 && defined(USE_LOCALE_MESSAGES) && defined(HAS_SYS_ERRLIST) 4878 4879 /* This code is ifdefd out because it was found to not be necessary in testing 4880 * on our dromedary test machine, which has over 700 locales. There, this 4881 * added no value to looking at the currency symbol and the time strings. I 4882 * left it in so as to avoid rewriting it if real-world experience indicates 4883 * that dromedary is an outlier. Essentially, instead of returning abpve if we 4884 * haven't found illegal utf8, we continue on and examine all the strerror() 4885 * messages on the platform for utf8ness. If all are ASCII, we still don't 4886 * know the answer; but otherwise we have a pretty good indication of the 4887 * utf8ness. The reason this doesn't help much is that the messages may not 4888 * have been translated into the locale. The currency symbol and time strings 4889 * are much more likely to have been translated. */ 4890 { 4891 int e; 4892 bool non_ascii = FALSE; 4893 const char *original_messages_locale 4894 = switch_category_locale_to_template(LC_MESSAGES, 4895 category, 4896 save_input_locale); 4897 const char * errmsg = NULL; 4898 4899 /* Here the current LC_MESSAGES is set to the locale of the category 4900 * whose information is desired. Look through all the messages. We 4901 * can't use Strerror() here because it may expand to code that 4902 * segfaults in miniperl */ 4903 4904 for (e = 0; e <= sys_nerr; e++) { 4905 errno = 0; 4906 errmsg = sys_errlist[e]; 4907 if (errno || !errmsg) { 4908 break; 4909 } 4910 errmsg = savepv(errmsg); 4911 if (! is_utf8_invariant_string((U8 *) errmsg, 0)) { 4912 non_ascii = TRUE; 4913 is_utf8 = is_utf8_string((U8 *) errmsg, 0); 4914 break; 4915 } 4916 } 4917 Safefree(errmsg); 4918 4919 restore_switched_locale(LC_MESSAGES, original_messages_locale); 4920 4921 if (non_ascii) { 4922 4923 /* Any non-UTF-8 message means not a UTF-8 locale; if all are valid, 4924 * any non-ascii means it is one; otherwise we assume it isn't */ 4925 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "\t?error messages for %s are UTF-8=%d\n", 4926 save_input_locale, 4927 is_utf8)); 4928 goto finish_and_return; 4929 } 4930 4931 DEBUG_L(PerlIO_printf(Perl_debug_log, "All error messages for %s contain only ASCII; can't use for determining if UTF-8 locale\n", save_input_locale)); 4932 } 4933 4934 # endif 4935 # ifndef EBCDIC /* On os390, even if the name ends with "UTF-8', it isn't a 4936 UTF-8 locale */ 4937 4938 /* As a last resort, look at the locale name to see if it matches 4939 * qr/UTF -? * 8 /ix, or some other common locale names. This "name", the 4940 * return of setlocale(), is actually defined to be opaque, so we can't 4941 * really rely on the absence of various substrings in the name to indicate 4942 * its UTF-8ness, but if it has UTF8 in the name, it is extremely likely to 4943 * be a UTF-8 locale. Similarly for the other common names */ 4944 4945 { 4946 const Size_t final_pos = strlen(save_input_locale) - 1; 4947 4948 if (final_pos >= 3) { 4949 const char *name = save_input_locale; 4950 4951 /* Find next 'U' or 'u' and look from there */ 4952 while ((name += strcspn(name, "Uu") + 1) 4953 <= save_input_locale + final_pos - 2) 4954 { 4955 if ( isALPHA_FOLD_NE(*name, 't') 4956 || isALPHA_FOLD_NE(*(name + 1), 'f')) 4957 { 4958 continue; 4959 } 4960 name += 2; 4961 if (*(name) == '-') { 4962 if ((name > save_input_locale + final_pos - 1)) { 4963 break; 4964 } 4965 name++; 4966 } 4967 if (*(name) == '8') { 4968 DEBUG_L(PerlIO_printf(Perl_debug_log, 4969 "Locale %s ends with UTF-8 in name\n", 4970 save_input_locale)); 4971 is_utf8 = TRUE; 4972 goto finish_and_return; 4973 } 4974 } 4975 DEBUG_L(PerlIO_printf(Perl_debug_log, 4976 "Locale %s doesn't end with UTF-8 in name\n", 4977 save_input_locale)); 4978 } 4979 4980 # ifdef WIN32 4981 4982 /* http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756.aspx */ 4983 if (memENDs(save_input_locale, final_pos, "65001")) { 4984 DEBUG_L(PerlIO_printf(Perl_debug_log, 4985 "Locale %s ends with 65001 in name, is UTF-8 locale\n", 4986 save_input_locale)); 4987 is_utf8 = TRUE; 4988 goto finish_and_return; 4989 } 4990 4991 # endif 4992 } 4993 # endif 4994 4995 /* Other common encodings are the ISO 8859 series, which aren't UTF-8. But 4996 * since we are about to return FALSE anyway, there is no point in doing 4997 * this extra work */ 4998 4999 # if 0 5000 if (instr(save_input_locale, "8859")) { 5001 DEBUG_L(PerlIO_printf(Perl_debug_log, 5002 "Locale %s has 8859 in name, not UTF-8 locale\n", 5003 save_input_locale)); 5004 is_utf8 = FALSE; 5005 goto finish_and_return; 5006 } 5007 # endif 5008 5009 DEBUG_L(PerlIO_printf(Perl_debug_log, 5010 "Assuming locale %s is not a UTF-8 locale\n", 5011 save_input_locale)); 5012 is_utf8 = FALSE; 5013 5014 # endif /* the code that is compiled when no modern LC_CTYPE */ 5015 5016 finish_and_return: 5017 5018 /* Cache this result so we don't have to go through all this next time. */ 5019 utf8ness_cache_size = sizeof(PL_locale_utf8ness) 5020 - (utf8ness_cache - PL_locale_utf8ness); 5021 5022 /* But we can't save it if it is too large for the total space available */ 5023 if (LIKELY(input_name_len_with_overhead < utf8ness_cache_size)) { 5024 Size_t utf8ness_cache_len = strlen(utf8ness_cache); 5025 5026 /* Here it can fit, but we may need to clear out the oldest cached 5027 * result(s) to do so. Check */ 5028 if (utf8ness_cache_len + input_name_len_with_overhead 5029 >= utf8ness_cache_size) 5030 { 5031 /* Here we have to clear something out to make room for this. 5032 * Start looking at the rightmost place where it could fit and find 5033 * the beginning of the entry that extends past that. */ 5034 char * cutoff = (char *) my_memrchr(utf8ness_cache, 5035 UTF8NESS_SEP[0], 5036 utf8ness_cache_size 5037 - input_name_len_with_overhead); 5038 5039 assert(cutoff); 5040 assert(cutoff >= utf8ness_cache); 5041 5042 /* This and all subsequent entries must be removed */ 5043 *cutoff = '\0'; 5044 utf8ness_cache_len = strlen(utf8ness_cache); 5045 } 5046 5047 /* Make space for the new entry */ 5048 Move(utf8ness_cache, 5049 utf8ness_cache + input_name_len_with_overhead, 5050 utf8ness_cache_len + 1 /* Incl. trailing NUL */, char); 5051 5052 /* And insert it */ 5053 Copy(delimited, utf8ness_cache, input_name_len_with_overhead - 1, char); 5054 utf8ness_cache[input_name_len_with_overhead - 1] = is_utf8 + '0'; 5055 5056 if ((PL_locale_utf8ness[strlen(PL_locale_utf8ness)-1] & ~1) != '0') { 5057 Perl_croak(aTHX_ 5058 "panic: %s: %d: Corrupt utf8ness_cache=%s\nlen=%zu," 5059 " inserted_name=%s, its_len=%zu\n", 5060 __FILE__, __LINE__, 5061 PL_locale_utf8ness, strlen(PL_locale_utf8ness), 5062 delimited, input_name_len_with_overhead); 5063 } 5064 } 5065 5066 # ifdef DEBUGGING 5067 5068 if (DEBUG_Lv_TEST) { 5069 const char * s = PL_locale_utf8ness; 5070 5071 /* Audit the structure */ 5072 while (s < PL_locale_utf8ness + strlen(PL_locale_utf8ness)) { 5073 const char *e; 5074 5075 if (*s != UTF8NESS_SEP[0]) { 5076 Perl_croak(aTHX_ 5077 "panic: %s: %d: Corrupt utf8ness_cache: missing" 5078 " separator %.*s<-- HERE %s\n", 5079 __FILE__, __LINE__, 5080 (int) (s - PL_locale_utf8ness), PL_locale_utf8ness, 5081 s); 5082 } 5083 s++; 5084 e = strchr(s, UTF8NESS_PREFIX[0]); 5085 if (! e) { 5086 e = PL_locale_utf8ness + strlen(PL_locale_utf8ness); 5087 Perl_croak(aTHX_ 5088 "panic: %s: %d: Corrupt utf8ness_cache: missing" 5089 " separator %.*s<-- HERE %s\n", 5090 __FILE__, __LINE__, 5091 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, 5092 e); 5093 } 5094 e++; 5095 if (*e != '0' && *e != '1') { 5096 Perl_croak(aTHX_ 5097 "panic: %s: %d: Corrupt utf8ness_cache: utf8ness" 5098 " must be [01] %.*s<-- HERE %s\n", 5099 __FILE__, __LINE__, 5100 (int) (e + 1 - PL_locale_utf8ness), 5101 PL_locale_utf8ness, e + 1); 5102 } 5103 if (ninstr(PL_locale_utf8ness, s, s-1, e)) { 5104 Perl_croak(aTHX_ 5105 "panic: %s: %d: Corrupt utf8ness_cache: entry" 5106 " has duplicate %.*s<-- HERE %s\n", 5107 __FILE__, __LINE__, 5108 (int) (e - PL_locale_utf8ness), PL_locale_utf8ness, 5109 e); 5110 } 5111 s = e + 1; 5112 } 5113 } 5114 5115 if (DEBUG_Lv_TEST || debug_initialization) { 5116 5117 PerlIO_printf(Perl_debug_log, 5118 "PL_locale_utf8ness is now %s; returning %d\n", 5119 PL_locale_utf8ness, is_utf8); 5120 } 5121 5122 # endif 5123 5124 /* free only when not using the buffer */ 5125 if ( delimited != buffer ) Safefree(delimited); 5126 Safefree(save_input_locale); 5127 return is_utf8; 5128 } 5129 5130 #endif 5131 5132 bool 5133 Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) 5134 { 5135 /* Internal function which returns if we are in the scope of a pragma that 5136 * enables the locale category 'category'. 'compiling' should indicate if 5137 * this is during the compilation phase (TRUE) or not (FALSE). */ 5138 5139 const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; 5140 5141 SV *these_categories = cop_hints_fetch_pvs(cop, "locale", 0); 5142 if (! these_categories || these_categories == &PL_sv_placeholder) { 5143 return FALSE; 5144 } 5145 5146 /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get 5147 * a valid unsigned */ 5148 assert(category >= -1); 5149 return cBOOL(SvUV(these_categories) & (1U << (category + 1))); 5150 } 5151 5152 char * 5153 Perl_my_strerror(pTHX_ const int errnum) 5154 { 5155 /* Returns a mortalized copy of the text of the error message associated 5156 * with 'errnum'. It uses the current locale's text unless the platform 5157 * doesn't have the LC_MESSAGES category or we are not being called from 5158 * within the scope of 'use locale'. In the former case, it uses whatever 5159 * strerror returns; in the latter case it uses the text from the C locale. 5160 * 5161 * The function just calls strerror(), but temporarily switches, if needed, 5162 * to the C locale */ 5163 5164 char *errstr; 5165 5166 #ifndef USE_LOCALE_MESSAGES 5167 5168 /* If platform doesn't have messages category, we don't do any switching to 5169 * the C locale; we just use whatever strerror() returns */ 5170 5171 errstr = savepv(Strerror(errnum)); 5172 5173 #else /* Has locale messages */ 5174 5175 const bool within_locale_scope = IN_LC(LC_MESSAGES); 5176 5177 # ifndef USE_ITHREADS 5178 5179 /* This function is trivial without threads. */ 5180 if (within_locale_scope) { 5181 errstr = savepv(strerror(errnum)); 5182 } 5183 else { 5184 const char * save_locale = savepv(do_setlocale_c(LC_MESSAGES, NULL)); 5185 5186 do_setlocale_c(LC_MESSAGES, "C"); 5187 errstr = savepv(strerror(errnum)); 5188 do_setlocale_c(LC_MESSAGES, save_locale); 5189 Safefree(save_locale); 5190 } 5191 5192 # elif defined(USE_POSIX_2008_LOCALE) \ 5193 && defined(HAS_STRERROR_L) 5194 5195 /* This function is also trivial if we don't have to worry about thread 5196 * safety and have strerror_l(), as it handles the switch of locales so we 5197 * don't have to deal with that. We don't have to worry about thread 5198 * safety if strerror_r() is also available. Both it and strerror_l() are 5199 * thread-safe. Plain strerror() isn't thread safe. But on threaded 5200 * builds when strerror_r() is available, the apparent call to strerror() 5201 * below is actually a macro that behind-the-scenes calls strerror_r(). */ 5202 5203 # ifdef HAS_STRERROR_R 5204 5205 if (within_locale_scope) { 5206 errstr = savepv(strerror(errnum)); 5207 } 5208 else { 5209 errstr = savepv(strerror_l(errnum, PL_C_locale_obj)); 5210 } 5211 5212 # else 5213 5214 /* Here we have strerror_l(), but not strerror_r() and we are on a 5215 * threaded-build. We use strerror_l() for everything, constructing a 5216 * locale to pass to it if necessary */ 5217 5218 bool do_free = FALSE; 5219 locale_t locale_to_use; 5220 5221 if (within_locale_scope) { 5222 locale_to_use = uselocale((locale_t) 0); 5223 if (locale_to_use == LC_GLOBAL_LOCALE) { 5224 locale_to_use = duplocale(LC_GLOBAL_LOCALE); 5225 do_free = TRUE; 5226 } 5227 } 5228 else { /* Use C locale if not within 'use locale' scope */ 5229 locale_to_use = PL_C_locale_obj; 5230 } 5231 5232 errstr = savepv(strerror_l(errnum, locale_to_use)); 5233 5234 if (do_free) { 5235 freelocale(locale_to_use); 5236 } 5237 5238 # endif 5239 # else /* Doesn't have strerror_l() */ 5240 5241 const char * save_locale = NULL; 5242 bool locale_is_C = FALSE; 5243 5244 /* We have a critical section to prevent another thread from executing this 5245 * same code at the same time. (On thread-safe perls, the LOCK is a 5246 * no-op.) Since this is the only place in core that changes LC_MESSAGES 5247 * (unless the user has called setlocale(), this works to prevent races. */ 5248 SETLOCALE_LOCK; 5249 5250 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5251 "my_strerror called with errnum %d\n", errnum)); 5252 if (! within_locale_scope) { 5253 save_locale = do_setlocale_c(LC_MESSAGES, NULL); 5254 if (! save_locale) { 5255 SETLOCALE_UNLOCK; 5256 Perl_croak(aTHX_ 5257 "panic: %s: %d: Could not find current LC_MESSAGES locale," 5258 " errno=%d\n", __FILE__, __LINE__, errno); 5259 } 5260 else { 5261 locale_is_C = isNAME_C_OR_POSIX(save_locale); 5262 5263 /* Switch to the C locale if not already in it */ 5264 if (! locale_is_C) { 5265 5266 /* The setlocale() just below likely will zap 'save_locale', so 5267 * create a copy. */ 5268 save_locale = savepv(save_locale); 5269 if (! do_setlocale_c(LC_MESSAGES, "C")) { 5270 5271 /* If, for some reason, the locale change failed, we 5272 * soldier on as best as possible under the circumstances, 5273 * using the current locale, and clear save_locale, so we 5274 * don't try to change back. On z/0S, all setlocale() 5275 * calls fail after you've created a thread. This is their 5276 * way of making sure the entire process is always a single 5277 * locale. This means that 'use locale' is always in place 5278 * for messages under these circumstances. */ 5279 Safefree(save_locale); 5280 save_locale = NULL; 5281 } 5282 } 5283 } 5284 } /* end of ! within_locale_scope */ 5285 else { 5286 DEBUG_Lv(PerlIO_printf(Perl_debug_log, "%s: %d: WITHIN locale scope\n", 5287 __FILE__, __LINE__)); 5288 } 5289 5290 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5291 "Any locale change has been done; about to call Strerror\n")); 5292 errstr = savepv(Strerror(errnum)); 5293 5294 if (! within_locale_scope) { 5295 if (save_locale && ! locale_is_C) { 5296 if (! do_setlocale_c(LC_MESSAGES, save_locale)) { 5297 SETLOCALE_UNLOCK; 5298 Perl_croak(aTHX_ 5299 "panic: %s: %d: setlocale restore to '%s' failed, errno=%d\n", 5300 __FILE__, __LINE__, save_locale, errno); 5301 } 5302 Safefree(save_locale); 5303 } 5304 } 5305 5306 SETLOCALE_UNLOCK; 5307 5308 # endif /* End of doesn't have strerror_l */ 5309 # ifdef DEBUGGING 5310 5311 if (DEBUG_Lv_TEST) { 5312 PerlIO_printf(Perl_debug_log, "Strerror returned; saving a copy: '"); 5313 print_bytes_for_locale(errstr, errstr + strlen(errstr), 0); 5314 PerlIO_printf(Perl_debug_log, "'\n"); 5315 } 5316 5317 # endif 5318 #endif /* End of does have locale messages */ 5319 5320 SAVEFREEPV(errstr); 5321 return errstr; 5322 } 5323 5324 /* 5325 5326 =for apidoc switch_to_global_locale 5327 5328 On systems without locale support, or on typical single-threaded builds, or on 5329 platforms that do not support per-thread locale operations, this function does 5330 nothing. On such systems that do have locale support, only a locale global to 5331 the whole program is available. 5332 5333 On multi-threaded builds on systems that do have per-thread locale operations, 5334 this function converts the thread it is running in to use the global locale. 5335 This is for code that has not yet or cannot be updated to handle multi-threaded 5336 locale operation. As long as only a single thread is so-converted, everything 5337 works fine, as all the other threads continue to ignore the global one, so only 5338 this thread looks at it. 5339 5340 However, on Windows systems this isn't quite true prior to Visual Studio 15, 5341 at which point Microsoft fixed a bug. A race can occur if you use the 5342 following operations on earlier Windows platforms: 5343 5344 =over 5345 5346 =item L<POSIX::localeconv|POSIX/localeconv> 5347 5348 =item L<I18N::Langinfo>, items C<CRNCYSTR> and C<THOUSEP> 5349 5350 =item L<perlapi/Perl_langinfo>, items C<CRNCYSTR> and C<THOUSEP> 5351 5352 =back 5353 5354 The first item is not fixable (except by upgrading to a later Visual Studio 5355 release), but it would be possible to work around the latter two items by using 5356 the Windows API functions C<GetNumberFormat> and C<GetCurrencyFormat>; patches 5357 welcome. 5358 5359 Without this function call, threads that use the L<C<setlocale(3)>> system 5360 function will not work properly, as all the locale-sensitive functions will 5361 look at the per-thread locale, and C<setlocale> will have no effect on this 5362 thread. 5363 5364 Perl code should convert to either call 5365 L<C<Perl_setlocale>|perlapi/Perl_setlocale> (which is a drop-in for the system 5366 C<setlocale>) or use the methods given in L<perlcall> to call 5367 L<C<POSIX::setlocale>|POSIX/setlocale>. Either one will transparently properly 5368 handle all cases of single- vs multi-thread, POSIX 2008-supported or not. 5369 5370 Non-Perl libraries, such as C<gtk>, that call the system C<setlocale> can 5371 continue to work if this function is called before transferring control to the 5372 library. 5373 5374 Upon return from the code that needs to use the global locale, 5375 L<C<sync_locale()>|perlapi/sync_locale> should be called to restore the safe 5376 multi-thread operation. 5377 5378 =cut 5379 */ 5380 5381 void 5382 Perl_switch_to_global_locale() 5383 { 5384 5385 #ifdef USE_THREAD_SAFE_LOCALE 5386 # ifdef WIN32 5387 5388 _configthreadlocale(_DISABLE_PER_THREAD_LOCALE); 5389 5390 # else 5391 # ifdef HAS_QUERYLOCALE 5392 5393 setlocale(LC_ALL, querylocale(LC_ALL_MASK, uselocale((locale_t) 0))); 5394 5395 # else 5396 5397 { 5398 unsigned int i; 5399 5400 for (i = 0; i < LC_ALL_INDEX; i++) { 5401 setlocale(categories[i], do_setlocale_r(categories[i], NULL)); 5402 } 5403 } 5404 5405 # endif 5406 5407 uselocale(LC_GLOBAL_LOCALE); 5408 5409 # endif 5410 #endif 5411 5412 } 5413 5414 /* 5415 5416 =for apidoc sync_locale 5417 5418 L<C<Perl_setlocale>|perlapi/Perl_setlocale> can be used at any time to query or 5419 change the locale (though changing the locale is antisocial and dangerous on 5420 multi-threaded systems that don't have multi-thread safe locale operations. 5421 (See L<perllocale/Multi-threaded operation>). Using the system 5422 L<C<setlocale(3)>> should be avoided. Nevertheless, certain non-Perl libraries 5423 called from XS, such as C<Gtk> do so, and this can't be changed. When the 5424 locale is changed by XS code that didn't use 5425 L<C<Perl_setlocale>|perlapi/Perl_setlocale>, Perl needs to be told that the 5426 locale has changed. Use this function to do so, before returning to Perl. 5427 5428 The return value is a boolean: TRUE if the global locale at the time of call 5429 was in effect; and FALSE if a per-thread locale was in effect. This can be 5430 used by the caller that needs to restore things as-they-were to decide whether 5431 or not to call 5432 L<C<Perl_switch_to_global_locale>|perlapi/switch_to_global_locale>. 5433 5434 =cut 5435 */ 5436 5437 bool 5438 Perl_sync_locale() 5439 { 5440 5441 #ifndef USE_LOCALE 5442 5443 return TRUE; 5444 5445 #else 5446 5447 const char * newlocale; 5448 dTHX; 5449 5450 # ifdef USE_POSIX_2008_LOCALE 5451 5452 bool was_in_global_locale = FALSE; 5453 locale_t cur_obj = uselocale((locale_t) 0); 5454 5455 /* On Windows, unless the foreign code has turned off the thread-safe 5456 * locale setting, any plain setlocale() will have affected what we see, so 5457 * no need to worry. Otherwise, If the foreign code has done a plain 5458 * setlocale(), it will only affect the global locale on POSIX systems, but 5459 * will affect the */ 5460 if (cur_obj == LC_GLOBAL_LOCALE) { 5461 5462 # ifdef HAS_QUERY_LOCALE 5463 5464 do_setlocale_c(LC_ALL, setlocale(LC_ALL, NULL)); 5465 5466 # else 5467 5468 unsigned int i; 5469 5470 /* We can't trust that we can read the LC_ALL format on the 5471 * platform, so do them individually */ 5472 for (i = 0; i < LC_ALL_INDEX; i++) { 5473 do_setlocale_r(categories[i], setlocale(categories[i], NULL)); 5474 } 5475 5476 # endif 5477 5478 was_in_global_locale = TRUE; 5479 } 5480 5481 # else 5482 5483 bool was_in_global_locale = TRUE; 5484 5485 # endif 5486 # ifdef USE_LOCALE_CTYPE 5487 5488 newlocale = savepv(do_setlocale_c(LC_CTYPE, NULL)); 5489 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5490 "%s:%d: %s\n", __FILE__, __LINE__, 5491 setlocale_debug_string(LC_CTYPE, NULL, newlocale))); 5492 new_ctype(newlocale); 5493 Safefree(newlocale); 5494 5495 # endif /* USE_LOCALE_CTYPE */ 5496 # ifdef USE_LOCALE_COLLATE 5497 5498 newlocale = savepv(do_setlocale_c(LC_COLLATE, NULL)); 5499 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5500 "%s:%d: %s\n", __FILE__, __LINE__, 5501 setlocale_debug_string(LC_COLLATE, NULL, newlocale))); 5502 new_collate(newlocale); 5503 Safefree(newlocale); 5504 5505 # endif 5506 # ifdef USE_LOCALE_NUMERIC 5507 5508 newlocale = savepv(do_setlocale_c(LC_NUMERIC, NULL)); 5509 DEBUG_Lv(PerlIO_printf(Perl_debug_log, 5510 "%s:%d: %s\n", __FILE__, __LINE__, 5511 setlocale_debug_string(LC_NUMERIC, NULL, newlocale))); 5512 new_numeric(newlocale); 5513 Safefree(newlocale); 5514 5515 # endif /* USE_LOCALE_NUMERIC */ 5516 5517 return was_in_global_locale; 5518 5519 #endif 5520 5521 } 5522 5523 #if defined(DEBUGGING) && defined(USE_LOCALE) 5524 5525 STATIC char * 5526 S_setlocale_debug_string(const int category, /* category number, 5527 like LC_ALL */ 5528 const char* const locale, /* locale name */ 5529 5530 /* return value from setlocale() when attempting to 5531 * set 'category' to 'locale' */ 5532 const char* const retval) 5533 { 5534 /* Returns a pointer to a NUL-terminated string in static storage with 5535 * added text about the info passed in. This is not thread safe and will 5536 * be overwritten by the next call, so this should be used just to 5537 * formulate a string to immediately print or savepv() on. */ 5538 5539 static char ret[256]; 5540 5541 my_strlcpy(ret, "setlocale(", sizeof(ret)); 5542 my_strlcat(ret, category_name(category), sizeof(ret)); 5543 my_strlcat(ret, ", ", sizeof(ret)); 5544 5545 if (locale) { 5546 my_strlcat(ret, "\"", sizeof(ret)); 5547 my_strlcat(ret, locale, sizeof(ret)); 5548 my_strlcat(ret, "\"", sizeof(ret)); 5549 } 5550 else { 5551 my_strlcat(ret, "NULL", sizeof(ret)); 5552 } 5553 5554 my_strlcat(ret, ") returned ", sizeof(ret)); 5555 5556 if (retval) { 5557 my_strlcat(ret, "\"", sizeof(ret)); 5558 my_strlcat(ret, retval, sizeof(ret)); 5559 my_strlcat(ret, "\"", sizeof(ret)); 5560 } 5561 else { 5562 my_strlcat(ret, "NULL", sizeof(ret)); 5563 } 5564 5565 assert(strlen(ret) < sizeof(ret)); 5566 5567 return ret; 5568 } 5569 5570 #endif 5571 5572 void 5573 Perl_thread_locale_init() 5574 { 5575 /* Called from a thread on startup*/ 5576 5577 #ifdef USE_THREAD_SAFE_LOCALE 5578 5579 dTHX_DEBUGGING; 5580 5581 /* C starts the new thread in the global C locale. If we are thread-safe, 5582 * we want to not be in the global locale */ 5583 5584 DEBUG_L(PerlIO_printf(Perl_debug_log, 5585 "%s:%d: new thread, initial locale is %s; calling setlocale\n", 5586 __FILE__, __LINE__, setlocale(LC_ALL, NULL))); 5587 5588 # ifdef WIN32 5589 5590 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE); 5591 5592 # else 5593 5594 Perl_setlocale(LC_ALL, "C"); 5595 5596 # endif 5597 #endif 5598 5599 } 5600 5601 void 5602 Perl_thread_locale_term() 5603 { 5604 /* Called from a thread as it gets ready to terminate */ 5605 5606 #ifdef USE_THREAD_SAFE_LOCALE 5607 5608 /* C starts the new thread in the global C locale. If we are thread-safe, 5609 * we want to not be in the global locale */ 5610 5611 # ifndef WIN32 5612 5613 { /* Free up */ 5614 locale_t cur_obj = uselocale(LC_GLOBAL_LOCALE); 5615 if (cur_obj != LC_GLOBAL_LOCALE && cur_obj != PL_C_locale_obj) { 5616 freelocale(cur_obj); 5617 } 5618 } 5619 5620 # endif 5621 #endif 5622 5623 } 5624 5625 /* 5626 * ex: set ts=8 sts=4 sw=4 et: 5627 */ 5628