1 /* locale information 2 3 Copyright 2016-2020 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 18 02110-1301, USA. */ 19 20 /* Written by Paul Eggert. */ 21 22 #include <config.h> 23 24 #include <localeinfo.h> 25 26 #include <verify.h> 27 28 #include <limits.h> 29 #include <locale.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <wctype.h> 33 34 /* The sbclen implementation relies on this. */ 35 verify (MB_LEN_MAX <= SCHAR_MAX); 36 37 /* Return true if the locale uses UTF-8. */ 38 39 static bool 40 is_using_utf8 (void) 41 { 42 wchar_t wc; 43 mbstate_t mbs = {0}; 44 return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; 45 } 46 47 /* Return true if the locale is compatible enough with the C locale so 48 that the locale is single-byte, bytes are in collating-sequence 49 order, and there are no multi-character collating elements. */ 50 51 static bool 52 using_simple_locale (bool multibyte) 53 { 54 /* The native character set is known to be compatible with 55 the C locale. The following test isn't perfect, but it's good 56 enough in practice, as only ASCII and EBCDIC are in common use 57 and this test correctly accepts ASCII and rejects EBCDIC. */ 58 enum { native_c_charset = 59 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 60 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 61 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 62 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 63 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 64 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 65 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 66 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 67 && '}' == 125 && '~' == 126) 68 }; 69 70 if (!native_c_charset || multibyte) 71 return false; 72 73 /* As a heuristic, use strcoll to compare native character order. 74 If this agrees with byte order the locale should be simple. 75 This heuristic should work for all known practical locales, 76 although it would be invalid for artificially-constructed locales 77 where the native order is the collating-sequence order but there 78 are multi-character collating elements. */ 79 for (int i = 0; i < UCHAR_MAX; i++) 80 if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0}))) 81 return false; 82 83 return true; 84 } 85 86 /* Initialize *LOCALEINFO from the current locale. */ 87 88 void 89 init_localeinfo (struct localeinfo *localeinfo) 90 { 91 localeinfo->multibyte = MB_CUR_MAX > 1; 92 localeinfo->simple = using_simple_locale (localeinfo->multibyte); 93 localeinfo->using_utf8 = is_using_utf8 (); 94 95 for (int i = CHAR_MIN; i <= CHAR_MAX; i++) 96 { 97 char c = i; 98 unsigned char uc = i; 99 mbstate_t s = {0}; 100 wchar_t wc; 101 size_t len = mbrtowc (&wc, &c, 1, &s); 102 localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len; 103 localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF; 104 } 105 } 106 107 /* The set of wchar_t values C such that there's a useful locale 108 somewhere where C != towupper (C) && C != towlower (towupper (C)). 109 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because 110 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and 111 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ 112 static short const lonesome_lower[] = 113 { 114 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, 115 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, 116 117 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase 118 counterpart in locales predating Unicode 4.0.0 (April 2003). */ 119 0x03F2, 120 121 0x03F5, 0x1E9B, 0x1FBE, 122 }; 123 124 /* Verify that the worst case fits. This is 1 for towupper, 1 for 125 towlower, and 1 for each entry in LONESOME_LOWER. */ 126 verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower 127 <= CASE_FOLDED_BUFSIZE); 128 129 /* Find the characters equal to C after case-folding, other than C 130 itself, and store them into FOLDED. Return the number of characters 131 stored; this is zero if C is WEOF. */ 132 133 int 134 case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) 135 { 136 int i; 137 int n = 0; 138 wint_t uc = towupper (c); 139 wint_t lc = towlower (uc); 140 if (uc != c) 141 folded[n++] = uc; 142 if (lc != uc && lc != c && towupper (lc) == uc) 143 folded[n++] = lc; 144 for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) 145 { 146 wint_t li = lonesome_lower[i]; 147 if (li != lc && li != uc && li != c && towupper (li) == uc) 148 folded[n++] = li; 149 } 150 return n; 151 } 152