1 /* searchutils.c - helper subroutines for grep's matchers. 2 Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 #include <config.h> 20 21 #define SEARCH_INLINE _GL_EXTERN_INLINE 22 #define SYSTEM_INLINE _GL_EXTERN_INLINE 23 #include "search.h" 24 25 #include <assert.h> 26 27 #define NCHAR (UCHAR_MAX + 1) 28 29 size_t mbclen_cache[NCHAR]; 30 31 void 32 kwsinit (kwset_t *kwset) 33 { 34 static char trans[NCHAR]; 35 int i; 36 37 if (match_icase && MB_CUR_MAX == 1) 38 { 39 for (i = 0; i < NCHAR; ++i) 40 trans[i] = toupper (i); 41 42 *kwset = kwsalloc (trans); 43 } 44 else 45 *kwset = kwsalloc (NULL); 46 47 if (!*kwset) 48 xalloc_die (); 49 } 50 51 /* Convert BEG, an *N-byte string, to uppercase, and write the 52 NUL-terminated result into malloc'd storage. Upon success, set *N 53 to the length (in bytes) of the resulting string (not including the 54 trailing NUL byte), and return a pointer to the uppercase string. 55 Upon memory allocation failure, exit. *N must be positive. 56 57 Although this function returns a pointer to malloc'd storage, 58 the caller must not free it, since this function retains a pointer 59 to the buffer and reuses it on any subsequent call. As a consequence, 60 this function is not thread-safe. 61 62 When each character in the uppercase result string has the same length 63 as the corresponding character in the input string, set *LEN_MAP_P 64 to NULL. Otherwise, set it to a malloc'd buffer (like the returned 65 buffer, this must not be freed by caller) of the same length as the 66 result string. (*LEN_MAP_P)[J] is the change in byte-length of the 67 character in BEG that formed byte J of the result as it was converted to 68 uppercase. It is usually zero. For lowercase Turkish dotless I it 69 is -1, since the lowercase input occupies two bytes, while the 70 uppercase output occupies only one byte. For lowercase I in the 71 tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I 72 is one byte longer than the original. When that happens, we have two 73 or more slots in *LEN_MAP_P for each such character. We store the 74 difference in the first one and 0's in any remaining slots. 75 76 This map is used by the caller to convert offset,length pairs that 77 reference the uppercase result to numbers that refer to the matched 78 part of the original buffer. */ 79 80 char * 81 mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p) 82 { 83 static char *out; 84 static mb_len_map_t *len_map; 85 static size_t outalloc; 86 size_t outlen, mb_cur_max; 87 mbstate_t is, os; 88 const char *end; 89 char *p; 90 mb_len_map_t *m; 91 bool lengths_differ = false; 92 93 if (*n > outalloc || outalloc == 0) 94 { 95 outalloc = MAX (1, *n); 96 out = xrealloc (out, outalloc); 97 len_map = xrealloc (len_map, outalloc); 98 } 99 100 /* appease clang-2.6 */ 101 assert (out); 102 assert (len_map); 103 if (*n == 0) 104 return out; 105 106 memset (&is, 0, sizeof (is)); 107 memset (&os, 0, sizeof (os)); 108 end = beg + *n; 109 110 mb_cur_max = MB_CUR_MAX; 111 p = out; 112 m = len_map; 113 outlen = 0; 114 while (beg < end) 115 { 116 wchar_t wc; 117 size_t mbclen = mbrtowc (&wc, beg, end - beg, &is); 118 #ifdef __CYGWIN__ 119 /* Handle a UTF-8 sequence for a character beyond the base plane. 120 Cygwin's wchar_t is UTF-16, as in the underlying OS. This 121 results in surrogate pairs which need some extra attention. */ 122 wint_t wci = 0; 123 if (mbclen == 3 && (wc & 0xdc00) == 0xd800) 124 { 125 /* We got the start of a 4 byte UTF-8 sequence. This is returned 126 as a UTF-16 surrogate pair. The first call to mbrtowc returned 3 127 and wc has been set to a high surrogate value, now we're going 128 to fetch the matching low surrogate. This second call to mbrtowc 129 is supposed to return 1 to complete the 4 byte UTF-8 sequence. */ 130 wchar_t wc_2; 131 size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen, 132 &is); 133 if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00) 134 { 135 /* Match. Convert this to a 4 byte wint_t which constitutes 136 a 32-bit UTF-32 value. */ 137 wci = ( (((wint_t) (wc - 0xd800)) << 10) 138 | ((wint_t) (wc_2 - 0xdc00))) 139 + 0x10000; 140 ++mbclen; 141 } 142 else 143 { 144 /* Invalid UTF-8 sequence. */ 145 mbclen = (size_t) -1; 146 } 147 } 148 #endif 149 if (outlen + mb_cur_max >= outalloc) 150 { 151 size_t dm = m - len_map; 152 out = x2nrealloc (out, &outalloc, 1); 153 len_map = xrealloc (len_map, outalloc); 154 p = out + outlen; 155 m = len_map + dm; 156 } 157 158 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 159 { 160 /* An invalid sequence, or a truncated multi-octet character. 161 We treat it as a single-octet character. */ 162 *m++ = 0; 163 *p++ = *beg++; 164 outlen++; 165 memset (&is, 0, sizeof (is)); 166 memset (&os, 0, sizeof (os)); 167 } 168 else 169 { 170 size_t ombclen; 171 beg += mbclen; 172 #ifdef __CYGWIN__ 173 /* Handle Unicode characters beyond the base plane. */ 174 if (mbclen == 4) 175 { 176 /* towupper, taking wint_t (4 bytes), handles UCS-4 values. */ 177 wci = towupper (wci); 178 if (wci >= 0x10000) 179 { 180 wci -= 0x10000; 181 wc = (wci >> 10) | 0xd800; 182 /* No need to check the return value. When reading the 183 high surrogate, the return value will be 0 and only the 184 mbstate indicates that we're in the middle of reading a 185 surrogate pair. The next wcrtomb call reading the low 186 surrogate will then return 4 and reset the mbstate. */ 187 wcrtomb (p, wc, &os); 188 wc = (wci & 0x3ff) | 0xdc00; 189 } 190 else 191 { 192 wc = (wchar_t) wci; 193 } 194 ombclen = wcrtomb (p, wc, &os); 195 } 196 else 197 #endif 198 ombclen = wcrtomb (p, towupper (wc), &os); 199 *m = mbclen - ombclen; 200 memset (m + 1, 0, ombclen - 1); 201 m += ombclen; 202 p += ombclen; 203 outlen += ombclen; 204 lengths_differ |= (mbclen != ombclen); 205 } 206 } 207 208 *len_map_p = lengths_differ ? len_map : NULL; 209 *n = p - out; 210 *p = 0; 211 return out; 212 } 213 214 /* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ 215 void 216 build_mbclen_cache (void) 217 { 218 int i; 219 220 for (i = CHAR_MIN; i <= CHAR_MAX; ++i) 221 { 222 char c = i; 223 unsigned char uc = i; 224 mbstate_t mbs = { 0 }; 225 size_t len = mbrlen (&c, 1, &mbs); 226 mbclen_cache[uc] = len ? len : 1; 227 } 228 } 229 230 /* In the buffer *MB_START, return the number of bytes needed to go 231 back from CUR to the previous boundary, where a "boundary" is the 232 start of a multibyte character or is an error-encoding byte. The 233 buffer ends at END (i.e., one past the address of the buffer's last 234 byte). If CUR is already at a boundary, return 0. If *MB_START is 235 greater than or equal to CUR, return the negative value CUR - *MB_START. 236 237 When returning zero, set *MB_START to CUR. When returning a 238 positive value, set *MB_START to the next boundary after CUR, or to 239 END if there is no such boundary. When returning a negative value, 240 leave *MB_START alone. */ 241 ptrdiff_t 242 mb_goback (char const **mb_start, char const *cur, char const *end) 243 { 244 const char *p = *mb_start; 245 const char *p0 = p; 246 mbstate_t cur_state; 247 248 memset (&cur_state, 0, sizeof cur_state); 249 250 while (p < cur) 251 { 252 size_t clen = mb_clen (p, end - p, &cur_state); 253 254 if ((size_t) -2 <= clen) 255 { 256 /* An invalid sequence, or a truncated multibyte character. 257 Treat it as a single byte character. */ 258 clen = 1; 259 memset (&cur_state, 0, sizeof cur_state); 260 } 261 p0 = p; 262 p += clen; 263 } 264 265 *mb_start = p; 266 return p == cur ? 0 : cur - p0; 267 } 268 269 /* In the buffer BUF, return the wide character that is encoded just 270 before CUR. The buffer ends at END. Return WEOF if there is no 271 wide character just before CUR. */ 272 wint_t 273 mb_prev_wc (char const *buf, char const *cur, char const *end) 274 { 275 if (cur == buf) 276 return WEOF; 277 char const *p = buf; 278 cur--; 279 cur -= mb_goback (&p, cur, end); 280 return mb_next_wc (cur, end); 281 } 282 283 /* Return the wide character that is encoded at CUR. The buffer ends 284 at END. Return WEOF if there is no wide character encoded at CUR. */ 285 wint_t 286 mb_next_wc (char const *cur, char const *end) 287 { 288 wchar_t wc; 289 mbstate_t mbs = { 0 }; 290 return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 291 ? wc : WEOF); 292 } 293