1 /* searchutils.c - helper subroutines for grep's matchers. 2 Copyright 1992, 1998, 2000, 2007, 2009-2014 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 #include <config.h> 20 #include <assert.h> 21 #include "search.h" 22 23 #define NCHAR (UCHAR_MAX + 1) 24 25 static size_t mbclen_cache[NCHAR]; 26 27 void 28 kwsinit (kwset_t *kwset) 29 { 30 static char trans[NCHAR]; 31 int i; 32 33 if (match_icase && MB_CUR_MAX == 1) 34 { 35 for (i = 0; i < NCHAR; ++i) 36 trans[i] = toupper (i); 37 38 *kwset = kwsalloc (trans); 39 } 40 else 41 *kwset = kwsalloc (NULL); 42 43 if (!*kwset) 44 xalloc_die (); 45 } 46 47 /* Convert BEG, an *N-byte string, to uppercase, and write the 48 NUL-terminated result into malloc'd storage. Upon success, set *N 49 to the length (in bytes) of the resulting string (not including the 50 trailing NUL byte), and return a pointer to the uppercase string. 51 Upon memory allocation failure, exit. *N must be positive. 52 53 Although this function returns a pointer to malloc'd storage, 54 the caller must not free it, since this function retains a pointer 55 to the buffer and reuses it on any subsequent call. As a consequence, 56 this function is not thread-safe. 57 58 When each character in the uppercase result string has the same length 59 as the corresponding character in the input string, set *LEN_MAP_P 60 to NULL. Otherwise, set it to a malloc'd buffer (like the returned 61 buffer, this must not be freed by caller) of the same length as the 62 result string. (*LEN_MAP_P)[J] is the change in byte-length of the 63 character in BEG that formed byte J of the result as it was converted to 64 uppercase. It is usually zero. For lowercase Turkish dotless I it 65 is -1, since the lowercase input occupies two bytes, while the 66 uppercase output occupies only one byte. For lowercase I in the 67 tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I 68 is one byte longer than the original. When that happens, we have two 69 or more slots in *LEN_MAP_P for each such character. We store the 70 difference in the first one and 0's in any remaining slots. 71 72 This map is used by the caller to convert offset,length pairs that 73 reference the uppercase result to numbers that refer to the matched 74 part of the original buffer. */ 75 76 char * 77 mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p) 78 { 79 static char *out; 80 static mb_len_map_t *len_map; 81 static size_t outalloc; 82 size_t outlen, mb_cur_max; 83 mbstate_t is, os; 84 const char *end; 85 char *p; 86 mb_len_map_t *m; 87 bool lengths_differ = false; 88 89 if (*n > outalloc || outalloc == 0) 90 { 91 outalloc = MAX (1, *n); 92 out = xrealloc (out, outalloc); 93 len_map = xrealloc (len_map, outalloc); 94 } 95 96 /* appease clang-2.6 */ 97 assert (out); 98 assert (len_map); 99 if (*n == 0) 100 return out; 101 102 memset (&is, 0, sizeof (is)); 103 memset (&os, 0, sizeof (os)); 104 end = beg + *n; 105 106 mb_cur_max = MB_CUR_MAX; 107 p = out; 108 m = len_map; 109 outlen = 0; 110 while (beg < end) 111 { 112 wchar_t wc; 113 size_t mbclen = mbrtowc (&wc, beg, end - beg, &is); 114 #ifdef __CYGWIN__ 115 /* Handle a UTF-8 sequence for a character beyond the base plane. 116 Cygwin's wchar_t is UTF-16, as in the underlying OS. This 117 results in surrogate pairs which need some extra attention. */ 118 wint_t wci = 0; 119 if (mbclen == 3 && (wc & 0xdc00) == 0xd800) 120 { 121 /* We got the start of a 4 byte UTF-8 sequence. This is returned 122 as a UTF-16 surrogate pair. The first call to mbrtowc returned 3 123 and wc has been set to a high surrogate value, now we're going 124 to fetch the matching low surrogate. This second call to mbrtowc 125 is supposed to return 1 to complete the 4 byte UTF-8 sequence. */ 126 wchar_t wc_2; 127 size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen, 128 &is); 129 if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00) 130 { 131 /* Match. Convert this to a 4 byte wint_t which constitutes 132 a 32-bit UTF-32 value. */ 133 wci = ( (((wint_t) (wc - 0xd800)) << 10) 134 | ((wint_t) (wc_2 - 0xdc00))) 135 + 0x10000; 136 ++mbclen; 137 } 138 else 139 { 140 /* Invalid UTF-8 sequence. */ 141 mbclen = (size_t) -1; 142 } 143 } 144 #endif 145 if (outlen + mb_cur_max >= outalloc) 146 { 147 size_t dm = m - len_map; 148 out = x2nrealloc (out, &outalloc, 1); 149 len_map = xrealloc (len_map, outalloc); 150 p = out + outlen; 151 m = len_map + dm; 152 } 153 154 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 155 { 156 /* An invalid sequence, or a truncated multi-octet character. 157 We treat it as a single-octet character. */ 158 *m++ = 0; 159 *p++ = *beg++; 160 outlen++; 161 memset (&is, 0, sizeof (is)); 162 memset (&os, 0, sizeof (os)); 163 } 164 else 165 { 166 size_t ombclen; 167 beg += mbclen; 168 #ifdef __CYGWIN__ 169 /* Handle Unicode characters beyond the base plane. */ 170 if (mbclen == 4) 171 { 172 /* towupper, taking wint_t (4 bytes), handles UCS-4 values. */ 173 wci = towupper (wci); 174 if (wci >= 0x10000) 175 { 176 wci -= 0x10000; 177 wc = (wci >> 10) | 0xd800; 178 /* No need to check the return value. When reading the 179 high surrogate, the return value will be 0 and only the 180 mbstate indicates that we're in the middle of reading a 181 surrogate pair. The next wcrtomb call reading the low 182 surrogate will then return 4 and reset the mbstate. */ 183 wcrtomb (p, wc, &os); 184 wc = (wci & 0x3ff) | 0xdc00; 185 } 186 else 187 { 188 wc = (wchar_t) wci; 189 } 190 ombclen = wcrtomb (p, wc, &os); 191 } 192 else 193 #endif 194 ombclen = wcrtomb (p, towupper (wc), &os); 195 *m = mbclen - ombclen; 196 memset (m + 1, 0, ombclen - 1); 197 m += ombclen; 198 p += ombclen; 199 outlen += ombclen; 200 lengths_differ |= (mbclen != ombclen); 201 } 202 } 203 204 *len_map_p = lengths_differ ? len_map : NULL; 205 *n = p - out; 206 *p = 0; 207 return out; 208 } 209 210 /* Initialize a cache of mbrlen values for each of its 1-byte inputs. */ 211 void 212 build_mbclen_cache (void) 213 { 214 int i; 215 216 for (i = CHAR_MIN; i <= CHAR_MAX; ++i) 217 { 218 char c = i; 219 unsigned char uc = i; 220 mbstate_t mbs = { 0 }; 221 mbclen_cache[uc] = mbrlen (&c, 1, &mbs); 222 } 223 } 224 225 /* In the buffer *MB_START, return the number of bytes needed to go 226 back from CUR to the previous boundary, where a "boundary" is the 227 start of a multibyte character or is an error-encoding byte. The 228 buffer ends at END (i.e., one past the address of the buffer's last 229 byte). If CUR is already at a boundary, return 0. If *MB_START is 230 greater than or equal to CUR, return the negative value CUR - *MB_START. 231 232 When returning zero, set *MB_START to CUR. When returning a 233 positive value, set *MB_START to the next boundary after CUR, or to 234 END if there is no such boundary. When returning a negative value, 235 leave *MB_START alone. */ 236 ptrdiff_t 237 mb_goback (char const **mb_start, char const *cur, char const *end) 238 { 239 const char *p = *mb_start; 240 const char *p0 = p; 241 mbstate_t cur_state; 242 243 memset (&cur_state, 0, sizeof cur_state); 244 245 while (p < cur) 246 { 247 size_t mbclen = mbclen_cache[to_uchar (*p)]; 248 249 if (mbclen == (size_t) -2) 250 mbclen = mbrlen (p, end - p, &cur_state); 251 252 if (! (0 < mbclen && mbclen < (size_t) -2)) 253 { 254 /* An invalid sequence, or a truncated multibyte character, or 255 a null wide character. Treat it as a single byte character. */ 256 mbclen = 1; 257 memset (&cur_state, 0, sizeof cur_state); 258 } 259 p0 = p; 260 p += mbclen; 261 } 262 263 *mb_start = p; 264 return p == cur ? 0 : cur - p0; 265 } 266 267 /* In the buffer BUF, return the wide character that is encoded just 268 before CUR. The buffer ends at END. Return WEOF if there is no 269 wide character just before CUR. */ 270 wint_t 271 mb_prev_wc (char const *buf, char const *cur, char const *end) 272 { 273 if (cur == buf) 274 return WEOF; 275 char const *p = buf; 276 cur--; 277 cur -= mb_goback (&p, cur, end); 278 return mb_next_wc (cur, end); 279 } 280 281 /* Return the wide character that is encoded at CUR. The buffer ends 282 at END. Return WEOF if there is no wide character encoded at CUR. */ 283 wint_t 284 mb_next_wc (char const *cur, char const *end) 285 { 286 wchar_t wc; 287 mbstate_t mbs = { 0 }; 288 return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF; 289 } 290