1 /* searchutils.c - helper subroutines for grep's matchers. 2 Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 #include <config.h> 20 #include <assert.h> 21 #include "search.h" 22 23 #define NCHAR (UCHAR_MAX + 1) 24 25 void 26 kwsinit (kwset_t *kwset) 27 { 28 static char trans[NCHAR]; 29 int i; 30 31 if (match_icase && MB_CUR_MAX == 1) 32 { 33 for (i = 0; i < NCHAR; ++i) 34 trans[i] = tolower (i); 35 36 *kwset = kwsalloc (trans); 37 } 38 else 39 *kwset = kwsalloc (NULL); 40 41 if (!*kwset) 42 xalloc_die (); 43 } 44 45 #if MBS_SUPPORT 46 /* Convert the *N-byte string, BEG, to lower-case, and write the 47 NUL-terminated result into malloc'd storage. Upon success, set *N 48 to the length (in bytes) of the resulting string (not including the 49 trailing NUL byte), and return a pointer to the lower-case string. 50 Upon memory allocation failure, this function exits. 51 Note that on input, *N must be larger than zero. 52 53 Note that while this function returns a pointer to malloc'd storage, 54 the caller must not free it, since this function retains a pointer 55 to the buffer and reuses it on any subsequent call. As a consequence, 56 this function is not thread-safe. 57 58 When each character in the lower-case result string has the same length 59 as the corresponding character in the input string, set *LEN_MAP_P 60 to NULL. Otherwise, set it to a malloc'd buffer (like the returned 61 buffer, this must not be freed by caller) of the same length as the 62 result string. (*LEN_MAP_P)[J] is the change in byte-length of the 63 character in BEG that formed byte J of the result as it was converted to 64 lower-case. It is usually zero. For the upper-case Turkish I-with-dot 65 it is -1, since the upper-case character occupies two bytes, while the 66 lower-case one occupies only one byte. For the Turkish-I-without-dot 67 in the tr_TR.utf8 locale, it is 1 because the lower-case representation 68 is one byte longer than the original. When that happens, we have two 69 or more slots in *LEN_MAP_P for each such character. We store the 70 difference in the first one and 0's in any remaining slots. 71 72 This map is used by the caller to convert offset,length pairs that 73 reference the lower-case result to numbers that refer to the matched 74 part of the original buffer. */ 75 76 char * 77 mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p) 78 { 79 static char *out; 80 static mb_len_map_t *len_map; 81 static size_t outalloc; 82 size_t outlen, mb_cur_max; 83 mbstate_t is, os; 84 const char *end; 85 char *p; 86 mb_len_map_t *m; 87 bool lengths_differ = false; 88 89 if (*n > outalloc || outalloc == 0) 90 { 91 outalloc = MAX(1, *n); 92 out = xrealloc (out, outalloc); 93 len_map = xrealloc (len_map, outalloc); 94 } 95 96 /* appease clang-2.6 */ 97 assert (out); 98 assert (len_map); 99 if (*n == 0) 100 return out; 101 102 memset (&is, 0, sizeof (is)); 103 memset (&os, 0, sizeof (os)); 104 end = beg + *n; 105 106 mb_cur_max = MB_CUR_MAX; 107 p = out; 108 m = len_map; 109 outlen = 0; 110 while (beg < end) 111 { 112 wchar_t wc; 113 size_t mbclen = mbrtowc (&wc, beg, end - beg, &is); 114 if (outlen + mb_cur_max >= outalloc) 115 { 116 size_t dm = m - len_map; 117 out = x2nrealloc (out, &outalloc, 1); 118 len_map = xrealloc (len_map, outalloc); 119 p = out + outlen; 120 m = len_map + dm; 121 } 122 123 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 124 { 125 /* An invalid sequence, or a truncated multi-octet character. 126 We treat it as a single-octet character. */ 127 *m++ = 0; 128 *p++ = *beg++; 129 outlen++; 130 memset (&is, 0, sizeof (is)); 131 memset (&os, 0, sizeof (os)); 132 } 133 else 134 { 135 beg += mbclen; 136 size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os); 137 *m = mbclen - ombclen; 138 memset (m + 1, 0, ombclen - 1); 139 m += ombclen; 140 p += ombclen; 141 outlen += ombclen; 142 lengths_differ |= (mbclen != ombclen); 143 } 144 } 145 146 *len_map_p = lengths_differ ? len_map : NULL; 147 *n = p - out; 148 *p = 0; 149 return out; 150 } 151 152 153 bool 154 is_mb_middle (const char **good, const char *buf, const char *end, 155 size_t match_len) 156 { 157 const char *p = *good; 158 const char *prev = p; 159 mbstate_t cur_state; 160 161 /* TODO: can be optimized for UTF-8. */ 162 memset(&cur_state, 0, sizeof(mbstate_t)); 163 while (p < buf) 164 { 165 size_t mbclen = mbrlen(p, end - p, &cur_state); 166 167 /* Store the beginning of the previous complete multibyte character. */ 168 if (mbclen != (size_t) -2) 169 prev = p; 170 171 if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) 172 { 173 /* An invalid sequence, or a truncated multibyte character. 174 We treat it as a single byte character. */ 175 mbclen = 1; 176 memset(&cur_state, 0, sizeof cur_state); 177 } 178 p += mbclen; 179 } 180 181 *good = prev; 182 183 if (p > buf) 184 return true; 185 186 /* P == BUF here. */ 187 return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state); 188 } 189 #endif /* MBS_SUPPORT */ 190