1 /* kwsearch.c - searching subroutines using kwset for grep. 2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Written August 1992 by Mike Haertel. */ 20 21 #include <config.h> 22 #include "search.h" 23 24 /* A compiled -F pattern list. */ 25 26 struct kwsearch 27 { 28 /* The kwset for this pattern list. */ 29 kwset_t kwset; 30 31 /* The number of user-specified patterns. This is less than 32 'kwswords (kwset)' when some extra one-character words have been 33 appended, one for each troublesome character that will require a 34 DFA search. */ 35 ptrdiff_t words; 36 37 /* The user's pattern and its size in bytes. */ 38 char *pattern; 39 size_t size; 40 41 /* The user's pattern compiled as a regular expression, 42 or null if it has not been compiled. */ 43 void *re; 44 }; 45 46 /* Compile the -F style PATTERN, containing SIZE bytes. Return a 47 description of the compiled pattern. */ 48 49 void * 50 Fcompile (char *pattern, size_t size, reg_syntax_t ignored) 51 { 52 kwset_t kwset; 53 ptrdiff_t total = size; 54 char *buf = NULL; 55 size_t bufalloc = 0; 56 57 kwset = kwsinit (true); 58 59 char const *p = pattern; 60 do 61 { 62 ptrdiff_t len; 63 char const *sep = memchr (p, '\n', total); 64 if (sep) 65 { 66 len = sep - p; 67 sep++; 68 total -= (len + 1); 69 } 70 else 71 { 72 len = total; 73 total = 0; 74 } 75 76 if (match_lines) 77 { 78 if (eolbyte == '\n' && pattern < p && sep) 79 p--; 80 else 81 { 82 if (bufalloc < len + 2) 83 { 84 free (buf); 85 bufalloc = len + 2; 86 buf = x2realloc (NULL, &bufalloc); 87 buf[0] = eolbyte; 88 } 89 memcpy (buf + 1, p, len); 90 buf[len + 1] = eolbyte; 91 p = buf; 92 } 93 len += 2; 94 } 95 kwsincr (kwset, p, len); 96 97 p = sep; 98 } 99 while (p); 100 101 free (buf); 102 ptrdiff_t words = kwswords (kwset); 103 104 if (match_icase) 105 { 106 /* For each pattern character C that has a case folded 107 counterpart F that is multibyte and so cannot easily be 108 implemented via translating a single byte, append a pattern 109 containing just F. That way, if the data contains F, the 110 matcher can fall back on DFA. For example, if C is 'i' and 111 the locale is en_US.utf8, append a pattern containing just 112 the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that 113 Fexecute will use a DFA if the data contain U+0131. */ 114 mbstate_t mbs = { 0 }; 115 char checked[NCHAR] = {0,}; 116 for (p = pattern; p < pattern + size; p++) 117 { 118 unsigned char c = *p; 119 if (checked[c]) 120 continue; 121 checked[c] = true; 122 123 wint_t wc = localeinfo.sbctowc[c]; 124 wchar_t folded[CASE_FOLDED_BUFSIZE]; 125 126 for (int i = case_folded_counterparts (wc, folded); 0 <= --i; ) 127 { 128 char s[MB_LEN_MAX]; 129 int nbytes = wcrtomb (s, folded[i], &mbs); 130 if (1 < nbytes) 131 kwsincr (kwset, s, nbytes); 132 } 133 } 134 } 135 136 kwsprep (kwset); 137 138 struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch); 139 kwsearch->kwset = kwset; 140 kwsearch->words = words; 141 kwsearch->pattern = pattern; 142 kwsearch->size = size; 143 kwsearch->re = NULL; 144 return kwsearch; 145 } 146 147 /* Use the compiled pattern VCP to search the buffer BUF of size SIZE. 148 If found, return the offset of the first match and store its 149 size into *MATCH_SIZE. If not found, return SIZE_MAX. 150 If START_PTR is nonnull, start searching there. */ 151 size_t 152 Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, 153 char const *start_ptr) 154 { 155 char const *beg, *end, *mb_start; 156 ptrdiff_t len; 157 char eol = eolbyte; 158 struct kwsmatch kwsmatch; 159 size_t ret_val; 160 bool mb_check; 161 bool longest; 162 struct kwsearch *kwsearch = vcp; 163 kwset_t kwset = kwsearch->kwset; 164 size_t mbclen; 165 166 if (match_lines) 167 mb_check = longest = false; 168 else 169 { 170 mb_check = localeinfo.multibyte & !localeinfo.using_utf8; 171 longest = mb_check | !!start_ptr | match_words; 172 } 173 174 for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) 175 { 176 ptrdiff_t offset = kwsexec (kwset, beg - match_lines, 177 buf + size - beg + match_lines, &kwsmatch, 178 longest); 179 if (offset < 0) 180 break; 181 len = kwsmatch.size[0] - 2 * match_lines; 182 183 if (kwsearch->words <= kwsmatch.index) 184 { 185 /* The data contain a multibyte character that matches 186 some pattern character that is a case folded counterpart. 187 Since the kwset code cannot handle this case, fall back 188 on the DFA code, which can. */ 189 if (! kwsearch->re) 190 { 191 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size); 192 kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size, 193 RE_SYNTAX_GREP); 194 } 195 return EGexecute (kwsearch->re, buf, size, match_size, start_ptr); 196 } 197 198 mbclen = 0; 199 if (mb_check 200 && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0) 201 { 202 /* We have matched a single byte that is not at the beginning of a 203 multibyte character. mb_goback has advanced MB_START past that 204 multibyte character. Now, we want to position BEG so that the 205 next kwsexec search starts there. Thus, to compensate for the 206 for-loop's BEG++, above, subtract one here. This code is 207 unusually hard to reach, and exceptionally, let's show how to 208 trigger it here: 209 210 printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A 211 212 That assumes the named locale is installed. 213 Note that your system's shift-JIS locale may have a different 214 name, possibly including "sjis". */ 215 beg = mb_start - 1; 216 continue; 217 } 218 beg += offset; 219 if (!!start_ptr & !match_words) 220 goto success_in_beg_and_len; 221 if (match_lines) 222 { 223 len += start_ptr == NULL; 224 goto success_in_beg_and_len; 225 } 226 if (! match_words) 227 goto success; 228 229 /* We need a preceding mb_start pointer. Use the beginning of line 230 if there is a preceding newline. */ 231 if (mbclen == 0) 232 { 233 char const *nl = memrchr (mb_start, eol, beg - mb_start); 234 if (nl) 235 mb_start = nl + 1; 236 } 237 238 /* Succeed if neither the preceding nor the following character is a 239 word constituent. If the preceding is not, yet the following 240 character IS a word constituent, keep trying with shorter matches. */ 241 if (mbclen > 0 242 ? ! wordchar_next (beg - mbclen, buf + size) 243 : ! wordchar_prev (mb_start, beg, buf + size)) 244 for (;;) 245 { 246 if (! wordchar_next (beg + len, buf + size)) 247 { 248 if (start_ptr) 249 goto success_in_beg_and_len; 250 else 251 goto success; 252 } 253 if (!start_ptr && !localeinfo.multibyte) 254 { 255 if (! kwsearch->re) 256 { 257 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size); 258 kwsearch->re = GEAcompile (kwsearch->pattern, 259 kwsearch->size, 260 RE_SYNTAX_GREP); 261 } 262 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 263 end = end ? end + 1 : buf + size; 264 if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL) 265 != (size_t) -1) 266 goto success_match_words; 267 beg = end - 1; 268 break; 269 } 270 if (!len) 271 break; 272 offset = kwsexec (kwset, beg, --len, &kwsmatch, true); 273 if (offset != 0) 274 break; 275 len = kwsmatch.size[0]; 276 } 277 278 /* No word match was found at BEG. Skip past word constituents, 279 since they cannot precede the next match and not skipping 280 them could make things much slower. */ 281 beg += wordchars_size (beg, buf + size); 282 mb_start = beg; 283 } /* for (beg in buf) */ 284 285 return -1; 286 287 success: 288 end = memchr (beg + len, eol, (buf + size) - (beg + len)); 289 end = end ? end + 1 : buf + size; 290 success_match_words: 291 beg = memrchr (buf, eol, beg - buf); 292 beg = beg ? beg + 1 : buf; 293 len = end - beg; 294 success_in_beg_and_len:; 295 size_t off = beg - buf; 296 297 *match_size = len; 298 ret_val = off; 299 return ret_val; 300 } 301