1 /* kwsearch.c - searching subroutines using kwset for grep.
2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19 /* Written August 1992 by Mike Haertel. */
20
21 #include <config.h>
22 #include "search.h"
23
24 /* A compiled -F pattern list. */
25
26 struct kwsearch
27 {
28 /* The kwset for this pattern list. */
29 kwset_t kwset;
30
31 /* The number of user-specified patterns. This is less than
32 'kwswords (kwset)' when some extra one-character words have been
33 appended, one for each troublesome character that will require a
34 DFA search. */
35 ptrdiff_t words;
36
37 /* The user's pattern and its size in bytes. */
38 char *pattern;
39 size_t size;
40
41 /* The user's pattern compiled as a regular expression,
42 or null if it has not been compiled. */
43 void *re;
44 };
45
46 /* Compile the -F style PATTERN, containing SIZE bytes. Return a
47 description of the compiled pattern. */
48
49 void *
Fcompile(char * pattern,size_t size,reg_syntax_t ignored)50 Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
51 {
52 kwset_t kwset;
53 ptrdiff_t total = size;
54 char *buf = NULL;
55 size_t bufalloc = 0;
56
57 kwset = kwsinit (true);
58
59 char const *p = pattern;
60 do
61 {
62 ptrdiff_t len;
63 char const *sep = memchr (p, '\n', total);
64 if (sep)
65 {
66 len = sep - p;
67 sep++;
68 total -= (len + 1);
69 }
70 else
71 {
72 len = total;
73 total = 0;
74 }
75
76 if (match_lines)
77 {
78 if (eolbyte == '\n' && pattern < p && sep)
79 p--;
80 else
81 {
82 if (bufalloc < len + 2)
83 {
84 free (buf);
85 bufalloc = len + 2;
86 buf = x2realloc (NULL, &bufalloc);
87 buf[0] = eolbyte;
88 }
89 memcpy (buf + 1, p, len);
90 buf[len + 1] = eolbyte;
91 p = buf;
92 }
93 len += 2;
94 }
95 kwsincr (kwset, p, len);
96
97 p = sep;
98 }
99 while (p);
100
101 free (buf);
102 ptrdiff_t words = kwswords (kwset);
103
104 if (match_icase)
105 {
106 /* For each pattern character C that has a case folded
107 counterpart F that is multibyte and so cannot easily be
108 implemented via translating a single byte, append a pattern
109 containing just F. That way, if the data contains F, the
110 matcher can fall back on DFA. For example, if C is 'i' and
111 the locale is en_US.utf8, append a pattern containing just
112 the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
113 Fexecute will use a DFA if the data contain U+0131. */
114 mbstate_t mbs = { 0 };
115 char checked[NCHAR] = {0,};
116 for (p = pattern; p < pattern + size; p++)
117 {
118 unsigned char c = *p;
119 if (checked[c])
120 continue;
121 checked[c] = true;
122
123 wint_t wc = localeinfo.sbctowc[c];
124 wchar_t folded[CASE_FOLDED_BUFSIZE];
125
126 for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
127 {
128 char s[MB_LEN_MAX];
129 int nbytes = wcrtomb (s, folded[i], &mbs);
130 if (1 < nbytes)
131 kwsincr (kwset, s, nbytes);
132 }
133 }
134 }
135
136 kwsprep (kwset);
137
138 struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
139 kwsearch->kwset = kwset;
140 kwsearch->words = words;
141 kwsearch->pattern = pattern;
142 kwsearch->size = size;
143 kwsearch->re = NULL;
144 return kwsearch;
145 }
146
147 /* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
148 If found, return the offset of the first match and store its
149 size into *MATCH_SIZE. If not found, return SIZE_MAX.
150 If START_PTR is nonnull, start searching there. */
151 size_t
Fexecute(void * vcp,char const * buf,size_t size,size_t * match_size,char const * start_ptr)152 Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
153 char const *start_ptr)
154 {
155 char const *beg, *end, *mb_start;
156 ptrdiff_t len;
157 char eol = eolbyte;
158 struct kwsmatch kwsmatch;
159 size_t ret_val;
160 bool mb_check;
161 bool longest;
162 struct kwsearch *kwsearch = vcp;
163 kwset_t kwset = kwsearch->kwset;
164 size_t mbclen;
165
166 if (match_lines)
167 mb_check = longest = false;
168 else
169 {
170 mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
171 longest = mb_check | !!start_ptr | match_words;
172 }
173
174 for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
175 {
176 ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
177 buf + size - beg + match_lines, &kwsmatch,
178 longest);
179 if (offset < 0)
180 break;
181 len = kwsmatch.size[0] - 2 * match_lines;
182
183 if (kwsearch->words <= kwsmatch.index)
184 {
185 /* The data contain a multibyte character that matches
186 some pattern character that is a case folded counterpart.
187 Since the kwset code cannot handle this case, fall back
188 on the DFA code, which can. */
189 if (! kwsearch->re)
190 {
191 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
192 kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
193 RE_SYNTAX_GREP);
194 }
195 return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
196 }
197
198 mbclen = 0;
199 if (mb_check
200 && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
201 {
202 /* We have matched a single byte that is not at the beginning of a
203 multibyte character. mb_goback has advanced MB_START past that
204 multibyte character. Now, we want to position BEG so that the
205 next kwsexec search starts there. Thus, to compensate for the
206 for-loop's BEG++, above, subtract one here. This code is
207 unusually hard to reach, and exceptionally, let's show how to
208 trigger it here:
209
210 printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
211
212 That assumes the named locale is installed.
213 Note that your system's shift-JIS locale may have a different
214 name, possibly including "sjis". */
215 beg = mb_start - 1;
216 continue;
217 }
218 beg += offset;
219 if (!!start_ptr & !match_words)
220 goto success_in_beg_and_len;
221 if (match_lines)
222 {
223 len += start_ptr == NULL;
224 goto success_in_beg_and_len;
225 }
226 if (! match_words)
227 goto success;
228
229 /* We need a preceding mb_start pointer. Use the beginning of line
230 if there is a preceding newline. */
231 if (mbclen == 0)
232 {
233 char const *nl = memrchr (mb_start, eol, beg - mb_start);
234 if (nl)
235 mb_start = nl + 1;
236 }
237
238 /* Succeed if neither the preceding nor the following character is a
239 word constituent. If the preceding is not, yet the following
240 character IS a word constituent, keep trying with shorter matches. */
241 if (mbclen > 0
242 ? ! wordchar_next (beg - mbclen, buf + size)
243 : ! wordchar_prev (mb_start, beg, buf + size))
244 for (;;)
245 {
246 if (! wordchar_next (beg + len, buf + size))
247 {
248 if (start_ptr)
249 goto success_in_beg_and_len;
250 else
251 goto success;
252 }
253 if (!start_ptr && !localeinfo.multibyte)
254 {
255 if (! kwsearch->re)
256 {
257 fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
258 kwsearch->re = GEAcompile (kwsearch->pattern,
259 kwsearch->size,
260 RE_SYNTAX_GREP);
261 }
262 end = memchr (beg + len, eol, (buf + size) - (beg + len));
263 end = end ? end + 1 : buf + size;
264 if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
265 != (size_t) -1)
266 goto success_match_words;
267 beg = end - 1;
268 break;
269 }
270 if (!len)
271 break;
272 offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
273 if (offset != 0)
274 break;
275 len = kwsmatch.size[0];
276 }
277
278 /* No word match was found at BEG. Skip past word constituents,
279 since they cannot precede the next match and not skipping
280 them could make things much slower. */
281 beg += wordchars_size (beg, buf + size);
282 mb_start = beg;
283 } /* for (beg in buf) */
284
285 return -1;
286
287 success:
288 end = memchr (beg + len, eol, (buf + size) - (beg + len));
289 end = end ? end + 1 : buf + size;
290 success_match_words:
291 beg = memrchr (buf, eol, beg - buf);
292 beg = beg ? beg + 1 : buf;
293 len = end - beg;
294 success_in_beg_and_len:;
295 size_t off = beg - buf;
296
297 *match_size = len;
298 ret_val = off;
299 return ret_val;
300 }
301