195b7b453SJohn Marino /* kwsearch.c - searching subroutines using kwset for grep.
2*09d4459fSDaniel Fojt Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino
495b7b453SJohn Marino This program is free software; you can redistribute it and/or modify
595b7b453SJohn Marino it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino the Free Software Foundation; either version 3, or (at your option)
795b7b453SJohn Marino any later version.
895b7b453SJohn Marino
995b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1295b7b453SJohn Marino GNU General Public License for more details.
1395b7b453SJohn Marino
1495b7b453SJohn Marino You should have received a copy of the GNU General Public License
1595b7b453SJohn Marino along with this program; if not, write to the Free Software
1695b7b453SJohn Marino Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
1795b7b453SJohn Marino 02110-1301, USA. */
1895b7b453SJohn Marino
1995b7b453SJohn Marino /* Written August 1992 by Mike Haertel. */
2095b7b453SJohn Marino
2195b7b453SJohn Marino #include <config.h>
2295b7b453SJohn Marino #include "search.h"
2395b7b453SJohn Marino
24*09d4459fSDaniel Fojt /* A compiled -F pattern list. */
25*09d4459fSDaniel Fojt
26*09d4459fSDaniel Fojt struct kwsearch
27680a9cb8SJohn Marino {
28*09d4459fSDaniel Fojt /* The kwset for this pattern list. */
29*09d4459fSDaniel Fojt kwset_t kwset;
3095b7b453SJohn Marino
31*09d4459fSDaniel Fojt /* The number of user-specified patterns. This is less than
32*09d4459fSDaniel Fojt 'kwswords (kwset)' when some extra one-character words have been
33*09d4459fSDaniel Fojt appended, one for each troublesome character that will require a
34*09d4459fSDaniel Fojt DFA search. */
35*09d4459fSDaniel Fojt ptrdiff_t words;
3695b7b453SJohn Marino
37*09d4459fSDaniel Fojt /* The user's pattern and its size in bytes. */
38*09d4459fSDaniel Fojt char *pattern;
39*09d4459fSDaniel Fojt size_t size;
40*09d4459fSDaniel Fojt
41*09d4459fSDaniel Fojt /* The user's pattern compiled as a regular expression,
42*09d4459fSDaniel Fojt or null if it has not been compiled. */
43*09d4459fSDaniel Fojt void *re;
44*09d4459fSDaniel Fojt };
45*09d4459fSDaniel Fojt
46*09d4459fSDaniel Fojt /* Compile the -F style PATTERN, containing SIZE bytes. Return a
47*09d4459fSDaniel Fojt description of the compiled pattern. */
48*09d4459fSDaniel Fojt
49*09d4459fSDaniel Fojt void *
Fcompile(char * pattern,size_t size,reg_syntax_t ignored)50*09d4459fSDaniel Fojt Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
5195b7b453SJohn Marino {
52*09d4459fSDaniel Fojt kwset_t kwset;
53*09d4459fSDaniel Fojt ptrdiff_t total = size;
54*09d4459fSDaniel Fojt char *buf = NULL;
55*09d4459fSDaniel Fojt size_t bufalloc = 0;
5695b7b453SJohn Marino
57*09d4459fSDaniel Fojt kwset = kwsinit (true);
5895b7b453SJohn Marino
59dc7c36e4SJohn Marino char const *p = pattern;
6095b7b453SJohn Marino do
6195b7b453SJohn Marino {
62*09d4459fSDaniel Fojt ptrdiff_t len;
63680a9cb8SJohn Marino char const *sep = memchr (p, '\n', total);
64680a9cb8SJohn Marino if (sep)
6595b7b453SJohn Marino {
66680a9cb8SJohn Marino len = sep - p;
67680a9cb8SJohn Marino sep++;
68680a9cb8SJohn Marino total -= (len + 1);
6995b7b453SJohn Marino }
70680a9cb8SJohn Marino else
7195b7b453SJohn Marino {
72680a9cb8SJohn Marino len = total;
73680a9cb8SJohn Marino total = 0;
7495b7b453SJohn Marino }
7595b7b453SJohn Marino
76680a9cb8SJohn Marino if (match_lines)
77680a9cb8SJohn Marino {
78*09d4459fSDaniel Fojt if (eolbyte == '\n' && pattern < p && sep)
79*09d4459fSDaniel Fojt p--;
80*09d4459fSDaniel Fojt else
81*09d4459fSDaniel Fojt {
82*09d4459fSDaniel Fojt if (bufalloc < len + 2)
83*09d4459fSDaniel Fojt {
84*09d4459fSDaniel Fojt free (buf);
85*09d4459fSDaniel Fojt bufalloc = len + 2;
86*09d4459fSDaniel Fojt buf = x2realloc (NULL, &bufalloc);
87680a9cb8SJohn Marino buf[0] = eolbyte;
88*09d4459fSDaniel Fojt }
89680a9cb8SJohn Marino memcpy (buf + 1, p, len);
90680a9cb8SJohn Marino buf[len + 1] = eolbyte;
91680a9cb8SJohn Marino p = buf;
92*09d4459fSDaniel Fojt }
93680a9cb8SJohn Marino len += 2;
9495b7b453SJohn Marino }
95680a9cb8SJohn Marino kwsincr (kwset, p, len);
9695b7b453SJohn Marino
97680a9cb8SJohn Marino p = sep;
98680a9cb8SJohn Marino }
99680a9cb8SJohn Marino while (p);
100680a9cb8SJohn Marino
101*09d4459fSDaniel Fojt free (buf);
102*09d4459fSDaniel Fojt ptrdiff_t words = kwswords (kwset);
103*09d4459fSDaniel Fojt
104*09d4459fSDaniel Fojt if (match_icase)
105*09d4459fSDaniel Fojt {
106*09d4459fSDaniel Fojt /* For each pattern character C that has a case folded
107*09d4459fSDaniel Fojt counterpart F that is multibyte and so cannot easily be
108*09d4459fSDaniel Fojt implemented via translating a single byte, append a pattern
109*09d4459fSDaniel Fojt containing just F. That way, if the data contains F, the
110*09d4459fSDaniel Fojt matcher can fall back on DFA. For example, if C is 'i' and
111*09d4459fSDaniel Fojt the locale is en_US.utf8, append a pattern containing just
112*09d4459fSDaniel Fojt the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
113*09d4459fSDaniel Fojt Fexecute will use a DFA if the data contain U+0131. */
114*09d4459fSDaniel Fojt mbstate_t mbs = { 0 };
115*09d4459fSDaniel Fojt char checked[NCHAR] = {0,};
116*09d4459fSDaniel Fojt for (p = pattern; p < pattern + size; p++)
117*09d4459fSDaniel Fojt {
118*09d4459fSDaniel Fojt unsigned char c = *p;
119*09d4459fSDaniel Fojt if (checked[c])
120*09d4459fSDaniel Fojt continue;
121*09d4459fSDaniel Fojt checked[c] = true;
122*09d4459fSDaniel Fojt
123*09d4459fSDaniel Fojt wint_t wc = localeinfo.sbctowc[c];
124*09d4459fSDaniel Fojt wchar_t folded[CASE_FOLDED_BUFSIZE];
125*09d4459fSDaniel Fojt
126*09d4459fSDaniel Fojt for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
127*09d4459fSDaniel Fojt {
128*09d4459fSDaniel Fojt char s[MB_LEN_MAX];
129*09d4459fSDaniel Fojt int nbytes = wcrtomb (s, folded[i], &mbs);
130*09d4459fSDaniel Fojt if (1 < nbytes)
131*09d4459fSDaniel Fojt kwsincr (kwset, s, nbytes);
132*09d4459fSDaniel Fojt }
133*09d4459fSDaniel Fojt }
134680a9cb8SJohn Marino }
135680a9cb8SJohn Marino
136*09d4459fSDaniel Fojt kwsprep (kwset);
137*09d4459fSDaniel Fojt
138*09d4459fSDaniel Fojt struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
139*09d4459fSDaniel Fojt kwsearch->kwset = kwset;
140*09d4459fSDaniel Fojt kwsearch->words = words;
141*09d4459fSDaniel Fojt kwsearch->pattern = pattern;
142*09d4459fSDaniel Fojt kwsearch->size = size;
143*09d4459fSDaniel Fojt kwsearch->re = NULL;
144*09d4459fSDaniel Fojt return kwsearch;
145*09d4459fSDaniel Fojt }
146*09d4459fSDaniel Fojt
147*09d4459fSDaniel Fojt /* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
148*09d4459fSDaniel Fojt If found, return the offset of the first match and store its
149*09d4459fSDaniel Fojt size into *MATCH_SIZE. If not found, return SIZE_MAX.
150*09d4459fSDaniel Fojt If START_PTR is nonnull, start searching there. */
15195b7b453SJohn Marino size_t
Fexecute(void * vcp,char const * buf,size_t size,size_t * match_size,char const * start_ptr)152*09d4459fSDaniel Fojt Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
15395b7b453SJohn Marino char const *start_ptr)
15495b7b453SJohn Marino {
155*09d4459fSDaniel Fojt char const *beg, *end, *mb_start;
156*09d4459fSDaniel Fojt ptrdiff_t len;
15795b7b453SJohn Marino char eol = eolbyte;
15895b7b453SJohn Marino struct kwsmatch kwsmatch;
15995b7b453SJohn Marino size_t ret_val;
160*09d4459fSDaniel Fojt bool mb_check;
161*09d4459fSDaniel Fojt bool longest;
162*09d4459fSDaniel Fojt struct kwsearch *kwsearch = vcp;
163*09d4459fSDaniel Fojt kwset_t kwset = kwsearch->kwset;
164*09d4459fSDaniel Fojt size_t mbclen;
165*09d4459fSDaniel Fojt
166*09d4459fSDaniel Fojt if (match_lines)
167*09d4459fSDaniel Fojt mb_check = longest = false;
168*09d4459fSDaniel Fojt else
169*09d4459fSDaniel Fojt {
170*09d4459fSDaniel Fojt mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
171*09d4459fSDaniel Fojt longest = mb_check | !!start_ptr | match_words;
172*09d4459fSDaniel Fojt }
17395b7b453SJohn Marino
17495b7b453SJohn Marino for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
17595b7b453SJohn Marino {
176*09d4459fSDaniel Fojt ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
177*09d4459fSDaniel Fojt buf + size - beg + match_lines, &kwsmatch,
178*09d4459fSDaniel Fojt longest);
179*09d4459fSDaniel Fojt if (offset < 0)
180*09d4459fSDaniel Fojt break;
181dc7c36e4SJohn Marino len = kwsmatch.size[0] - 2 * match_lines;
182*09d4459fSDaniel Fojt
183*09d4459fSDaniel Fojt if (kwsearch->words <= kwsmatch.index)
184*09d4459fSDaniel Fojt {
185*09d4459fSDaniel Fojt /* The data contain a multibyte character that matches
186*09d4459fSDaniel Fojt some pattern character that is a case folded counterpart.
187*09d4459fSDaniel Fojt Since the kwset code cannot handle this case, fall back
188*09d4459fSDaniel Fojt on the DFA code, which can. */
189*09d4459fSDaniel Fojt if (! kwsearch->re)
190*09d4459fSDaniel Fojt {
191*09d4459fSDaniel Fojt fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
192*09d4459fSDaniel Fojt kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
193*09d4459fSDaniel Fojt RE_SYNTAX_GREP);
194*09d4459fSDaniel Fojt }
195*09d4459fSDaniel Fojt return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
196*09d4459fSDaniel Fojt }
197*09d4459fSDaniel Fojt
198*09d4459fSDaniel Fojt mbclen = 0;
199*09d4459fSDaniel Fojt if (mb_check
200*09d4459fSDaniel Fojt && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
20195b7b453SJohn Marino {
202dc7c36e4SJohn Marino /* We have matched a single byte that is not at the beginning of a
203dc7c36e4SJohn Marino multibyte character. mb_goback has advanced MB_START past that
204dc7c36e4SJohn Marino multibyte character. Now, we want to position BEG so that the
205dc7c36e4SJohn Marino next kwsexec search starts there. Thus, to compensate for the
206dc7c36e4SJohn Marino for-loop's BEG++, above, subtract one here. This code is
207dc7c36e4SJohn Marino unusually hard to reach, and exceptionally, let's show how to
208dc7c36e4SJohn Marino trigger it here:
209dc7c36e4SJohn Marino
210dc7c36e4SJohn Marino printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
211dc7c36e4SJohn Marino
212dc7c36e4SJohn Marino That assumes the named locale is installed.
213dc7c36e4SJohn Marino Note that your system's shift-JIS locale may have a different
214dc7c36e4SJohn Marino name, possibly including "sjis". */
215dc7c36e4SJohn Marino beg = mb_start - 1;
21695b7b453SJohn Marino continue;
21795b7b453SJohn Marino }
21895b7b453SJohn Marino beg += offset;
219*09d4459fSDaniel Fojt if (!!start_ptr & !match_words)
22095b7b453SJohn Marino goto success_in_beg_and_len;
22195b7b453SJohn Marino if (match_lines)
222dc7c36e4SJohn Marino {
223dc7c36e4SJohn Marino len += start_ptr == NULL;
224680a9cb8SJohn Marino goto success_in_beg_and_len;
225dc7c36e4SJohn Marino }
226*09d4459fSDaniel Fojt if (! match_words)
227*09d4459fSDaniel Fojt goto success;
228*09d4459fSDaniel Fojt
229*09d4459fSDaniel Fojt /* We need a preceding mb_start pointer. Use the beginning of line
230*09d4459fSDaniel Fojt if there is a preceding newline. */
231*09d4459fSDaniel Fojt if (mbclen == 0)
23295b7b453SJohn Marino {
233*09d4459fSDaniel Fojt char const *nl = memrchr (mb_start, eol, beg - mb_start);
234*09d4459fSDaniel Fojt if (nl)
235*09d4459fSDaniel Fojt mb_start = nl + 1;
236*09d4459fSDaniel Fojt }
237*09d4459fSDaniel Fojt
238*09d4459fSDaniel Fojt /* Succeed if neither the preceding nor the following character is a
239*09d4459fSDaniel Fojt word constituent. If the preceding is not, yet the following
240*09d4459fSDaniel Fojt character IS a word constituent, keep trying with shorter matches. */
241*09d4459fSDaniel Fojt if (mbclen > 0
242*09d4459fSDaniel Fojt ? ! wordchar_next (beg - mbclen, buf + size)
243*09d4459fSDaniel Fojt : ! wordchar_prev (mb_start, beg, buf + size))
244*09d4459fSDaniel Fojt for (;;)
245*09d4459fSDaniel Fojt {
246*09d4459fSDaniel Fojt if (! wordchar_next (beg + len, buf + size))
247*09d4459fSDaniel Fojt {
248*09d4459fSDaniel Fojt if (start_ptr)
249*09d4459fSDaniel Fojt goto success_in_beg_and_len;
250*09d4459fSDaniel Fojt else
251*09d4459fSDaniel Fojt goto success;
252*09d4459fSDaniel Fojt }
253*09d4459fSDaniel Fojt if (!start_ptr && !localeinfo.multibyte)
254*09d4459fSDaniel Fojt {
255*09d4459fSDaniel Fojt if (! kwsearch->re)
256*09d4459fSDaniel Fojt {
257*09d4459fSDaniel Fojt fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
258*09d4459fSDaniel Fojt kwsearch->re = GEAcompile (kwsearch->pattern,
259*09d4459fSDaniel Fojt kwsearch->size,
260*09d4459fSDaniel Fojt RE_SYNTAX_GREP);
261*09d4459fSDaniel Fojt }
262*09d4459fSDaniel Fojt end = memchr (beg + len, eol, (buf + size) - (beg + len));
263*09d4459fSDaniel Fojt end = end ? end + 1 : buf + size;
264*09d4459fSDaniel Fojt if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
265*09d4459fSDaniel Fojt != (size_t) -1)
266*09d4459fSDaniel Fojt goto success_match_words;
267*09d4459fSDaniel Fojt beg = end - 1;
26895b7b453SJohn Marino break;
269*09d4459fSDaniel Fojt }
27095b7b453SJohn Marino if (!len)
27195b7b453SJohn Marino break;
272*09d4459fSDaniel Fojt offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
273*09d4459fSDaniel Fojt if (offset != 0)
27495b7b453SJohn Marino break;
27595b7b453SJohn Marino len = kwsmatch.size[0];
27695b7b453SJohn Marino }
277*09d4459fSDaniel Fojt
278*09d4459fSDaniel Fojt /* No word match was found at BEG. Skip past word constituents,
279*09d4459fSDaniel Fojt since they cannot precede the next match and not skipping
280*09d4459fSDaniel Fojt them could make things much slower. */
281*09d4459fSDaniel Fojt beg += wordchars_size (beg, buf + size);
282*09d4459fSDaniel Fojt mb_start = beg;
28395b7b453SJohn Marino } /* for (beg in buf) */
28495b7b453SJohn Marino
285680a9cb8SJohn Marino return -1;
28695b7b453SJohn Marino
28795b7b453SJohn Marino success:
288dc7c36e4SJohn Marino end = memchr (beg + len, eol, (buf + size) - (beg + len));
289dc7c36e4SJohn Marino end = end ? end + 1 : buf + size;
290*09d4459fSDaniel Fojt success_match_words:
291dc7c36e4SJohn Marino beg = memrchr (buf, eol, beg - buf);
292dc7c36e4SJohn Marino beg = beg ? beg + 1 : buf;
29395b7b453SJohn Marino len = end - beg;
294a8597f6cSJohn Marino success_in_beg_and_len:;
295a8597f6cSJohn Marino size_t off = beg - buf;
296a8597f6cSJohn Marino
29795b7b453SJohn Marino *match_size = len;
298a8597f6cSJohn Marino ret_val = off;
29995b7b453SJohn Marino return ret_val;
30095b7b453SJohn Marino }
301