xref: /dragonfly/contrib/grep/src/kwsearch.c (revision 09d4459f)
195b7b453SJohn Marino /* kwsearch.c - searching subroutines using kwset for grep.
2*09d4459fSDaniel Fojt    Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino 
495b7b453SJohn Marino    This program is free software; you can redistribute it and/or modify
595b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino    the Free Software Foundation; either version 3, or (at your option)
795b7b453SJohn Marino    any later version.
895b7b453SJohn Marino 
995b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1295b7b453SJohn Marino    GNU General Public License for more details.
1395b7b453SJohn Marino 
1495b7b453SJohn Marino    You should have received a copy of the GNU General Public License
1595b7b453SJohn Marino    along with this program; if not, write to the Free Software
1695b7b453SJohn Marino    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
1795b7b453SJohn Marino    02110-1301, USA.  */
1895b7b453SJohn Marino 
1995b7b453SJohn Marino /* Written August 1992 by Mike Haertel. */
2095b7b453SJohn Marino 
2195b7b453SJohn Marino #include <config.h>
2295b7b453SJohn Marino #include "search.h"
2395b7b453SJohn Marino 
24*09d4459fSDaniel Fojt /* A compiled -F pattern list.  */
25*09d4459fSDaniel Fojt 
26*09d4459fSDaniel Fojt struct kwsearch
27680a9cb8SJohn Marino {
28*09d4459fSDaniel Fojt   /* The kwset for this pattern list.  */
29*09d4459fSDaniel Fojt   kwset_t kwset;
3095b7b453SJohn Marino 
31*09d4459fSDaniel Fojt   /* The number of user-specified patterns.  This is less than
32*09d4459fSDaniel Fojt      'kwswords (kwset)' when some extra one-character words have been
33*09d4459fSDaniel Fojt      appended, one for each troublesome character that will require a
34*09d4459fSDaniel Fojt      DFA search.  */
35*09d4459fSDaniel Fojt   ptrdiff_t words;
3695b7b453SJohn Marino 
37*09d4459fSDaniel Fojt   /* The user's pattern and its size in bytes.  */
38*09d4459fSDaniel Fojt   char *pattern;
39*09d4459fSDaniel Fojt   size_t size;
40*09d4459fSDaniel Fojt 
41*09d4459fSDaniel Fojt   /* The user's pattern compiled as a regular expression,
42*09d4459fSDaniel Fojt      or null if it has not been compiled.  */
43*09d4459fSDaniel Fojt   void *re;
44*09d4459fSDaniel Fojt };
45*09d4459fSDaniel Fojt 
46*09d4459fSDaniel Fojt /* Compile the -F style PATTERN, containing SIZE bytes.  Return a
47*09d4459fSDaniel Fojt    description of the compiled pattern.  */
48*09d4459fSDaniel Fojt 
49*09d4459fSDaniel Fojt void *
Fcompile(char * pattern,size_t size,reg_syntax_t ignored)50*09d4459fSDaniel Fojt Fcompile (char *pattern, size_t size, reg_syntax_t ignored)
5195b7b453SJohn Marino {
52*09d4459fSDaniel Fojt   kwset_t kwset;
53*09d4459fSDaniel Fojt   ptrdiff_t total = size;
54*09d4459fSDaniel Fojt   char *buf = NULL;
55*09d4459fSDaniel Fojt   size_t bufalloc = 0;
5695b7b453SJohn Marino 
57*09d4459fSDaniel Fojt   kwset = kwsinit (true);
5895b7b453SJohn Marino 
59dc7c36e4SJohn Marino   char const *p = pattern;
6095b7b453SJohn Marino   do
6195b7b453SJohn Marino     {
62*09d4459fSDaniel Fojt       ptrdiff_t len;
63680a9cb8SJohn Marino       char const *sep = memchr (p, '\n', total);
64680a9cb8SJohn Marino       if (sep)
6595b7b453SJohn Marino         {
66680a9cb8SJohn Marino           len = sep - p;
67680a9cb8SJohn Marino           sep++;
68680a9cb8SJohn Marino           total -= (len + 1);
6995b7b453SJohn Marino         }
70680a9cb8SJohn Marino       else
7195b7b453SJohn Marino         {
72680a9cb8SJohn Marino           len = total;
73680a9cb8SJohn Marino           total = 0;
7495b7b453SJohn Marino         }
7595b7b453SJohn Marino 
76680a9cb8SJohn Marino       if (match_lines)
77680a9cb8SJohn Marino         {
78*09d4459fSDaniel Fojt           if (eolbyte == '\n' && pattern < p && sep)
79*09d4459fSDaniel Fojt             p--;
80*09d4459fSDaniel Fojt           else
81*09d4459fSDaniel Fojt             {
82*09d4459fSDaniel Fojt               if (bufalloc < len + 2)
83*09d4459fSDaniel Fojt                 {
84*09d4459fSDaniel Fojt                   free (buf);
85*09d4459fSDaniel Fojt                   bufalloc = len + 2;
86*09d4459fSDaniel Fojt                   buf = x2realloc (NULL, &bufalloc);
87680a9cb8SJohn Marino                   buf[0] = eolbyte;
88*09d4459fSDaniel Fojt                 }
89680a9cb8SJohn Marino               memcpy (buf + 1, p, len);
90680a9cb8SJohn Marino               buf[len + 1] = eolbyte;
91680a9cb8SJohn Marino               p = buf;
92*09d4459fSDaniel Fojt             }
93680a9cb8SJohn Marino           len += 2;
9495b7b453SJohn Marino         }
95680a9cb8SJohn Marino       kwsincr (kwset, p, len);
9695b7b453SJohn Marino 
97680a9cb8SJohn Marino       p = sep;
98680a9cb8SJohn Marino     }
99680a9cb8SJohn Marino   while (p);
100680a9cb8SJohn Marino 
101*09d4459fSDaniel Fojt   free (buf);
102*09d4459fSDaniel Fojt   ptrdiff_t words = kwswords (kwset);
103*09d4459fSDaniel Fojt 
104*09d4459fSDaniel Fojt   if (match_icase)
105*09d4459fSDaniel Fojt     {
106*09d4459fSDaniel Fojt       /* For each pattern character C that has a case folded
107*09d4459fSDaniel Fojt          counterpart F that is multibyte and so cannot easily be
108*09d4459fSDaniel Fojt          implemented via translating a single byte, append a pattern
109*09d4459fSDaniel Fojt          containing just F.  That way, if the data contains F, the
110*09d4459fSDaniel Fojt          matcher can fall back on DFA.  For example, if C is 'i' and
111*09d4459fSDaniel Fojt          the locale is en_US.utf8, append a pattern containing just
112*09d4459fSDaniel Fojt          the character U+0131 (LATIN SMALL LETTER DOTLESS I), so that
113*09d4459fSDaniel Fojt          Fexecute will use a DFA if the data contain U+0131.  */
114*09d4459fSDaniel Fojt       mbstate_t mbs = { 0 };
115*09d4459fSDaniel Fojt       char checked[NCHAR] = {0,};
116*09d4459fSDaniel Fojt       for (p = pattern; p < pattern + size; p++)
117*09d4459fSDaniel Fojt         {
118*09d4459fSDaniel Fojt           unsigned char c = *p;
119*09d4459fSDaniel Fojt           if (checked[c])
120*09d4459fSDaniel Fojt             continue;
121*09d4459fSDaniel Fojt           checked[c] = true;
122*09d4459fSDaniel Fojt 
123*09d4459fSDaniel Fojt           wint_t wc = localeinfo.sbctowc[c];
124*09d4459fSDaniel Fojt           wchar_t folded[CASE_FOLDED_BUFSIZE];
125*09d4459fSDaniel Fojt 
126*09d4459fSDaniel Fojt           for (int i = case_folded_counterparts (wc, folded); 0 <= --i; )
127*09d4459fSDaniel Fojt             {
128*09d4459fSDaniel Fojt               char s[MB_LEN_MAX];
129*09d4459fSDaniel Fojt               int nbytes = wcrtomb (s, folded[i], &mbs);
130*09d4459fSDaniel Fojt               if (1 < nbytes)
131*09d4459fSDaniel Fojt                 kwsincr (kwset, s, nbytes);
132*09d4459fSDaniel Fojt             }
133*09d4459fSDaniel Fojt         }
134680a9cb8SJohn Marino     }
135680a9cb8SJohn Marino 
136*09d4459fSDaniel Fojt   kwsprep (kwset);
137*09d4459fSDaniel Fojt 
138*09d4459fSDaniel Fojt   struct kwsearch *kwsearch = xmalloc (sizeof *kwsearch);
139*09d4459fSDaniel Fojt   kwsearch->kwset = kwset;
140*09d4459fSDaniel Fojt   kwsearch->words = words;
141*09d4459fSDaniel Fojt   kwsearch->pattern = pattern;
142*09d4459fSDaniel Fojt   kwsearch->size = size;
143*09d4459fSDaniel Fojt   kwsearch->re = NULL;
144*09d4459fSDaniel Fojt   return kwsearch;
145*09d4459fSDaniel Fojt }
146*09d4459fSDaniel Fojt 
147*09d4459fSDaniel Fojt /* Use the compiled pattern VCP to search the buffer BUF of size SIZE.
148*09d4459fSDaniel Fojt    If found, return the offset of the first match and store its
149*09d4459fSDaniel Fojt    size into *MATCH_SIZE.  If not found, return SIZE_MAX.
150*09d4459fSDaniel Fojt    If START_PTR is nonnull, start searching there.  */
15195b7b453SJohn Marino size_t
Fexecute(void * vcp,char const * buf,size_t size,size_t * match_size,char const * start_ptr)152*09d4459fSDaniel Fojt Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
15395b7b453SJohn Marino           char const *start_ptr)
15495b7b453SJohn Marino {
155*09d4459fSDaniel Fojt   char const *beg, *end, *mb_start;
156*09d4459fSDaniel Fojt   ptrdiff_t len;
15795b7b453SJohn Marino   char eol = eolbyte;
15895b7b453SJohn Marino   struct kwsmatch kwsmatch;
15995b7b453SJohn Marino   size_t ret_val;
160*09d4459fSDaniel Fojt   bool mb_check;
161*09d4459fSDaniel Fojt   bool longest;
162*09d4459fSDaniel Fojt   struct kwsearch *kwsearch = vcp;
163*09d4459fSDaniel Fojt   kwset_t kwset = kwsearch->kwset;
164*09d4459fSDaniel Fojt   size_t mbclen;
165*09d4459fSDaniel Fojt 
166*09d4459fSDaniel Fojt   if (match_lines)
167*09d4459fSDaniel Fojt     mb_check = longest = false;
168*09d4459fSDaniel Fojt   else
169*09d4459fSDaniel Fojt     {
170*09d4459fSDaniel Fojt       mb_check = localeinfo.multibyte & !localeinfo.using_utf8;
171*09d4459fSDaniel Fojt       longest = mb_check | !!start_ptr | match_words;
172*09d4459fSDaniel Fojt     }
17395b7b453SJohn Marino 
17495b7b453SJohn Marino   for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
17595b7b453SJohn Marino     {
176*09d4459fSDaniel Fojt       ptrdiff_t offset = kwsexec (kwset, beg - match_lines,
177*09d4459fSDaniel Fojt                                   buf + size - beg + match_lines, &kwsmatch,
178*09d4459fSDaniel Fojt                                   longest);
179*09d4459fSDaniel Fojt       if (offset < 0)
180*09d4459fSDaniel Fojt         break;
181dc7c36e4SJohn Marino       len = kwsmatch.size[0] - 2 * match_lines;
182*09d4459fSDaniel Fojt 
183*09d4459fSDaniel Fojt       if (kwsearch->words <= kwsmatch.index)
184*09d4459fSDaniel Fojt         {
185*09d4459fSDaniel Fojt           /* The data contain a multibyte character that matches
186*09d4459fSDaniel Fojt              some pattern character that is a case folded counterpart.
187*09d4459fSDaniel Fojt              Since the kwset code cannot handle this case, fall back
188*09d4459fSDaniel Fojt              on the DFA code, which can.  */
189*09d4459fSDaniel Fojt           if (! kwsearch->re)
190*09d4459fSDaniel Fojt             {
191*09d4459fSDaniel Fojt               fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
192*09d4459fSDaniel Fojt               kwsearch->re = GEAcompile (kwsearch->pattern, kwsearch->size,
193*09d4459fSDaniel Fojt                                          RE_SYNTAX_GREP);
194*09d4459fSDaniel Fojt             }
195*09d4459fSDaniel Fojt           return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
196*09d4459fSDaniel Fojt         }
197*09d4459fSDaniel Fojt 
198*09d4459fSDaniel Fojt       mbclen = 0;
199*09d4459fSDaniel Fojt       if (mb_check
200*09d4459fSDaniel Fojt           && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
20195b7b453SJohn Marino         {
202dc7c36e4SJohn Marino           /* We have matched a single byte that is not at the beginning of a
203dc7c36e4SJohn Marino              multibyte character.  mb_goback has advanced MB_START past that
204dc7c36e4SJohn Marino              multibyte character.  Now, we want to position BEG so that the
205dc7c36e4SJohn Marino              next kwsexec search starts there.  Thus, to compensate for the
206dc7c36e4SJohn Marino              for-loop's BEG++, above, subtract one here.  This code is
207dc7c36e4SJohn Marino              unusually hard to reach, and exceptionally, let's show how to
208dc7c36e4SJohn Marino              trigger it here:
209dc7c36e4SJohn Marino 
210dc7c36e4SJohn Marino                printf '\203AA\n'|LC_ALL=ja_JP.SHIFT_JIS src/grep -F A
211dc7c36e4SJohn Marino 
212dc7c36e4SJohn Marino              That assumes the named locale is installed.
213dc7c36e4SJohn Marino              Note that your system's shift-JIS locale may have a different
214dc7c36e4SJohn Marino              name, possibly including "sjis".  */
215dc7c36e4SJohn Marino           beg = mb_start - 1;
21695b7b453SJohn Marino           continue;
21795b7b453SJohn Marino         }
21895b7b453SJohn Marino       beg += offset;
219*09d4459fSDaniel Fojt       if (!!start_ptr & !match_words)
22095b7b453SJohn Marino         goto success_in_beg_and_len;
22195b7b453SJohn Marino       if (match_lines)
222dc7c36e4SJohn Marino         {
223dc7c36e4SJohn Marino           len += start_ptr == NULL;
224680a9cb8SJohn Marino           goto success_in_beg_and_len;
225dc7c36e4SJohn Marino         }
226*09d4459fSDaniel Fojt       if (! match_words)
227*09d4459fSDaniel Fojt         goto success;
228*09d4459fSDaniel Fojt 
229*09d4459fSDaniel Fojt       /* We need a preceding mb_start pointer.  Use the beginning of line
230*09d4459fSDaniel Fojt          if there is a preceding newline.  */
231*09d4459fSDaniel Fojt       if (mbclen == 0)
23295b7b453SJohn Marino         {
233*09d4459fSDaniel Fojt           char const *nl = memrchr (mb_start, eol, beg - mb_start);
234*09d4459fSDaniel Fojt           if (nl)
235*09d4459fSDaniel Fojt             mb_start = nl + 1;
236*09d4459fSDaniel Fojt         }
237*09d4459fSDaniel Fojt 
238*09d4459fSDaniel Fojt       /* Succeed if neither the preceding nor the following character is a
239*09d4459fSDaniel Fojt          word constituent.  If the preceding is not, yet the following
240*09d4459fSDaniel Fojt          character IS a word constituent, keep trying with shorter matches.  */
241*09d4459fSDaniel Fojt       if (mbclen > 0
242*09d4459fSDaniel Fojt           ? ! wordchar_next (beg - mbclen, buf + size)
243*09d4459fSDaniel Fojt           : ! wordchar_prev (mb_start, beg, buf + size))
244*09d4459fSDaniel Fojt         for (;;)
245*09d4459fSDaniel Fojt           {
246*09d4459fSDaniel Fojt             if (! wordchar_next (beg + len, buf + size))
247*09d4459fSDaniel Fojt               {
248*09d4459fSDaniel Fojt                 if (start_ptr)
249*09d4459fSDaniel Fojt                   goto success_in_beg_and_len;
250*09d4459fSDaniel Fojt                 else
251*09d4459fSDaniel Fojt                   goto success;
252*09d4459fSDaniel Fojt               }
253*09d4459fSDaniel Fojt             if (!start_ptr && !localeinfo.multibyte)
254*09d4459fSDaniel Fojt               {
255*09d4459fSDaniel Fojt                 if (! kwsearch->re)
256*09d4459fSDaniel Fojt                   {
257*09d4459fSDaniel Fojt                     fgrep_to_grep_pattern (&kwsearch->pattern, &kwsearch->size);
258*09d4459fSDaniel Fojt                     kwsearch->re = GEAcompile (kwsearch->pattern,
259*09d4459fSDaniel Fojt                                                kwsearch->size,
260*09d4459fSDaniel Fojt                                                RE_SYNTAX_GREP);
261*09d4459fSDaniel Fojt                   }
262*09d4459fSDaniel Fojt                 end = memchr (beg + len, eol, (buf + size) - (beg + len));
263*09d4459fSDaniel Fojt                 end = end ? end + 1 : buf + size;
264*09d4459fSDaniel Fojt                 if (EGexecute (kwsearch->re, beg, end - beg, match_size, NULL)
265*09d4459fSDaniel Fojt                     != (size_t) -1)
266*09d4459fSDaniel Fojt                   goto success_match_words;
267*09d4459fSDaniel Fojt                 beg = end - 1;
26895b7b453SJohn Marino                 break;
269*09d4459fSDaniel Fojt               }
27095b7b453SJohn Marino             if (!len)
27195b7b453SJohn Marino               break;
272*09d4459fSDaniel Fojt             offset = kwsexec (kwset, beg, --len, &kwsmatch, true);
273*09d4459fSDaniel Fojt             if (offset != 0)
27495b7b453SJohn Marino               break;
27595b7b453SJohn Marino             len = kwsmatch.size[0];
27695b7b453SJohn Marino           }
277*09d4459fSDaniel Fojt 
278*09d4459fSDaniel Fojt       /* No word match was found at BEG.  Skip past word constituents,
279*09d4459fSDaniel Fojt          since they cannot precede the next match and not skipping
280*09d4459fSDaniel Fojt          them could make things much slower.  */
281*09d4459fSDaniel Fojt       beg += wordchars_size (beg, buf + size);
282*09d4459fSDaniel Fojt       mb_start = beg;
28395b7b453SJohn Marino     } /* for (beg in buf) */
28495b7b453SJohn Marino 
285680a9cb8SJohn Marino   return -1;
28695b7b453SJohn Marino 
28795b7b453SJohn Marino  success:
288dc7c36e4SJohn Marino   end = memchr (beg + len, eol, (buf + size) - (beg + len));
289dc7c36e4SJohn Marino   end = end ? end + 1 : buf + size;
290*09d4459fSDaniel Fojt  success_match_words:
291dc7c36e4SJohn Marino   beg = memrchr (buf, eol, beg - buf);
292dc7c36e4SJohn Marino   beg = beg ? beg + 1 : buf;
29395b7b453SJohn Marino   len = end - beg;
294a8597f6cSJohn Marino  success_in_beg_and_len:;
295a8597f6cSJohn Marino   size_t off = beg - buf;
296a8597f6cSJohn Marino 
29795b7b453SJohn Marino   *match_size = len;
298a8597f6cSJohn Marino   ret_val = off;
29995b7b453SJohn Marino   return ret_val;
30095b7b453SJohn Marino }
301