xref: /dragonfly/contrib/grep/src/searchutils.c (revision ae071d8d)
1 /* searchutils.c - helper subroutines for grep's matchers.
2    Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 #include <config.h>
20 #include <assert.h>
21 #include "search.h"
22 
23 #define NCHAR (UCHAR_MAX + 1)
24 
25 void
26 kwsinit (kwset_t *kwset)
27 {
28   static char trans[NCHAR];
29   int i;
30 
31   if (match_icase && MB_CUR_MAX == 1)
32     {
33       for (i = 0; i < NCHAR; ++i)
34         trans[i] = tolower (i);
35 
36       *kwset = kwsalloc (trans);
37     }
38   else
39     *kwset = kwsalloc (NULL);
40 
41   if (!*kwset)
42     xalloc_die ();
43 }
44 
45 #if MBS_SUPPORT
46 /* Convert the *N-byte string, BEG, to lower-case, and write the
47    NUL-terminated result into malloc'd storage.  Upon success, set *N
48    to the length (in bytes) of the resulting string (not including the
49    trailing NUL byte), and return a pointer to the lower-case string.
50    Upon memory allocation failure, this function exits.
51    Note that on input, *N must be larger than zero.
52 
53    Note that while this function returns a pointer to malloc'd storage,
54    the caller must not free it, since this function retains a pointer
55    to the buffer and reuses it on any subsequent call.  As a consequence,
56    this function is not thread-safe.
57 
58    When each character in the lower-case result string has the same length
59    as the corresponding character in the input string, set *LEN_MAP_P
60    to NULL.  Otherwise, set it to a malloc'd buffer (like the returned
61    buffer, this must not be freed by caller) of the same length as the
62    result string.  (*LEN_MAP_P)[J] is the change in byte-length of the
63    character in BEG that formed byte J of the result as it was converted to
64    lower-case.  It is usually zero.  For the upper-case Turkish I-with-dot
65    it is -1, since the upper-case character occupies two bytes, while the
66    lower-case one occupies only one byte.  For the Turkish-I-without-dot
67    in the tr_TR.utf8 locale, it is 1 because the lower-case representation
68    is one byte longer than the original.  When that happens, we have two
69    or more slots in *LEN_MAP_P for each such character.  We store the
70    difference in the first one and 0's in any remaining slots.
71 
72    This map is used by the caller to convert offset,length pairs that
73    reference the lower-case result to numbers that refer to the matched
74    part of the original buffer.  */
75 
76 char *
77 mbtolower (const char *beg, size_t *n, mb_len_map_t **len_map_p)
78 {
79   static char *out;
80   static mb_len_map_t *len_map;
81   static size_t outalloc;
82   size_t outlen, mb_cur_max;
83   mbstate_t is, os;
84   const char *end;
85   char *p;
86   mb_len_map_t *m;
87   bool lengths_differ = false;
88 
89   if (*n > outalloc || outalloc == 0)
90     {
91       outalloc = MAX(1, *n);
92       out = xrealloc (out, outalloc);
93       len_map = xrealloc (len_map, outalloc);
94     }
95 
96   /* appease clang-2.6 */
97   assert (out);
98   assert (len_map);
99   if (*n == 0)
100     return out;
101 
102   memset (&is, 0, sizeof (is));
103   memset (&os, 0, sizeof (os));
104   end = beg + *n;
105 
106   mb_cur_max = MB_CUR_MAX;
107   p = out;
108   m = len_map;
109   outlen = 0;
110   while (beg < end)
111     {
112       wchar_t wc;
113       size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
114       if (outlen + mb_cur_max >= outalloc)
115         {
116           size_t dm = m - len_map;
117           out = x2nrealloc (out, &outalloc, 1);
118           len_map = xrealloc (len_map, outalloc);
119           p = out + outlen;
120           m = len_map + dm;
121         }
122 
123       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
124         {
125           /* An invalid sequence, or a truncated multi-octet character.
126              We treat it as a single-octet character.  */
127           *m++ = 0;
128           *p++ = *beg++;
129           outlen++;
130           memset (&is, 0, sizeof (is));
131           memset (&os, 0, sizeof (os));
132         }
133       else
134         {
135           beg += mbclen;
136           size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
137           *m = mbclen - ombclen;
138           memset (m + 1, 0, ombclen - 1);
139           m += ombclen;
140           p += ombclen;
141           outlen += ombclen;
142           lengths_differ |= (mbclen != ombclen);
143         }
144     }
145 
146   *len_map_p = lengths_differ ? len_map : NULL;
147   *n = p - out;
148   *p = 0;
149   return out;
150 }
151 
152 
153 bool
154 is_mb_middle (const char **good, const char *buf, const char *end,
155               size_t match_len)
156 {
157   const char *p = *good;
158   const char *prev = p;
159   mbstate_t cur_state;
160 
161   /* TODO: can be optimized for UTF-8.  */
162   memset(&cur_state, 0, sizeof(mbstate_t));
163   while (p < buf)
164     {
165       size_t mbclen = mbrlen(p, end - p, &cur_state);
166 
167       /* Store the beginning of the previous complete multibyte character.  */
168       if (mbclen != (size_t) -2)
169         prev = p;
170 
171       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
172         {
173           /* An invalid sequence, or a truncated multibyte character.
174              We treat it as a single byte character.  */
175           mbclen = 1;
176           memset(&cur_state, 0, sizeof cur_state);
177         }
178       p += mbclen;
179     }
180 
181   *good = prev;
182 
183   if (p > buf)
184     return true;
185 
186   /* P == BUF here.  */
187   return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
188 }
189 #endif /* MBS_SUPPORT */
190