xref: /dragonfly/contrib/grep/src/searchutils.c (revision c69bf40f)
1 /* searchutils.c - helper subroutines for grep's matchers.
2    Copyright 1992, 1998, 2000, 2007, 2009-2015 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 #include <config.h>
20 
21 #define SEARCH_INLINE _GL_EXTERN_INLINE
22 #define SYSTEM_INLINE _GL_EXTERN_INLINE
23 #include "search.h"
24 
25 #include <assert.h>
26 
27 #define NCHAR (UCHAR_MAX + 1)
28 
29 size_t mbclen_cache[NCHAR];
30 
31 void
32 kwsinit (kwset_t *kwset)
33 {
34   static char trans[NCHAR];
35   int i;
36 
37   if (match_icase && MB_CUR_MAX == 1)
38     {
39       for (i = 0; i < NCHAR; ++i)
40         trans[i] = toupper (i);
41 
42       *kwset = kwsalloc (trans);
43     }
44   else
45     *kwset = kwsalloc (NULL);
46 
47   if (!*kwset)
48     xalloc_die ();
49 }
50 
51 /* Convert BEG, an *N-byte string, to uppercase, and write the
52    NUL-terminated result into malloc'd storage.  Upon success, set *N
53    to the length (in bytes) of the resulting string (not including the
54    trailing NUL byte), and return a pointer to the uppercase string.
55    Upon memory allocation failure, exit.  *N must be positive.
56 
57    Although this function returns a pointer to malloc'd storage,
58    the caller must not free it, since this function retains a pointer
59    to the buffer and reuses it on any subsequent call.  As a consequence,
60    this function is not thread-safe.
61 
62    When each character in the uppercase result string has the same length
63    as the corresponding character in the input string, set *LEN_MAP_P
64    to NULL.  Otherwise, set it to a malloc'd buffer (like the returned
65    buffer, this must not be freed by caller) of the same length as the
66    result string.  (*LEN_MAP_P)[J] is the change in byte-length of the
67    character in BEG that formed byte J of the result as it was converted to
68    uppercase.  It is usually zero.  For lowercase Turkish dotless I it
69    is -1, since the lowercase input occupies two bytes, while the
70    uppercase output occupies only one byte.  For lowercase I in the
71    tr_TR.utf8 locale, it is 1 because the uppercase Turkish dotted I
72    is one byte longer than the original.  When that happens, we have two
73    or more slots in *LEN_MAP_P for each such character.  We store the
74    difference in the first one and 0's in any remaining slots.
75 
76    This map is used by the caller to convert offset,length pairs that
77    reference the uppercase result to numbers that refer to the matched
78    part of the original buffer.  */
79 
80 char *
81 mbtoupper (const char *beg, size_t *n, mb_len_map_t **len_map_p)
82 {
83   static char *out;
84   static mb_len_map_t *len_map;
85   static size_t outalloc;
86   size_t outlen, mb_cur_max;
87   mbstate_t is, os;
88   const char *end;
89   char *p;
90   mb_len_map_t *m;
91   bool lengths_differ = false;
92 
93   if (*n > outalloc || outalloc == 0)
94     {
95       outalloc = MAX (1, *n);
96       out = xrealloc (out, outalloc);
97       len_map = xrealloc (len_map, outalloc);
98     }
99 
100   /* appease clang-2.6 */
101   assert (out);
102   assert (len_map);
103   if (*n == 0)
104     return out;
105 
106   memset (&is, 0, sizeof (is));
107   memset (&os, 0, sizeof (os));
108   end = beg + *n;
109 
110   mb_cur_max = MB_CUR_MAX;
111   p = out;
112   m = len_map;
113   outlen = 0;
114   while (beg < end)
115     {
116       wchar_t wc;
117       size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
118 #ifdef __CYGWIN__
119       /* Handle a UTF-8 sequence for a character beyond the base plane.
120          Cygwin's wchar_t is UTF-16, as in the underlying OS.  This
121          results in surrogate pairs which need some extra attention.  */
122       wint_t wci = 0;
123       if (mbclen == 3 && (wc & 0xdc00) == 0xd800)
124         {
125           /* We got the start of a 4 byte UTF-8 sequence.  This is returned
126              as a UTF-16 surrogate pair.  The first call to mbrtowc returned 3
127              and wc has been set to a high surrogate value, now we're going
128              to fetch the matching low surrogate.  This second call to mbrtowc
129              is supposed to return 1 to complete the 4 byte UTF-8 sequence.  */
130           wchar_t wc_2;
131           size_t mbclen_2 = mbrtowc (&wc_2, beg + mbclen, end - beg - mbclen,
132                                      &is);
133           if (mbclen_2 == 1 && (wc_2 & 0xdc00) == 0xdc00)
134             {
135               /* Match.  Convert this to a 4 byte wint_t which constitutes
136                  a 32-bit UTF-32 value.  */
137               wci = ( (((wint_t) (wc - 0xd800)) << 10)
138                      | ((wint_t) (wc_2 - 0xdc00)))
139                     + 0x10000;
140               ++mbclen;
141             }
142           else
143             {
144               /* Invalid UTF-8 sequence.  */
145               mbclen = (size_t) -1;
146             }
147         }
148 #endif
149       if (outlen + mb_cur_max >= outalloc)
150         {
151           size_t dm = m - len_map;
152           out = x2nrealloc (out, &outalloc, 1);
153           len_map = xrealloc (len_map, outalloc);
154           p = out + outlen;
155           m = len_map + dm;
156         }
157 
158       if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
159         {
160           /* An invalid sequence, or a truncated multi-octet character.
161              We treat it as a single-octet character.  */
162           *m++ = 0;
163           *p++ = *beg++;
164           outlen++;
165           memset (&is, 0, sizeof (is));
166           memset (&os, 0, sizeof (os));
167         }
168       else
169         {
170           size_t ombclen;
171           beg += mbclen;
172 #ifdef __CYGWIN__
173           /* Handle Unicode characters beyond the base plane.  */
174           if (mbclen == 4)
175             {
176               /* towupper, taking wint_t (4 bytes), handles UCS-4 values.  */
177               wci = towupper (wci);
178               if (wci >= 0x10000)
179                 {
180                   wci -= 0x10000;
181                   wc = (wci >> 10) | 0xd800;
182                   /* No need to check the return value.  When reading the
183                      high surrogate, the return value will be 0 and only the
184                      mbstate indicates that we're in the middle of reading a
185                      surrogate pair.  The next wcrtomb call reading the low
186                      surrogate will then return 4 and reset the mbstate.  */
187                   wcrtomb (p, wc, &os);
188                   wc = (wci & 0x3ff) | 0xdc00;
189                 }
190               else
191                 {
192                   wc = (wchar_t) wci;
193                 }
194               ombclen = wcrtomb (p, wc, &os);
195             }
196           else
197 #endif
198           ombclen = wcrtomb (p, towupper (wc), &os);
199           *m = mbclen - ombclen;
200           memset (m + 1, 0, ombclen - 1);
201           m += ombclen;
202           p += ombclen;
203           outlen += ombclen;
204           lengths_differ |= (mbclen != ombclen);
205         }
206     }
207 
208   *len_map_p = lengths_differ ? len_map : NULL;
209   *n = p - out;
210   *p = 0;
211   return out;
212 }
213 
214 /* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
215 void
216 build_mbclen_cache (void)
217 {
218   int i;
219 
220   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
221     {
222       char c = i;
223       unsigned char uc = i;
224       mbstate_t mbs = { 0 };
225       size_t len = mbrlen (&c, 1, &mbs);
226       mbclen_cache[uc] = len ? len : 1;
227     }
228 }
229 
230 /* In the buffer *MB_START, return the number of bytes needed to go
231    back from CUR to the previous boundary, where a "boundary" is the
232    start of a multibyte character or is an error-encoding byte.  The
233    buffer ends at END (i.e., one past the address of the buffer's last
234    byte).  If CUR is already at a boundary, return 0.  If *MB_START is
235    greater than or equal to CUR, return the negative value CUR - *MB_START.
236 
237    When returning zero, set *MB_START to CUR.  When returning a
238    positive value, set *MB_START to the next boundary after CUR, or to
239    END if there is no such boundary.  When returning a negative value,
240    leave *MB_START alone.  */
241 ptrdiff_t
242 mb_goback (char const **mb_start, char const *cur, char const *end)
243 {
244   const char *p = *mb_start;
245   const char *p0 = p;
246   mbstate_t cur_state;
247 
248   memset (&cur_state, 0, sizeof cur_state);
249 
250   while (p < cur)
251     {
252       size_t clen = mb_clen (p, end - p, &cur_state);
253 
254       if ((size_t) -2 <= clen)
255         {
256           /* An invalid sequence, or a truncated multibyte character.
257              Treat it as a single byte character.  */
258           clen = 1;
259           memset (&cur_state, 0, sizeof cur_state);
260         }
261       p0 = p;
262       p += clen;
263     }
264 
265   *mb_start = p;
266   return p == cur ? 0 : cur - p0;
267 }
268 
269 /* In the buffer BUF, return the wide character that is encoded just
270    before CUR.  The buffer ends at END.  Return WEOF if there is no
271    wide character just before CUR.  */
272 wint_t
273 mb_prev_wc (char const *buf, char const *cur, char const *end)
274 {
275   if (cur == buf)
276     return WEOF;
277   char const *p = buf;
278   cur--;
279   cur -= mb_goback (&p, cur, end);
280   return mb_next_wc (cur, end);
281 }
282 
283 /* Return the wide character that is encoded at CUR.  The buffer ends
284    at END.  Return WEOF if there is no wide character encoded at CUR.  */
285 wint_t
286 mb_next_wc (char const *cur, char const *end)
287 {
288   wchar_t wc;
289   mbstate_t mbs = { 0 };
290   return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
291           ? wc : WEOF);
292 }
293