1 /* searchutils.c - helper subroutines for grep's matchers.
2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19 #include <config.h>
20
21 #define SEARCH_INLINE _GL_EXTERN_INLINE
22 #define SYSTEM_INLINE _GL_EXTERN_INLINE
23 #include "search.h"
24
25 /* For each byte B, sbwordchar[B] is true if B is a single-byte
26 character that is a word constituent, and is false otherwise. */
27 static bool sbwordchar[NCHAR];
28
29 /* Whether -w considers WC to be a word constituent. */
30 static bool
wordchar(wint_t wc)31 wordchar (wint_t wc)
32 {
33 return wc == L'_' || iswalnum (wc);
34 }
35
36 void
wordinit(void)37 wordinit (void)
38 {
39 for (int i = 0; i < NCHAR; i++)
40 sbwordchar[i] = wordchar (localeinfo.sbctowc[i]);
41 }
42
43 kwset_t
kwsinit(bool mb_trans)44 kwsinit (bool mb_trans)
45 {
46 char *trans = NULL;
47
48 if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
49 {
50 trans = xmalloc (NCHAR);
51 if (MB_CUR_MAX == 1)
52 for (int i = 0; i < NCHAR; i++)
53 trans[i] = toupper (i);
54 else
55 for (int i = 0; i < NCHAR; i++)
56 {
57 wint_t wc = localeinfo.sbctowc[i];
58 wint_t uwc = towupper (wc);
59 if (uwc != wc)
60 {
61 mbstate_t mbs = { 0 };
62 size_t len = wcrtomb (&trans[i], uwc, &mbs);
63 if (len != 1)
64 abort ();
65 }
66 else
67 trans[i] = i;
68 }
69 }
70
71 return kwsalloc (trans);
72 }
73
74 /* In the buffer *MB_START, return the number of bytes needed to go
75 back from CUR to the previous boundary, where a "boundary" is the
76 start of a multibyte character or is an error-encoding byte. The
77 buffer ends at END (i.e., one past the address of the buffer's last
78 byte). If CUR is already at a boundary, return 0. If CUR is no
79 larger than *MB_START, return CUR - *MB_START without modifying
80 *MB_START or *MBCLEN.
81
82 When returning zero, set *MB_START to CUR. When returning a
83 positive value, set *MB_START to the next boundary after CUR,
84 or to END if there is no such boundary, and set *MBCLEN to the
85 length of the preceding character. */
86 ptrdiff_t
mb_goback(char const ** mb_start,size_t * mbclen,char const * cur,char const * end)87 mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
88 char const *end)
89 {
90 const char *p = *mb_start;
91 const char *p0 = p;
92 size_t clen;
93
94 if (cur <= p)
95 return cur - p;
96
97 if (localeinfo.using_utf8)
98 {
99 p = cur;
100 clen = 1;
101
102 if (cur < end && (*cur & 0xc0) == 0x80)
103 for (int i = 1; i <= 3; i++)
104 if ((cur[-i] & 0xc0) != 0x80)
105 {
106 mbstate_t mbs = { 0 };
107 clen = mb_clen (cur - i, end - (cur - i), &mbs);
108 if (i < clen && clen < (size_t) -2)
109 {
110 p0 = cur - i;
111 p = p0 + clen;
112 }
113 break;
114 }
115 }
116 else
117 {
118 mbstate_t mbs = { 0 };
119 do
120 {
121 clen = mb_clen (p, end - p, &mbs);
122
123 if ((size_t) -2 <= clen)
124 {
125 /* An invalid sequence, or a truncated multibyte character.
126 Treat it as a single byte character. */
127 clen = 1;
128 memset (&mbs, 0, sizeof mbs);
129 }
130 p0 = p;
131 p += clen;
132 }
133 while (p < cur);
134 }
135
136 *mb_start = p;
137 if (mbclen)
138 *mbclen = clen;
139 return p == cur ? 0 : cur - p0;
140 }
141
142 /* Examine the start of BUF (which goes to END) for word constituents.
143 If COUNTALL, examine as many as possible; otherwise, examine at most one.
144 Return the total number of bytes in the examined characters. */
145 static size_t
wordchars_count(char const * buf,char const * end,bool countall)146 wordchars_count (char const *buf, char const *end, bool countall)
147 {
148 size_t n = 0;
149 mbstate_t mbs = { 0 };
150 while (n < end - buf)
151 {
152 unsigned char b = buf[n];
153 if (sbwordchar[b])
154 n++;
155 else if (localeinfo.sbclen[b] != -2)
156 break;
157 else
158 {
159 wchar_t wc = 0;
160 size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
161 if (!wordchar (wc))
162 break;
163 n += wcbytes + !wcbytes;
164 }
165 if (!countall)
166 break;
167 }
168 return n;
169 }
170
171 /* Examine the start of BUF for the longest prefix containing just
172 word constituents. Return the total number of bytes in the prefix.
173 The buffer ends at END. */
174 size_t
wordchars_size(char const * buf,char const * end)175 wordchars_size (char const *buf, char const *end)
176 {
177 return wordchars_count (buf, end, true);
178 }
179
180 /* If BUF starts with a word constituent, return the number of bytes
181 used to represent it; otherwise, return zero. The buffer ends at END. */
182 size_t
wordchar_next(char const * buf,char const * end)183 wordchar_next (char const *buf, char const *end)
184 {
185 return wordchars_count (buf, end, false);
186 }
187
188 /* In the buffer BUF, return nonzero if the character whose encoding
189 contains the byte before CUR is a word constituent. The buffer
190 ends at END. */
191 size_t
wordchar_prev(char const * buf,char const * cur,char const * end)192 wordchar_prev (char const *buf, char const *cur, char const *end)
193 {
194 if (buf == cur)
195 return 0;
196 unsigned char b = *--cur;
197 if (! localeinfo.multibyte
198 || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
199 return sbwordchar[b];
200 char const *p = buf;
201 cur -= mb_goback (&p, NULL, cur, end);
202 return wordchar_next (cur, end);
203 }
204