1 /* searchutils.c - helper subroutines for grep's matchers. 2 Copyright 1992, 1998, 2000, 2007, 2009-2020 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 #include <config.h> 20 21 #define SEARCH_INLINE _GL_EXTERN_INLINE 22 #define SYSTEM_INLINE _GL_EXTERN_INLINE 23 #include "search.h" 24 25 /* For each byte B, sbwordchar[B] is true if B is a single-byte 26 character that is a word constituent, and is false otherwise. */ 27 static bool sbwordchar[NCHAR]; 28 29 /* Whether -w considers WC to be a word constituent. */ 30 static bool 31 wordchar (wint_t wc) 32 { 33 return wc == L'_' || iswalnum (wc); 34 } 35 36 void 37 wordinit (void) 38 { 39 for (int i = 0; i < NCHAR; i++) 40 sbwordchar[i] = wordchar (localeinfo.sbctowc[i]); 41 } 42 43 kwset_t 44 kwsinit (bool mb_trans) 45 { 46 char *trans = NULL; 47 48 if (match_icase && (MB_CUR_MAX == 1 || mb_trans)) 49 { 50 trans = xmalloc (NCHAR); 51 if (MB_CUR_MAX == 1) 52 for (int i = 0; i < NCHAR; i++) 53 trans[i] = toupper (i); 54 else 55 for (int i = 0; i < NCHAR; i++) 56 { 57 wint_t wc = localeinfo.sbctowc[i]; 58 wint_t uwc = towupper (wc); 59 if (uwc != wc) 60 { 61 mbstate_t mbs = { 0 }; 62 size_t len = wcrtomb (&trans[i], uwc, &mbs); 63 if (len != 1) 64 abort (); 65 } 66 else 67 trans[i] = i; 68 } 69 } 70 71 return kwsalloc (trans); 72 } 73 74 /* In the buffer *MB_START, return the number of bytes needed to go 75 back from CUR to the previous boundary, where a "boundary" is the 76 start of a multibyte character or is an error-encoding byte. The 77 buffer ends at END (i.e., one past the address of the buffer's last 78 byte). If CUR is already at a boundary, return 0. If CUR is no 79 larger than *MB_START, return CUR - *MB_START without modifying 80 *MB_START or *MBCLEN. 81 82 When returning zero, set *MB_START to CUR. When returning a 83 positive value, set *MB_START to the next boundary after CUR, 84 or to END if there is no such boundary, and set *MBCLEN to the 85 length of the preceding character. */ 86 ptrdiff_t 87 mb_goback (char const **mb_start, size_t *mbclen, char const *cur, 88 char const *end) 89 { 90 const char *p = *mb_start; 91 const char *p0 = p; 92 size_t clen; 93 94 if (cur <= p) 95 return cur - p; 96 97 if (localeinfo.using_utf8) 98 { 99 p = cur; 100 clen = 1; 101 102 if (cur < end && (*cur & 0xc0) == 0x80) 103 for (int i = 1; i <= 3; i++) 104 if ((cur[-i] & 0xc0) != 0x80) 105 { 106 mbstate_t mbs = { 0 }; 107 clen = mb_clen (cur - i, end - (cur - i), &mbs); 108 if (i < clen && clen < (size_t) -2) 109 { 110 p0 = cur - i; 111 p = p0 + clen; 112 } 113 break; 114 } 115 } 116 else 117 { 118 mbstate_t mbs = { 0 }; 119 do 120 { 121 clen = mb_clen (p, end - p, &mbs); 122 123 if ((size_t) -2 <= clen) 124 { 125 /* An invalid sequence, or a truncated multibyte character. 126 Treat it as a single byte character. */ 127 clen = 1; 128 memset (&mbs, 0, sizeof mbs); 129 } 130 p0 = p; 131 p += clen; 132 } 133 while (p < cur); 134 } 135 136 *mb_start = p; 137 if (mbclen) 138 *mbclen = clen; 139 return p == cur ? 0 : cur - p0; 140 } 141 142 /* Examine the start of BUF (which goes to END) for word constituents. 143 If COUNTALL, examine as many as possible; otherwise, examine at most one. 144 Return the total number of bytes in the examined characters. */ 145 static size_t 146 wordchars_count (char const *buf, char const *end, bool countall) 147 { 148 size_t n = 0; 149 mbstate_t mbs = { 0 }; 150 while (n < end - buf) 151 { 152 unsigned char b = buf[n]; 153 if (sbwordchar[b]) 154 n++; 155 else if (localeinfo.sbclen[b] != -2) 156 break; 157 else 158 { 159 wchar_t wc = 0; 160 size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs); 161 if (!wordchar (wc)) 162 break; 163 n += wcbytes + !wcbytes; 164 } 165 if (!countall) 166 break; 167 } 168 return n; 169 } 170 171 /* Examine the start of BUF for the longest prefix containing just 172 word constituents. Return the total number of bytes in the prefix. 173 The buffer ends at END. */ 174 size_t 175 wordchars_size (char const *buf, char const *end) 176 { 177 return wordchars_count (buf, end, true); 178 } 179 180 /* If BUF starts with a word constituent, return the number of bytes 181 used to represent it; otherwise, return zero. The buffer ends at END. */ 182 size_t 183 wordchar_next (char const *buf, char const *end) 184 { 185 return wordchars_count (buf, end, false); 186 } 187 188 /* In the buffer BUF, return nonzero if the character whose encoding 189 contains the byte before CUR is a word constituent. The buffer 190 ends at END. */ 191 size_t 192 wordchar_prev (char const *buf, char const *cur, char const *end) 193 { 194 if (buf == cur) 195 return 0; 196 unsigned char b = *--cur; 197 if (! localeinfo.multibyte 198 || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2)) 199 return sbwordchar[b]; 200 char const *p = buf; 201 cur -= mb_goback (&p, NULL, cur, end); 202 return wordchar_next (cur, end); 203 } 204