1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002-2019 Free Software Foundation, Inc.
5 This file is part of the GNU C Library.
6 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
17
18 You should have received a copy of the GNU General Public
19 License along with the GNU C Library; if not, see
20 <https://www.gnu.org/licenses/>. */
21
22 static void re_string_construct_common (const char *str, Idx len,
23 re_string_t *pstr,
24 RE_TRANSLATE_TYPE trans, bool icase,
25 const re_dfa_t *dfa);
26 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
27 const re_node_set *nodes,
28 re_hashval_t hash);
29 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int context,
32 re_hashval_t hash);
33 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
34 Idx new_buf_len);
35 #ifdef RE_ENABLE_I18N
36 static void build_wcs_buffer (re_string_t *pstr);
37 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
38 #endif /* RE_ENABLE_I18N */
39 static void build_upper_buffer (re_string_t *pstr);
40 static void re_string_translate_buffer (re_string_t *pstr);
41 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
42 int eflags) __attribute__ ((pure));
43
44 /* Functions for string operation. */
45
46 /* This function allocate the buffers. It is necessary to call
47 re_string_reconstruct before using the object. */
48
49 static reg_errcode_t
50 __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)51 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
52 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
53 {
54 reg_errcode_t ret;
55 Idx init_buf_len;
56
57 /* Ensure at least one character fits into the buffers. */
58 if (init_len < dfa->mb_cur_max)
59 init_len = dfa->mb_cur_max;
60 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61 re_string_construct_common (str, len, pstr, trans, icase, dfa);
62
63 ret = re_string_realloc_buffers (pstr, init_buf_len);
64 if (__glibc_unlikely (ret != REG_NOERROR))
65 return ret;
66
67 pstr->word_char = dfa->word_char;
68 pstr->word_ops_used = dfa->word_ops_used;
69 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71 pstr->valid_raw_len = pstr->valid_len;
72 return REG_NOERROR;
73 }
74
75 /* This function allocate the buffers, and initialize them. */
76
77 static reg_errcode_t
78 __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)79 re_string_construct (re_string_t *pstr, const char *str, Idx len,
80 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
81 {
82 reg_errcode_t ret;
83 memset (pstr, '\0', sizeof (re_string_t));
84 re_string_construct_common (str, len, pstr, trans, icase, dfa);
85
86 if (len > 0)
87 {
88 ret = re_string_realloc_buffers (pstr, len + 1);
89 if (__glibc_unlikely (ret != REG_NOERROR))
90 return ret;
91 }
92 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
93
94 if (icase)
95 {
96 #ifdef RE_ENABLE_I18N
97 if (dfa->mb_cur_max > 1)
98 {
99 while (1)
100 {
101 ret = build_wcs_upper_buffer (pstr);
102 if (__glibc_unlikely (ret != REG_NOERROR))
103 return ret;
104 if (pstr->valid_raw_len >= len)
105 break;
106 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
107 break;
108 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
109 if (__glibc_unlikely (ret != REG_NOERROR))
110 return ret;
111 }
112 }
113 else
114 #endif /* RE_ENABLE_I18N */
115 build_upper_buffer (pstr);
116 }
117 else
118 {
119 #ifdef RE_ENABLE_I18N
120 if (dfa->mb_cur_max > 1)
121 build_wcs_buffer (pstr);
122 else
123 #endif /* RE_ENABLE_I18N */
124 {
125 if (trans != NULL)
126 re_string_translate_buffer (pstr);
127 else
128 {
129 pstr->valid_len = pstr->bufs_len;
130 pstr->valid_raw_len = pstr->bufs_len;
131 }
132 }
133 }
134
135 return REG_NOERROR;
136 }
137
138 /* Helper functions for re_string_allocate, and re_string_construct. */
139
140 static reg_errcode_t
141 __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)142 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
143 {
144 #ifdef RE_ENABLE_I18N
145 if (pstr->mb_cur_max > 1)
146 {
147 wint_t *new_wcs;
148
149 /* Avoid overflow in realloc. */
150 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
151 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
152 < new_buf_len))
153 return REG_ESPACE;
154
155 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
156 if (__glibc_unlikely (new_wcs == NULL))
157 return REG_ESPACE;
158 pstr->wcs = new_wcs;
159 if (pstr->offsets != NULL)
160 {
161 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
162 if (__glibc_unlikely (new_offsets == NULL))
163 return REG_ESPACE;
164 pstr->offsets = new_offsets;
165 }
166 }
167 #endif /* RE_ENABLE_I18N */
168 if (pstr->mbs_allocated)
169 {
170 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
171 new_buf_len);
172 if (__glibc_unlikely (new_mbs == NULL))
173 return REG_ESPACE;
174 pstr->mbs = new_mbs;
175 }
176 pstr->bufs_len = new_buf_len;
177 return REG_NOERROR;
178 }
179
180
181 static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)182 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
183 RE_TRANSLATE_TYPE trans, bool icase,
184 const re_dfa_t *dfa)
185 {
186 pstr->raw_mbs = (const unsigned char *) str;
187 pstr->len = len;
188 pstr->raw_len = len;
189 pstr->trans = trans;
190 pstr->icase = icase;
191 pstr->mbs_allocated = (trans != NULL || icase);
192 pstr->mb_cur_max = dfa->mb_cur_max;
193 pstr->is_utf8 = dfa->is_utf8;
194 pstr->map_notascii = dfa->map_notascii;
195 pstr->stop = pstr->len;
196 pstr->raw_stop = pstr->stop;
197 }
198
199 #ifdef RE_ENABLE_I18N
200
201 /* Build wide character buffer PSTR->WCS.
202 If the byte sequence of the string are:
203 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204 Then wide character buffer will be:
205 <wc1> , WEOF , <wc2> , WEOF , <wc3>
206 We use WEOF for padding, they indicate that the position isn't
207 a first byte of a multibyte character.
208
209 Note that this function assumes PSTR->VALID_LEN elements are already
210 built and starts from PSTR->VALID_LEN. */
211
212 static void
build_wcs_buffer(re_string_t * pstr)213 build_wcs_buffer (re_string_t *pstr)
214 {
215 #ifdef _LIBC
216 unsigned char buf[MB_LEN_MAX];
217 assert (MB_LEN_MAX >= pstr->mb_cur_max);
218 #else
219 unsigned char buf[64];
220 #endif
221 mbstate_t prev_st;
222 Idx byte_idx, end_idx, remain_len;
223 size_t mbclen;
224
225 /* Build the buffers from pstr->valid_len to either pstr->len or
226 pstr->bufs_len. */
227 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229 {
230 wchar_t wc;
231 const char *p;
232
233 remain_len = end_idx - byte_idx;
234 prev_st = pstr->cur_state;
235 /* Apply the translation if we need. */
236 if (__glibc_unlikely (pstr->trans != NULL))
237 {
238 int i, ch;
239
240 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241 {
242 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244 }
245 p = (const char *) buf;
246 }
247 else
248 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250 if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
251 || (mbclen == (size_t) -2
252 && pstr->bufs_len >= pstr->len)))
253 {
254 /* We treat these cases as a singlebyte character. */
255 mbclen = 1;
256 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
257 if (__glibc_unlikely (pstr->trans != NULL))
258 wc = pstr->trans[wc];
259 pstr->cur_state = prev_st;
260 }
261 else if (__glibc_unlikely (mbclen == (size_t) -2))
262 {
263 /* The buffer doesn't have enough space, finish to build. */
264 pstr->cur_state = prev_st;
265 break;
266 }
267
268 /* Write wide character and padding. */
269 pstr->wcs[byte_idx++] = wc;
270 /* Write paddings. */
271 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 pstr->wcs[byte_idx++] = WEOF;
273 }
274 pstr->valid_len = byte_idx;
275 pstr->valid_raw_len = byte_idx;
276 }
277
278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279 but for REG_ICASE. */
280
281 static reg_errcode_t
282 __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)283 build_wcs_upper_buffer (re_string_t *pstr)
284 {
285 mbstate_t prev_st;
286 Idx src_idx, byte_idx, end_idx, remain_len;
287 size_t mbclen;
288 #ifdef _LIBC
289 char buf[MB_LEN_MAX];
290 assert (MB_LEN_MAX >= pstr->mb_cur_max);
291 #else
292 char buf[64];
293 #endif
294
295 byte_idx = pstr->valid_len;
296 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297
298 /* The following optimization assumes that ASCII characters can be
299 mapped to wide characters with a simple cast. */
300 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301 {
302 while (byte_idx < end_idx)
303 {
304 wchar_t wc;
305
306 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
307 && mbsinit (&pstr->cur_state))
308 {
309 /* In case of a singlebyte character. */
310 pstr->mbs[byte_idx]
311 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
312 /* The next step uses the assumption that wchar_t is encoded
313 ASCII-safe: all ASCII values can be converted like this. */
314 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
315 ++byte_idx;
316 continue;
317 }
318
319 remain_len = end_idx - byte_idx;
320 prev_st = pstr->cur_state;
321 mbclen = __mbrtowc (&wc,
322 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 + byte_idx), remain_len, &pstr->cur_state);
324 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
325 {
326 wchar_t wcu = __towupper (wc);
327 if (wcu != wc)
328 {
329 size_t mbcdlen;
330
331 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
332 if (__glibc_likely (mbclen == mbcdlen))
333 memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 else
335 {
336 src_idx = byte_idx;
337 goto offsets_needed;
338 }
339 }
340 else
341 memcpy (pstr->mbs + byte_idx,
342 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 pstr->wcs[byte_idx++] = wcu;
344 /* Write paddings. */
345 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 pstr->wcs[byte_idx++] = WEOF;
347 }
348 else if (mbclen == (size_t) -1 || mbclen == 0
349 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
350 {
351 /* It is an invalid character, an incomplete character
352 at the end of the string, or '\0'. Just use the byte. */
353 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
354 pstr->mbs[byte_idx] = ch;
355 /* And also cast it to wide char. */
356 pstr->wcs[byte_idx++] = (wchar_t) ch;
357 if (__glibc_unlikely (mbclen == (size_t) -1))
358 pstr->cur_state = prev_st;
359 }
360 else
361 {
362 /* The buffer doesn't have enough space, finish to build. */
363 pstr->cur_state = prev_st;
364 break;
365 }
366 }
367 pstr->valid_len = byte_idx;
368 pstr->valid_raw_len = byte_idx;
369 return REG_NOERROR;
370 }
371 else
372 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
373 {
374 wchar_t wc;
375 const char *p;
376 offsets_needed:
377 remain_len = end_idx - byte_idx;
378 prev_st = pstr->cur_state;
379 if (__glibc_unlikely (pstr->trans != NULL))
380 {
381 int i, ch;
382
383 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
384 {
385 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
386 buf[i] = pstr->trans[ch];
387 }
388 p = (const char *) buf;
389 }
390 else
391 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
392 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
393 if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
394 {
395 wchar_t wcu = __towupper (wc);
396 if (wcu != wc)
397 {
398 size_t mbcdlen;
399
400 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
401 if (__glibc_likely (mbclen == mbcdlen))
402 memcpy (pstr->mbs + byte_idx, buf, mbclen);
403 else if (mbcdlen != (size_t) -1)
404 {
405 size_t i;
406
407 if (byte_idx + mbcdlen > pstr->bufs_len)
408 {
409 pstr->cur_state = prev_st;
410 break;
411 }
412
413 if (pstr->offsets == NULL)
414 {
415 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
416
417 if (pstr->offsets == NULL)
418 return REG_ESPACE;
419 }
420 if (!pstr->offsets_needed)
421 {
422 for (i = 0; i < (size_t) byte_idx; ++i)
423 pstr->offsets[i] = i;
424 pstr->offsets_needed = 1;
425 }
426
427 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428 pstr->wcs[byte_idx] = wcu;
429 pstr->offsets[byte_idx] = src_idx;
430 for (i = 1; i < mbcdlen; ++i)
431 {
432 pstr->offsets[byte_idx + i]
433 = src_idx + (i < mbclen ? i : mbclen - 1);
434 pstr->wcs[byte_idx + i] = WEOF;
435 }
436 pstr->len += mbcdlen - mbclen;
437 if (pstr->raw_stop > src_idx)
438 pstr->stop += mbcdlen - mbclen;
439 end_idx = (pstr->bufs_len > pstr->len)
440 ? pstr->len : pstr->bufs_len;
441 byte_idx += mbcdlen;
442 src_idx += mbclen;
443 continue;
444 }
445 else
446 memcpy (pstr->mbs + byte_idx, p, mbclen);
447 }
448 else
449 memcpy (pstr->mbs + byte_idx, p, mbclen);
450
451 if (__glibc_unlikely (pstr->offsets_needed != 0))
452 {
453 size_t i;
454 for (i = 0; i < mbclen; ++i)
455 pstr->offsets[byte_idx + i] = src_idx + i;
456 }
457 src_idx += mbclen;
458
459 pstr->wcs[byte_idx++] = wcu;
460 /* Write paddings. */
461 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462 pstr->wcs[byte_idx++] = WEOF;
463 }
464 else if (mbclen == (size_t) -1 || mbclen == 0
465 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
466 {
467 /* It is an invalid character or '\0'. Just use the byte. */
468 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
469
470 if (__glibc_unlikely (pstr->trans != NULL))
471 ch = pstr->trans [ch];
472 pstr->mbs[byte_idx] = ch;
473
474 if (__glibc_unlikely (pstr->offsets_needed != 0))
475 pstr->offsets[byte_idx] = src_idx;
476 ++src_idx;
477
478 /* And also cast it to wide char. */
479 pstr->wcs[byte_idx++] = (wchar_t) ch;
480 if (__glibc_unlikely (mbclen == (size_t) -1))
481 pstr->cur_state = prev_st;
482 }
483 else
484 {
485 /* The buffer doesn't have enough space, finish to build. */
486 pstr->cur_state = prev_st;
487 break;
488 }
489 }
490 pstr->valid_len = byte_idx;
491 pstr->valid_raw_len = src_idx;
492 return REG_NOERROR;
493 }
494
495 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
496 Return the index. */
497
498 static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)499 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
500 {
501 mbstate_t prev_st;
502 Idx rawbuf_idx;
503 size_t mbclen;
504 wint_t wc = WEOF;
505
506 /* Skip the characters which are not necessary to check. */
507 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
508 rawbuf_idx < new_raw_idx;)
509 {
510 wchar_t wc2;
511 Idx remain_len = pstr->raw_len - rawbuf_idx;
512 prev_st = pstr->cur_state;
513 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
514 remain_len, &pstr->cur_state);
515 if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
516 || mbclen == 0))
517 {
518 /* We treat these cases as a single byte character. */
519 if (mbclen == 0 || remain_len == 0)
520 wc = L'\0';
521 else
522 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
523 mbclen = 1;
524 pstr->cur_state = prev_st;
525 }
526 else
527 wc = wc2;
528 /* Then proceed the next character. */
529 rawbuf_idx += mbclen;
530 }
531 *last_wc = wc;
532 return rawbuf_idx;
533 }
534 #endif /* RE_ENABLE_I18N */
535
536 /* Build the buffer PSTR->MBS, and apply the translation if we need.
537 This function is used in case of REG_ICASE. */
538
539 static void
build_upper_buffer(re_string_t * pstr)540 build_upper_buffer (re_string_t *pstr)
541 {
542 Idx char_idx, end_idx;
543 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544
545 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
546 {
547 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
548 if (__glibc_unlikely (pstr->trans != NULL))
549 ch = pstr->trans[ch];
550 pstr->mbs[char_idx] = toupper (ch);
551 }
552 pstr->valid_len = char_idx;
553 pstr->valid_raw_len = char_idx;
554 }
555
556 /* Apply TRANS to the buffer in PSTR. */
557
558 static void
re_string_translate_buffer(re_string_t * pstr)559 re_string_translate_buffer (re_string_t *pstr)
560 {
561 Idx buf_idx, end_idx;
562 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563
564 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565 {
566 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567 pstr->mbs[buf_idx] = pstr->trans[ch];
568 }
569
570 pstr->valid_len = buf_idx;
571 pstr->valid_raw_len = buf_idx;
572 }
573
574 /* This function re-construct the buffers.
575 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576 convert to upper case in case of REG_ICASE, apply translation. */
577
578 static reg_errcode_t
579 __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)580 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
581 {
582 Idx offset;
583
584 if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
585 offset = idx - pstr->raw_mbs_idx;
586 else
587 {
588 /* Reset buffer. */
589 #ifdef RE_ENABLE_I18N
590 if (pstr->mb_cur_max > 1)
591 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
592 #endif /* RE_ENABLE_I18N */
593 pstr->len = pstr->raw_len;
594 pstr->stop = pstr->raw_stop;
595 pstr->valid_len = 0;
596 pstr->raw_mbs_idx = 0;
597 pstr->valid_raw_len = 0;
598 pstr->offsets_needed = 0;
599 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
600 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
601 if (!pstr->mbs_allocated)
602 pstr->mbs = (unsigned char *) pstr->raw_mbs;
603 offset = idx;
604 }
605
606 if (__glibc_likely (offset != 0))
607 {
608 /* Should the already checked characters be kept? */
609 if (__glibc_likely (offset < pstr->valid_raw_len))
610 {
611 /* Yes, move them to the front of the buffer. */
612 #ifdef RE_ENABLE_I18N
613 if (__glibc_unlikely (pstr->offsets_needed))
614 {
615 Idx low = 0, high = pstr->valid_len, mid;
616 do
617 {
618 mid = (high + low) / 2;
619 if (pstr->offsets[mid] > offset)
620 high = mid;
621 else if (pstr->offsets[mid] < offset)
622 low = mid + 1;
623 else
624 break;
625 }
626 while (low < high);
627 if (pstr->offsets[mid] < offset)
628 ++mid;
629 pstr->tip_context = re_string_context_at (pstr, mid - 1,
630 eflags);
631 /* This can be quite complicated, so handle specially
632 only the common and easy case where the character with
633 different length representation of lower and upper
634 case is present at or after offset. */
635 if (pstr->valid_len > offset
636 && mid == offset && pstr->offsets[mid] == offset)
637 {
638 memmove (pstr->wcs, pstr->wcs + offset,
639 (pstr->valid_len - offset) * sizeof (wint_t));
640 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
641 pstr->valid_len -= offset;
642 pstr->valid_raw_len -= offset;
643 for (low = 0; low < pstr->valid_len; low++)
644 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
645 }
646 else
647 {
648 /* Otherwise, just find out how long the partial multibyte
649 character at offset is and fill it with WEOF/255. */
650 pstr->len = pstr->raw_len - idx + offset;
651 pstr->stop = pstr->raw_stop - idx + offset;
652 pstr->offsets_needed = 0;
653 while (mid > 0 && pstr->offsets[mid - 1] == offset)
654 --mid;
655 while (mid < pstr->valid_len)
656 if (pstr->wcs[mid] != WEOF)
657 break;
658 else
659 ++mid;
660 if (mid == pstr->valid_len)
661 pstr->valid_len = 0;
662 else
663 {
664 pstr->valid_len = pstr->offsets[mid] - offset;
665 if (pstr->valid_len)
666 {
667 for (low = 0; low < pstr->valid_len; ++low)
668 pstr->wcs[low] = WEOF;
669 memset (pstr->mbs, 255, pstr->valid_len);
670 }
671 }
672 pstr->valid_raw_len = pstr->valid_len;
673 }
674 }
675 else
676 #endif
677 {
678 pstr->tip_context = re_string_context_at (pstr, offset - 1,
679 eflags);
680 #ifdef RE_ENABLE_I18N
681 if (pstr->mb_cur_max > 1)
682 memmove (pstr->wcs, pstr->wcs + offset,
683 (pstr->valid_len - offset) * sizeof (wint_t));
684 #endif /* RE_ENABLE_I18N */
685 if (__glibc_unlikely (pstr->mbs_allocated))
686 memmove (pstr->mbs, pstr->mbs + offset,
687 pstr->valid_len - offset);
688 pstr->valid_len -= offset;
689 pstr->valid_raw_len -= offset;
690 #if defined DEBUG && DEBUG
691 assert (pstr->valid_len > 0);
692 #endif
693 }
694 }
695 else
696 {
697 #ifdef RE_ENABLE_I18N
698 /* No, skip all characters until IDX. */
699 Idx prev_valid_len = pstr->valid_len;
700
701 if (__glibc_unlikely (pstr->offsets_needed))
702 {
703 pstr->len = pstr->raw_len - idx + offset;
704 pstr->stop = pstr->raw_stop - idx + offset;
705 pstr->offsets_needed = 0;
706 }
707 #endif
708 pstr->valid_len = 0;
709 #ifdef RE_ENABLE_I18N
710 if (pstr->mb_cur_max > 1)
711 {
712 Idx wcs_idx;
713 wint_t wc = WEOF;
714
715 if (pstr->is_utf8)
716 {
717 const unsigned char *raw, *p, *end;
718
719 /* Special case UTF-8. Multi-byte chars start with any
720 byte other than 0x80 - 0xbf. */
721 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
722 end = raw + (offset - pstr->mb_cur_max);
723 if (end < pstr->raw_mbs)
724 end = pstr->raw_mbs;
725 p = raw + offset - 1;
726 #ifdef _LIBC
727 /* We know the wchar_t encoding is UCS4, so for the simple
728 case, ASCII characters, skip the conversion step. */
729 if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
730 {
731 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
732 /* pstr->valid_len = 0; */
733 wc = (wchar_t) *p;
734 }
735 else
736 #endif
737 for (; p >= end; --p)
738 if ((*p & 0xc0) != 0x80)
739 {
740 mbstate_t cur_state;
741 wchar_t wc2;
742 Idx mlen = raw + pstr->len - p;
743 unsigned char buf[6];
744 size_t mbclen;
745
746 const unsigned char *pp = p;
747 if (__glibc_unlikely (pstr->trans != NULL))
748 {
749 int i = mlen < 6 ? mlen : 6;
750 while (--i >= 0)
751 buf[i] = pstr->trans[p[i]];
752 pp = buf;
753 }
754 /* XXX Don't use mbrtowc, we know which conversion
755 to use (UTF-8 -> UCS4). */
756 memset (&cur_state, 0, sizeof (cur_state));
757 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
758 &cur_state);
759 if (raw + offset - p <= mbclen
760 && mbclen < (size_t) -2)
761 {
762 memset (&pstr->cur_state, '\0',
763 sizeof (mbstate_t));
764 pstr->valid_len = mbclen - (raw + offset - p);
765 wc = wc2;
766 }
767 break;
768 }
769 }
770
771 if (wc == WEOF)
772 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
773 if (wc == WEOF)
774 pstr->tip_context
775 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
776 else
777 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
778 && IS_WIDE_WORD_CHAR (wc))
779 ? CONTEXT_WORD
780 : ((IS_WIDE_NEWLINE (wc)
781 && pstr->newline_anchor)
782 ? CONTEXT_NEWLINE : 0));
783 if (__glibc_unlikely (pstr->valid_len))
784 {
785 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
786 pstr->wcs[wcs_idx] = WEOF;
787 if (pstr->mbs_allocated)
788 memset (pstr->mbs, 255, pstr->valid_len);
789 }
790 pstr->valid_raw_len = pstr->valid_len;
791 }
792 else
793 #endif /* RE_ENABLE_I18N */
794 {
795 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
796 pstr->valid_raw_len = 0;
797 if (pstr->trans)
798 c = pstr->trans[c];
799 pstr->tip_context = (bitset_contain (pstr->word_char, c)
800 ? CONTEXT_WORD
801 : ((IS_NEWLINE (c) && pstr->newline_anchor)
802 ? CONTEXT_NEWLINE : 0));
803 }
804 }
805 if (!__glibc_unlikely (pstr->mbs_allocated))
806 pstr->mbs += offset;
807 }
808 pstr->raw_mbs_idx = idx;
809 pstr->len -= offset;
810 pstr->stop -= offset;
811
812 /* Then build the buffers. */
813 #ifdef RE_ENABLE_I18N
814 if (pstr->mb_cur_max > 1)
815 {
816 if (pstr->icase)
817 {
818 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
819 if (__glibc_unlikely (ret != REG_NOERROR))
820 return ret;
821 }
822 else
823 build_wcs_buffer (pstr);
824 }
825 else
826 #endif /* RE_ENABLE_I18N */
827 if (__glibc_unlikely (pstr->mbs_allocated))
828 {
829 if (pstr->icase)
830 build_upper_buffer (pstr);
831 else if (pstr->trans != NULL)
832 re_string_translate_buffer (pstr);
833 }
834 else
835 pstr->valid_len = pstr->len;
836
837 pstr->cur_idx = 0;
838 return REG_NOERROR;
839 }
840
841 static unsigned char
842 __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)843 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
844 {
845 int ch;
846 Idx off;
847
848 /* Handle the common (easiest) cases first. */
849 if (__glibc_likely (!pstr->mbs_allocated))
850 return re_string_peek_byte (pstr, idx);
851
852 #ifdef RE_ENABLE_I18N
853 if (pstr->mb_cur_max > 1
854 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
855 return re_string_peek_byte (pstr, idx);
856 #endif
857
858 off = pstr->cur_idx + idx;
859 #ifdef RE_ENABLE_I18N
860 if (pstr->offsets_needed)
861 off = pstr->offsets[off];
862 #endif
863
864 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
865
866 #ifdef RE_ENABLE_I18N
867 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
868 this function returns CAPITAL LETTER I instead of first byte of
869 DOTLESS SMALL LETTER I. The latter would confuse the parser,
870 since peek_byte_case doesn't advance cur_idx in any way. */
871 if (pstr->offsets_needed && !isascii (ch))
872 return re_string_peek_byte (pstr, idx);
873 #endif
874
875 return ch;
876 }
877
878 static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)879 re_string_fetch_byte_case (re_string_t *pstr)
880 {
881 if (__glibc_likely (!pstr->mbs_allocated))
882 return re_string_fetch_byte (pstr);
883
884 #ifdef RE_ENABLE_I18N
885 if (pstr->offsets_needed)
886 {
887 Idx off;
888 int ch;
889
890 /* For tr_TR.UTF-8 [[:islower:]] there is
891 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
892 in that case the whole multi-byte character and return
893 the original letter. On the other side, with
894 [[: DOTLESS SMALL LETTER I return [[:I, as doing
895 anything else would complicate things too much. */
896
897 if (!re_string_first_byte (pstr, pstr->cur_idx))
898 return re_string_fetch_byte (pstr);
899
900 off = pstr->offsets[pstr->cur_idx];
901 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
902
903 if (! isascii (ch))
904 return re_string_fetch_byte (pstr);
905
906 re_string_skip_bytes (pstr,
907 re_string_char_size_at (pstr, pstr->cur_idx));
908 return ch;
909 }
910 #endif
911
912 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
913 }
914
915 static void
re_string_destruct(re_string_t * pstr)916 re_string_destruct (re_string_t *pstr)
917 {
918 #ifdef RE_ENABLE_I18N
919 re_free (pstr->wcs);
920 re_free (pstr->offsets);
921 #endif /* RE_ENABLE_I18N */
922 if (pstr->mbs_allocated)
923 re_free (pstr->mbs);
924 }
925
926 /* Return the context at IDX in INPUT. */
927
928 static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)929 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930 {
931 int c;
932 if (__glibc_unlikely (idx < 0))
933 /* In this case, we use the value stored in input->tip_context,
934 since we can't know the character in input->mbs[-1] here. */
935 return input->tip_context;
936 if (__glibc_unlikely (idx == input->len))
937 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939 #ifdef RE_ENABLE_I18N
940 if (input->mb_cur_max > 1)
941 {
942 wint_t wc;
943 Idx wc_idx = idx;
944 while(input->wcs[wc_idx] == WEOF)
945 {
946 #if defined DEBUG && DEBUG
947 /* It must not happen. */
948 assert (wc_idx >= 0);
949 #endif
950 --wc_idx;
951 if (wc_idx < 0)
952 return input->tip_context;
953 }
954 wc = input->wcs[wc_idx];
955 if (__glibc_unlikely (input->word_ops_used != 0)
956 && IS_WIDE_WORD_CHAR (wc))
957 return CONTEXT_WORD;
958 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
959 ? CONTEXT_NEWLINE : 0);
960 }
961 else
962 #endif
963 {
964 c = re_string_byte_at (input, idx);
965 if (bitset_contain (input->word_char, c))
966 return CONTEXT_WORD;
967 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
968 }
969 }
970
971 /* Functions for set operation. */
972
973 static reg_errcode_t
974 __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)975 re_node_set_alloc (re_node_set *set, Idx size)
976 {
977 set->alloc = size;
978 set->nelem = 0;
979 set->elems = re_malloc (Idx, size);
980 if (__glibc_unlikely (set->elems == NULL)
981 && (MALLOC_0_IS_NONNULL || size != 0))
982 return REG_ESPACE;
983 return REG_NOERROR;
984 }
985
986 static reg_errcode_t
987 __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)988 re_node_set_init_1 (re_node_set *set, Idx elem)
989 {
990 set->alloc = 1;
991 set->nelem = 1;
992 set->elems = re_malloc (Idx, 1);
993 if (__glibc_unlikely (set->elems == NULL))
994 {
995 set->alloc = set->nelem = 0;
996 return REG_ESPACE;
997 }
998 set->elems[0] = elem;
999 return REG_NOERROR;
1000 }
1001
1002 static reg_errcode_t
1003 __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)1004 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1005 {
1006 set->alloc = 2;
1007 set->elems = re_malloc (Idx, 2);
1008 if (__glibc_unlikely (set->elems == NULL))
1009 return REG_ESPACE;
1010 if (elem1 == elem2)
1011 {
1012 set->nelem = 1;
1013 set->elems[0] = elem1;
1014 }
1015 else
1016 {
1017 set->nelem = 2;
1018 if (elem1 < elem2)
1019 {
1020 set->elems[0] = elem1;
1021 set->elems[1] = elem2;
1022 }
1023 else
1024 {
1025 set->elems[0] = elem2;
1026 set->elems[1] = elem1;
1027 }
1028 }
1029 return REG_NOERROR;
1030 }
1031
1032 static reg_errcode_t
1033 __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1034 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1035 {
1036 dest->nelem = src->nelem;
1037 if (src->nelem > 0)
1038 {
1039 dest->alloc = dest->nelem;
1040 dest->elems = re_malloc (Idx, dest->alloc);
1041 if (__glibc_unlikely (dest->elems == NULL))
1042 {
1043 dest->alloc = dest->nelem = 0;
1044 return REG_ESPACE;
1045 }
1046 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1047 }
1048 else
1049 re_node_set_init_empty (dest);
1050 return REG_NOERROR;
1051 }
1052
1053 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1054 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1055 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1056
1057 static reg_errcode_t
1058 __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1059 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1060 const re_node_set *src2)
1061 {
1062 Idx i1, i2, is, id, delta, sbase;
1063 if (src1->nelem == 0 || src2->nelem == 0)
1064 return REG_NOERROR;
1065
1066 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1067 conservative estimate. */
1068 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1069 {
1070 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1071 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1072 if (__glibc_unlikely (new_elems == NULL))
1073 return REG_ESPACE;
1074 dest->elems = new_elems;
1075 dest->alloc = new_alloc;
1076 }
1077
1078 /* Find the items in the intersection of SRC1 and SRC2, and copy
1079 into the top of DEST those that are not already in DEST itself. */
1080 sbase = dest->nelem + src1->nelem + src2->nelem;
1081 i1 = src1->nelem - 1;
1082 i2 = src2->nelem - 1;
1083 id = dest->nelem - 1;
1084 for (;;)
1085 {
1086 if (src1->elems[i1] == src2->elems[i2])
1087 {
1088 /* Try to find the item in DEST. Maybe we could binary search? */
1089 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1090 --id;
1091
1092 if (id < 0 || dest->elems[id] != src1->elems[i1])
1093 dest->elems[--sbase] = src1->elems[i1];
1094
1095 if (--i1 < 0 || --i2 < 0)
1096 break;
1097 }
1098
1099 /* Lower the highest of the two items. */
1100 else if (src1->elems[i1] < src2->elems[i2])
1101 {
1102 if (--i2 < 0)
1103 break;
1104 }
1105 else
1106 {
1107 if (--i1 < 0)
1108 break;
1109 }
1110 }
1111
1112 id = dest->nelem - 1;
1113 is = dest->nelem + src1->nelem + src2->nelem - 1;
1114 delta = is - sbase + 1;
1115
1116 /* Now copy. When DELTA becomes zero, the remaining
1117 DEST elements are already in place; this is more or
1118 less the same loop that is in re_node_set_merge. */
1119 dest->nelem += delta;
1120 if (delta > 0 && id >= 0)
1121 for (;;)
1122 {
1123 if (dest->elems[is] > dest->elems[id])
1124 {
1125 /* Copy from the top. */
1126 dest->elems[id + delta--] = dest->elems[is--];
1127 if (delta == 0)
1128 break;
1129 }
1130 else
1131 {
1132 /* Slide from the bottom. */
1133 dest->elems[id + delta] = dest->elems[id];
1134 if (--id < 0)
1135 break;
1136 }
1137 }
1138
1139 /* Copy remaining SRC elements. */
1140 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1141
1142 return REG_NOERROR;
1143 }
1144
1145 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1146 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1147
1148 static reg_errcode_t
1149 __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1150 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1151 const re_node_set *src2)
1152 {
1153 Idx i1, i2, id;
1154 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1155 {
1156 dest->alloc = src1->nelem + src2->nelem;
1157 dest->elems = re_malloc (Idx, dest->alloc);
1158 if (__glibc_unlikely (dest->elems == NULL))
1159 return REG_ESPACE;
1160 }
1161 else
1162 {
1163 if (src1 != NULL && src1->nelem > 0)
1164 return re_node_set_init_copy (dest, src1);
1165 else if (src2 != NULL && src2->nelem > 0)
1166 return re_node_set_init_copy (dest, src2);
1167 else
1168 re_node_set_init_empty (dest);
1169 return REG_NOERROR;
1170 }
1171 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1172 {
1173 if (src1->elems[i1] > src2->elems[i2])
1174 {
1175 dest->elems[id++] = src2->elems[i2++];
1176 continue;
1177 }
1178 if (src1->elems[i1] == src2->elems[i2])
1179 ++i2;
1180 dest->elems[id++] = src1->elems[i1++];
1181 }
1182 if (i1 < src1->nelem)
1183 {
1184 memcpy (dest->elems + id, src1->elems + i1,
1185 (src1->nelem - i1) * sizeof (Idx));
1186 id += src1->nelem - i1;
1187 }
1188 else if (i2 < src2->nelem)
1189 {
1190 memcpy (dest->elems + id, src2->elems + i2,
1191 (src2->nelem - i2) * sizeof (Idx));
1192 id += src2->nelem - i2;
1193 }
1194 dest->nelem = id;
1195 return REG_NOERROR;
1196 }
1197
1198 /* Calculate the union set of the sets DEST and SRC. And store it to
1199 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1200
1201 static reg_errcode_t
1202 __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1203 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1204 {
1205 Idx is, id, sbase, delta;
1206 if (src == NULL || src->nelem == 0)
1207 return REG_NOERROR;
1208 if (dest->alloc < 2 * src->nelem + dest->nelem)
1209 {
1210 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1211 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1212 if (__glibc_unlikely (new_buffer == NULL))
1213 return REG_ESPACE;
1214 dest->elems = new_buffer;
1215 dest->alloc = new_alloc;
1216 }
1217
1218 if (__glibc_unlikely (dest->nelem == 0))
1219 {
1220 dest->nelem = src->nelem;
1221 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1222 return REG_NOERROR;
1223 }
1224
1225 /* Copy into the top of DEST the items of SRC that are not
1226 found in DEST. Maybe we could binary search in DEST? */
1227 for (sbase = dest->nelem + 2 * src->nelem,
1228 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1229 {
1230 if (dest->elems[id] == src->elems[is])
1231 is--, id--;
1232 else if (dest->elems[id] < src->elems[is])
1233 dest->elems[--sbase] = src->elems[is--];
1234 else /* if (dest->elems[id] > src->elems[is]) */
1235 --id;
1236 }
1237
1238 if (is >= 0)
1239 {
1240 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1241 sbase -= is + 1;
1242 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243 }
1244
1245 id = dest->nelem - 1;
1246 is = dest->nelem + 2 * src->nelem - 1;
1247 delta = is - sbase + 1;
1248 if (delta == 0)
1249 return REG_NOERROR;
1250
1251 /* Now copy. When DELTA becomes zero, the remaining
1252 DEST elements are already in place. */
1253 dest->nelem += delta;
1254 for (;;)
1255 {
1256 if (dest->elems[is] > dest->elems[id])
1257 {
1258 /* Copy from the top. */
1259 dest->elems[id + delta--] = dest->elems[is--];
1260 if (delta == 0)
1261 break;
1262 }
1263 else
1264 {
1265 /* Slide from the bottom. */
1266 dest->elems[id + delta] = dest->elems[id];
1267 if (--id < 0)
1268 {
1269 /* Copy remaining SRC elements. */
1270 memcpy (dest->elems, dest->elems + sbase,
1271 delta * sizeof (Idx));
1272 break;
1273 }
1274 }
1275 }
1276
1277 return REG_NOERROR;
1278 }
1279
1280 /* Insert the new element ELEM to the re_node_set* SET.
1281 SET should not already have ELEM.
1282 Return true if successful. */
1283
1284 static bool
1285 __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1286 re_node_set_insert (re_node_set *set, Idx elem)
1287 {
1288 Idx idx;
1289 /* In case the set is empty. */
1290 if (set->alloc == 0)
1291 return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1292
1293 if (__glibc_unlikely (set->nelem) == 0)
1294 {
1295 /* We already guaranteed above that set->alloc != 0. */
1296 set->elems[0] = elem;
1297 ++set->nelem;
1298 return true;
1299 }
1300
1301 /* Realloc if we need. */
1302 if (set->alloc == set->nelem)
1303 {
1304 Idx *new_elems;
1305 set->alloc = set->alloc * 2;
1306 new_elems = re_realloc (set->elems, Idx, set->alloc);
1307 if (__glibc_unlikely (new_elems == NULL))
1308 return false;
1309 set->elems = new_elems;
1310 }
1311
1312 /* Move the elements which follows the new element. Test the
1313 first element separately to skip a check in the inner loop. */
1314 if (elem < set->elems[0])
1315 {
1316 idx = 0;
1317 for (idx = set->nelem; idx > 0; idx--)
1318 set->elems[idx] = set->elems[idx - 1];
1319 }
1320 else
1321 {
1322 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1323 set->elems[idx] = set->elems[idx - 1];
1324 }
1325
1326 /* Insert the new element. */
1327 set->elems[idx] = elem;
1328 ++set->nelem;
1329 return true;
1330 }
1331
1332 /* Insert the new element ELEM to the re_node_set* SET.
1333 SET should not already have any element greater than or equal to ELEM.
1334 Return true if successful. */
1335
1336 static bool
1337 __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1338 re_node_set_insert_last (re_node_set *set, Idx elem)
1339 {
1340 /* Realloc if we need. */
1341 if (set->alloc == set->nelem)
1342 {
1343 Idx *new_elems;
1344 set->alloc = (set->alloc + 1) * 2;
1345 new_elems = re_realloc (set->elems, Idx, set->alloc);
1346 if (__glibc_unlikely (new_elems == NULL))
1347 return false;
1348 set->elems = new_elems;
1349 }
1350
1351 /* Insert the new element. */
1352 set->elems[set->nelem++] = elem;
1353 return true;
1354 }
1355
1356 /* Compare two node sets SET1 and SET2.
1357 Return true if SET1 and SET2 are equivalent. */
1358
1359 static bool
1360 __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1361 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1362 {
1363 Idx i;
1364 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1365 return false;
1366 for (i = set1->nelem ; --i >= 0 ; )
1367 if (set1->elems[i] != set2->elems[i])
1368 return false;
1369 return true;
1370 }
1371
1372 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1373
1374 static Idx
1375 __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1376 re_node_set_contains (const re_node_set *set, Idx elem)
1377 {
1378 __re_size_t idx, right, mid;
1379 if (set->nelem <= 0)
1380 return 0;
1381
1382 /* Binary search the element. */
1383 idx = 0;
1384 right = set->nelem - 1;
1385 while (idx < right)
1386 {
1387 mid = (idx + right) / 2;
1388 if (set->elems[mid] < elem)
1389 idx = mid + 1;
1390 else
1391 right = mid;
1392 }
1393 return set->elems[idx] == elem ? idx + 1 : 0;
1394 }
1395
1396 static void
re_node_set_remove_at(re_node_set * set,Idx idx)1397 re_node_set_remove_at (re_node_set *set, Idx idx)
1398 {
1399 if (idx < 0 || idx >= set->nelem)
1400 return;
1401 --set->nelem;
1402 for (; idx < set->nelem; idx++)
1403 set->elems[idx] = set->elems[idx + 1];
1404 }
1405
1406
1407 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1408 Or return -1 if an error occurred. */
1409
1410 static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1411 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1412 {
1413 if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1414 {
1415 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1416 Idx *new_nexts, *new_indices;
1417 re_node_set *new_edests, *new_eclosures;
1418 re_token_t *new_nodes;
1419
1420 /* Avoid overflows in realloc. */
1421 const size_t max_object_size = MAX (sizeof (re_token_t),
1422 MAX (sizeof (re_node_set),
1423 sizeof (Idx)));
1424 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1425 < new_nodes_alloc))
1426 return -1;
1427
1428 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1429 if (__glibc_unlikely (new_nodes == NULL))
1430 return -1;
1431 dfa->nodes = new_nodes;
1432 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1433 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1434 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1435 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1436 if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1437 || new_edests == NULL || new_eclosures == NULL))
1438 {
1439 re_free (new_nexts);
1440 re_free (new_indices);
1441 re_free (new_edests);
1442 re_free (new_eclosures);
1443 return -1;
1444 }
1445 dfa->nexts = new_nexts;
1446 dfa->org_indices = new_indices;
1447 dfa->edests = new_edests;
1448 dfa->eclosures = new_eclosures;
1449 dfa->nodes_alloc = new_nodes_alloc;
1450 }
1451 dfa->nodes[dfa->nodes_len] = token;
1452 dfa->nodes[dfa->nodes_len].constraint = 0;
1453 #ifdef RE_ENABLE_I18N
1454 dfa->nodes[dfa->nodes_len].accept_mb =
1455 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1456 || token.type == COMPLEX_BRACKET);
1457 #endif
1458 dfa->nexts[dfa->nodes_len] = -1;
1459 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1460 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1461 return dfa->nodes_len++;
1462 }
1463
1464 static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)1465 calc_state_hash (const re_node_set *nodes, unsigned int context)
1466 {
1467 re_hashval_t hash = nodes->nelem + context;
1468 Idx i;
1469 for (i = 0 ; i < nodes->nelem ; i++)
1470 hash += nodes->elems[i];
1471 return hash;
1472 }
1473
1474 /* Search for the state whose node_set is equivalent to NODES.
1475 Return the pointer to the state, if we found it in the DFA.
1476 Otherwise create the new one and return it. In case of an error
1477 return NULL and set the error code in ERR.
1478 Note: - We assume NULL as the invalid state, then it is possible that
1479 return value is NULL and ERR is REG_NOERROR.
1480 - We never return non-NULL value in case of any errors, it is for
1481 optimization. */
1482
1483 static re_dfastate_t *
1484 __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1485 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1486 const re_node_set *nodes)
1487 {
1488 re_hashval_t hash;
1489 re_dfastate_t *new_state;
1490 struct re_state_table_entry *spot;
1491 Idx i;
1492 #if defined GCC_LINT || defined lint
1493 /* Suppress bogus uninitialized-variable warnings. */
1494 *err = REG_NOERROR;
1495 #endif
1496 if (__glibc_unlikely (nodes->nelem == 0))
1497 {
1498 *err = REG_NOERROR;
1499 return NULL;
1500 }
1501 hash = calc_state_hash (nodes, 0);
1502 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503
1504 for (i = 0 ; i < spot->num ; i++)
1505 {
1506 re_dfastate_t *state = spot->array[i];
1507 if (hash != state->hash)
1508 continue;
1509 if (re_node_set_compare (&state->nodes, nodes))
1510 return state;
1511 }
1512
1513 /* There are no appropriate state in the dfa, create the new one. */
1514 new_state = create_ci_newstate (dfa, nodes, hash);
1515 if (__glibc_unlikely (new_state == NULL))
1516 *err = REG_ESPACE;
1517
1518 return new_state;
1519 }
1520
1521 /* Search for the state whose node_set is equivalent to NODES and
1522 whose context is equivalent to CONTEXT.
1523 Return the pointer to the state, if we found it in the DFA.
1524 Otherwise create the new one and return it. In case of an error
1525 return NULL and set the error code in ERR.
1526 Note: - We assume NULL as the invalid state, then it is possible that
1527 return value is NULL and ERR is REG_NOERROR.
1528 - We never return non-NULL value in case of any errors, it is for
1529 optimization. */
1530
1531 static re_dfastate_t *
1532 __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534 const re_node_set *nodes, unsigned int context)
1535 {
1536 re_hashval_t hash;
1537 re_dfastate_t *new_state;
1538 struct re_state_table_entry *spot;
1539 Idx i;
1540 #if defined GCC_LINT || defined lint
1541 /* Suppress bogus uninitialized-variable warnings. */
1542 *err = REG_NOERROR;
1543 #endif
1544 if (nodes->nelem == 0)
1545 {
1546 *err = REG_NOERROR;
1547 return NULL;
1548 }
1549 hash = calc_state_hash (nodes, context);
1550 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1551
1552 for (i = 0 ; i < spot->num ; i++)
1553 {
1554 re_dfastate_t *state = spot->array[i];
1555 if (state->hash == hash
1556 && state->context == context
1557 && re_node_set_compare (state->entrance_nodes, nodes))
1558 return state;
1559 }
1560 /* There are no appropriate state in 'dfa', create the new one. */
1561 new_state = create_cd_newstate (dfa, nodes, context, hash);
1562 if (__glibc_unlikely (new_state == NULL))
1563 *err = REG_ESPACE;
1564
1565 return new_state;
1566 }
1567
1568 /* Finish initialization of the new state NEWSTATE, and using its hash value
1569 HASH put in the appropriate bucket of DFA's state table. Return value
1570 indicates the error code if failed. */
1571
1572 static reg_errcode_t
1573 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1574 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1575 re_hashval_t hash)
1576 {
1577 struct re_state_table_entry *spot;
1578 reg_errcode_t err;
1579 Idx i;
1580
1581 newstate->hash = hash;
1582 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1583 if (__glibc_unlikely (err != REG_NOERROR))
1584 return REG_ESPACE;
1585 for (i = 0; i < newstate->nodes.nelem; i++)
1586 {
1587 Idx elem = newstate->nodes.elems[i];
1588 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1589 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1590 return REG_ESPACE;
1591 }
1592
1593 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1594 if (__glibc_unlikely (spot->alloc <= spot->num))
1595 {
1596 Idx new_alloc = 2 * spot->num + 2;
1597 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1598 new_alloc);
1599 if (__glibc_unlikely (new_array == NULL))
1600 return REG_ESPACE;
1601 spot->array = new_array;
1602 spot->alloc = new_alloc;
1603 }
1604 spot->array[spot->num++] = newstate;
1605 return REG_NOERROR;
1606 }
1607
1608 static void
free_state(re_dfastate_t * state)1609 free_state (re_dfastate_t *state)
1610 {
1611 re_node_set_free (&state->non_eps_nodes);
1612 re_node_set_free (&state->inveclosure);
1613 if (state->entrance_nodes != &state->nodes)
1614 {
1615 re_node_set_free (state->entrance_nodes);
1616 re_free (state->entrance_nodes);
1617 }
1618 re_node_set_free (&state->nodes);
1619 re_free (state->word_trtable);
1620 re_free (state->trtable);
1621 re_free (state);
1622 }
1623
1624 /* Create the new state which is independent of contexts.
1625 Return the new state if succeeded, otherwise return NULL. */
1626
1627 static re_dfastate_t *
1628 __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1629 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1630 re_hashval_t hash)
1631 {
1632 Idx i;
1633 reg_errcode_t err;
1634 re_dfastate_t *newstate;
1635
1636 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1637 if (__glibc_unlikely (newstate == NULL))
1638 return NULL;
1639 err = re_node_set_init_copy (&newstate->nodes, nodes);
1640 if (__glibc_unlikely (err != REG_NOERROR))
1641 {
1642 re_free (newstate);
1643 return NULL;
1644 }
1645
1646 newstate->entrance_nodes = &newstate->nodes;
1647 for (i = 0 ; i < nodes->nelem ; i++)
1648 {
1649 re_token_t *node = dfa->nodes + nodes->elems[i];
1650 re_token_type_t type = node->type;
1651 if (type == CHARACTER && !node->constraint)
1652 continue;
1653 #ifdef RE_ENABLE_I18N
1654 newstate->accept_mb |= node->accept_mb;
1655 #endif /* RE_ENABLE_I18N */
1656
1657 /* If the state has the halt node, the state is a halt state. */
1658 if (type == END_OF_RE)
1659 newstate->halt = 1;
1660 else if (type == OP_BACK_REF)
1661 newstate->has_backref = 1;
1662 else if (type == ANCHOR || node->constraint)
1663 newstate->has_constraint = 1;
1664 }
1665 err = register_state (dfa, newstate, hash);
1666 if (__glibc_unlikely (err != REG_NOERROR))
1667 {
1668 free_state (newstate);
1669 newstate = NULL;
1670 }
1671 return newstate;
1672 }
1673
1674 /* Create the new state which is depend on the context CONTEXT.
1675 Return the new state if succeeded, otherwise return NULL. */
1676
1677 static re_dfastate_t *
1678 __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1679 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1680 unsigned int context, re_hashval_t hash)
1681 {
1682 Idx i, nctx_nodes = 0;
1683 reg_errcode_t err;
1684 re_dfastate_t *newstate;
1685
1686 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1687 if (__glibc_unlikely (newstate == NULL))
1688 return NULL;
1689 err = re_node_set_init_copy (&newstate->nodes, nodes);
1690 if (__glibc_unlikely (err != REG_NOERROR))
1691 {
1692 re_free (newstate);
1693 return NULL;
1694 }
1695
1696 newstate->context = context;
1697 newstate->entrance_nodes = &newstate->nodes;
1698
1699 for (i = 0 ; i < nodes->nelem ; i++)
1700 {
1701 re_token_t *node = dfa->nodes + nodes->elems[i];
1702 re_token_type_t type = node->type;
1703 unsigned int constraint = node->constraint;
1704
1705 if (type == CHARACTER && !constraint)
1706 continue;
1707 #ifdef RE_ENABLE_I18N
1708 newstate->accept_mb |= node->accept_mb;
1709 #endif /* RE_ENABLE_I18N */
1710
1711 /* If the state has the halt node, the state is a halt state. */
1712 if (type == END_OF_RE)
1713 newstate->halt = 1;
1714 else if (type == OP_BACK_REF)
1715 newstate->has_backref = 1;
1716
1717 if (constraint)
1718 {
1719 if (newstate->entrance_nodes == &newstate->nodes)
1720 {
1721 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1722 if (__glibc_unlikely (newstate->entrance_nodes == NULL))
1723 {
1724 free_state (newstate);
1725 return NULL;
1726 }
1727 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1728 != REG_NOERROR)
1729 return NULL;
1730 nctx_nodes = 0;
1731 newstate->has_constraint = 1;
1732 }
1733
1734 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1735 {
1736 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1737 ++nctx_nodes;
1738 }
1739 }
1740 }
1741 err = register_state (dfa, newstate, hash);
1742 if (__glibc_unlikely (err != REG_NOERROR))
1743 {
1744 free_state (newstate);
1745 newstate = NULL;
1746 }
1747 return newstate;
1748 }
1749