1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31
32 /* Functions for string operation. */
33
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
36
37 static reg_errcode_t
38 internal_function __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42 reg_errcode_t ret;
43 Idx init_buf_len;
44
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
54
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
61 }
62
63 /* This function allocate the buffers, and initialize them. */
64
65 static reg_errcode_t
66 internal_function __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74 if (len > 0)
75 {
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
79 }
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82 if (icase)
83 {
84 #ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
86 {
87 while (1)
88 {
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
99 }
100 }
101 else
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
104 }
105 else
106 {
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111 #endif /* RE_ENABLE_I18N */
112 {
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
116 {
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
119 }
120 }
121 }
122
123 return REG_NOERROR;
124 }
125
126 /* Helper functions for re_string_allocate, and re_string_construct. */
127
128 static reg_errcode_t
129 internal_function __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs;
136
137 /* Avoid overflow in realloc. */
138 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
140 return REG_ESPACE;
141
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
144 return REG_ESPACE;
145 pstr->wcs = new_wcs;
146 if (pstr->offsets != NULL)
147 {
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
150 return REG_ESPACE;
151 pstr->offsets = new_offsets;
152 }
153 }
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
156 {
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 new_buf_len);
159 if (BE (new_mbs == NULL, 0))
160 return REG_ESPACE;
161 pstr->mbs = new_mbs;
162 }
163 pstr->bufs_len = new_buf_len;
164 return REG_NOERROR;
165 }
166
167
168 static void
169 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
172 const re_dfa_t *dfa)
173 {
174 pstr->raw_mbs = (const unsigned char *) str;
175 pstr->len = len;
176 pstr->raw_len = len;
177 pstr->trans = trans;
178 pstr->icase = icase;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
185 }
186
187 #ifdef RE_ENABLE_I18N
188
189 /* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
196
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
199
200 static void
201 internal_function
build_wcs_buffer(re_string_t * pstr)202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208 unsigned char buf[64];
209 #endif
210 mbstate_t prev_st;
211 Idx byte_idx, end_idx, remain_len;
212 size_t mbclen;
213
214 /* Build the buffers from pstr->valid_len to either pstr->len or
215 pstr->bufs_len. */
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218 {
219 wchar_t wc;
220 const char *p;
221
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
226 {
227 int i, ch;
228
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 {
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 }
234 p = (const char *) buf;
235 }
236 else
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -1 || mbclen == 0
240 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241 {
242 /* We treat these cases as a singlebyte character. */
243 mbclen = 1;
244 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 if (BE (pstr->trans != NULL, 0))
246 wc = pstr->trans[wc];
247 pstr->cur_state = prev_st;
248 }
249 else if (BE (mbclen == (size_t) -2, 0))
250 {
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr->cur_state = prev_st;
253 break;
254 }
255
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
261 }
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
264 }
265
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
268
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273 mbstate_t prev_st;
274 Idx src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276 #ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280 char buf[64];
281 #endif
282
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 {
290 while (byte_idx < end_idx)
291 {
292 wchar_t wc;
293
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
296 {
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
305 }
306
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = __mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen < (size_t) -2, 1))
313 {
314 wchar_t wcu = __towupper (wc);
315 if (wcu != wc)
316 {
317 size_t mbcdlen;
318
319 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 else
323 {
324 src_idx = byte_idx;
325 goto offsets_needed;
326 }
327 }
328 else
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
335 }
336 else if (mbclen == (size_t) -1 || mbclen == 0
337 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338 {
339 /* It is an invalid character, an incomplete character
340 at the end of the string, or '\0'. Just use the byte. */
341 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
342 pstr->mbs[byte_idx] = ch;
343 /* And also cast it to wide char. */
344 pstr->wcs[byte_idx++] = (wchar_t) ch;
345 if (BE (mbclen == (size_t) -1, 0))
346 pstr->cur_state = prev_st;
347 }
348 else
349 {
350 /* The buffer doesn't have enough space, finish to build. */
351 pstr->cur_state = prev_st;
352 break;
353 }
354 }
355 pstr->valid_len = byte_idx;
356 pstr->valid_raw_len = byte_idx;
357 return REG_NOERROR;
358 }
359 else
360 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361 {
362 wchar_t wc;
363 const char *p;
364 offsets_needed:
365 remain_len = end_idx - byte_idx;
366 prev_st = pstr->cur_state;
367 if (BE (pstr->trans != NULL, 0))
368 {
369 int i, ch;
370
371 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 {
373 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
374 buf[i] = pstr->trans[ch];
375 }
376 p = (const char *) buf;
377 }
378 else
379 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
380 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
381 if (BE (mbclen < (size_t) -2, 1))
382 {
383 wchar_t wcu = __towupper (wc);
384 if (wcu != wc)
385 {
386 size_t mbcdlen;
387
388 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 if (BE (mbclen == mbcdlen, 1))
390 memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 else if (mbcdlen != (size_t) -1)
392 {
393 size_t i;
394
395 if (byte_idx + mbcdlen > pstr->bufs_len)
396 {
397 pstr->cur_state = prev_st;
398 break;
399 }
400
401 if (pstr->offsets == NULL)
402 {
403 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404
405 if (pstr->offsets == NULL)
406 return REG_ESPACE;
407 }
408 if (!pstr->offsets_needed)
409 {
410 for (i = 0; i < (size_t) byte_idx; ++i)
411 pstr->offsets[i] = i;
412 pstr->offsets_needed = 1;
413 }
414
415 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 pstr->wcs[byte_idx] = wcu;
417 pstr->offsets[byte_idx] = src_idx;
418 for (i = 1; i < mbcdlen; ++i)
419 {
420 pstr->offsets[byte_idx + i]
421 = src_idx + (i < mbclen ? i : mbclen - 1);
422 pstr->wcs[byte_idx + i] = WEOF;
423 }
424 pstr->len += mbcdlen - mbclen;
425 if (pstr->raw_stop > src_idx)
426 pstr->stop += mbcdlen - mbclen;
427 end_idx = (pstr->bufs_len > pstr->len)
428 ? pstr->len : pstr->bufs_len;
429 byte_idx += mbcdlen;
430 src_idx += mbclen;
431 continue;
432 }
433 else
434 memcpy (pstr->mbs + byte_idx, p, mbclen);
435 }
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
438
439 if (BE (pstr->offsets_needed != 0, 0))
440 {
441 size_t i;
442 for (i = 0; i < mbclen; ++i)
443 pstr->offsets[byte_idx + i] = src_idx + i;
444 }
445 src_idx += mbclen;
446
447 pstr->wcs[byte_idx++] = wcu;
448 /* Write paddings. */
449 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 pstr->wcs[byte_idx++] = WEOF;
451 }
452 else if (mbclen == (size_t) -1 || mbclen == 0
453 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
454 {
455 /* It is an invalid character or '\0'. Just use the byte. */
456 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457
458 if (BE (pstr->trans != NULL, 0))
459 ch = pstr->trans [ch];
460 pstr->mbs[byte_idx] = ch;
461
462 if (BE (pstr->offsets_needed != 0, 0))
463 pstr->offsets[byte_idx] = src_idx;
464 ++src_idx;
465
466 /* And also cast it to wide char. */
467 pstr->wcs[byte_idx++] = (wchar_t) ch;
468 if (BE (mbclen == (size_t) -1, 0))
469 pstr->cur_state = prev_st;
470 }
471 else
472 {
473 /* The buffer doesn't have enough space, finish to build. */
474 pstr->cur_state = prev_st;
475 break;
476 }
477 }
478 pstr->valid_len = byte_idx;
479 pstr->valid_raw_len = src_idx;
480 return REG_NOERROR;
481 }
482
483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
484 Return the index. */
485
486 static Idx
487 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)488 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
489 {
490 mbstate_t prev_st;
491 Idx rawbuf_idx;
492 size_t mbclen;
493 wint_t wc = WEOF;
494
495 /* Skip the characters which are not necessary to check. */
496 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
497 rawbuf_idx < new_raw_idx;)
498 {
499 wchar_t wc2;
500 Idx remain_len = pstr->raw_len - rawbuf_idx;
501 prev_st = pstr->cur_state;
502 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503 remain_len, &pstr->cur_state);
504 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 {
506 /* We treat these cases as a single byte character. */
507 if (mbclen == 0 || remain_len == 0)
508 wc = L'\0';
509 else
510 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 mbclen = 1;
512 pstr->cur_state = prev_st;
513 }
514 else
515 wc = wc2;
516 /* Then proceed the next character. */
517 rawbuf_idx += mbclen;
518 }
519 *last_wc = wc;
520 return rawbuf_idx;
521 }
522 #endif /* RE_ENABLE_I18N */
523
524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
525 This function is used in case of REG_ICASE. */
526
527 static void
528 internal_function
build_upper_buffer(re_string_t * pstr)529 build_upper_buffer (re_string_t *pstr)
530 {
531 Idx char_idx, end_idx;
532 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533
534 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535 {
536 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537 if (BE (pstr->trans != NULL, 0))
538 ch = pstr->trans[ch];
539 pstr->mbs[char_idx] = toupper (ch);
540 }
541 pstr->valid_len = char_idx;
542 pstr->valid_raw_len = char_idx;
543 }
544
545 /* Apply TRANS to the buffer in PSTR. */
546
547 static void
548 internal_function
re_string_translate_buffer(re_string_t * pstr)549 re_string_translate_buffer (re_string_t *pstr)
550 {
551 Idx buf_idx, end_idx;
552 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553
554 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555 {
556 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557 pstr->mbs[buf_idx] = pstr->trans[ch];
558 }
559
560 pstr->valid_len = buf_idx;
561 pstr->valid_raw_len = buf_idx;
562 }
563
564 /* This function re-construct the buffers.
565 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566 convert to upper case in case of REG_ICASE, apply translation. */
567
568 static reg_errcode_t
569 internal_function __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)570 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571 {
572 Idx offset;
573
574 if (BE (pstr->raw_mbs_idx <= idx, 0))
575 offset = idx - pstr->raw_mbs_idx;
576 else
577 {
578 /* Reset buffer. */
579 #ifdef RE_ENABLE_I18N
580 if (pstr->mb_cur_max > 1)
581 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
582 #endif /* RE_ENABLE_I18N */
583 pstr->len = pstr->raw_len;
584 pstr->stop = pstr->raw_stop;
585 pstr->valid_len = 0;
586 pstr->raw_mbs_idx = 0;
587 pstr->valid_raw_len = 0;
588 pstr->offsets_needed = 0;
589 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
590 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
591 if (!pstr->mbs_allocated)
592 pstr->mbs = (unsigned char *) pstr->raw_mbs;
593 offset = idx;
594 }
595
596 if (BE (offset != 0, 1))
597 {
598 /* Should the already checked characters be kept? */
599 if (BE (offset < pstr->valid_raw_len, 1))
600 {
601 /* Yes, move them to the front of the buffer. */
602 #ifdef RE_ENABLE_I18N
603 if (BE (pstr->offsets_needed, 0))
604 {
605 Idx low = 0, high = pstr->valid_len, mid;
606 do
607 {
608 mid = (high + low) / 2;
609 if (pstr->offsets[mid] > offset)
610 high = mid;
611 else if (pstr->offsets[mid] < offset)
612 low = mid + 1;
613 else
614 break;
615 }
616 while (low < high);
617 if (pstr->offsets[mid] < offset)
618 ++mid;
619 pstr->tip_context = re_string_context_at (pstr, mid - 1,
620 eflags);
621 /* This can be quite complicated, so handle specially
622 only the common and easy case where the character with
623 different length representation of lower and upper
624 case is present at or after offset. */
625 if (pstr->valid_len > offset
626 && mid == offset && pstr->offsets[mid] == offset)
627 {
628 memmove (pstr->wcs, pstr->wcs + offset,
629 (pstr->valid_len - offset) * sizeof (wint_t));
630 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
631 pstr->valid_len -= offset;
632 pstr->valid_raw_len -= offset;
633 for (low = 0; low < pstr->valid_len; low++)
634 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
635 }
636 else
637 {
638 /* Otherwise, just find out how long the partial multibyte
639 character at offset is and fill it with WEOF/255. */
640 pstr->len = pstr->raw_len - idx + offset;
641 pstr->stop = pstr->raw_stop - idx + offset;
642 pstr->offsets_needed = 0;
643 while (mid > 0 && pstr->offsets[mid - 1] == offset)
644 --mid;
645 while (mid < pstr->valid_len)
646 if (pstr->wcs[mid] != WEOF)
647 break;
648 else
649 ++mid;
650 if (mid == pstr->valid_len)
651 pstr->valid_len = 0;
652 else
653 {
654 pstr->valid_len = pstr->offsets[mid] - offset;
655 if (pstr->valid_len)
656 {
657 for (low = 0; low < pstr->valid_len; ++low)
658 pstr->wcs[low] = WEOF;
659 memset (pstr->mbs, 255, pstr->valid_len);
660 }
661 }
662 pstr->valid_raw_len = pstr->valid_len;
663 }
664 }
665 else
666 #endif
667 {
668 pstr->tip_context = re_string_context_at (pstr, offset - 1,
669 eflags);
670 #ifdef RE_ENABLE_I18N
671 if (pstr->mb_cur_max > 1)
672 memmove (pstr->wcs, pstr->wcs + offset,
673 (pstr->valid_len - offset) * sizeof (wint_t));
674 #endif /* RE_ENABLE_I18N */
675 if (BE (pstr->mbs_allocated, 0))
676 memmove (pstr->mbs, pstr->mbs + offset,
677 pstr->valid_len - offset);
678 pstr->valid_len -= offset;
679 pstr->valid_raw_len -= offset;
680 #if defined DEBUG && DEBUG
681 assert (pstr->valid_len > 0);
682 #endif
683 }
684 }
685 else
686 {
687 #ifdef RE_ENABLE_I18N
688 /* No, skip all characters until IDX. */
689 Idx prev_valid_len = pstr->valid_len;
690
691 if (BE (pstr->offsets_needed, 0))
692 {
693 pstr->len = pstr->raw_len - idx + offset;
694 pstr->stop = pstr->raw_stop - idx + offset;
695 pstr->offsets_needed = 0;
696 }
697 #endif
698 pstr->valid_len = 0;
699 #ifdef RE_ENABLE_I18N
700 if (pstr->mb_cur_max > 1)
701 {
702 Idx wcs_idx;
703 wint_t wc = WEOF;
704
705 if (pstr->is_utf8)
706 {
707 const unsigned char *raw, *p, *end;
708
709 /* Special case UTF-8. Multi-byte chars start with any
710 byte other than 0x80 - 0xbf. */
711 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
712 end = raw + (offset - pstr->mb_cur_max);
713 if (end < pstr->raw_mbs)
714 end = pstr->raw_mbs;
715 p = raw + offset - 1;
716 #ifdef _LIBC
717 /* We know the wchar_t encoding is UCS4, so for the simple
718 case, ASCII characters, skip the conversion step. */
719 if (isascii (*p) && BE (pstr->trans == NULL, 1))
720 {
721 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
722 /* pstr->valid_len = 0; */
723 wc = (wchar_t) *p;
724 }
725 else
726 #endif
727 for (; p >= end; --p)
728 if ((*p & 0xc0) != 0x80)
729 {
730 mbstate_t cur_state;
731 wchar_t wc2;
732 Idx mlen = raw + pstr->len - p;
733 unsigned char buf[6];
734 size_t mbclen;
735
736 const unsigned char *pp = p;
737 if (BE (pstr->trans != NULL, 0))
738 {
739 int i = mlen < 6 ? mlen : 6;
740 while (--i >= 0)
741 buf[i] = pstr->trans[p[i]];
742 pp = buf;
743 }
744 /* XXX Don't use mbrtowc, we know which conversion
745 to use (UTF-8 -> UCS4). */
746 memset (&cur_state, 0, sizeof (cur_state));
747 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
748 &cur_state);
749 if (raw + offset - p <= mbclen
750 && mbclen < (size_t) -2)
751 {
752 memset (&pstr->cur_state, '\0',
753 sizeof (mbstate_t));
754 pstr->valid_len = mbclen - (raw + offset - p);
755 wc = wc2;
756 }
757 break;
758 }
759 }
760
761 if (wc == WEOF)
762 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
763 if (wc == WEOF)
764 pstr->tip_context
765 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
766 else
767 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
768 && IS_WIDE_WORD_CHAR (wc))
769 ? CONTEXT_WORD
770 : ((IS_WIDE_NEWLINE (wc)
771 && pstr->newline_anchor)
772 ? CONTEXT_NEWLINE : 0));
773 if (BE (pstr->valid_len, 0))
774 {
775 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
776 pstr->wcs[wcs_idx] = WEOF;
777 if (pstr->mbs_allocated)
778 memset (pstr->mbs, 255, pstr->valid_len);
779 }
780 pstr->valid_raw_len = pstr->valid_len;
781 }
782 else
783 #endif /* RE_ENABLE_I18N */
784 {
785 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
786 pstr->valid_raw_len = 0;
787 if (pstr->trans)
788 c = pstr->trans[c];
789 pstr->tip_context = (bitset_contain (pstr->word_char, c)
790 ? CONTEXT_WORD
791 : ((IS_NEWLINE (c) && pstr->newline_anchor)
792 ? CONTEXT_NEWLINE : 0));
793 }
794 }
795 if (!BE (pstr->mbs_allocated, 0))
796 pstr->mbs += offset;
797 }
798 pstr->raw_mbs_idx = idx;
799 pstr->len -= offset;
800 pstr->stop -= offset;
801
802 /* Then build the buffers. */
803 #ifdef RE_ENABLE_I18N
804 if (pstr->mb_cur_max > 1)
805 {
806 if (pstr->icase)
807 {
808 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
809 if (BE (ret != REG_NOERROR, 0))
810 return ret;
811 }
812 else
813 build_wcs_buffer (pstr);
814 }
815 else
816 #endif /* RE_ENABLE_I18N */
817 if (BE (pstr->mbs_allocated, 0))
818 {
819 if (pstr->icase)
820 build_upper_buffer (pstr);
821 else if (pstr->trans != NULL)
822 re_string_translate_buffer (pstr);
823 }
824 else
825 pstr->valid_len = pstr->len;
826
827 pstr->cur_idx = 0;
828 return REG_NOERROR;
829 }
830
831 static unsigned char
832 internal_function __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)833 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
834 {
835 int ch;
836 Idx off;
837
838 /* Handle the common (easiest) cases first. */
839 if (BE (!pstr->mbs_allocated, 1))
840 return re_string_peek_byte (pstr, idx);
841
842 #ifdef RE_ENABLE_I18N
843 if (pstr->mb_cur_max > 1
844 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
845 return re_string_peek_byte (pstr, idx);
846 #endif
847
848 off = pstr->cur_idx + idx;
849 #ifdef RE_ENABLE_I18N
850 if (pstr->offsets_needed)
851 off = pstr->offsets[off];
852 #endif
853
854 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
855
856 #ifdef RE_ENABLE_I18N
857 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858 this function returns CAPITAL LETTER I instead of first byte of
859 DOTLESS SMALL LETTER I. The latter would confuse the parser,
860 since peek_byte_case doesn't advance cur_idx in any way. */
861 if (pstr->offsets_needed && !isascii (ch))
862 return re_string_peek_byte (pstr, idx);
863 #endif
864
865 return ch;
866 }
867
868 static unsigned char
869 internal_function
re_string_fetch_byte_case(re_string_t * pstr)870 re_string_fetch_byte_case (re_string_t *pstr)
871 {
872 if (BE (!pstr->mbs_allocated, 1))
873 return re_string_fetch_byte (pstr);
874
875 #ifdef RE_ENABLE_I18N
876 if (pstr->offsets_needed)
877 {
878 Idx off;
879 int ch;
880
881 /* For tr_TR.UTF-8 [[:islower:]] there is
882 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
883 in that case the whole multi-byte character and return
884 the original letter. On the other side, with
885 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886 anything else would complicate things too much. */
887
888 if (!re_string_first_byte (pstr, pstr->cur_idx))
889 return re_string_fetch_byte (pstr);
890
891 off = pstr->offsets[pstr->cur_idx];
892 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893
894 if (! isascii (ch))
895 return re_string_fetch_byte (pstr);
896
897 re_string_skip_bytes (pstr,
898 re_string_char_size_at (pstr, pstr->cur_idx));
899 return ch;
900 }
901 #endif
902
903 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904 }
905
906 static void
907 internal_function
re_string_destruct(re_string_t * pstr)908 re_string_destruct (re_string_t *pstr)
909 {
910 #ifdef RE_ENABLE_I18N
911 re_free (pstr->wcs);
912 re_free (pstr->offsets);
913 #endif /* RE_ENABLE_I18N */
914 if (pstr->mbs_allocated)
915 re_free (pstr->mbs);
916 }
917
918 /* Return the context at IDX in INPUT. */
919
920 static unsigned int
921 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)922 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
923 {
924 int c;
925 if (BE (idx < 0, 0))
926 /* In this case, we use the value stored in input->tip_context,
927 since we can't know the character in input->mbs[-1] here. */
928 return input->tip_context;
929 if (BE (idx == input->len, 0))
930 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932 #ifdef RE_ENABLE_I18N
933 if (input->mb_cur_max > 1)
934 {
935 wint_t wc;
936 Idx wc_idx = idx;
937 while(input->wcs[wc_idx] == WEOF)
938 {
939 #if defined DEBUG && DEBUG
940 /* It must not happen. */
941 assert (wc_idx >= 0);
942 #endif
943 --wc_idx;
944 if (wc_idx < 0)
945 return input->tip_context;
946 }
947 wc = input->wcs[wc_idx];
948 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949 return CONTEXT_WORD;
950 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951 ? CONTEXT_NEWLINE : 0);
952 }
953 else
954 #endif
955 {
956 c = re_string_byte_at (input, idx);
957 if (bitset_contain (input->word_char, c))
958 return CONTEXT_WORD;
959 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960 }
961 }
962
963 /* Functions for set operation. */
964
965 static reg_errcode_t
966 internal_function __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)967 re_node_set_alloc (re_node_set *set, Idx size)
968 {
969 set->alloc = size;
970 set->nelem = 0;
971 set->elems = re_malloc (Idx, size);
972 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
973 return REG_ESPACE;
974 return REG_NOERROR;
975 }
976
977 static reg_errcode_t
978 internal_function __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)979 re_node_set_init_1 (re_node_set *set, Idx elem)
980 {
981 set->alloc = 1;
982 set->nelem = 1;
983 set->elems = re_malloc (Idx, 1);
984 if (BE (set->elems == NULL, 0))
985 {
986 set->alloc = set->nelem = 0;
987 return REG_ESPACE;
988 }
989 set->elems[0] = elem;
990 return REG_NOERROR;
991 }
992
993 static reg_errcode_t
994 internal_function __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)995 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
996 {
997 set->alloc = 2;
998 set->elems = re_malloc (Idx, 2);
999 if (BE (set->elems == NULL, 0))
1000 return REG_ESPACE;
1001 if (elem1 == elem2)
1002 {
1003 set->nelem = 1;
1004 set->elems[0] = elem1;
1005 }
1006 else
1007 {
1008 set->nelem = 2;
1009 if (elem1 < elem2)
1010 {
1011 set->elems[0] = elem1;
1012 set->elems[1] = elem2;
1013 }
1014 else
1015 {
1016 set->elems[0] = elem2;
1017 set->elems[1] = elem1;
1018 }
1019 }
1020 return REG_NOERROR;
1021 }
1022
1023 static reg_errcode_t
1024 internal_function __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1025 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 {
1027 dest->nelem = src->nelem;
1028 if (src->nelem > 0)
1029 {
1030 dest->alloc = dest->nelem;
1031 dest->elems = re_malloc (Idx, dest->alloc);
1032 if (BE (dest->elems == NULL, 0))
1033 {
1034 dest->alloc = dest->nelem = 0;
1035 return REG_ESPACE;
1036 }
1037 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1038 }
1039 else
1040 re_node_set_init_empty (dest);
1041 return REG_NOERROR;
1042 }
1043
1044 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1047
1048 static reg_errcode_t
1049 internal_function __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1050 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051 const re_node_set *src2)
1052 {
1053 Idx i1, i2, is, id, delta, sbase;
1054 if (src1->nelem == 0 || src2->nelem == 0)
1055 return REG_NOERROR;
1056
1057 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058 conservative estimate. */
1059 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060 {
1061 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1063 if (BE (new_elems == NULL, 0))
1064 return REG_ESPACE;
1065 dest->elems = new_elems;
1066 dest->alloc = new_alloc;
1067 }
1068
1069 /* Find the items in the intersection of SRC1 and SRC2, and copy
1070 into the top of DEST those that are not already in DEST itself. */
1071 sbase = dest->nelem + src1->nelem + src2->nelem;
1072 i1 = src1->nelem - 1;
1073 i2 = src2->nelem - 1;
1074 id = dest->nelem - 1;
1075 for (;;)
1076 {
1077 if (src1->elems[i1] == src2->elems[i2])
1078 {
1079 /* Try to find the item in DEST. Maybe we could binary search? */
1080 while (id >= 0 && dest->elems[id] > src1->elems[i1])
1081 --id;
1082
1083 if (id < 0 || dest->elems[id] != src1->elems[i1])
1084 dest->elems[--sbase] = src1->elems[i1];
1085
1086 if (--i1 < 0 || --i2 < 0)
1087 break;
1088 }
1089
1090 /* Lower the highest of the two items. */
1091 else if (src1->elems[i1] < src2->elems[i2])
1092 {
1093 if (--i2 < 0)
1094 break;
1095 }
1096 else
1097 {
1098 if (--i1 < 0)
1099 break;
1100 }
1101 }
1102
1103 id = dest->nelem - 1;
1104 is = dest->nelem + src1->nelem + src2->nelem - 1;
1105 delta = is - sbase + 1;
1106
1107 /* Now copy. When DELTA becomes zero, the remaining
1108 DEST elements are already in place; this is more or
1109 less the same loop that is in re_node_set_merge. */
1110 dest->nelem += delta;
1111 if (delta > 0 && id >= 0)
1112 for (;;)
1113 {
1114 if (dest->elems[is] > dest->elems[id])
1115 {
1116 /* Copy from the top. */
1117 dest->elems[id + delta--] = dest->elems[is--];
1118 if (delta == 0)
1119 break;
1120 }
1121 else
1122 {
1123 /* Slide from the bottom. */
1124 dest->elems[id + delta] = dest->elems[id];
1125 if (--id < 0)
1126 break;
1127 }
1128 }
1129
1130 /* Copy remaining SRC elements. */
1131 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1132
1133 return REG_NOERROR;
1134 }
1135
1136 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1138
1139 static reg_errcode_t
1140 internal_function __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1141 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142 const re_node_set *src2)
1143 {
1144 Idx i1, i2, id;
1145 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146 {
1147 dest->alloc = src1->nelem + src2->nelem;
1148 dest->elems = re_malloc (Idx, dest->alloc);
1149 if (BE (dest->elems == NULL, 0))
1150 return REG_ESPACE;
1151 }
1152 else
1153 {
1154 if (src1 != NULL && src1->nelem > 0)
1155 return re_node_set_init_copy (dest, src1);
1156 else if (src2 != NULL && src2->nelem > 0)
1157 return re_node_set_init_copy (dest, src2);
1158 else
1159 re_node_set_init_empty (dest);
1160 return REG_NOERROR;
1161 }
1162 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163 {
1164 if (src1->elems[i1] > src2->elems[i2])
1165 {
1166 dest->elems[id++] = src2->elems[i2++];
1167 continue;
1168 }
1169 if (src1->elems[i1] == src2->elems[i2])
1170 ++i2;
1171 dest->elems[id++] = src1->elems[i1++];
1172 }
1173 if (i1 < src1->nelem)
1174 {
1175 memcpy (dest->elems + id, src1->elems + i1,
1176 (src1->nelem - i1) * sizeof (Idx));
1177 id += src1->nelem - i1;
1178 }
1179 else if (i2 < src2->nelem)
1180 {
1181 memcpy (dest->elems + id, src2->elems + i2,
1182 (src2->nelem - i2) * sizeof (Idx));
1183 id += src2->nelem - i2;
1184 }
1185 dest->nelem = id;
1186 return REG_NOERROR;
1187 }
1188
1189 /* Calculate the union set of the sets DEST and SRC. And store it to
1190 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1191
1192 static reg_errcode_t
1193 internal_function __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1194 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 {
1196 Idx is, id, sbase, delta;
1197 if (src == NULL || src->nelem == 0)
1198 return REG_NOERROR;
1199 if (dest->alloc < 2 * src->nelem + dest->nelem)
1200 {
1201 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1202 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1203 if (BE (new_buffer == NULL, 0))
1204 return REG_ESPACE;
1205 dest->elems = new_buffer;
1206 dest->alloc = new_alloc;
1207 }
1208
1209 if (BE (dest->nelem == 0, 0))
1210 {
1211 dest->nelem = src->nelem;
1212 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1213 return REG_NOERROR;
1214 }
1215
1216 /* Copy into the top of DEST the items of SRC that are not
1217 found in DEST. Maybe we could binary search in DEST? */
1218 for (sbase = dest->nelem + 2 * src->nelem,
1219 is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220 {
1221 if (dest->elems[id] == src->elems[is])
1222 is--, id--;
1223 else if (dest->elems[id] < src->elems[is])
1224 dest->elems[--sbase] = src->elems[is--];
1225 else /* if (dest->elems[id] > src->elems[is]) */
1226 --id;
1227 }
1228
1229 if (is >= 0)
1230 {
1231 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1232 sbase -= is + 1;
1233 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1234 }
1235
1236 id = dest->nelem - 1;
1237 is = dest->nelem + 2 * src->nelem - 1;
1238 delta = is - sbase + 1;
1239 if (delta == 0)
1240 return REG_NOERROR;
1241
1242 /* Now copy. When DELTA becomes zero, the remaining
1243 DEST elements are already in place. */
1244 dest->nelem += delta;
1245 for (;;)
1246 {
1247 if (dest->elems[is] > dest->elems[id])
1248 {
1249 /* Copy from the top. */
1250 dest->elems[id + delta--] = dest->elems[is--];
1251 if (delta == 0)
1252 break;
1253 }
1254 else
1255 {
1256 /* Slide from the bottom. */
1257 dest->elems[id + delta] = dest->elems[id];
1258 if (--id < 0)
1259 {
1260 /* Copy remaining SRC elements. */
1261 memcpy (dest->elems, dest->elems + sbase,
1262 delta * sizeof (Idx));
1263 break;
1264 }
1265 }
1266 }
1267
1268 return REG_NOERROR;
1269 }
1270
1271 /* Insert the new element ELEM to the re_node_set* SET.
1272 SET should not already have ELEM.
1273 Return true if successful. */
1274
1275 static bool
1276 internal_function __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1277 re_node_set_insert (re_node_set *set, Idx elem)
1278 {
1279 Idx idx;
1280 /* In case the set is empty. */
1281 if (set->alloc == 0)
1282 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1283
1284 if (BE (set->nelem, 0) == 0)
1285 {
1286 /* We already guaranteed above that set->alloc != 0. */
1287 set->elems[0] = elem;
1288 ++set->nelem;
1289 return true;
1290 }
1291
1292 /* Realloc if we need. */
1293 if (set->alloc == set->nelem)
1294 {
1295 Idx *new_elems;
1296 set->alloc = set->alloc * 2;
1297 new_elems = re_realloc (set->elems, Idx, set->alloc);
1298 if (BE (new_elems == NULL, 0))
1299 return false;
1300 set->elems = new_elems;
1301 }
1302
1303 /* Move the elements which follows the new element. Test the
1304 first element separately to skip a check in the inner loop. */
1305 if (elem < set->elems[0])
1306 {
1307 idx = 0;
1308 for (idx = set->nelem; idx > 0; idx--)
1309 set->elems[idx] = set->elems[idx - 1];
1310 }
1311 else
1312 {
1313 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1314 set->elems[idx] = set->elems[idx - 1];
1315 }
1316
1317 /* Insert the new element. */
1318 set->elems[idx] = elem;
1319 ++set->nelem;
1320 return true;
1321 }
1322
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324 SET should not already have any element greater than or equal to ELEM.
1325 Return true if successful. */
1326
1327 static bool
1328 internal_function __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1329 re_node_set_insert_last (re_node_set *set, Idx elem)
1330 {
1331 /* Realloc if we need. */
1332 if (set->alloc == set->nelem)
1333 {
1334 Idx *new_elems;
1335 set->alloc = (set->alloc + 1) * 2;
1336 new_elems = re_realloc (set->elems, Idx, set->alloc);
1337 if (BE (new_elems == NULL, 0))
1338 return false;
1339 set->elems = new_elems;
1340 }
1341
1342 /* Insert the new element. */
1343 set->elems[set->nelem++] = elem;
1344 return true;
1345 }
1346
1347 /* Compare two node sets SET1 and SET2.
1348 Return true if SET1 and SET2 are equivalent. */
1349
1350 static bool
1351 internal_function __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1352 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1353 {
1354 Idx i;
1355 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1356 return false;
1357 for (i = set1->nelem ; --i >= 0 ; )
1358 if (set1->elems[i] != set2->elems[i])
1359 return false;
1360 return true;
1361 }
1362
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1364
1365 static Idx
1366 internal_function __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1367 re_node_set_contains (const re_node_set *set, Idx elem)
1368 {
1369 __re_size_t idx, right, mid;
1370 if (set->nelem <= 0)
1371 return 0;
1372
1373 /* Binary search the element. */
1374 idx = 0;
1375 right = set->nelem - 1;
1376 while (idx < right)
1377 {
1378 mid = (idx + right) / 2;
1379 if (set->elems[mid] < elem)
1380 idx = mid + 1;
1381 else
1382 right = mid;
1383 }
1384 return set->elems[idx] == elem ? idx + 1 : 0;
1385 }
1386
1387 static void
1388 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1389 re_node_set_remove_at (re_node_set *set, Idx idx)
1390 {
1391 if (idx < 0 || idx >= set->nelem)
1392 return;
1393 --set->nelem;
1394 for (; idx < set->nelem; idx++)
1395 set->elems[idx] = set->elems[idx + 1];
1396 }
1397
1398
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400 Or return -1 if an error occurred. */
1401
1402 static Idx
1403 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407 {
1408 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409 Idx *new_nexts, *new_indices;
1410 re_node_set *new_edests, *new_eclosures;
1411 re_token_t *new_nodes;
1412
1413 /* Avoid overflows in realloc. */
1414 const size_t max_object_size = MAX (sizeof (re_token_t),
1415 MAX (sizeof (re_node_set),
1416 sizeof (Idx)));
1417 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1418 return -1;
1419
1420 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421 if (BE (new_nodes == NULL, 0))
1422 return -1;
1423 dfa->nodes = new_nodes;
1424 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428 if (BE (new_nexts == NULL || new_indices == NULL
1429 || new_edests == NULL || new_eclosures == NULL, 0))
1430 {
1431 re_free (new_nexts);
1432 re_free (new_indices);
1433 re_free (new_edests);
1434 re_free (new_eclosures);
1435 return -1;
1436 }
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
1441 dfa->nodes_alloc = new_nodes_alloc;
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
1444 dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
1449 #endif
1450 dfa->nexts[dfa->nodes_len] = -1;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1454 }
1455
1456 static re_hashval_t
1457 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460 re_hashval_t hash = nodes->nelem + context;
1461 Idx i;
1462 for (i = 0 ; i < nodes->nelem ; i++)
1463 hash += nodes->elems[i];
1464 return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1474 optimization. */
1475
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 const re_node_set *nodes)
1480 {
1481 re_hashval_t hash;
1482 re_dfastate_t *new_state;
1483 struct re_state_table_entry *spot;
1484 Idx i;
1485 #if defined GCC_LINT || defined lint
1486 /* Suppress bogus uninitialized-variable warnings. */
1487 *err = REG_NOERROR;
1488 #endif
1489 if (BE (nodes->nelem == 0, 0))
1490 {
1491 *err = REG_NOERROR;
1492 return NULL;
1493 }
1494 hash = calc_state_hash (nodes, 0);
1495 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497 for (i = 0 ; i < spot->num ; i++)
1498 {
1499 re_dfastate_t *state = spot->array[i];
1500 if (hash != state->hash)
1501 continue;
1502 if (re_node_set_compare (&state->nodes, nodes))
1503 return state;
1504 }
1505
1506 /* There are no appropriate state in the dfa, create the new one. */
1507 new_state = create_ci_newstate (dfa, nodes, hash);
1508 if (BE (new_state == NULL, 0))
1509 *err = REG_ESPACE;
1510
1511 return new_state;
1512 }
1513
1514 /* Search for the state whose node_set is equivalent to NODES and
1515 whose context is equivalent to CONTEXT.
1516 Return the pointer to the state, if we found it in the DFA.
1517 Otherwise create the new one and return it. In case of an error
1518 return NULL and set the error code in ERR.
1519 Note: - We assume NULL as the invalid state, then it is possible that
1520 return value is NULL and ERR is REG_NOERROR.
1521 - We never return non-NULL value in case of any errors, it is for
1522 optimization. */
1523
1524 static re_dfastate_t *
1525 internal_function __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527 const re_node_set *nodes, unsigned int context)
1528 {
1529 re_hashval_t hash;
1530 re_dfastate_t *new_state;
1531 struct re_state_table_entry *spot;
1532 Idx i;
1533 #if defined GCC_LINT || defined lint
1534 /* Suppress bogus uninitialized-variable warnings. */
1535 *err = REG_NOERROR;
1536 #endif
1537 if (nodes->nelem == 0)
1538 {
1539 *err = REG_NOERROR;
1540 return NULL;
1541 }
1542 hash = calc_state_hash (nodes, context);
1543 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545 for (i = 0 ; i < spot->num ; i++)
1546 {
1547 re_dfastate_t *state = spot->array[i];
1548 if (state->hash == hash
1549 && state->context == context
1550 && re_node_set_compare (state->entrance_nodes, nodes))
1551 return state;
1552 }
1553 /* There are no appropriate state in 'dfa', create the new one. */
1554 new_state = create_cd_newstate (dfa, nodes, context, hash);
1555 if (BE (new_state == NULL, 0))
1556 *err = REG_ESPACE;
1557
1558 return new_state;
1559 }
1560
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562 HASH put in the appropriate bucket of DFA's state table. Return value
1563 indicates the error code if failed. */
1564
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568 re_hashval_t hash)
1569 {
1570 struct re_state_table_entry *spot;
1571 reg_errcode_t err;
1572 Idx i;
1573
1574 newstate->hash = hash;
1575 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576 if (BE (err != REG_NOERROR, 0))
1577 return REG_ESPACE;
1578 for (i = 0; i < newstate->nodes.nelem; i++)
1579 {
1580 Idx elem = newstate->nodes.elems[i];
1581 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583 return REG_ESPACE;
1584 }
1585
1586 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587 if (BE (spot->alloc <= spot->num, 0))
1588 {
1589 Idx new_alloc = 2 * spot->num + 2;
1590 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 new_alloc);
1592 if (BE (new_array == NULL, 0))
1593 return REG_ESPACE;
1594 spot->array = new_array;
1595 spot->alloc = new_alloc;
1596 }
1597 spot->array[spot->num++] = newstate;
1598 return REG_NOERROR;
1599 }
1600
1601 static void
free_state(re_dfastate_t * state)1602 free_state (re_dfastate_t *state)
1603 {
1604 re_node_set_free (&state->non_eps_nodes);
1605 re_node_set_free (&state->inveclosure);
1606 if (state->entrance_nodes != &state->nodes)
1607 {
1608 re_node_set_free (state->entrance_nodes);
1609 re_free (state->entrance_nodes);
1610 }
1611 re_node_set_free (&state->nodes);
1612 re_free (state->word_trtable);
1613 re_free (state->trtable);
1614 re_free (state);
1615 }
1616
1617 /* Create the new state which is independent of contexts.
1618 Return the new state if succeeded, otherwise return NULL. */
1619
1620 static re_dfastate_t *
1621 internal_function __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623 re_hashval_t hash)
1624 {
1625 Idx i;
1626 reg_errcode_t err;
1627 re_dfastate_t *newstate;
1628
1629 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630 if (BE (newstate == NULL, 0))
1631 return NULL;
1632 err = re_node_set_init_copy (&newstate->nodes, nodes);
1633 if (BE (err != REG_NOERROR, 0))
1634 {
1635 re_free (newstate);
1636 return NULL;
1637 }
1638
1639 newstate->entrance_nodes = &newstate->nodes;
1640 for (i = 0 ; i < nodes->nelem ; i++)
1641 {
1642 re_token_t *node = dfa->nodes + nodes->elems[i];
1643 re_token_type_t type = node->type;
1644 if (type == CHARACTER && !node->constraint)
1645 continue;
1646 #ifdef RE_ENABLE_I18N
1647 newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650 /* If the state has the halt node, the state is a halt state. */
1651 if (type == END_OF_RE)
1652 newstate->halt = 1;
1653 else if (type == OP_BACK_REF)
1654 newstate->has_backref = 1;
1655 else if (type == ANCHOR || node->constraint)
1656 newstate->has_constraint = 1;
1657 }
1658 err = register_state (dfa, newstate, hash);
1659 if (BE (err != REG_NOERROR, 0))
1660 {
1661 free_state (newstate);
1662 newstate = NULL;
1663 }
1664 return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668 Return the new state if succeeded, otherwise return NULL. */
1669
1670 static re_dfastate_t *
1671 internal_function __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673 unsigned int context, re_hashval_t hash)
1674 {
1675 Idx i, nctx_nodes = 0;
1676 reg_errcode_t err;
1677 re_dfastate_t *newstate;
1678
1679 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680 if (BE (newstate == NULL, 0))
1681 return NULL;
1682 err = re_node_set_init_copy (&newstate->nodes, nodes);
1683 if (BE (err != REG_NOERROR, 0))
1684 {
1685 re_free (newstate);
1686 return NULL;
1687 }
1688
1689 newstate->context = context;
1690 newstate->entrance_nodes = &newstate->nodes;
1691
1692 for (i = 0 ; i < nodes->nelem ; i++)
1693 {
1694 re_token_t *node = dfa->nodes + nodes->elems[i];
1695 re_token_type_t type = node->type;
1696 unsigned int constraint = node->constraint;
1697
1698 if (type == CHARACTER && !constraint)
1699 continue;
1700 #ifdef RE_ENABLE_I18N
1701 newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704 /* If the state has the halt node, the state is a halt state. */
1705 if (type == END_OF_RE)
1706 newstate->halt = 1;
1707 else if (type == OP_BACK_REF)
1708 newstate->has_backref = 1;
1709
1710 if (constraint)
1711 {
1712 if (newstate->entrance_nodes == &newstate->nodes)
1713 {
1714 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715 if (BE (newstate->entrance_nodes == NULL, 0))
1716 {
1717 free_state (newstate);
1718 return NULL;
1719 }
1720 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721 != REG_NOERROR)
1722 return NULL;
1723 nctx_nodes = 0;
1724 newstate->has_constraint = 1;
1725 }
1726
1727 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728 {
1729 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730 ++nctx_nodes;
1731 }
1732 }
1733 }
1734 err = register_state (dfa, newstate, hash);
1735 if (BE (err != REG_NOERROR, 0))
1736 {
1737 free_state (newstate);
1738 newstate = NULL;
1739 }
1740 return newstate;
1741 }
1742