1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4    Copyright (C) 2002-2019 Free Software Foundation, Inc.
5    This file is part of the GNU C Library.
6    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 
8    The GNU C Library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU General Public
10    License as published by the Free Software Foundation; either
11    version 3 of the License, or (at your option) any later version.
12 
13    The GNU C Library is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    General Public License for more details.
17 
18    You should have received a copy of the GNU General Public
19    License along with the GNU C Library; if not, see
20    <https://www.gnu.org/licenses/>.  */
21 
22 static void re_string_construct_common (const char *str, Idx len,
23 					re_string_t *pstr,
24 					RE_TRANSLATE_TYPE trans, bool icase,
25 					const re_dfa_t *dfa);
26 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
27 					  const re_node_set *nodes,
28 					  re_hashval_t hash);
29 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
30 					  const re_node_set *nodes,
31 					  unsigned int context,
32 					  re_hashval_t hash);
33 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
34 						Idx new_buf_len);
35 #ifdef RE_ENABLE_I18N
36 static void build_wcs_buffer (re_string_t *pstr);
37 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
38 #endif /* RE_ENABLE_I18N */
39 static void build_upper_buffer (re_string_t *pstr);
40 static void re_string_translate_buffer (re_string_t *pstr);
41 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
42 					  int eflags) __attribute__ ((pure));
43 
44 /* Functions for string operation.  */
45 
46 /* This function allocate the buffers.  It is necessary to call
47    re_string_reconstruct before using the object.  */
48 
49 static reg_errcode_t
50 __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)51 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
52 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
53 {
54   reg_errcode_t ret;
55   Idx init_buf_len;
56 
57   /* Ensure at least one character fits into the buffers.  */
58   if (init_len < dfa->mb_cur_max)
59     init_len = dfa->mb_cur_max;
60   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
61   re_string_construct_common (str, len, pstr, trans, icase, dfa);
62 
63   ret = re_string_realloc_buffers (pstr, init_buf_len);
64   if (__glibc_unlikely (ret != REG_NOERROR))
65     return ret;
66 
67   pstr->word_char = dfa->word_char;
68   pstr->word_ops_used = dfa->word_ops_used;
69   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
70   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
71   pstr->valid_raw_len = pstr->valid_len;
72   return REG_NOERROR;
73 }
74 
75 /* This function allocate the buffers, and initialize them.  */
76 
77 static reg_errcode_t
78 __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)79 re_string_construct (re_string_t *pstr, const char *str, Idx len,
80 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
81 {
82   reg_errcode_t ret;
83   memset (pstr, '\0', sizeof (re_string_t));
84   re_string_construct_common (str, len, pstr, trans, icase, dfa);
85 
86   if (len > 0)
87     {
88       ret = re_string_realloc_buffers (pstr, len + 1);
89       if (__glibc_unlikely (ret != REG_NOERROR))
90 	return ret;
91     }
92   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
93 
94   if (icase)
95     {
96 #ifdef RE_ENABLE_I18N
97       if (dfa->mb_cur_max > 1)
98 	{
99 	  while (1)
100 	    {
101 	      ret = build_wcs_upper_buffer (pstr);
102 	      if (__glibc_unlikely (ret != REG_NOERROR))
103 		return ret;
104 	      if (pstr->valid_raw_len >= len)
105 		break;
106 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
107 		break;
108 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
109 	      if (__glibc_unlikely (ret != REG_NOERROR))
110 		return ret;
111 	    }
112 	}
113       else
114 #endif /* RE_ENABLE_I18N  */
115 	build_upper_buffer (pstr);
116     }
117   else
118     {
119 #ifdef RE_ENABLE_I18N
120       if (dfa->mb_cur_max > 1)
121 	build_wcs_buffer (pstr);
122       else
123 #endif /* RE_ENABLE_I18N  */
124 	{
125 	  if (trans != NULL)
126 	    re_string_translate_buffer (pstr);
127 	  else
128 	    {
129 	      pstr->valid_len = pstr->bufs_len;
130 	      pstr->valid_raw_len = pstr->bufs_len;
131 	    }
132 	}
133     }
134 
135   return REG_NOERROR;
136 }
137 
138 /* Helper functions for re_string_allocate, and re_string_construct.  */
139 
140 static reg_errcode_t
141 __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)142 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
143 {
144 #ifdef RE_ENABLE_I18N
145   if (pstr->mb_cur_max > 1)
146     {
147       wint_t *new_wcs;
148 
149       /* Avoid overflow in realloc.  */
150       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
151       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
152 			    < new_buf_len))
153 	return REG_ESPACE;
154 
155       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
156       if (__glibc_unlikely (new_wcs == NULL))
157 	return REG_ESPACE;
158       pstr->wcs = new_wcs;
159       if (pstr->offsets != NULL)
160 	{
161 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
162 	  if (__glibc_unlikely (new_offsets == NULL))
163 	    return REG_ESPACE;
164 	  pstr->offsets = new_offsets;
165 	}
166     }
167 #endif /* RE_ENABLE_I18N  */
168   if (pstr->mbs_allocated)
169     {
170       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
171 					   new_buf_len);
172       if (__glibc_unlikely (new_mbs == NULL))
173 	return REG_ESPACE;
174       pstr->mbs = new_mbs;
175     }
176   pstr->bufs_len = new_buf_len;
177   return REG_NOERROR;
178 }
179 
180 
181 static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)182 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
183 			    RE_TRANSLATE_TYPE trans, bool icase,
184 			    const re_dfa_t *dfa)
185 {
186   pstr->raw_mbs = (const unsigned char *) str;
187   pstr->len = len;
188   pstr->raw_len = len;
189   pstr->trans = trans;
190   pstr->icase = icase;
191   pstr->mbs_allocated = (trans != NULL || icase);
192   pstr->mb_cur_max = dfa->mb_cur_max;
193   pstr->is_utf8 = dfa->is_utf8;
194   pstr->map_notascii = dfa->map_notascii;
195   pstr->stop = pstr->len;
196   pstr->raw_stop = pstr->stop;
197 }
198 
199 #ifdef RE_ENABLE_I18N
200 
201 /* Build wide character buffer PSTR->WCS.
202    If the byte sequence of the string are:
203      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
204    Then wide character buffer will be:
205      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
206    We use WEOF for padding, they indicate that the position isn't
207    a first byte of a multibyte character.
208 
209    Note that this function assumes PSTR->VALID_LEN elements are already
210    built and starts from PSTR->VALID_LEN.  */
211 
212 static void
build_wcs_buffer(re_string_t * pstr)213 build_wcs_buffer (re_string_t *pstr)
214 {
215 #ifdef _LIBC
216   unsigned char buf[MB_LEN_MAX];
217   assert (MB_LEN_MAX >= pstr->mb_cur_max);
218 #else
219   unsigned char buf[64];
220 #endif
221   mbstate_t prev_st;
222   Idx byte_idx, end_idx, remain_len;
223   size_t mbclen;
224 
225   /* Build the buffers from pstr->valid_len to either pstr->len or
226      pstr->bufs_len.  */
227   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
228   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
229     {
230       wchar_t wc;
231       const char *p;
232 
233       remain_len = end_idx - byte_idx;
234       prev_st = pstr->cur_state;
235       /* Apply the translation if we need.  */
236       if (__glibc_unlikely (pstr->trans != NULL))
237 	{
238 	  int i, ch;
239 
240 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
241 	    {
242 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
243 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
244 	    }
245 	  p = (const char *) buf;
246 	}
247       else
248 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
249       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
250       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
251 			    || (mbclen == (size_t) -2
252 				&& pstr->bufs_len >= pstr->len)))
253 	{
254 	  /* We treat these cases as a singlebyte character.  */
255 	  mbclen = 1;
256 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
257 	  if (__glibc_unlikely (pstr->trans != NULL))
258 	    wc = pstr->trans[wc];
259 	  pstr->cur_state = prev_st;
260 	}
261       else if (__glibc_unlikely (mbclen == (size_t) -2))
262 	{
263 	  /* The buffer doesn't have enough space, finish to build.  */
264 	  pstr->cur_state = prev_st;
265 	  break;
266 	}
267 
268       /* Write wide character and padding.  */
269       pstr->wcs[byte_idx++] = wc;
270       /* Write paddings.  */
271       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
272 	pstr->wcs[byte_idx++] = WEOF;
273     }
274   pstr->valid_len = byte_idx;
275   pstr->valid_raw_len = byte_idx;
276 }
277 
278 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
279    but for REG_ICASE.  */
280 
281 static reg_errcode_t
282 __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)283 build_wcs_upper_buffer (re_string_t *pstr)
284 {
285   mbstate_t prev_st;
286   Idx src_idx, byte_idx, end_idx, remain_len;
287   size_t mbclen;
288 #ifdef _LIBC
289   char buf[MB_LEN_MAX];
290   assert (MB_LEN_MAX >= pstr->mb_cur_max);
291 #else
292   char buf[64];
293 #endif
294 
295   byte_idx = pstr->valid_len;
296   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
297 
298   /* The following optimization assumes that ASCII characters can be
299      mapped to wide characters with a simple cast.  */
300   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
301     {
302       while (byte_idx < end_idx)
303 	{
304 	  wchar_t wc;
305 
306 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
307 	      && mbsinit (&pstr->cur_state))
308 	    {
309 	      /* In case of a singlebyte character.  */
310 	      pstr->mbs[byte_idx]
311 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
312 	      /* The next step uses the assumption that wchar_t is encoded
313 		 ASCII-safe: all ASCII values can be converted like this.  */
314 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
315 	      ++byte_idx;
316 	      continue;
317 	    }
318 
319 	  remain_len = end_idx - byte_idx;
320 	  prev_st = pstr->cur_state;
321 	  mbclen = __mbrtowc (&wc,
322 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
323 			       + byte_idx), remain_len, &pstr->cur_state);
324 	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
325 	    {
326 	      wchar_t wcu = __towupper (wc);
327 	      if (wcu != wc)
328 		{
329 		  size_t mbcdlen;
330 
331 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
332 		  if (__glibc_likely (mbclen == mbcdlen))
333 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
334 		  else
335 		    {
336 		      src_idx = byte_idx;
337 		      goto offsets_needed;
338 		    }
339 		}
340 	      else
341 		memcpy (pstr->mbs + byte_idx,
342 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
343 	      pstr->wcs[byte_idx++] = wcu;
344 	      /* Write paddings.  */
345 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
346 		pstr->wcs[byte_idx++] = WEOF;
347 	    }
348 	  else if (mbclen == (size_t) -1 || mbclen == 0
349 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
350 	    {
351 	      /* It is an invalid character, an incomplete character
352 		 at the end of the string, or '\0'.  Just use the byte.  */
353 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
354 	      pstr->mbs[byte_idx] = ch;
355 	      /* And also cast it to wide char.  */
356 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
357 	      if (__glibc_unlikely (mbclen == (size_t) -1))
358 		pstr->cur_state = prev_st;
359 	    }
360 	  else
361 	    {
362 	      /* The buffer doesn't have enough space, finish to build.  */
363 	      pstr->cur_state = prev_st;
364 	      break;
365 	    }
366 	}
367       pstr->valid_len = byte_idx;
368       pstr->valid_raw_len = byte_idx;
369       return REG_NOERROR;
370     }
371   else
372     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
373       {
374 	wchar_t wc;
375 	const char *p;
376       offsets_needed:
377 	remain_len = end_idx - byte_idx;
378 	prev_st = pstr->cur_state;
379 	if (__glibc_unlikely (pstr->trans != NULL))
380 	  {
381 	    int i, ch;
382 
383 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
384 	      {
385 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
386 		buf[i] = pstr->trans[ch];
387 	      }
388 	    p = (const char *) buf;
389 	  }
390 	else
391 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
392 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
393 	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
394 	  {
395 	    wchar_t wcu = __towupper (wc);
396 	    if (wcu != wc)
397 	      {
398 		size_t mbcdlen;
399 
400 		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
401 		if (__glibc_likely (mbclen == mbcdlen))
402 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
403 		else if (mbcdlen != (size_t) -1)
404 		  {
405 		    size_t i;
406 
407 		    if (byte_idx + mbcdlen > pstr->bufs_len)
408 		      {
409 			pstr->cur_state = prev_st;
410 			break;
411 		      }
412 
413 		    if (pstr->offsets == NULL)
414 		      {
415 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
416 
417 			if (pstr->offsets == NULL)
418 			  return REG_ESPACE;
419 		      }
420 		    if (!pstr->offsets_needed)
421 		      {
422 			for (i = 0; i < (size_t) byte_idx; ++i)
423 			  pstr->offsets[i] = i;
424 			pstr->offsets_needed = 1;
425 		      }
426 
427 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
428 		    pstr->wcs[byte_idx] = wcu;
429 		    pstr->offsets[byte_idx] = src_idx;
430 		    for (i = 1; i < mbcdlen; ++i)
431 		      {
432 			pstr->offsets[byte_idx + i]
433 			  = src_idx + (i < mbclen ? i : mbclen - 1);
434 			pstr->wcs[byte_idx + i] = WEOF;
435 		      }
436 		    pstr->len += mbcdlen - mbclen;
437 		    if (pstr->raw_stop > src_idx)
438 		      pstr->stop += mbcdlen - mbclen;
439 		    end_idx = (pstr->bufs_len > pstr->len)
440 			      ? pstr->len : pstr->bufs_len;
441 		    byte_idx += mbcdlen;
442 		    src_idx += mbclen;
443 		    continue;
444 		  }
445 		else
446 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
447 	      }
448 	    else
449 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
450 
451 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
452 	      {
453 		size_t i;
454 		for (i = 0; i < mbclen; ++i)
455 		  pstr->offsets[byte_idx + i] = src_idx + i;
456 	      }
457 	    src_idx += mbclen;
458 
459 	    pstr->wcs[byte_idx++] = wcu;
460 	    /* Write paddings.  */
461 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
462 	      pstr->wcs[byte_idx++] = WEOF;
463 	  }
464 	else if (mbclen == (size_t) -1 || mbclen == 0
465 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
466 	  {
467 	    /* It is an invalid character or '\0'.  Just use the byte.  */
468 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
469 
470 	    if (__glibc_unlikely (pstr->trans != NULL))
471 	      ch = pstr->trans [ch];
472 	    pstr->mbs[byte_idx] = ch;
473 
474 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
475 	      pstr->offsets[byte_idx] = src_idx;
476 	    ++src_idx;
477 
478 	    /* And also cast it to wide char.  */
479 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
480 	    if (__glibc_unlikely (mbclen == (size_t) -1))
481 	      pstr->cur_state = prev_st;
482 	  }
483 	else
484 	  {
485 	    /* The buffer doesn't have enough space, finish to build.  */
486 	    pstr->cur_state = prev_st;
487 	    break;
488 	  }
489       }
490   pstr->valid_len = byte_idx;
491   pstr->valid_raw_len = src_idx;
492   return REG_NOERROR;
493 }
494 
495 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
496    Return the index.  */
497 
498 static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)499 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
500 {
501   mbstate_t prev_st;
502   Idx rawbuf_idx;
503   size_t mbclen;
504   wint_t wc = WEOF;
505 
506   /* Skip the characters which are not necessary to check.  */
507   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
508        rawbuf_idx < new_raw_idx;)
509     {
510       wchar_t wc2;
511       Idx remain_len = pstr->raw_len - rawbuf_idx;
512       prev_st = pstr->cur_state;
513       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
514 			  remain_len, &pstr->cur_state);
515       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
516 			    || mbclen == 0))
517 	{
518 	  /* We treat these cases as a single byte character.  */
519 	  if (mbclen == 0 || remain_len == 0)
520 	    wc = L'\0';
521 	  else
522 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
523 	  mbclen = 1;
524 	  pstr->cur_state = prev_st;
525 	}
526       else
527 	wc = wc2;
528       /* Then proceed the next character.  */
529       rawbuf_idx += mbclen;
530     }
531   *last_wc = wc;
532   return rawbuf_idx;
533 }
534 #endif /* RE_ENABLE_I18N  */
535 
536 /* Build the buffer PSTR->MBS, and apply the translation if we need.
537    This function is used in case of REG_ICASE.  */
538 
539 static void
build_upper_buffer(re_string_t * pstr)540 build_upper_buffer (re_string_t *pstr)
541 {
542   Idx char_idx, end_idx;
543   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
544 
545   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
546     {
547       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
548       if (__glibc_unlikely (pstr->trans != NULL))
549 	ch = pstr->trans[ch];
550       pstr->mbs[char_idx] = toupper (ch);
551     }
552   pstr->valid_len = char_idx;
553   pstr->valid_raw_len = char_idx;
554 }
555 
556 /* Apply TRANS to the buffer in PSTR.  */
557 
558 static void
re_string_translate_buffer(re_string_t * pstr)559 re_string_translate_buffer (re_string_t *pstr)
560 {
561   Idx buf_idx, end_idx;
562   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
563 
564   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
565     {
566       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
567       pstr->mbs[buf_idx] = pstr->trans[ch];
568     }
569 
570   pstr->valid_len = buf_idx;
571   pstr->valid_raw_len = buf_idx;
572 }
573 
574 /* This function re-construct the buffers.
575    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
576    convert to upper case in case of REG_ICASE, apply translation.  */
577 
578 static reg_errcode_t
579 __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)580 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
581 {
582   Idx offset;
583 
584   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
585     offset = idx - pstr->raw_mbs_idx;
586   else
587     {
588       /* Reset buffer.  */
589 #ifdef RE_ENABLE_I18N
590       if (pstr->mb_cur_max > 1)
591 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
592 #endif /* RE_ENABLE_I18N */
593       pstr->len = pstr->raw_len;
594       pstr->stop = pstr->raw_stop;
595       pstr->valid_len = 0;
596       pstr->raw_mbs_idx = 0;
597       pstr->valid_raw_len = 0;
598       pstr->offsets_needed = 0;
599       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
600 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
601       if (!pstr->mbs_allocated)
602 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
603       offset = idx;
604     }
605 
606   if (__glibc_likely (offset != 0))
607     {
608       /* Should the already checked characters be kept?  */
609       if (__glibc_likely (offset < pstr->valid_raw_len))
610 	{
611 	  /* Yes, move them to the front of the buffer.  */
612 #ifdef RE_ENABLE_I18N
613 	  if (__glibc_unlikely (pstr->offsets_needed))
614 	    {
615 	      Idx low = 0, high = pstr->valid_len, mid;
616 	      do
617 		{
618 		  mid = (high + low) / 2;
619 		  if (pstr->offsets[mid] > offset)
620 		    high = mid;
621 		  else if (pstr->offsets[mid] < offset)
622 		    low = mid + 1;
623 		  else
624 		    break;
625 		}
626 	      while (low < high);
627 	      if (pstr->offsets[mid] < offset)
628 		++mid;
629 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
630 							eflags);
631 	      /* This can be quite complicated, so handle specially
632 		 only the common and easy case where the character with
633 		 different length representation of lower and upper
634 		 case is present at or after offset.  */
635 	      if (pstr->valid_len > offset
636 		  && mid == offset && pstr->offsets[mid] == offset)
637 		{
638 		  memmove (pstr->wcs, pstr->wcs + offset,
639 			   (pstr->valid_len - offset) * sizeof (wint_t));
640 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
641 		  pstr->valid_len -= offset;
642 		  pstr->valid_raw_len -= offset;
643 		  for (low = 0; low < pstr->valid_len; low++)
644 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
645 		}
646 	      else
647 		{
648 		  /* Otherwise, just find out how long the partial multibyte
649 		     character at offset is and fill it with WEOF/255.  */
650 		  pstr->len = pstr->raw_len - idx + offset;
651 		  pstr->stop = pstr->raw_stop - idx + offset;
652 		  pstr->offsets_needed = 0;
653 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
654 		    --mid;
655 		  while (mid < pstr->valid_len)
656 		    if (pstr->wcs[mid] != WEOF)
657 		      break;
658 		    else
659 		      ++mid;
660 		  if (mid == pstr->valid_len)
661 		    pstr->valid_len = 0;
662 		  else
663 		    {
664 		      pstr->valid_len = pstr->offsets[mid] - offset;
665 		      if (pstr->valid_len)
666 			{
667 			  for (low = 0; low < pstr->valid_len; ++low)
668 			    pstr->wcs[low] = WEOF;
669 			  memset (pstr->mbs, 255, pstr->valid_len);
670 			}
671 		    }
672 		  pstr->valid_raw_len = pstr->valid_len;
673 		}
674 	    }
675 	  else
676 #endif
677 	    {
678 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
679 							eflags);
680 #ifdef RE_ENABLE_I18N
681 	      if (pstr->mb_cur_max > 1)
682 		memmove (pstr->wcs, pstr->wcs + offset,
683 			 (pstr->valid_len - offset) * sizeof (wint_t));
684 #endif /* RE_ENABLE_I18N */
685 	      if (__glibc_unlikely (pstr->mbs_allocated))
686 		memmove (pstr->mbs, pstr->mbs + offset,
687 			 pstr->valid_len - offset);
688 	      pstr->valid_len -= offset;
689 	      pstr->valid_raw_len -= offset;
690 #if defined DEBUG && DEBUG
691 	      assert (pstr->valid_len > 0);
692 #endif
693 	    }
694 	}
695       else
696 	{
697 #ifdef RE_ENABLE_I18N
698 	  /* No, skip all characters until IDX.  */
699 	  Idx prev_valid_len = pstr->valid_len;
700 
701 	  if (__glibc_unlikely (pstr->offsets_needed))
702 	    {
703 	      pstr->len = pstr->raw_len - idx + offset;
704 	      pstr->stop = pstr->raw_stop - idx + offset;
705 	      pstr->offsets_needed = 0;
706 	    }
707 #endif
708 	  pstr->valid_len = 0;
709 #ifdef RE_ENABLE_I18N
710 	  if (pstr->mb_cur_max > 1)
711 	    {
712 	      Idx wcs_idx;
713 	      wint_t wc = WEOF;
714 
715 	      if (pstr->is_utf8)
716 		{
717 		  const unsigned char *raw, *p, *end;
718 
719 		  /* Special case UTF-8.  Multi-byte chars start with any
720 		     byte other than 0x80 - 0xbf.  */
721 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
722 		  end = raw + (offset - pstr->mb_cur_max);
723 		  if (end < pstr->raw_mbs)
724 		    end = pstr->raw_mbs;
725 		  p = raw + offset - 1;
726 #ifdef _LIBC
727 		  /* We know the wchar_t encoding is UCS4, so for the simple
728 		     case, ASCII characters, skip the conversion step.  */
729 		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
730 		    {
731 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
732 		      /* pstr->valid_len = 0; */
733 		      wc = (wchar_t) *p;
734 		    }
735 		  else
736 #endif
737 		    for (; p >= end; --p)
738 		      if ((*p & 0xc0) != 0x80)
739 			{
740 			  mbstate_t cur_state;
741 			  wchar_t wc2;
742 			  Idx mlen = raw + pstr->len - p;
743 			  unsigned char buf[6];
744 			  size_t mbclen;
745 
746 			  const unsigned char *pp = p;
747 			  if (__glibc_unlikely (pstr->trans != NULL))
748 			    {
749 			      int i = mlen < 6 ? mlen : 6;
750 			      while (--i >= 0)
751 				buf[i] = pstr->trans[p[i]];
752 			      pp = buf;
753 			    }
754 			  /* XXX Don't use mbrtowc, we know which conversion
755 			     to use (UTF-8 -> UCS4).  */
756 			  memset (&cur_state, 0, sizeof (cur_state));
757 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
758 					      &cur_state);
759 			  if (raw + offset - p <= mbclen
760 			      && mbclen < (size_t) -2)
761 			    {
762 			      memset (&pstr->cur_state, '\0',
763 				      sizeof (mbstate_t));
764 			      pstr->valid_len = mbclen - (raw + offset - p);
765 			      wc = wc2;
766 			    }
767 			  break;
768 			}
769 		}
770 
771 	      if (wc == WEOF)
772 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
773 	      if (wc == WEOF)
774 		pstr->tip_context
775 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
776 	      else
777 		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
778 				      && IS_WIDE_WORD_CHAR (wc))
779 				     ? CONTEXT_WORD
780 				     : ((IS_WIDE_NEWLINE (wc)
781 					 && pstr->newline_anchor)
782 					? CONTEXT_NEWLINE : 0));
783 	      if (__glibc_unlikely (pstr->valid_len))
784 		{
785 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
786 		    pstr->wcs[wcs_idx] = WEOF;
787 		  if (pstr->mbs_allocated)
788 		    memset (pstr->mbs, 255, pstr->valid_len);
789 		}
790 	      pstr->valid_raw_len = pstr->valid_len;
791 	    }
792 	  else
793 #endif /* RE_ENABLE_I18N */
794 	    {
795 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
796 	      pstr->valid_raw_len = 0;
797 	      if (pstr->trans)
798 		c = pstr->trans[c];
799 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
800 				   ? CONTEXT_WORD
801 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
802 				      ? CONTEXT_NEWLINE : 0));
803 	    }
804 	}
805       if (!__glibc_unlikely (pstr->mbs_allocated))
806 	pstr->mbs += offset;
807     }
808   pstr->raw_mbs_idx = idx;
809   pstr->len -= offset;
810   pstr->stop -= offset;
811 
812   /* Then build the buffers.  */
813 #ifdef RE_ENABLE_I18N
814   if (pstr->mb_cur_max > 1)
815     {
816       if (pstr->icase)
817 	{
818 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
819 	  if (__glibc_unlikely (ret != REG_NOERROR))
820 	    return ret;
821 	}
822       else
823 	build_wcs_buffer (pstr);
824     }
825   else
826 #endif /* RE_ENABLE_I18N */
827     if (__glibc_unlikely (pstr->mbs_allocated))
828       {
829 	if (pstr->icase)
830 	  build_upper_buffer (pstr);
831 	else if (pstr->trans != NULL)
832 	  re_string_translate_buffer (pstr);
833       }
834     else
835       pstr->valid_len = pstr->len;
836 
837   pstr->cur_idx = 0;
838   return REG_NOERROR;
839 }
840 
841 static unsigned char
842 __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)843 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
844 {
845   int ch;
846   Idx off;
847 
848   /* Handle the common (easiest) cases first.  */
849   if (__glibc_likely (!pstr->mbs_allocated))
850     return re_string_peek_byte (pstr, idx);
851 
852 #ifdef RE_ENABLE_I18N
853   if (pstr->mb_cur_max > 1
854       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
855     return re_string_peek_byte (pstr, idx);
856 #endif
857 
858   off = pstr->cur_idx + idx;
859 #ifdef RE_ENABLE_I18N
860   if (pstr->offsets_needed)
861     off = pstr->offsets[off];
862 #endif
863 
864   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
865 
866 #ifdef RE_ENABLE_I18N
867   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
868      this function returns CAPITAL LETTER I instead of first byte of
869      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
870      since peek_byte_case doesn't advance cur_idx in any way.  */
871   if (pstr->offsets_needed && !isascii (ch))
872     return re_string_peek_byte (pstr, idx);
873 #endif
874 
875   return ch;
876 }
877 
878 static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)879 re_string_fetch_byte_case (re_string_t *pstr)
880 {
881   if (__glibc_likely (!pstr->mbs_allocated))
882     return re_string_fetch_byte (pstr);
883 
884 #ifdef RE_ENABLE_I18N
885   if (pstr->offsets_needed)
886     {
887       Idx off;
888       int ch;
889 
890       /* For tr_TR.UTF-8 [[:islower:]] there is
891 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
892 	 in that case the whole multi-byte character and return
893 	 the original letter.  On the other side, with
894 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
895 	 anything else would complicate things too much.  */
896 
897       if (!re_string_first_byte (pstr, pstr->cur_idx))
898 	return re_string_fetch_byte (pstr);
899 
900       off = pstr->offsets[pstr->cur_idx];
901       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
902 
903       if (! isascii (ch))
904 	return re_string_fetch_byte (pstr);
905 
906       re_string_skip_bytes (pstr,
907 			    re_string_char_size_at (pstr, pstr->cur_idx));
908       return ch;
909     }
910 #endif
911 
912   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
913 }
914 
915 static void
re_string_destruct(re_string_t * pstr)916 re_string_destruct (re_string_t *pstr)
917 {
918 #ifdef RE_ENABLE_I18N
919   re_free (pstr->wcs);
920   re_free (pstr->offsets);
921 #endif /* RE_ENABLE_I18N  */
922   if (pstr->mbs_allocated)
923     re_free (pstr->mbs);
924 }
925 
926 /* Return the context at IDX in INPUT.  */
927 
928 static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)929 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
930 {
931   int c;
932   if (__glibc_unlikely (idx < 0))
933     /* In this case, we use the value stored in input->tip_context,
934        since we can't know the character in input->mbs[-1] here.  */
935     return input->tip_context;
936   if (__glibc_unlikely (idx == input->len))
937     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
938 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
939 #ifdef RE_ENABLE_I18N
940   if (input->mb_cur_max > 1)
941     {
942       wint_t wc;
943       Idx wc_idx = idx;
944       while(input->wcs[wc_idx] == WEOF)
945 	{
946 #if defined DEBUG && DEBUG
947 	  /* It must not happen.  */
948 	  assert (wc_idx >= 0);
949 #endif
950 	  --wc_idx;
951 	  if (wc_idx < 0)
952 	    return input->tip_context;
953 	}
954       wc = input->wcs[wc_idx];
955       if (__glibc_unlikely (input->word_ops_used != 0)
956 	  && IS_WIDE_WORD_CHAR (wc))
957 	return CONTEXT_WORD;
958       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
959 	      ? CONTEXT_NEWLINE : 0);
960     }
961   else
962 #endif
963     {
964       c = re_string_byte_at (input, idx);
965       if (bitset_contain (input->word_char, c))
966 	return CONTEXT_WORD;
967       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
968     }
969 }
970 
971 /* Functions for set operation.  */
972 
973 static reg_errcode_t
974 __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)975 re_node_set_alloc (re_node_set *set, Idx size)
976 {
977   set->alloc = size;
978   set->nelem = 0;
979   set->elems = re_malloc (Idx, size);
980   if (__glibc_unlikely (set->elems == NULL)
981       && (MALLOC_0_IS_NONNULL || size != 0))
982     return REG_ESPACE;
983   return REG_NOERROR;
984 }
985 
986 static reg_errcode_t
987 __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)988 re_node_set_init_1 (re_node_set *set, Idx elem)
989 {
990   set->alloc = 1;
991   set->nelem = 1;
992   set->elems = re_malloc (Idx, 1);
993   if (__glibc_unlikely (set->elems == NULL))
994     {
995       set->alloc = set->nelem = 0;
996       return REG_ESPACE;
997     }
998   set->elems[0] = elem;
999   return REG_NOERROR;
1000 }
1001 
1002 static reg_errcode_t
1003 __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)1004 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1005 {
1006   set->alloc = 2;
1007   set->elems = re_malloc (Idx, 2);
1008   if (__glibc_unlikely (set->elems == NULL))
1009     return REG_ESPACE;
1010   if (elem1 == elem2)
1011     {
1012       set->nelem = 1;
1013       set->elems[0] = elem1;
1014     }
1015   else
1016     {
1017       set->nelem = 2;
1018       if (elem1 < elem2)
1019 	{
1020 	  set->elems[0] = elem1;
1021 	  set->elems[1] = elem2;
1022 	}
1023       else
1024 	{
1025 	  set->elems[0] = elem2;
1026 	  set->elems[1] = elem1;
1027 	}
1028     }
1029   return REG_NOERROR;
1030 }
1031 
1032 static reg_errcode_t
1033 __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1034 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1035 {
1036   dest->nelem = src->nelem;
1037   if (src->nelem > 0)
1038     {
1039       dest->alloc = dest->nelem;
1040       dest->elems = re_malloc (Idx, dest->alloc);
1041       if (__glibc_unlikely (dest->elems == NULL))
1042 	{
1043 	  dest->alloc = dest->nelem = 0;
1044 	  return REG_ESPACE;
1045 	}
1046       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1047     }
1048   else
1049     re_node_set_init_empty (dest);
1050   return REG_NOERROR;
1051 }
1052 
1053 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1054    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1055    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1056 
1057 static reg_errcode_t
1058 __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1059 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1060 			   const re_node_set *src2)
1061 {
1062   Idx i1, i2, is, id, delta, sbase;
1063   if (src1->nelem == 0 || src2->nelem == 0)
1064     return REG_NOERROR;
1065 
1066   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1067      conservative estimate.  */
1068   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1069     {
1070       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1071       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1072       if (__glibc_unlikely (new_elems == NULL))
1073 	return REG_ESPACE;
1074       dest->elems = new_elems;
1075       dest->alloc = new_alloc;
1076     }
1077 
1078   /* Find the items in the intersection of SRC1 and SRC2, and copy
1079      into the top of DEST those that are not already in DEST itself.  */
1080   sbase = dest->nelem + src1->nelem + src2->nelem;
1081   i1 = src1->nelem - 1;
1082   i2 = src2->nelem - 1;
1083   id = dest->nelem - 1;
1084   for (;;)
1085     {
1086       if (src1->elems[i1] == src2->elems[i2])
1087 	{
1088 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1089 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1090 	    --id;
1091 
1092 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1093             dest->elems[--sbase] = src1->elems[i1];
1094 
1095 	  if (--i1 < 0 || --i2 < 0)
1096 	    break;
1097 	}
1098 
1099       /* Lower the highest of the two items.  */
1100       else if (src1->elems[i1] < src2->elems[i2])
1101 	{
1102 	  if (--i2 < 0)
1103 	    break;
1104 	}
1105       else
1106 	{
1107 	  if (--i1 < 0)
1108 	    break;
1109 	}
1110     }
1111 
1112   id = dest->nelem - 1;
1113   is = dest->nelem + src1->nelem + src2->nelem - 1;
1114   delta = is - sbase + 1;
1115 
1116   /* Now copy.  When DELTA becomes zero, the remaining
1117      DEST elements are already in place; this is more or
1118      less the same loop that is in re_node_set_merge.  */
1119   dest->nelem += delta;
1120   if (delta > 0 && id >= 0)
1121     for (;;)
1122       {
1123 	if (dest->elems[is] > dest->elems[id])
1124 	  {
1125 	    /* Copy from the top.  */
1126 	    dest->elems[id + delta--] = dest->elems[is--];
1127 	    if (delta == 0)
1128 	      break;
1129 	  }
1130 	else
1131 	  {
1132 	    /* Slide from the bottom.  */
1133 	    dest->elems[id + delta] = dest->elems[id];
1134 	    if (--id < 0)
1135 	      break;
1136 	  }
1137       }
1138 
1139   /* Copy remaining SRC elements.  */
1140   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1141 
1142   return REG_NOERROR;
1143 }
1144 
1145 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1146    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1147 
1148 static reg_errcode_t
1149 __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1150 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1151 			const re_node_set *src2)
1152 {
1153   Idx i1, i2, id;
1154   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1155     {
1156       dest->alloc = src1->nelem + src2->nelem;
1157       dest->elems = re_malloc (Idx, dest->alloc);
1158       if (__glibc_unlikely (dest->elems == NULL))
1159 	return REG_ESPACE;
1160     }
1161   else
1162     {
1163       if (src1 != NULL && src1->nelem > 0)
1164 	return re_node_set_init_copy (dest, src1);
1165       else if (src2 != NULL && src2->nelem > 0)
1166 	return re_node_set_init_copy (dest, src2);
1167       else
1168 	re_node_set_init_empty (dest);
1169       return REG_NOERROR;
1170     }
1171   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1172     {
1173       if (src1->elems[i1] > src2->elems[i2])
1174 	{
1175 	  dest->elems[id++] = src2->elems[i2++];
1176 	  continue;
1177 	}
1178       if (src1->elems[i1] == src2->elems[i2])
1179 	++i2;
1180       dest->elems[id++] = src1->elems[i1++];
1181     }
1182   if (i1 < src1->nelem)
1183     {
1184       memcpy (dest->elems + id, src1->elems + i1,
1185 	     (src1->nelem - i1) * sizeof (Idx));
1186       id += src1->nelem - i1;
1187     }
1188   else if (i2 < src2->nelem)
1189     {
1190       memcpy (dest->elems + id, src2->elems + i2,
1191 	     (src2->nelem - i2) * sizeof (Idx));
1192       id += src2->nelem - i2;
1193     }
1194   dest->nelem = id;
1195   return REG_NOERROR;
1196 }
1197 
1198 /* Calculate the union set of the sets DEST and SRC. And store it to
1199    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1200 
1201 static reg_errcode_t
1202 __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1203 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1204 {
1205   Idx is, id, sbase, delta;
1206   if (src == NULL || src->nelem == 0)
1207     return REG_NOERROR;
1208   if (dest->alloc < 2 * src->nelem + dest->nelem)
1209     {
1210       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1211       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1212       if (__glibc_unlikely (new_buffer == NULL))
1213 	return REG_ESPACE;
1214       dest->elems = new_buffer;
1215       dest->alloc = new_alloc;
1216     }
1217 
1218   if (__glibc_unlikely (dest->nelem == 0))
1219     {
1220       dest->nelem = src->nelem;
1221       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1222       return REG_NOERROR;
1223     }
1224 
1225   /* Copy into the top of DEST the items of SRC that are not
1226      found in DEST.  Maybe we could binary search in DEST?  */
1227   for (sbase = dest->nelem + 2 * src->nelem,
1228        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1229     {
1230       if (dest->elems[id] == src->elems[is])
1231 	is--, id--;
1232       else if (dest->elems[id] < src->elems[is])
1233 	dest->elems[--sbase] = src->elems[is--];
1234       else /* if (dest->elems[id] > src->elems[is]) */
1235 	--id;
1236     }
1237 
1238   if (is >= 0)
1239     {
1240       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1241       sbase -= is + 1;
1242       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1243     }
1244 
1245   id = dest->nelem - 1;
1246   is = dest->nelem + 2 * src->nelem - 1;
1247   delta = is - sbase + 1;
1248   if (delta == 0)
1249     return REG_NOERROR;
1250 
1251   /* Now copy.  When DELTA becomes zero, the remaining
1252      DEST elements are already in place.  */
1253   dest->nelem += delta;
1254   for (;;)
1255     {
1256       if (dest->elems[is] > dest->elems[id])
1257 	{
1258 	  /* Copy from the top.  */
1259 	  dest->elems[id + delta--] = dest->elems[is--];
1260 	  if (delta == 0)
1261 	    break;
1262 	}
1263       else
1264 	{
1265 	  /* Slide from the bottom.  */
1266 	  dest->elems[id + delta] = dest->elems[id];
1267 	  if (--id < 0)
1268 	    {
1269 	      /* Copy remaining SRC elements.  */
1270 	      memcpy (dest->elems, dest->elems + sbase,
1271 		      delta * sizeof (Idx));
1272 	      break;
1273 	    }
1274 	}
1275     }
1276 
1277   return REG_NOERROR;
1278 }
1279 
1280 /* Insert the new element ELEM to the re_node_set* SET.
1281    SET should not already have ELEM.
1282    Return true if successful.  */
1283 
1284 static bool
1285 __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1286 re_node_set_insert (re_node_set *set, Idx elem)
1287 {
1288   Idx idx;
1289   /* In case the set is empty.  */
1290   if (set->alloc == 0)
1291     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1292 
1293   if (__glibc_unlikely (set->nelem) == 0)
1294     {
1295       /* We already guaranteed above that set->alloc != 0.  */
1296       set->elems[0] = elem;
1297       ++set->nelem;
1298       return true;
1299     }
1300 
1301   /* Realloc if we need.  */
1302   if (set->alloc == set->nelem)
1303     {
1304       Idx *new_elems;
1305       set->alloc = set->alloc * 2;
1306       new_elems = re_realloc (set->elems, Idx, set->alloc);
1307       if (__glibc_unlikely (new_elems == NULL))
1308 	return false;
1309       set->elems = new_elems;
1310     }
1311 
1312   /* Move the elements which follows the new element.  Test the
1313      first element separately to skip a check in the inner loop.  */
1314   if (elem < set->elems[0])
1315     {
1316       idx = 0;
1317       for (idx = set->nelem; idx > 0; idx--)
1318 	set->elems[idx] = set->elems[idx - 1];
1319     }
1320   else
1321     {
1322       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1323 	set->elems[idx] = set->elems[idx - 1];
1324     }
1325 
1326   /* Insert the new element.  */
1327   set->elems[idx] = elem;
1328   ++set->nelem;
1329   return true;
1330 }
1331 
1332 /* Insert the new element ELEM to the re_node_set* SET.
1333    SET should not already have any element greater than or equal to ELEM.
1334    Return true if successful.  */
1335 
1336 static bool
1337 __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1338 re_node_set_insert_last (re_node_set *set, Idx elem)
1339 {
1340   /* Realloc if we need.  */
1341   if (set->alloc == set->nelem)
1342     {
1343       Idx *new_elems;
1344       set->alloc = (set->alloc + 1) * 2;
1345       new_elems = re_realloc (set->elems, Idx, set->alloc);
1346       if (__glibc_unlikely (new_elems == NULL))
1347 	return false;
1348       set->elems = new_elems;
1349     }
1350 
1351   /* Insert the new element.  */
1352   set->elems[set->nelem++] = elem;
1353   return true;
1354 }
1355 
1356 /* Compare two node sets SET1 and SET2.
1357    Return true if SET1 and SET2 are equivalent.  */
1358 
1359 static bool
1360 __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1361 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1362 {
1363   Idx i;
1364   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1365     return false;
1366   for (i = set1->nelem ; --i >= 0 ; )
1367     if (set1->elems[i] != set2->elems[i])
1368       return false;
1369   return true;
1370 }
1371 
1372 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1373 
1374 static Idx
1375 __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1376 re_node_set_contains (const re_node_set *set, Idx elem)
1377 {
1378   __re_size_t idx, right, mid;
1379   if (set->nelem <= 0)
1380     return 0;
1381 
1382   /* Binary search the element.  */
1383   idx = 0;
1384   right = set->nelem - 1;
1385   while (idx < right)
1386     {
1387       mid = (idx + right) / 2;
1388       if (set->elems[mid] < elem)
1389 	idx = mid + 1;
1390       else
1391 	right = mid;
1392     }
1393   return set->elems[idx] == elem ? idx + 1 : 0;
1394 }
1395 
1396 static void
re_node_set_remove_at(re_node_set * set,Idx idx)1397 re_node_set_remove_at (re_node_set *set, Idx idx)
1398 {
1399   if (idx < 0 || idx >= set->nelem)
1400     return;
1401   --set->nelem;
1402   for (; idx < set->nelem; idx++)
1403     set->elems[idx] = set->elems[idx + 1];
1404 }
1405 
1406 
1407 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1408    Or return -1 if an error occurred.  */
1409 
1410 static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1411 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1412 {
1413   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1414     {
1415       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1416       Idx *new_nexts, *new_indices;
1417       re_node_set *new_edests, *new_eclosures;
1418       re_token_t *new_nodes;
1419 
1420       /* Avoid overflows in realloc.  */
1421       const size_t max_object_size = MAX (sizeof (re_token_t),
1422 					  MAX (sizeof (re_node_set),
1423 					       sizeof (Idx)));
1424       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1425 			    < new_nodes_alloc))
1426 	return -1;
1427 
1428       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1429       if (__glibc_unlikely (new_nodes == NULL))
1430 	return -1;
1431       dfa->nodes = new_nodes;
1432       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1433       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1434       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1435       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1436       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1437 			    || new_edests == NULL || new_eclosures == NULL))
1438 	{
1439 	   re_free (new_nexts);
1440 	   re_free (new_indices);
1441 	   re_free (new_edests);
1442 	   re_free (new_eclosures);
1443 	   return -1;
1444 	}
1445       dfa->nexts = new_nexts;
1446       dfa->org_indices = new_indices;
1447       dfa->edests = new_edests;
1448       dfa->eclosures = new_eclosures;
1449       dfa->nodes_alloc = new_nodes_alloc;
1450     }
1451   dfa->nodes[dfa->nodes_len] = token;
1452   dfa->nodes[dfa->nodes_len].constraint = 0;
1453 #ifdef RE_ENABLE_I18N
1454   dfa->nodes[dfa->nodes_len].accept_mb =
1455     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1456      || token.type == COMPLEX_BRACKET);
1457 #endif
1458   dfa->nexts[dfa->nodes_len] = -1;
1459   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1460   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1461   return dfa->nodes_len++;
1462 }
1463 
1464 static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)1465 calc_state_hash (const re_node_set *nodes, unsigned int context)
1466 {
1467   re_hashval_t hash = nodes->nelem + context;
1468   Idx i;
1469   for (i = 0 ; i < nodes->nelem ; i++)
1470     hash += nodes->elems[i];
1471   return hash;
1472 }
1473 
1474 /* Search for the state whose node_set is equivalent to NODES.
1475    Return the pointer to the state, if we found it in the DFA.
1476    Otherwise create the new one and return it.  In case of an error
1477    return NULL and set the error code in ERR.
1478    Note: - We assume NULL as the invalid state, then it is possible that
1479 	   return value is NULL and ERR is REG_NOERROR.
1480 	 - We never return non-NULL value in case of any errors, it is for
1481 	   optimization.  */
1482 
1483 static re_dfastate_t *
1484 __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1485 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1486 		  const re_node_set *nodes)
1487 {
1488   re_hashval_t hash;
1489   re_dfastate_t *new_state;
1490   struct re_state_table_entry *spot;
1491   Idx i;
1492 #if defined GCC_LINT || defined lint
1493   /* Suppress bogus uninitialized-variable warnings.  */
1494   *err = REG_NOERROR;
1495 #endif
1496   if (__glibc_unlikely (nodes->nelem == 0))
1497     {
1498       *err = REG_NOERROR;
1499       return NULL;
1500     }
1501   hash = calc_state_hash (nodes, 0);
1502   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1503 
1504   for (i = 0 ; i < spot->num ; i++)
1505     {
1506       re_dfastate_t *state = spot->array[i];
1507       if (hash != state->hash)
1508 	continue;
1509       if (re_node_set_compare (&state->nodes, nodes))
1510 	return state;
1511     }
1512 
1513   /* There are no appropriate state in the dfa, create the new one.  */
1514   new_state = create_ci_newstate (dfa, nodes, hash);
1515   if (__glibc_unlikely (new_state == NULL))
1516     *err = REG_ESPACE;
1517 
1518   return new_state;
1519 }
1520 
1521 /* Search for the state whose node_set is equivalent to NODES and
1522    whose context is equivalent to CONTEXT.
1523    Return the pointer to the state, if we found it in the DFA.
1524    Otherwise create the new one and return it.  In case of an error
1525    return NULL and set the error code in ERR.
1526    Note: - We assume NULL as the invalid state, then it is possible that
1527 	   return value is NULL and ERR is REG_NOERROR.
1528 	 - We never return non-NULL value in case of any errors, it is for
1529 	   optimization.  */
1530 
1531 static re_dfastate_t *
1532 __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1533 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1534 			  const re_node_set *nodes, unsigned int context)
1535 {
1536   re_hashval_t hash;
1537   re_dfastate_t *new_state;
1538   struct re_state_table_entry *spot;
1539   Idx i;
1540 #if defined GCC_LINT || defined lint
1541   /* Suppress bogus uninitialized-variable warnings.  */
1542   *err = REG_NOERROR;
1543 #endif
1544   if (nodes->nelem == 0)
1545     {
1546       *err = REG_NOERROR;
1547       return NULL;
1548     }
1549   hash = calc_state_hash (nodes, context);
1550   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1551 
1552   for (i = 0 ; i < spot->num ; i++)
1553     {
1554       re_dfastate_t *state = spot->array[i];
1555       if (state->hash == hash
1556 	  && state->context == context
1557 	  && re_node_set_compare (state->entrance_nodes, nodes))
1558 	return state;
1559     }
1560   /* There are no appropriate state in 'dfa', create the new one.  */
1561   new_state = create_cd_newstate (dfa, nodes, context, hash);
1562   if (__glibc_unlikely (new_state == NULL))
1563     *err = REG_ESPACE;
1564 
1565   return new_state;
1566 }
1567 
1568 /* Finish initialization of the new state NEWSTATE, and using its hash value
1569    HASH put in the appropriate bucket of DFA's state table.  Return value
1570    indicates the error code if failed.  */
1571 
1572 static reg_errcode_t
1573 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1574 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1575 		re_hashval_t hash)
1576 {
1577   struct re_state_table_entry *spot;
1578   reg_errcode_t err;
1579   Idx i;
1580 
1581   newstate->hash = hash;
1582   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1583   if (__glibc_unlikely (err != REG_NOERROR))
1584     return REG_ESPACE;
1585   for (i = 0; i < newstate->nodes.nelem; i++)
1586     {
1587       Idx elem = newstate->nodes.elems[i];
1588       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1589 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1590 	  return REG_ESPACE;
1591     }
1592 
1593   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1594   if (__glibc_unlikely (spot->alloc <= spot->num))
1595     {
1596       Idx new_alloc = 2 * spot->num + 2;
1597       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1598 					      new_alloc);
1599       if (__glibc_unlikely (new_array == NULL))
1600 	return REG_ESPACE;
1601       spot->array = new_array;
1602       spot->alloc = new_alloc;
1603     }
1604   spot->array[spot->num++] = newstate;
1605   return REG_NOERROR;
1606 }
1607 
1608 static void
free_state(re_dfastate_t * state)1609 free_state (re_dfastate_t *state)
1610 {
1611   re_node_set_free (&state->non_eps_nodes);
1612   re_node_set_free (&state->inveclosure);
1613   if (state->entrance_nodes != &state->nodes)
1614     {
1615       re_node_set_free (state->entrance_nodes);
1616       re_free (state->entrance_nodes);
1617     }
1618   re_node_set_free (&state->nodes);
1619   re_free (state->word_trtable);
1620   re_free (state->trtable);
1621   re_free (state);
1622 }
1623 
1624 /* Create the new state which is independent of contexts.
1625    Return the new state if succeeded, otherwise return NULL.  */
1626 
1627 static re_dfastate_t *
1628 __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1629 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1630 		    re_hashval_t hash)
1631 {
1632   Idx i;
1633   reg_errcode_t err;
1634   re_dfastate_t *newstate;
1635 
1636   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1637   if (__glibc_unlikely (newstate == NULL))
1638     return NULL;
1639   err = re_node_set_init_copy (&newstate->nodes, nodes);
1640   if (__glibc_unlikely (err != REG_NOERROR))
1641     {
1642       re_free (newstate);
1643       return NULL;
1644     }
1645 
1646   newstate->entrance_nodes = &newstate->nodes;
1647   for (i = 0 ; i < nodes->nelem ; i++)
1648     {
1649       re_token_t *node = dfa->nodes + nodes->elems[i];
1650       re_token_type_t type = node->type;
1651       if (type == CHARACTER && !node->constraint)
1652 	continue;
1653 #ifdef RE_ENABLE_I18N
1654       newstate->accept_mb |= node->accept_mb;
1655 #endif /* RE_ENABLE_I18N */
1656 
1657       /* If the state has the halt node, the state is a halt state.  */
1658       if (type == END_OF_RE)
1659 	newstate->halt = 1;
1660       else if (type == OP_BACK_REF)
1661 	newstate->has_backref = 1;
1662       else if (type == ANCHOR || node->constraint)
1663 	newstate->has_constraint = 1;
1664     }
1665   err = register_state (dfa, newstate, hash);
1666   if (__glibc_unlikely (err != REG_NOERROR))
1667     {
1668       free_state (newstate);
1669       newstate = NULL;
1670     }
1671   return newstate;
1672 }
1673 
1674 /* Create the new state which is depend on the context CONTEXT.
1675    Return the new state if succeeded, otherwise return NULL.  */
1676 
1677 static re_dfastate_t *
1678 __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1679 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1680 		    unsigned int context, re_hashval_t hash)
1681 {
1682   Idx i, nctx_nodes = 0;
1683   reg_errcode_t err;
1684   re_dfastate_t *newstate;
1685 
1686   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1687   if (__glibc_unlikely (newstate == NULL))
1688     return NULL;
1689   err = re_node_set_init_copy (&newstate->nodes, nodes);
1690   if (__glibc_unlikely (err != REG_NOERROR))
1691     {
1692       re_free (newstate);
1693       return NULL;
1694     }
1695 
1696   newstate->context = context;
1697   newstate->entrance_nodes = &newstate->nodes;
1698 
1699   for (i = 0 ; i < nodes->nelem ; i++)
1700     {
1701       re_token_t *node = dfa->nodes + nodes->elems[i];
1702       re_token_type_t type = node->type;
1703       unsigned int constraint = node->constraint;
1704 
1705       if (type == CHARACTER && !constraint)
1706 	continue;
1707 #ifdef RE_ENABLE_I18N
1708       newstate->accept_mb |= node->accept_mb;
1709 #endif /* RE_ENABLE_I18N */
1710 
1711       /* If the state has the halt node, the state is a halt state.  */
1712       if (type == END_OF_RE)
1713 	newstate->halt = 1;
1714       else if (type == OP_BACK_REF)
1715 	newstate->has_backref = 1;
1716 
1717       if (constraint)
1718 	{
1719 	  if (newstate->entrance_nodes == &newstate->nodes)
1720 	    {
1721 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1722 	      if (__glibc_unlikely (newstate->entrance_nodes == NULL))
1723 		{
1724 		  free_state (newstate);
1725 		  return NULL;
1726 		}
1727 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1728 		  != REG_NOERROR)
1729 		return NULL;
1730 	      nctx_nodes = 0;
1731 	      newstate->has_constraint = 1;
1732 	    }
1733 
1734 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1735 	    {
1736 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1737 	      ++nctx_nodes;
1738 	    }
1739 	}
1740     }
1741   err = register_state (dfa, newstate, hash);
1742   if (__glibc_unlikely (err != REG_NOERROR))
1743     {
1744       free_state (newstate);
1745       newstate = NULL;
1746     }
1747   return  newstate;
1748 }
1749