1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash) internal_function;
31 
32 /* Functions for string operation.  */
33 
34 /* This function allocate the buffers.  It is necessary to call
35    re_string_reconstruct before using the object.  */
36 
37 static reg_errcode_t
38 internal_function __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42   reg_errcode_t ret;
43   Idx init_buf_len;
44 
45   /* Ensure at least one character fits into the buffers.  */
46   if (init_len < dfa->mb_cur_max)
47     init_len = dfa->mb_cur_max;
48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
50 
51   ret = re_string_realloc_buffers (pstr, init_buf_len);
52   if (BE (ret != REG_NOERROR, 0))
53     return ret;
54 
55   pstr->word_char = dfa->word_char;
56   pstr->word_ops_used = dfa->word_ops_used;
57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59   pstr->valid_raw_len = pstr->valid_len;
60   return REG_NOERROR;
61 }
62 
63 /* This function allocate the buffers, and initialize them.  */
64 
65 static reg_errcode_t
66 internal_function __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70   reg_errcode_t ret;
71   memset (pstr, '\0', sizeof (re_string_t));
72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
73 
74   if (len > 0)
75     {
76       ret = re_string_realloc_buffers (pstr, len + 1);
77       if (BE (ret != REG_NOERROR, 0))
78 	return ret;
79     }
80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81 
82   if (icase)
83     {
84 #ifdef RE_ENABLE_I18N
85       if (dfa->mb_cur_max > 1)
86 	{
87 	  while (1)
88 	    {
89 	      ret = build_wcs_upper_buffer (pstr);
90 	      if (BE (ret != REG_NOERROR, 0))
91 		return ret;
92 	      if (pstr->valid_raw_len >= len)
93 		break;
94 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 		break;
96 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 	      if (BE (ret != REG_NOERROR, 0))
98 		return ret;
99 	    }
100 	}
101       else
102 #endif /* RE_ENABLE_I18N  */
103 	build_upper_buffer (pstr);
104     }
105   else
106     {
107 #ifdef RE_ENABLE_I18N
108       if (dfa->mb_cur_max > 1)
109 	build_wcs_buffer (pstr);
110       else
111 #endif /* RE_ENABLE_I18N  */
112 	{
113 	  if (trans != NULL)
114 	    re_string_translate_buffer (pstr);
115 	  else
116 	    {
117 	      pstr->valid_len = pstr->bufs_len;
118 	      pstr->valid_raw_len = pstr->bufs_len;
119 	    }
120 	}
121     }
122 
123   return REG_NOERROR;
124 }
125 
126 /* Helper functions for re_string_allocate, and re_string_construct.  */
127 
128 static reg_errcode_t
129 internal_function __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133   if (pstr->mb_cur_max > 1)
134     {
135       wint_t *new_wcs;
136 
137       /* Avoid overflow in realloc.  */
138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
140 	return REG_ESPACE;
141 
142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143       if (BE (new_wcs == NULL, 0))
144 	return REG_ESPACE;
145       pstr->wcs = new_wcs;
146       if (pstr->offsets != NULL)
147 	{
148 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 	  if (BE (new_offsets == NULL, 0))
150 	    return REG_ESPACE;
151 	  pstr->offsets = new_offsets;
152 	}
153     }
154 #endif /* RE_ENABLE_I18N  */
155   if (pstr->mbs_allocated)
156     {
157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 					   new_buf_len);
159       if (BE (new_mbs == NULL, 0))
160 	return REG_ESPACE;
161       pstr->mbs = new_mbs;
162     }
163   pstr->bufs_len = new_buf_len;
164   return REG_NOERROR;
165 }
166 
167 
168 static void
169 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 			    RE_TRANSLATE_TYPE trans, bool icase,
172 			    const re_dfa_t *dfa)
173 {
174   pstr->raw_mbs = (const unsigned char *) str;
175   pstr->len = len;
176   pstr->raw_len = len;
177   pstr->trans = trans;
178   pstr->icase = icase;
179   pstr->mbs_allocated = (trans != NULL || icase);
180   pstr->mb_cur_max = dfa->mb_cur_max;
181   pstr->is_utf8 = dfa->is_utf8;
182   pstr->map_notascii = dfa->map_notascii;
183   pstr->stop = pstr->len;
184   pstr->raw_stop = pstr->stop;
185 }
186 
187 #ifdef RE_ENABLE_I18N
188 
189 /* Build wide character buffer PSTR->WCS.
190    If the byte sequence of the string are:
191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192    Then wide character buffer will be:
193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
194    We use WEOF for padding, they indicate that the position isn't
195    a first byte of a multibyte character.
196 
197    Note that this function assumes PSTR->VALID_LEN elements are already
198    built and starts from PSTR->VALID_LEN.  */
199 
200 static void
201 internal_function
build_wcs_buffer(re_string_t * pstr)202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205   unsigned char buf[MB_LEN_MAX];
206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208   unsigned char buf[64];
209 #endif
210   mbstate_t prev_st;
211   Idx byte_idx, end_idx, remain_len;
212   size_t mbclen;
213 
214   /* Build the buffers from pstr->valid_len to either pstr->len or
215      pstr->bufs_len.  */
216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218     {
219       wchar_t wc;
220       const char *p;
221 
222       remain_len = end_idx - byte_idx;
223       prev_st = pstr->cur_state;
224       /* Apply the translation if we need.  */
225       if (BE (pstr->trans != NULL, 0))
226 	{
227 	  int i, ch;
228 
229 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 	    {
231 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 	    }
234 	  p = (const char *) buf;
235 	}
236       else
237 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239       if (BE (mbclen == (size_t) -1 || mbclen == 0
240 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241 	{
242 	  /* We treat these cases as a singlebyte character.  */
243 	  mbclen = 1;
244 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 	  if (BE (pstr->trans != NULL, 0))
246 	    wc = pstr->trans[wc];
247 	  pstr->cur_state = prev_st;
248 	}
249       else if (BE (mbclen == (size_t) -2, 0))
250 	{
251 	  /* The buffer doesn't have enough space, finish to build.  */
252 	  pstr->cur_state = prev_st;
253 	  break;
254 	}
255 
256       /* Write wide character and padding.  */
257       pstr->wcs[byte_idx++] = wc;
258       /* Write paddings.  */
259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 	pstr->wcs[byte_idx++] = WEOF;
261     }
262   pstr->valid_len = byte_idx;
263   pstr->valid_raw_len = byte_idx;
264 }
265 
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267    but for REG_ICASE.  */
268 
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273   mbstate_t prev_st;
274   Idx src_idx, byte_idx, end_idx, remain_len;
275   size_t mbclen;
276 #ifdef _LIBC
277   char buf[MB_LEN_MAX];
278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280   char buf[64];
281 #endif
282 
283   byte_idx = pstr->valid_len;
284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 
286   /* The following optimization assumes that ASCII characters can be
287      mapped to wide characters with a simple cast.  */
288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289     {
290       while (byte_idx < end_idx)
291 	{
292 	  wchar_t wc;
293 
294 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 	      && mbsinit (&pstr->cur_state))
296 	    {
297 	      /* In case of a singlebyte character.  */
298 	      pstr->mbs[byte_idx]
299 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 	      /* The next step uses the assumption that wchar_t is encoded
301 		 ASCII-safe: all ASCII values can be converted like this.  */
302 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 	      ++byte_idx;
304 	      continue;
305 	    }
306 
307 	  remain_len = end_idx - byte_idx;
308 	  prev_st = pstr->cur_state;
309 	  mbclen = __mbrtowc (&wc,
310 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 			       + byte_idx), remain_len, &pstr->cur_state);
312 	  if (BE (mbclen < (size_t) -2, 1))
313 	    {
314 	      wchar_t wcu = __towupper (wc);
315 	      if (wcu != wc)
316 		{
317 		  size_t mbcdlen;
318 
319 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
320 		  if (BE (mbclen == mbcdlen, 1))
321 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 		  else
323 		    {
324 		      src_idx = byte_idx;
325 		      goto offsets_needed;
326 		    }
327 		}
328 	      else
329 		memcpy (pstr->mbs + byte_idx,
330 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 	      pstr->wcs[byte_idx++] = wcu;
332 	      /* Write paddings.  */
333 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 		pstr->wcs[byte_idx++] = WEOF;
335 	    }
336 	  else if (mbclen == (size_t) -1 || mbclen == 0
337 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338 	    {
339 	      /* It is an invalid character, an incomplete character
340 		 at the end of the string, or '\0'.  Just use the byte.  */
341 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
342 	      pstr->mbs[byte_idx] = ch;
343 	      /* And also cast it to wide char.  */
344 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
345 	      if (BE (mbclen == (size_t) -1, 0))
346 		pstr->cur_state = prev_st;
347 	    }
348 	  else
349 	    {
350 	      /* The buffer doesn't have enough space, finish to build.  */
351 	      pstr->cur_state = prev_st;
352 	      break;
353 	    }
354 	}
355       pstr->valid_len = byte_idx;
356       pstr->valid_raw_len = byte_idx;
357       return REG_NOERROR;
358     }
359   else
360     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361       {
362 	wchar_t wc;
363 	const char *p;
364       offsets_needed:
365 	remain_len = end_idx - byte_idx;
366 	prev_st = pstr->cur_state;
367 	if (BE (pstr->trans != NULL, 0))
368 	  {
369 	    int i, ch;
370 
371 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 	      {
373 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
374 		buf[i] = pstr->trans[ch];
375 	      }
376 	    p = (const char *) buf;
377 	  }
378 	else
379 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
380 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
381 	if (BE (mbclen < (size_t) -2, 1))
382 	  {
383 	    wchar_t wcu = __towupper (wc);
384 	    if (wcu != wc)
385 	      {
386 		size_t mbcdlen;
387 
388 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 		if (BE (mbclen == mbcdlen, 1))
390 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 		else if (mbcdlen != (size_t) -1)
392 		  {
393 		    size_t i;
394 
395 		    if (byte_idx + mbcdlen > pstr->bufs_len)
396 		      {
397 			pstr->cur_state = prev_st;
398 			break;
399 		      }
400 
401 		    if (pstr->offsets == NULL)
402 		      {
403 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404 
405 			if (pstr->offsets == NULL)
406 			  return REG_ESPACE;
407 		      }
408 		    if (!pstr->offsets_needed)
409 		      {
410 			for (i = 0; i < (size_t) byte_idx; ++i)
411 			  pstr->offsets[i] = i;
412 			pstr->offsets_needed = 1;
413 		      }
414 
415 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 		    pstr->wcs[byte_idx] = wcu;
417 		    pstr->offsets[byte_idx] = src_idx;
418 		    for (i = 1; i < mbcdlen; ++i)
419 		      {
420 			pstr->offsets[byte_idx + i]
421 			  = src_idx + (i < mbclen ? i : mbclen - 1);
422 			pstr->wcs[byte_idx + i] = WEOF;
423 		      }
424 		    pstr->len += mbcdlen - mbclen;
425 		    if (pstr->raw_stop > src_idx)
426 		      pstr->stop += mbcdlen - mbclen;
427 		    end_idx = (pstr->bufs_len > pstr->len)
428 			      ? pstr->len : pstr->bufs_len;
429 		    byte_idx += mbcdlen;
430 		    src_idx += mbclen;
431 		    continue;
432 		  }
433 		else
434 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
435 	      }
436 	    else
437 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
438 
439 	    if (BE (pstr->offsets_needed != 0, 0))
440 	      {
441 		size_t i;
442 		for (i = 0; i < mbclen; ++i)
443 		  pstr->offsets[byte_idx + i] = src_idx + i;
444 	      }
445 	    src_idx += mbclen;
446 
447 	    pstr->wcs[byte_idx++] = wcu;
448 	    /* Write paddings.  */
449 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 	      pstr->wcs[byte_idx++] = WEOF;
451 	  }
452 	else if (mbclen == (size_t) -1 || mbclen == 0
453 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
454 	  {
455 	    /* It is an invalid character or '\0'.  Just use the byte.  */
456 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 
458 	    if (BE (pstr->trans != NULL, 0))
459 	      ch = pstr->trans [ch];
460 	    pstr->mbs[byte_idx] = ch;
461 
462 	    if (BE (pstr->offsets_needed != 0, 0))
463 	      pstr->offsets[byte_idx] = src_idx;
464 	    ++src_idx;
465 
466 	    /* And also cast it to wide char.  */
467 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
468 	    if (BE (mbclen == (size_t) -1, 0))
469 	      pstr->cur_state = prev_st;
470 	  }
471 	else
472 	  {
473 	    /* The buffer doesn't have enough space, finish to build.  */
474 	    pstr->cur_state = prev_st;
475 	    break;
476 	  }
477       }
478   pstr->valid_len = byte_idx;
479   pstr->valid_raw_len = src_idx;
480   return REG_NOERROR;
481 }
482 
483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
484    Return the index.  */
485 
486 static Idx
487 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)488 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
489 {
490   mbstate_t prev_st;
491   Idx rawbuf_idx;
492   size_t mbclen;
493   wint_t wc = WEOF;
494 
495   /* Skip the characters which are not necessary to check.  */
496   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
497        rawbuf_idx < new_raw_idx;)
498     {
499       wchar_t wc2;
500       Idx remain_len = pstr->raw_len - rawbuf_idx;
501       prev_st = pstr->cur_state;
502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503 			  remain_len, &pstr->cur_state);
504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 	{
506 	  /* We treat these cases as a single byte character.  */
507 	  if (mbclen == 0 || remain_len == 0)
508 	    wc = L'\0';
509 	  else
510 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 	  mbclen = 1;
512 	  pstr->cur_state = prev_st;
513 	}
514       else
515 	wc = wc2;
516       /* Then proceed the next character.  */
517       rawbuf_idx += mbclen;
518     }
519   *last_wc = wc;
520   return rawbuf_idx;
521 }
522 #endif /* RE_ENABLE_I18N  */
523 
524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
525    This function is used in case of REG_ICASE.  */
526 
527 static void
528 internal_function
build_upper_buffer(re_string_t * pstr)529 build_upper_buffer (re_string_t *pstr)
530 {
531   Idx char_idx, end_idx;
532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533 
534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535     {
536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537       if (BE (pstr->trans != NULL, 0))
538 	ch = pstr->trans[ch];
539       pstr->mbs[char_idx] = toupper (ch);
540     }
541   pstr->valid_len = char_idx;
542   pstr->valid_raw_len = char_idx;
543 }
544 
545 /* Apply TRANS to the buffer in PSTR.  */
546 
547 static void
548 internal_function
re_string_translate_buffer(re_string_t * pstr)549 re_string_translate_buffer (re_string_t *pstr)
550 {
551   Idx buf_idx, end_idx;
552   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553 
554   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555     {
556       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557       pstr->mbs[buf_idx] = pstr->trans[ch];
558     }
559 
560   pstr->valid_len = buf_idx;
561   pstr->valid_raw_len = buf_idx;
562 }
563 
564 /* This function re-construct the buffers.
565    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566    convert to upper case in case of REG_ICASE, apply translation.  */
567 
568 static reg_errcode_t
569 internal_function __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)570 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571 {
572   Idx offset;
573 
574   if (BE (pstr->raw_mbs_idx <= idx, 0))
575     offset = idx - pstr->raw_mbs_idx;
576   else
577     {
578       /* Reset buffer.  */
579 #ifdef RE_ENABLE_I18N
580       if (pstr->mb_cur_max > 1)
581 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
582 #endif /* RE_ENABLE_I18N */
583       pstr->len = pstr->raw_len;
584       pstr->stop = pstr->raw_stop;
585       pstr->valid_len = 0;
586       pstr->raw_mbs_idx = 0;
587       pstr->valid_raw_len = 0;
588       pstr->offsets_needed = 0;
589       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
590 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
591       if (!pstr->mbs_allocated)
592 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
593       offset = idx;
594     }
595 
596   if (BE (offset != 0, 1))
597     {
598       /* Should the already checked characters be kept?  */
599       if (BE (offset < pstr->valid_raw_len, 1))
600 	{
601 	  /* Yes, move them to the front of the buffer.  */
602 #ifdef RE_ENABLE_I18N
603 	  if (BE (pstr->offsets_needed, 0))
604 	    {
605 	      Idx low = 0, high = pstr->valid_len, mid;
606 	      do
607 		{
608 		  mid = (high + low) / 2;
609 		  if (pstr->offsets[mid] > offset)
610 		    high = mid;
611 		  else if (pstr->offsets[mid] < offset)
612 		    low = mid + 1;
613 		  else
614 		    break;
615 		}
616 	      while (low < high);
617 	      if (pstr->offsets[mid] < offset)
618 		++mid;
619 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
620 							eflags);
621 	      /* This can be quite complicated, so handle specially
622 		 only the common and easy case where the character with
623 		 different length representation of lower and upper
624 		 case is present at or after offset.  */
625 	      if (pstr->valid_len > offset
626 		  && mid == offset && pstr->offsets[mid] == offset)
627 		{
628 		  memmove (pstr->wcs, pstr->wcs + offset,
629 			   (pstr->valid_len - offset) * sizeof (wint_t));
630 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
631 		  pstr->valid_len -= offset;
632 		  pstr->valid_raw_len -= offset;
633 		  for (low = 0; low < pstr->valid_len; low++)
634 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
635 		}
636 	      else
637 		{
638 		  /* Otherwise, just find out how long the partial multibyte
639 		     character at offset is and fill it with WEOF/255.  */
640 		  pstr->len = pstr->raw_len - idx + offset;
641 		  pstr->stop = pstr->raw_stop - idx + offset;
642 		  pstr->offsets_needed = 0;
643 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
644 		    --mid;
645 		  while (mid < pstr->valid_len)
646 		    if (pstr->wcs[mid] != WEOF)
647 		      break;
648 		    else
649 		      ++mid;
650 		  if (mid == pstr->valid_len)
651 		    pstr->valid_len = 0;
652 		  else
653 		    {
654 		      pstr->valid_len = pstr->offsets[mid] - offset;
655 		      if (pstr->valid_len)
656 			{
657 			  for (low = 0; low < pstr->valid_len; ++low)
658 			    pstr->wcs[low] = WEOF;
659 			  memset (pstr->mbs, 255, pstr->valid_len);
660 			}
661 		    }
662 		  pstr->valid_raw_len = pstr->valid_len;
663 		}
664 	    }
665 	  else
666 #endif
667 	    {
668 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
669 							eflags);
670 #ifdef RE_ENABLE_I18N
671 	      if (pstr->mb_cur_max > 1)
672 		memmove (pstr->wcs, pstr->wcs + offset,
673 			 (pstr->valid_len - offset) * sizeof (wint_t));
674 #endif /* RE_ENABLE_I18N */
675 	      if (BE (pstr->mbs_allocated, 0))
676 		memmove (pstr->mbs, pstr->mbs + offset,
677 			 pstr->valid_len - offset);
678 	      pstr->valid_len -= offset;
679 	      pstr->valid_raw_len -= offset;
680 #if defined DEBUG && DEBUG
681 	      assert (pstr->valid_len > 0);
682 #endif
683 	    }
684 	}
685       else
686 	{
687 #ifdef RE_ENABLE_I18N
688 	  /* No, skip all characters until IDX.  */
689 	  Idx prev_valid_len = pstr->valid_len;
690 
691 	  if (BE (pstr->offsets_needed, 0))
692 	    {
693 	      pstr->len = pstr->raw_len - idx + offset;
694 	      pstr->stop = pstr->raw_stop - idx + offset;
695 	      pstr->offsets_needed = 0;
696 	    }
697 #endif
698 	  pstr->valid_len = 0;
699 #ifdef RE_ENABLE_I18N
700 	  if (pstr->mb_cur_max > 1)
701 	    {
702 	      Idx wcs_idx;
703 	      wint_t wc = WEOF;
704 
705 	      if (pstr->is_utf8)
706 		{
707 		  const unsigned char *raw, *p, *end;
708 
709 		  /* Special case UTF-8.  Multi-byte chars start with any
710 		     byte other than 0x80 - 0xbf.  */
711 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
712 		  end = raw + (offset - pstr->mb_cur_max);
713 		  if (end < pstr->raw_mbs)
714 		    end = pstr->raw_mbs;
715 		  p = raw + offset - 1;
716 #ifdef _LIBC
717 		  /* We know the wchar_t encoding is UCS4, so for the simple
718 		     case, ASCII characters, skip the conversion step.  */
719 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
720 		    {
721 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
722 		      /* pstr->valid_len = 0; */
723 		      wc = (wchar_t) *p;
724 		    }
725 		  else
726 #endif
727 		    for (; p >= end; --p)
728 		      if ((*p & 0xc0) != 0x80)
729 			{
730 			  mbstate_t cur_state;
731 			  wchar_t wc2;
732 			  Idx mlen = raw + pstr->len - p;
733 			  unsigned char buf[6];
734 			  size_t mbclen;
735 
736 			  const unsigned char *pp = p;
737 			  if (BE (pstr->trans != NULL, 0))
738 			    {
739 			      int i = mlen < 6 ? mlen : 6;
740 			      while (--i >= 0)
741 				buf[i] = pstr->trans[p[i]];
742 			      pp = buf;
743 			    }
744 			  /* XXX Don't use mbrtowc, we know which conversion
745 			     to use (UTF-8 -> UCS4).  */
746 			  memset (&cur_state, 0, sizeof (cur_state));
747 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
748 					      &cur_state);
749 			  if (raw + offset - p <= mbclen
750 			      && mbclen < (size_t) -2)
751 			    {
752 			      memset (&pstr->cur_state, '\0',
753 				      sizeof (mbstate_t));
754 			      pstr->valid_len = mbclen - (raw + offset - p);
755 			      wc = wc2;
756 			    }
757 			  break;
758 			}
759 		}
760 
761 	      if (wc == WEOF)
762 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
763 	      if (wc == WEOF)
764 		pstr->tip_context
765 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
766 	      else
767 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
768 				      && IS_WIDE_WORD_CHAR (wc))
769 				     ? CONTEXT_WORD
770 				     : ((IS_WIDE_NEWLINE (wc)
771 					 && pstr->newline_anchor)
772 					? CONTEXT_NEWLINE : 0));
773 	      if (BE (pstr->valid_len, 0))
774 		{
775 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
776 		    pstr->wcs[wcs_idx] = WEOF;
777 		  if (pstr->mbs_allocated)
778 		    memset (pstr->mbs, 255, pstr->valid_len);
779 		}
780 	      pstr->valid_raw_len = pstr->valid_len;
781 	    }
782 	  else
783 #endif /* RE_ENABLE_I18N */
784 	    {
785 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
786 	      pstr->valid_raw_len = 0;
787 	      if (pstr->trans)
788 		c = pstr->trans[c];
789 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
790 				   ? CONTEXT_WORD
791 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
792 				      ? CONTEXT_NEWLINE : 0));
793 	    }
794 	}
795       if (!BE (pstr->mbs_allocated, 0))
796 	pstr->mbs += offset;
797     }
798   pstr->raw_mbs_idx = idx;
799   pstr->len -= offset;
800   pstr->stop -= offset;
801 
802   /* Then build the buffers.  */
803 #ifdef RE_ENABLE_I18N
804   if (pstr->mb_cur_max > 1)
805     {
806       if (pstr->icase)
807 	{
808 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
809 	  if (BE (ret != REG_NOERROR, 0))
810 	    return ret;
811 	}
812       else
813 	build_wcs_buffer (pstr);
814     }
815   else
816 #endif /* RE_ENABLE_I18N */
817     if (BE (pstr->mbs_allocated, 0))
818       {
819 	if (pstr->icase)
820 	  build_upper_buffer (pstr);
821 	else if (pstr->trans != NULL)
822 	  re_string_translate_buffer (pstr);
823       }
824     else
825       pstr->valid_len = pstr->len;
826 
827   pstr->cur_idx = 0;
828   return REG_NOERROR;
829 }
830 
831 static unsigned char
832 internal_function __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)833 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
834 {
835   int ch;
836   Idx off;
837 
838   /* Handle the common (easiest) cases first.  */
839   if (BE (!pstr->mbs_allocated, 1))
840     return re_string_peek_byte (pstr, idx);
841 
842 #ifdef RE_ENABLE_I18N
843   if (pstr->mb_cur_max > 1
844       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
845     return re_string_peek_byte (pstr, idx);
846 #endif
847 
848   off = pstr->cur_idx + idx;
849 #ifdef RE_ENABLE_I18N
850   if (pstr->offsets_needed)
851     off = pstr->offsets[off];
852 #endif
853 
854   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
855 
856 #ifdef RE_ENABLE_I18N
857   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858      this function returns CAPITAL LETTER I instead of first byte of
859      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
860      since peek_byte_case doesn't advance cur_idx in any way.  */
861   if (pstr->offsets_needed && !isascii (ch))
862     return re_string_peek_byte (pstr, idx);
863 #endif
864 
865   return ch;
866 }
867 
868 static unsigned char
869 internal_function
re_string_fetch_byte_case(re_string_t * pstr)870 re_string_fetch_byte_case (re_string_t *pstr)
871 {
872   if (BE (!pstr->mbs_allocated, 1))
873     return re_string_fetch_byte (pstr);
874 
875 #ifdef RE_ENABLE_I18N
876   if (pstr->offsets_needed)
877     {
878       Idx off;
879       int ch;
880 
881       /* For tr_TR.UTF-8 [[:islower:]] there is
882 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
883 	 in that case the whole multi-byte character and return
884 	 the original letter.  On the other side, with
885 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886 	 anything else would complicate things too much.  */
887 
888       if (!re_string_first_byte (pstr, pstr->cur_idx))
889 	return re_string_fetch_byte (pstr);
890 
891       off = pstr->offsets[pstr->cur_idx];
892       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893 
894       if (! isascii (ch))
895 	return re_string_fetch_byte (pstr);
896 
897       re_string_skip_bytes (pstr,
898 			    re_string_char_size_at (pstr, pstr->cur_idx));
899       return ch;
900     }
901 #endif
902 
903   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904 }
905 
906 static void
907 internal_function
re_string_destruct(re_string_t * pstr)908 re_string_destruct (re_string_t *pstr)
909 {
910 #ifdef RE_ENABLE_I18N
911   re_free (pstr->wcs);
912   re_free (pstr->offsets);
913 #endif /* RE_ENABLE_I18N  */
914   if (pstr->mbs_allocated)
915     re_free (pstr->mbs);
916 }
917 
918 /* Return the context at IDX in INPUT.  */
919 
920 static unsigned int
921 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)922 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
923 {
924   int c;
925   if (BE (idx < 0, 0))
926     /* In this case, we use the value stored in input->tip_context,
927        since we can't know the character in input->mbs[-1] here.  */
928     return input->tip_context;
929   if (BE (idx == input->len, 0))
930     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932 #ifdef RE_ENABLE_I18N
933   if (input->mb_cur_max > 1)
934     {
935       wint_t wc;
936       Idx wc_idx = idx;
937       while(input->wcs[wc_idx] == WEOF)
938 	{
939 #if defined DEBUG && DEBUG
940 	  /* It must not happen.  */
941 	  assert (wc_idx >= 0);
942 #endif
943 	  --wc_idx;
944 	  if (wc_idx < 0)
945 	    return input->tip_context;
946 	}
947       wc = input->wcs[wc_idx];
948       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949 	return CONTEXT_WORD;
950       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951 	      ? CONTEXT_NEWLINE : 0);
952     }
953   else
954 #endif
955     {
956       c = re_string_byte_at (input, idx);
957       if (bitset_contain (input->word_char, c))
958 	return CONTEXT_WORD;
959       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960     }
961 }
962 
963 /* Functions for set operation.  */
964 
965 static reg_errcode_t
966 internal_function __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)967 re_node_set_alloc (re_node_set *set, Idx size)
968 {
969   set->alloc = size;
970   set->nelem = 0;
971   set->elems = re_malloc (Idx, size);
972   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
973     return REG_ESPACE;
974   return REG_NOERROR;
975 }
976 
977 static reg_errcode_t
978 internal_function __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)979 re_node_set_init_1 (re_node_set *set, Idx elem)
980 {
981   set->alloc = 1;
982   set->nelem = 1;
983   set->elems = re_malloc (Idx, 1);
984   if (BE (set->elems == NULL, 0))
985     {
986       set->alloc = set->nelem = 0;
987       return REG_ESPACE;
988     }
989   set->elems[0] = elem;
990   return REG_NOERROR;
991 }
992 
993 static reg_errcode_t
994 internal_function __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)995 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
996 {
997   set->alloc = 2;
998   set->elems = re_malloc (Idx, 2);
999   if (BE (set->elems == NULL, 0))
1000     return REG_ESPACE;
1001   if (elem1 == elem2)
1002     {
1003       set->nelem = 1;
1004       set->elems[0] = elem1;
1005     }
1006   else
1007     {
1008       set->nelem = 2;
1009       if (elem1 < elem2)
1010 	{
1011 	  set->elems[0] = elem1;
1012 	  set->elems[1] = elem2;
1013 	}
1014       else
1015 	{
1016 	  set->elems[0] = elem2;
1017 	  set->elems[1] = elem1;
1018 	}
1019     }
1020   return REG_NOERROR;
1021 }
1022 
1023 static reg_errcode_t
1024 internal_function __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1025 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 {
1027   dest->nelem = src->nelem;
1028   if (src->nelem > 0)
1029     {
1030       dest->alloc = dest->nelem;
1031       dest->elems = re_malloc (Idx, dest->alloc);
1032       if (BE (dest->elems == NULL, 0))
1033 	{
1034 	  dest->alloc = dest->nelem = 0;
1035 	  return REG_ESPACE;
1036 	}
1037       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1038     }
1039   else
1040     re_node_set_init_empty (dest);
1041   return REG_NOERROR;
1042 }
1043 
1044 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047 
1048 static reg_errcode_t
1049 internal_function __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1050 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051 			   const re_node_set *src2)
1052 {
1053   Idx i1, i2, is, id, delta, sbase;
1054   if (src1->nelem == 0 || src2->nelem == 0)
1055     return REG_NOERROR;
1056 
1057   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058      conservative estimate.  */
1059   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060     {
1061       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1063       if (BE (new_elems == NULL, 0))
1064 	return REG_ESPACE;
1065       dest->elems = new_elems;
1066       dest->alloc = new_alloc;
1067     }
1068 
1069   /* Find the items in the intersection of SRC1 and SRC2, and copy
1070      into the top of DEST those that are not already in DEST itself.  */
1071   sbase = dest->nelem + src1->nelem + src2->nelem;
1072   i1 = src1->nelem - 1;
1073   i2 = src2->nelem - 1;
1074   id = dest->nelem - 1;
1075   for (;;)
1076     {
1077       if (src1->elems[i1] == src2->elems[i2])
1078 	{
1079 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1080 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
1081 	    --id;
1082 
1083 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
1084             dest->elems[--sbase] = src1->elems[i1];
1085 
1086 	  if (--i1 < 0 || --i2 < 0)
1087 	    break;
1088 	}
1089 
1090       /* Lower the highest of the two items.  */
1091       else if (src1->elems[i1] < src2->elems[i2])
1092 	{
1093 	  if (--i2 < 0)
1094 	    break;
1095 	}
1096       else
1097 	{
1098 	  if (--i1 < 0)
1099 	    break;
1100 	}
1101     }
1102 
1103   id = dest->nelem - 1;
1104   is = dest->nelem + src1->nelem + src2->nelem - 1;
1105   delta = is - sbase + 1;
1106 
1107   /* Now copy.  When DELTA becomes zero, the remaining
1108      DEST elements are already in place; this is more or
1109      less the same loop that is in re_node_set_merge.  */
1110   dest->nelem += delta;
1111   if (delta > 0 && id >= 0)
1112     for (;;)
1113       {
1114 	if (dest->elems[is] > dest->elems[id])
1115 	  {
1116 	    /* Copy from the top.  */
1117 	    dest->elems[id + delta--] = dest->elems[is--];
1118 	    if (delta == 0)
1119 	      break;
1120 	  }
1121 	else
1122 	  {
1123 	    /* Slide from the bottom.  */
1124 	    dest->elems[id + delta] = dest->elems[id];
1125 	    if (--id < 0)
1126 	      break;
1127 	  }
1128       }
1129 
1130   /* Copy remaining SRC elements.  */
1131   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1132 
1133   return REG_NOERROR;
1134 }
1135 
1136 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138 
1139 static reg_errcode_t
1140 internal_function __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1141 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142 			const re_node_set *src2)
1143 {
1144   Idx i1, i2, id;
1145   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146     {
1147       dest->alloc = src1->nelem + src2->nelem;
1148       dest->elems = re_malloc (Idx, dest->alloc);
1149       if (BE (dest->elems == NULL, 0))
1150 	return REG_ESPACE;
1151     }
1152   else
1153     {
1154       if (src1 != NULL && src1->nelem > 0)
1155 	return re_node_set_init_copy (dest, src1);
1156       else if (src2 != NULL && src2->nelem > 0)
1157 	return re_node_set_init_copy (dest, src2);
1158       else
1159 	re_node_set_init_empty (dest);
1160       return REG_NOERROR;
1161     }
1162   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163     {
1164       if (src1->elems[i1] > src2->elems[i2])
1165 	{
1166 	  dest->elems[id++] = src2->elems[i2++];
1167 	  continue;
1168 	}
1169       if (src1->elems[i1] == src2->elems[i2])
1170 	++i2;
1171       dest->elems[id++] = src1->elems[i1++];
1172     }
1173   if (i1 < src1->nelem)
1174     {
1175       memcpy (dest->elems + id, src1->elems + i1,
1176 	     (src1->nelem - i1) * sizeof (Idx));
1177       id += src1->nelem - i1;
1178     }
1179   else if (i2 < src2->nelem)
1180     {
1181       memcpy (dest->elems + id, src2->elems + i2,
1182 	     (src2->nelem - i2) * sizeof (Idx));
1183       id += src2->nelem - i2;
1184     }
1185   dest->nelem = id;
1186   return REG_NOERROR;
1187 }
1188 
1189 /* Calculate the union set of the sets DEST and SRC. And store it to
1190    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191 
1192 static reg_errcode_t
1193 internal_function __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1194 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 {
1196   Idx is, id, sbase, delta;
1197   if (src == NULL || src->nelem == 0)
1198     return REG_NOERROR;
1199   if (dest->alloc < 2 * src->nelem + dest->nelem)
1200     {
1201       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1202       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1203       if (BE (new_buffer == NULL, 0))
1204 	return REG_ESPACE;
1205       dest->elems = new_buffer;
1206       dest->alloc = new_alloc;
1207     }
1208 
1209   if (BE (dest->nelem == 0, 0))
1210     {
1211       dest->nelem = src->nelem;
1212       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1213       return REG_NOERROR;
1214     }
1215 
1216   /* Copy into the top of DEST the items of SRC that are not
1217      found in DEST.  Maybe we could binary search in DEST?  */
1218   for (sbase = dest->nelem + 2 * src->nelem,
1219        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220     {
1221       if (dest->elems[id] == src->elems[is])
1222 	is--, id--;
1223       else if (dest->elems[id] < src->elems[is])
1224 	dest->elems[--sbase] = src->elems[is--];
1225       else /* if (dest->elems[id] > src->elems[is]) */
1226 	--id;
1227     }
1228 
1229   if (is >= 0)
1230     {
1231       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1232       sbase -= is + 1;
1233       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1234     }
1235 
1236   id = dest->nelem - 1;
1237   is = dest->nelem + 2 * src->nelem - 1;
1238   delta = is - sbase + 1;
1239   if (delta == 0)
1240     return REG_NOERROR;
1241 
1242   /* Now copy.  When DELTA becomes zero, the remaining
1243      DEST elements are already in place.  */
1244   dest->nelem += delta;
1245   for (;;)
1246     {
1247       if (dest->elems[is] > dest->elems[id])
1248 	{
1249 	  /* Copy from the top.  */
1250 	  dest->elems[id + delta--] = dest->elems[is--];
1251 	  if (delta == 0)
1252 	    break;
1253 	}
1254       else
1255 	{
1256 	  /* Slide from the bottom.  */
1257 	  dest->elems[id + delta] = dest->elems[id];
1258 	  if (--id < 0)
1259 	    {
1260 	      /* Copy remaining SRC elements.  */
1261 	      memcpy (dest->elems, dest->elems + sbase,
1262 		      delta * sizeof (Idx));
1263 	      break;
1264 	    }
1265 	}
1266     }
1267 
1268   return REG_NOERROR;
1269 }
1270 
1271 /* Insert the new element ELEM to the re_node_set* SET.
1272    SET should not already have ELEM.
1273    Return true if successful.  */
1274 
1275 static bool
1276 internal_function __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1277 re_node_set_insert (re_node_set *set, Idx elem)
1278 {
1279   Idx idx;
1280   /* In case the set is empty.  */
1281   if (set->alloc == 0)
1282     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1283 
1284   if (BE (set->nelem, 0) == 0)
1285     {
1286       /* We already guaranteed above that set->alloc != 0.  */
1287       set->elems[0] = elem;
1288       ++set->nelem;
1289       return true;
1290     }
1291 
1292   /* Realloc if we need.  */
1293   if (set->alloc == set->nelem)
1294     {
1295       Idx *new_elems;
1296       set->alloc = set->alloc * 2;
1297       new_elems = re_realloc (set->elems, Idx, set->alloc);
1298       if (BE (new_elems == NULL, 0))
1299 	return false;
1300       set->elems = new_elems;
1301     }
1302 
1303   /* Move the elements which follows the new element.  Test the
1304      first element separately to skip a check in the inner loop.  */
1305   if (elem < set->elems[0])
1306     {
1307       idx = 0;
1308       for (idx = set->nelem; idx > 0; idx--)
1309 	set->elems[idx] = set->elems[idx - 1];
1310     }
1311   else
1312     {
1313       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1314 	set->elems[idx] = set->elems[idx - 1];
1315     }
1316 
1317   /* Insert the new element.  */
1318   set->elems[idx] = elem;
1319   ++set->nelem;
1320   return true;
1321 }
1322 
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324    SET should not already have any element greater than or equal to ELEM.
1325    Return true if successful.  */
1326 
1327 static bool
1328 internal_function __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1329 re_node_set_insert_last (re_node_set *set, Idx elem)
1330 {
1331   /* Realloc if we need.  */
1332   if (set->alloc == set->nelem)
1333     {
1334       Idx *new_elems;
1335       set->alloc = (set->alloc + 1) * 2;
1336       new_elems = re_realloc (set->elems, Idx, set->alloc);
1337       if (BE (new_elems == NULL, 0))
1338 	return false;
1339       set->elems = new_elems;
1340     }
1341 
1342   /* Insert the new element.  */
1343   set->elems[set->nelem++] = elem;
1344   return true;
1345 }
1346 
1347 /* Compare two node sets SET1 and SET2.
1348    Return true if SET1 and SET2 are equivalent.  */
1349 
1350 static bool
1351 internal_function __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1352 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1353 {
1354   Idx i;
1355   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1356     return false;
1357   for (i = set1->nelem ; --i >= 0 ; )
1358     if (set1->elems[i] != set2->elems[i])
1359       return false;
1360   return true;
1361 }
1362 
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1364 
1365 static Idx
1366 internal_function __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1367 re_node_set_contains (const re_node_set *set, Idx elem)
1368 {
1369   __re_size_t idx, right, mid;
1370   if (set->nelem <= 0)
1371     return 0;
1372 
1373   /* Binary search the element.  */
1374   idx = 0;
1375   right = set->nelem - 1;
1376   while (idx < right)
1377     {
1378       mid = (idx + right) / 2;
1379       if (set->elems[mid] < elem)
1380 	idx = mid + 1;
1381       else
1382 	right = mid;
1383     }
1384   return set->elems[idx] == elem ? idx + 1 : 0;
1385 }
1386 
1387 static void
1388 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1389 re_node_set_remove_at (re_node_set *set, Idx idx)
1390 {
1391   if (idx < 0 || idx >= set->nelem)
1392     return;
1393   --set->nelem;
1394   for (; idx < set->nelem; idx++)
1395     set->elems[idx] = set->elems[idx + 1];
1396 }
1397 
1398 
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400    Or return -1 if an error occurred.  */
1401 
1402 static Idx
1403 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407     {
1408       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409       Idx *new_nexts, *new_indices;
1410       re_node_set *new_edests, *new_eclosures;
1411       re_token_t *new_nodes;
1412 
1413       /* Avoid overflows in realloc.  */
1414       const size_t max_object_size = MAX (sizeof (re_token_t),
1415 					  MAX (sizeof (re_node_set),
1416 					       sizeof (Idx)));
1417       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1418 	return -1;
1419 
1420       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421       if (BE (new_nodes == NULL, 0))
1422 	return -1;
1423       dfa->nodes = new_nodes;
1424       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428       if (BE (new_nexts == NULL || new_indices == NULL
1429 	      || new_edests == NULL || new_eclosures == NULL, 0))
1430 	{
1431 	   re_free (new_nexts);
1432 	   re_free (new_indices);
1433 	   re_free (new_edests);
1434 	   re_free (new_eclosures);
1435 	   return -1;
1436 	}
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448      || token.type == COMPLEX_BRACKET);
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = -1;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455 
1456 static re_hashval_t
1457 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460   re_hashval_t hash = nodes->nelem + context;
1461   Idx i;
1462   for (i = 0 ; i < nodes->nelem ; i++)
1463     hash += nodes->elems[i];
1464   return hash;
1465 }
1466 
1467 /* Search for the state whose node_set is equivalent to NODES.
1468    Return the pointer to the state, if we found it in the DFA.
1469    Otherwise create the new one and return it.  In case of an error
1470    return NULL and set the error code in ERR.
1471    Note: - We assume NULL as the invalid state, then it is possible that
1472 	   return value is NULL and ERR is REG_NOERROR.
1473 	 - We never return non-NULL value in case of any errors, it is for
1474 	   optimization.  */
1475 
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 		  const re_node_set *nodes)
1480 {
1481   re_hashval_t hash;
1482   re_dfastate_t *new_state;
1483   struct re_state_table_entry *spot;
1484   Idx i;
1485 #if defined GCC_LINT || defined lint
1486   /* Suppress bogus uninitialized-variable warnings.  */
1487   *err = REG_NOERROR;
1488 #endif
1489   if (BE (nodes->nelem == 0, 0))
1490     {
1491       *err = REG_NOERROR;
1492       return NULL;
1493     }
1494   hash = calc_state_hash (nodes, 0);
1495   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496 
1497   for (i = 0 ; i < spot->num ; i++)
1498     {
1499       re_dfastate_t *state = spot->array[i];
1500       if (hash != state->hash)
1501 	continue;
1502       if (re_node_set_compare (&state->nodes, nodes))
1503 	return state;
1504     }
1505 
1506   /* There are no appropriate state in the dfa, create the new one.  */
1507   new_state = create_ci_newstate (dfa, nodes, hash);
1508   if (BE (new_state == NULL, 0))
1509     *err = REG_ESPACE;
1510 
1511   return new_state;
1512 }
1513 
1514 /* Search for the state whose node_set is equivalent to NODES and
1515    whose context is equivalent to CONTEXT.
1516    Return the pointer to the state, if we found it in the DFA.
1517    Otherwise create the new one and return it.  In case of an error
1518    return NULL and set the error code in ERR.
1519    Note: - We assume NULL as the invalid state, then it is possible that
1520 	   return value is NULL and ERR is REG_NOERROR.
1521 	 - We never return non-NULL value in case of any errors, it is for
1522 	   optimization.  */
1523 
1524 static re_dfastate_t *
1525 internal_function __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527 			  const re_node_set *nodes, unsigned int context)
1528 {
1529   re_hashval_t hash;
1530   re_dfastate_t *new_state;
1531   struct re_state_table_entry *spot;
1532   Idx i;
1533 #if defined GCC_LINT || defined lint
1534   /* Suppress bogus uninitialized-variable warnings.  */
1535   *err = REG_NOERROR;
1536 #endif
1537   if (nodes->nelem == 0)
1538     {
1539       *err = REG_NOERROR;
1540       return NULL;
1541     }
1542   hash = calc_state_hash (nodes, context);
1543   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544 
1545   for (i = 0 ; i < spot->num ; i++)
1546     {
1547       re_dfastate_t *state = spot->array[i];
1548       if (state->hash == hash
1549 	  && state->context == context
1550 	  && re_node_set_compare (state->entrance_nodes, nodes))
1551 	return state;
1552     }
1553   /* There are no appropriate state in 'dfa', create the new one.  */
1554   new_state = create_cd_newstate (dfa, nodes, context, hash);
1555   if (BE (new_state == NULL, 0))
1556     *err = REG_ESPACE;
1557 
1558   return new_state;
1559 }
1560 
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562    HASH put in the appropriate bucket of DFA's state table.  Return value
1563    indicates the error code if failed.  */
1564 
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568 		re_hashval_t hash)
1569 {
1570   struct re_state_table_entry *spot;
1571   reg_errcode_t err;
1572   Idx i;
1573 
1574   newstate->hash = hash;
1575   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576   if (BE (err != REG_NOERROR, 0))
1577     return REG_ESPACE;
1578   for (i = 0; i < newstate->nodes.nelem; i++)
1579     {
1580       Idx elem = newstate->nodes.elems[i];
1581       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583 	  return REG_ESPACE;
1584     }
1585 
1586   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587   if (BE (spot->alloc <= spot->num, 0))
1588     {
1589       Idx new_alloc = 2 * spot->num + 2;
1590       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 					      new_alloc);
1592       if (BE (new_array == NULL, 0))
1593 	return REG_ESPACE;
1594       spot->array = new_array;
1595       spot->alloc = new_alloc;
1596     }
1597   spot->array[spot->num++] = newstate;
1598   return REG_NOERROR;
1599 }
1600 
1601 static void
free_state(re_dfastate_t * state)1602 free_state (re_dfastate_t *state)
1603 {
1604   re_node_set_free (&state->non_eps_nodes);
1605   re_node_set_free (&state->inveclosure);
1606   if (state->entrance_nodes != &state->nodes)
1607     {
1608       re_node_set_free (state->entrance_nodes);
1609       re_free (state->entrance_nodes);
1610     }
1611   re_node_set_free (&state->nodes);
1612   re_free (state->word_trtable);
1613   re_free (state->trtable);
1614   re_free (state);
1615 }
1616 
1617 /* Create the new state which is independent of contexts.
1618    Return the new state if succeeded, otherwise return NULL.  */
1619 
1620 static re_dfastate_t *
1621 internal_function __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623 		    re_hashval_t hash)
1624 {
1625   Idx i;
1626   reg_errcode_t err;
1627   re_dfastate_t *newstate;
1628 
1629   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630   if (BE (newstate == NULL, 0))
1631     return NULL;
1632   err = re_node_set_init_copy (&newstate->nodes, nodes);
1633   if (BE (err != REG_NOERROR, 0))
1634     {
1635       re_free (newstate);
1636       return NULL;
1637     }
1638 
1639   newstate->entrance_nodes = &newstate->nodes;
1640   for (i = 0 ; i < nodes->nelem ; i++)
1641     {
1642       re_token_t *node = dfa->nodes + nodes->elems[i];
1643       re_token_type_t type = node->type;
1644       if (type == CHARACTER && !node->constraint)
1645 	continue;
1646 #ifdef RE_ENABLE_I18N
1647       newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649 
1650       /* If the state has the halt node, the state is a halt state.  */
1651       if (type == END_OF_RE)
1652 	newstate->halt = 1;
1653       else if (type == OP_BACK_REF)
1654 	newstate->has_backref = 1;
1655       else if (type == ANCHOR || node->constraint)
1656 	newstate->has_constraint = 1;
1657     }
1658   err = register_state (dfa, newstate, hash);
1659   if (BE (err != REG_NOERROR, 0))
1660     {
1661       free_state (newstate);
1662       newstate = NULL;
1663     }
1664   return newstate;
1665 }
1666 
1667 /* Create the new state which is depend on the context CONTEXT.
1668    Return the new state if succeeded, otherwise return NULL.  */
1669 
1670 static re_dfastate_t *
1671 internal_function __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673 		    unsigned int context, re_hashval_t hash)
1674 {
1675   Idx i, nctx_nodes = 0;
1676   reg_errcode_t err;
1677   re_dfastate_t *newstate;
1678 
1679   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680   if (BE (newstate == NULL, 0))
1681     return NULL;
1682   err = re_node_set_init_copy (&newstate->nodes, nodes);
1683   if (BE (err != REG_NOERROR, 0))
1684     {
1685       re_free (newstate);
1686       return NULL;
1687     }
1688 
1689   newstate->context = context;
1690   newstate->entrance_nodes = &newstate->nodes;
1691 
1692   for (i = 0 ; i < nodes->nelem ; i++)
1693     {
1694       re_token_t *node = dfa->nodes + nodes->elems[i];
1695       re_token_type_t type = node->type;
1696       unsigned int constraint = node->constraint;
1697 
1698       if (type == CHARACTER && !constraint)
1699 	continue;
1700 #ifdef RE_ENABLE_I18N
1701       newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703 
1704       /* If the state has the halt node, the state is a halt state.  */
1705       if (type == END_OF_RE)
1706 	newstate->halt = 1;
1707       else if (type == OP_BACK_REF)
1708 	newstate->has_backref = 1;
1709 
1710       if (constraint)
1711 	{
1712 	  if (newstate->entrance_nodes == &newstate->nodes)
1713 	    {
1714 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715 	      if (BE (newstate->entrance_nodes == NULL, 0))
1716 		{
1717 		  free_state (newstate);
1718 		  return NULL;
1719 		}
1720 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721 		  != REG_NOERROR)
1722 		return NULL;
1723 	      nctx_nodes = 0;
1724 	      newstate->has_constraint = 1;
1725 	    }
1726 
1727 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728 	    {
1729 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730 	      ++nctx_nodes;
1731 	    }
1732 	}
1733     }
1734   err = register_state (dfa, newstate, hash);
1735   if (BE (err != REG_NOERROR, 0))
1736     {
1737       free_state (newstate);
1738       newstate = NULL;
1739     }
1740   return  newstate;
1741 }
1742