1 /* Extended regular expression matching and search library.
2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5 
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10 
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15 
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19 
20 static void re_string_construct_common (const char *str, Idx len,
21 					re_string_t *pstr,
22 					RE_TRANSLATE_TYPE trans, bool icase,
23 					const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 					  const re_node_set *nodes,
26 					  re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 					  const re_node_set *nodes,
29 					  unsigned int context,
30 					  re_hashval_t hash) internal_function;
31 
32 /* Functions for string operation.  */
33 
34 /* This function allocate the buffers.  It is necessary to call
35    re_string_reconstruct before using the object.  */
36 
37 static reg_errcode_t
38 internal_function __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42   reg_errcode_t ret;
43   Idx init_buf_len;
44 
45   /* Ensure at least one character fits into the buffers.  */
46   if (init_len < dfa->mb_cur_max)
47     init_len = dfa->mb_cur_max;
48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
50 
51   ret = re_string_realloc_buffers (pstr, init_buf_len);
52   if (BE (ret != REG_NOERROR, 0))
53     return ret;
54 
55   pstr->word_char = dfa->word_char;
56   pstr->word_ops_used = dfa->word_ops_used;
57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59   pstr->valid_raw_len = pstr->valid_len;
60   return REG_NOERROR;
61 }
62 
63 /* This function allocate the buffers, and initialize them.  */
64 
65 static reg_errcode_t
66 internal_function __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70   reg_errcode_t ret;
71   memset (pstr, '\0', sizeof (re_string_t));
72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
73 
74   if (len > 0)
75     {
76       ret = re_string_realloc_buffers (pstr, len + 1);
77       if (BE (ret != REG_NOERROR, 0))
78 	return ret;
79     }
80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81 
82   if (icase)
83     {
84 #ifdef RE_ENABLE_I18N
85       if (dfa->mb_cur_max > 1)
86 	{
87 	  while (1)
88 	    {
89 	      ret = build_wcs_upper_buffer (pstr);
90 	      if (BE (ret != REG_NOERROR, 0))
91 		return ret;
92 	      if (pstr->valid_raw_len >= len)
93 		break;
94 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 		break;
96 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 	      if (BE (ret != REG_NOERROR, 0))
98 		return ret;
99 	    }
100 	}
101       else
102 #endif /* RE_ENABLE_I18N  */
103 	build_upper_buffer (pstr);
104     }
105   else
106     {
107 #ifdef RE_ENABLE_I18N
108       if (dfa->mb_cur_max > 1)
109 	build_wcs_buffer (pstr);
110       else
111 #endif /* RE_ENABLE_I18N  */
112 	{
113 	  if (trans != NULL)
114 	    re_string_translate_buffer (pstr);
115 	  else
116 	    {
117 	      pstr->valid_len = pstr->bufs_len;
118 	      pstr->valid_raw_len = pstr->bufs_len;
119 	    }
120 	}
121     }
122 
123   return REG_NOERROR;
124 }
125 
126 /* Helper functions for re_string_allocate, and re_string_construct.  */
127 
128 static reg_errcode_t
129 internal_function __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133   if (pstr->mb_cur_max > 1)
134     {
135       wint_t *new_wcs;
136 
137       /* Avoid overflow in realloc.  */
138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
140 	return REG_ESPACE;
141 
142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143       if (BE (new_wcs == NULL, 0))
144 	return REG_ESPACE;
145       pstr->wcs = new_wcs;
146       if (pstr->offsets != NULL)
147 	{
148 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 	  if (BE (new_offsets == NULL, 0))
150 	    return REG_ESPACE;
151 	  pstr->offsets = new_offsets;
152 	}
153     }
154 #endif /* RE_ENABLE_I18N  */
155   if (pstr->mbs_allocated)
156     {
157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 					   new_buf_len);
159       if (BE (new_mbs == NULL, 0))
160 	return REG_ESPACE;
161       pstr->mbs = new_mbs;
162     }
163   pstr->bufs_len = new_buf_len;
164   return REG_NOERROR;
165 }
166 
167 
168 static void
169 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 			    RE_TRANSLATE_TYPE trans, bool icase,
172 			    const re_dfa_t *dfa)
173 {
174   pstr->raw_mbs = (const unsigned char *) str;
175   pstr->len = len;
176   pstr->raw_len = len;
177   pstr->trans = trans;
178   pstr->icase = icase;
179   pstr->mbs_allocated = (trans != NULL || icase);
180   pstr->mb_cur_max = dfa->mb_cur_max;
181   pstr->is_utf8 = dfa->is_utf8;
182   pstr->map_notascii = dfa->map_notascii;
183   pstr->stop = pstr->len;
184   pstr->raw_stop = pstr->stop;
185 }
186 
187 #ifdef RE_ENABLE_I18N
188 
189 /* Build wide character buffer PSTR->WCS.
190    If the byte sequence of the string are:
191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192    Then wide character buffer will be:
193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
194    We use WEOF for padding, they indicate that the position isn't
195    a first byte of a multibyte character.
196 
197    Note that this function assumes PSTR->VALID_LEN elements are already
198    built and starts from PSTR->VALID_LEN.  */
199 
200 static void
201 internal_function
build_wcs_buffer(re_string_t * pstr)202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205   unsigned char buf[MB_LEN_MAX];
206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208   unsigned char buf[64];
209 #endif
210   mbstate_t prev_st;
211   Idx byte_idx, end_idx, remain_len;
212   size_t mbclen;
213 
214   /* Build the buffers from pstr->valid_len to either pstr->len or
215      pstr->bufs_len.  */
216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218     {
219       wchar_t wc;
220       const char *p;
221 
222       remain_len = end_idx - byte_idx;
223       prev_st = pstr->cur_state;
224       /* Apply the translation if we need.  */
225       if (BE (pstr->trans != NULL, 0))
226 	{
227 	  int i, ch;
228 
229 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 	    {
231 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 	    }
234 	  p = (const char *) buf;
235 	}
236       else
237 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239       if (BE (mbclen == (size_t) -1 || mbclen == 0
240 	      || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241 	{
242 	  /* We treat these cases as a singlebyte character.  */
243 	  mbclen = 1;
244 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 	  if (BE (pstr->trans != NULL, 0))
246 	    wc = pstr->trans[wc];
247 	  pstr->cur_state = prev_st;
248 	}
249       else if (BE (mbclen == (size_t) -2, 0))
250 	{
251 	  /* The buffer doesn't have enough space, finish to build.  */
252 	  pstr->cur_state = prev_st;
253 	  break;
254 	}
255 
256       /* Write wide character and padding.  */
257       pstr->wcs[byte_idx++] = wc;
258       /* Write paddings.  */
259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 	pstr->wcs[byte_idx++] = WEOF;
261     }
262   pstr->valid_len = byte_idx;
263   pstr->valid_raw_len = byte_idx;
264 }
265 
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267    but for REG_ICASE.  */
268 
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273   mbstate_t prev_st;
274   Idx src_idx, byte_idx, end_idx, remain_len;
275   size_t mbclen;
276 #ifdef _LIBC
277   char buf[MB_LEN_MAX];
278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280   char buf[64];
281 #endif
282 
283   byte_idx = pstr->valid_len;
284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285 
286   /* The following optimization assumes that ASCII characters can be
287      mapped to wide characters with a simple cast.  */
288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289     {
290       while (byte_idx < end_idx)
291 	{
292 	  wchar_t wc;
293 
294 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 	      && mbsinit (&pstr->cur_state))
296 	    {
297 	      /* In case of a singlebyte character.  */
298 	      pstr->mbs[byte_idx]
299 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 	      /* The next step uses the assumption that wchar_t is encoded
301 		 ASCII-safe: all ASCII values can be converted like this.  */
302 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 	      ++byte_idx;
304 	      continue;
305 	    }
306 
307 	  remain_len = end_idx - byte_idx;
308 	  prev_st = pstr->cur_state;
309 	  mbclen = __mbrtowc (&wc,
310 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 			       + byte_idx), remain_len, &pstr->cur_state);
312 	  if (BE (mbclen < (size_t) -2, 1))
313 	    {
314 	      wchar_t wcu = towupper (wc);
315 	      if (wcu != wc)
316 		{
317 		  size_t mbcdlen;
318 
319 		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 		  if (BE (mbclen == mbcdlen, 1))
321 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 		  else
323 		    {
324 		      src_idx = byte_idx;
325 		      goto offsets_needed;
326 		    }
327 		}
328 	      else
329 		memcpy (pstr->mbs + byte_idx,
330 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 	      pstr->wcs[byte_idx++] = wcu;
332 	      /* Write paddings.  */
333 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 		pstr->wcs[byte_idx++] = WEOF;
335 	    }
336 	  else if (mbclen == (size_t) -1 || mbclen == 0
337 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338 	    {
339 	      /* It is an invalid character, an incomplete character
340 		 at the end of the string, or '\0'.  Just use the byte.  */
341 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
342 	      pstr->mbs[byte_idx] = ch;
343 	      /* And also cast it to wide char.  */
344 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
345 	      if (BE (mbclen == (size_t) -1, 0))
346 		pstr->cur_state = prev_st;
347 	    }
348 	  else
349 	    {
350 	      /* The buffer doesn't have enough space, finish to build.  */
351 	      pstr->cur_state = prev_st;
352 	      break;
353 	    }
354 	}
355       pstr->valid_len = byte_idx;
356       pstr->valid_raw_len = byte_idx;
357       return REG_NOERROR;
358     }
359   else
360     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361       {
362 	wchar_t wc;
363 	const char *p;
364       offsets_needed:
365 	remain_len = end_idx - byte_idx;
366 	prev_st = pstr->cur_state;
367 	if (BE (pstr->trans != NULL, 0))
368 	  {
369 	    int i, ch;
370 
371 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 	      {
373 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
374 		buf[i] = pstr->trans[ch];
375 	      }
376 	    p = (const char *) buf;
377 	  }
378 	else
379 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
380 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
381 	if (BE (mbclen < (size_t) -2, 1))
382 	  {
383 	    wchar_t wcu = towupper (wc);
384 	    if (wcu != wc)
385 	      {
386 		size_t mbcdlen;
387 
388 		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
389 		if (BE (mbclen == mbcdlen, 1))
390 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
391 		else if (mbcdlen != (size_t) -1)
392 		  {
393 		    size_t i;
394 
395 		    if (byte_idx + mbcdlen > pstr->bufs_len)
396 		      {
397 			pstr->cur_state = prev_st;
398 			break;
399 		      }
400 
401 		    if (pstr->offsets == NULL)
402 		      {
403 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
404 
405 			if (pstr->offsets == NULL)
406 			  return REG_ESPACE;
407 		      }
408 		    if (!pstr->offsets_needed)
409 		      {
410 			for (i = 0; i < (size_t) byte_idx; ++i)
411 			  pstr->offsets[i] = i;
412 			pstr->offsets_needed = 1;
413 		      }
414 
415 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
416 		    pstr->wcs[byte_idx] = wcu;
417 		    pstr->offsets[byte_idx] = src_idx;
418 		    for (i = 1; i < mbcdlen; ++i)
419 		      {
420 			pstr->offsets[byte_idx + i]
421 			  = src_idx + (i < mbclen ? i : mbclen - 1);
422 			pstr->wcs[byte_idx + i] = WEOF;
423 		      }
424 		    pstr->len += mbcdlen - mbclen;
425 		    if (pstr->raw_stop > src_idx)
426 		      pstr->stop += mbcdlen - mbclen;
427 		    end_idx = (pstr->bufs_len > pstr->len)
428 			      ? pstr->len : pstr->bufs_len;
429 		    byte_idx += mbcdlen;
430 		    src_idx += mbclen;
431 		    continue;
432 		  }
433 		else
434 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
435 	      }
436 	    else
437 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
438 
439 	    if (BE (pstr->offsets_needed != 0, 0))
440 	      {
441 		size_t i;
442 		for (i = 0; i < mbclen; ++i)
443 		  pstr->offsets[byte_idx + i] = src_idx + i;
444 	      }
445 	    src_idx += mbclen;
446 
447 	    pstr->wcs[byte_idx++] = wcu;
448 	    /* Write paddings.  */
449 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
450 	      pstr->wcs[byte_idx++] = WEOF;
451 	  }
452 	else if (mbclen == (size_t) -1 || mbclen == 0
453 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
454 	  {
455 	    /* It is an invalid character or '\0'.  Just use the byte.  */
456 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
457 
458 	    if (BE (pstr->trans != NULL, 0))
459 	      ch = pstr->trans [ch];
460 	    pstr->mbs[byte_idx] = ch;
461 
462 	    if (BE (pstr->offsets_needed != 0, 0))
463 	      pstr->offsets[byte_idx] = src_idx;
464 	    ++src_idx;
465 
466 	    /* And also cast it to wide char.  */
467 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
468 	    if (BE (mbclen == (size_t) -1, 0))
469 	      pstr->cur_state = prev_st;
470 	  }
471 	else
472 	  {
473 	    /* The buffer doesn't have enough space, finish to build.  */
474 	    pstr->cur_state = prev_st;
475 	    break;
476 	  }
477       }
478   pstr->valid_len = byte_idx;
479   pstr->valid_raw_len = src_idx;
480   return REG_NOERROR;
481 }
482 
483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
484    Return the index.  */
485 
486 static Idx
487 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)488 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
489 {
490   mbstate_t prev_st;
491   Idx rawbuf_idx;
492   size_t mbclen;
493   wint_t wc = WEOF;
494 
495   /* Skip the characters which are not necessary to check.  */
496   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
497        rawbuf_idx < new_raw_idx;)
498     {
499       wchar_t wc2;
500       Idx remain_len = pstr->raw_len - rawbuf_idx;
501       prev_st = pstr->cur_state;
502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
503 			  remain_len, &pstr->cur_state);
504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
505 	{
506 	  /* We treat these cases as a single byte character.  */
507 	  if (mbclen == 0 || remain_len == 0)
508 	    wc = L'\0';
509 	  else
510 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
511 	  mbclen = 1;
512 	  pstr->cur_state = prev_st;
513 	}
514       else
515 	wc = wc2;
516       /* Then proceed the next character.  */
517       rawbuf_idx += mbclen;
518     }
519   *last_wc = wc;
520   return rawbuf_idx;
521 }
522 #endif /* RE_ENABLE_I18N  */
523 
524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
525    This function is used in case of REG_ICASE.  */
526 
527 static void
528 internal_function
build_upper_buffer(re_string_t * pstr)529 build_upper_buffer (re_string_t *pstr)
530 {
531   Idx char_idx, end_idx;
532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
533 
534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
535     {
536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
537       if (BE (pstr->trans != NULL, 0))
538 	ch = pstr->trans[ch];
539       pstr->mbs[char_idx] = toupper (ch);
540     }
541   pstr->valid_len = char_idx;
542   pstr->valid_raw_len = char_idx;
543 }
544 
545 /* Apply TRANS to the buffer in PSTR.  */
546 
547 static void
548 internal_function
re_string_translate_buffer(re_string_t * pstr)549 re_string_translate_buffer (re_string_t *pstr)
550 {
551   Idx buf_idx, end_idx;
552   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
553 
554   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
555     {
556       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
557       pstr->mbs[buf_idx] = pstr->trans[ch];
558     }
559 
560   pstr->valid_len = buf_idx;
561   pstr->valid_raw_len = buf_idx;
562 }
563 
564 /* This function re-construct the buffers.
565    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
566    convert to upper case in case of REG_ICASE, apply translation.  */
567 
568 static reg_errcode_t
569 internal_function __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)570 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
571 {
572   Idx offset;
573 
574   if (BE (pstr->raw_mbs_idx <= idx, 0))
575     offset = idx - pstr->raw_mbs_idx;
576   else
577     {
578       /* Reset buffer.  */
579 #ifdef RE_ENABLE_I18N
580       if (pstr->mb_cur_max > 1)
581 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
582 #endif /* RE_ENABLE_I18N */
583       pstr->len = pstr->raw_len;
584       pstr->stop = pstr->raw_stop;
585       pstr->valid_len = 0;
586       pstr->raw_mbs_idx = 0;
587       pstr->valid_raw_len = 0;
588       pstr->offsets_needed = 0;
589       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
590 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
591       if (!pstr->mbs_allocated)
592 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
593       offset = idx;
594     }
595 
596   if (BE (offset != 0, 1))
597     {
598       /* Should the already checked characters be kept?  */
599       if (BE (offset < pstr->valid_raw_len, 1))
600 	{
601 	  /* Yes, move them to the front of the buffer.  */
602 #ifdef RE_ENABLE_I18N
603 	  if (BE (pstr->offsets_needed, 0))
604 	    {
605 	      Idx low = 0, high = pstr->valid_len, mid;
606 	      do
607 		{
608 		  mid = (high + low) / 2;
609 		  if (pstr->offsets[mid] > offset)
610 		    high = mid;
611 		  else if (pstr->offsets[mid] < offset)
612 		    low = mid + 1;
613 		  else
614 		    break;
615 		}
616 	      while (low < high);
617 	      if (pstr->offsets[mid] < offset)
618 		++mid;
619 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
620 							eflags);
621 	      /* This can be quite complicated, so handle specially
622 		 only the common and easy case where the character with
623 		 different length representation of lower and upper
624 		 case is present at or after offset.  */
625 	      if (pstr->valid_len > offset
626 		  && mid == offset && pstr->offsets[mid] == offset)
627 		{
628 		  memmove (pstr->wcs, pstr->wcs + offset,
629 			   (pstr->valid_len - offset) * sizeof (wint_t));
630 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
631 		  pstr->valid_len -= offset;
632 		  pstr->valid_raw_len -= offset;
633 		  for (low = 0; low < pstr->valid_len; low++)
634 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
635 		}
636 	      else
637 		{
638 		  /* Otherwise, just find out how long the partial multibyte
639 		     character at offset is and fill it with WEOF/255.  */
640 		  pstr->len = pstr->raw_len - idx + offset;
641 		  pstr->stop = pstr->raw_stop - idx + offset;
642 		  pstr->offsets_needed = 0;
643 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
644 		    --mid;
645 		  while (mid < pstr->valid_len)
646 		    if (pstr->wcs[mid] != WEOF)
647 		      break;
648 		    else
649 		      ++mid;
650 		  if (mid == pstr->valid_len)
651 		    pstr->valid_len = 0;
652 		  else
653 		    {
654 		      pstr->valid_len = pstr->offsets[mid] - offset;
655 		      if (pstr->valid_len)
656 			{
657 			  for (low = 0; low < pstr->valid_len; ++low)
658 			    pstr->wcs[low] = WEOF;
659 			  memset (pstr->mbs, 255, pstr->valid_len);
660 			}
661 		    }
662 		  pstr->valid_raw_len = pstr->valid_len;
663 		}
664 	    }
665 	  else
666 #endif
667 	    {
668 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
669 							eflags);
670 #ifdef RE_ENABLE_I18N
671 	      if (pstr->mb_cur_max > 1)
672 		memmove (pstr->wcs, pstr->wcs + offset,
673 			 (pstr->valid_len - offset) * sizeof (wint_t));
674 #endif /* RE_ENABLE_I18N */
675 	      if (BE (pstr->mbs_allocated, 0))
676 		memmove (pstr->mbs, pstr->mbs + offset,
677 			 pstr->valid_len - offset);
678 	      pstr->valid_len -= offset;
679 	      pstr->valid_raw_len -= offset;
680 #if defined DEBUG && DEBUG
681 	      assert (pstr->valid_len > 0);
682 #endif
683 	    }
684 	}
685       else
686 	{
687 #ifdef RE_ENABLE_I18N
688 	  /* No, skip all characters until IDX.  */
689 	  Idx prev_valid_len = pstr->valid_len;
690 
691 	  if (BE (pstr->offsets_needed, 0))
692 	    {
693 	      pstr->len = pstr->raw_len - idx + offset;
694 	      pstr->stop = pstr->raw_stop - idx + offset;
695 	      pstr->offsets_needed = 0;
696 	    }
697 #endif
698 	  pstr->valid_len = 0;
699 #ifdef RE_ENABLE_I18N
700 	  if (pstr->mb_cur_max > 1)
701 	    {
702 	      Idx wcs_idx;
703 	      wint_t wc = WEOF;
704 
705 	      if (pstr->is_utf8)
706 		{
707 		  const unsigned char *raw, *p, *end;
708 
709 		  /* Special case UTF-8.  Multi-byte chars start with any
710 		     byte other than 0x80 - 0xbf.  */
711 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
712 		  end = raw + (offset - pstr->mb_cur_max);
713 		  if (end < pstr->raw_mbs)
714 		    end = pstr->raw_mbs;
715 		  p = raw + offset - 1;
716 #ifdef _LIBC
717 		  /* We know the wchar_t encoding is UCS4, so for the simple
718 		     case, ASCII characters, skip the conversion step.  */
719 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
720 		    {
721 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
722 		      /* pstr->valid_len = 0; */
723 		      wc = (wchar_t) *p;
724 		    }
725 		  else
726 #endif
727 		    for (; p >= end; --p)
728 		      if ((*p & 0xc0) != 0x80)
729 			{
730 			  mbstate_t cur_state;
731 			  wchar_t wc2;
732 			  Idx mlen = raw + pstr->len - p;
733 			  unsigned char buf[6];
734 			  size_t mbclen;
735 
736 			  const unsigned char *pp = p;
737 			  if (BE (pstr->trans != NULL, 0))
738 			    {
739 			      int i = mlen < 6 ? mlen : 6;
740 			      while (--i >= 0)
741 				buf[i] = pstr->trans[p[i]];
742 			      pp = buf;
743 			    }
744 			  /* XXX Don't use mbrtowc, we know which conversion
745 			     to use (UTF-8 -> UCS4).  */
746 			  memset (&cur_state, 0, sizeof (cur_state));
747 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
748 					      &cur_state);
749 			  if (raw + offset - p <= mbclen
750 			      && mbclen < (size_t) -2)
751 			    {
752 			      memset (&pstr->cur_state, '\0',
753 				      sizeof (mbstate_t));
754 			      pstr->valid_len = mbclen - (raw + offset - p);
755 			      wc = wc2;
756 			    }
757 			  break;
758 			}
759 		}
760 
761 	      if (wc == WEOF)
762 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
763 	      if (wc == WEOF)
764 		pstr->tip_context
765 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
766 	      else
767 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
768 				      && IS_WIDE_WORD_CHAR (wc))
769 				     ? CONTEXT_WORD
770 				     : ((IS_WIDE_NEWLINE (wc)
771 					 && pstr->newline_anchor)
772 					? CONTEXT_NEWLINE : 0));
773 	      if (BE (pstr->valid_len, 0))
774 		{
775 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
776 		    pstr->wcs[wcs_idx] = WEOF;
777 		  if (pstr->mbs_allocated)
778 		    memset (pstr->mbs, 255, pstr->valid_len);
779 		}
780 	      pstr->valid_raw_len = pstr->valid_len;
781 	    }
782 	  else
783 #endif /* RE_ENABLE_I18N */
784 	    {
785 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
786 	      pstr->valid_raw_len = 0;
787 	      if (pstr->trans)
788 		c = pstr->trans[c];
789 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
790 				   ? CONTEXT_WORD
791 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
792 				      ? CONTEXT_NEWLINE : 0));
793 	    }
794 	}
795       if (!BE (pstr->mbs_allocated, 0))
796 	pstr->mbs += offset;
797     }
798   pstr->raw_mbs_idx = idx;
799   pstr->len -= offset;
800   pstr->stop -= offset;
801 
802   /* Then build the buffers.  */
803 #ifdef RE_ENABLE_I18N
804   if (pstr->mb_cur_max > 1)
805     {
806       if (pstr->icase)
807 	{
808 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
809 	  if (BE (ret != REG_NOERROR, 0))
810 	    return ret;
811 	}
812       else
813 	build_wcs_buffer (pstr);
814     }
815   else
816 #endif /* RE_ENABLE_I18N */
817     if (BE (pstr->mbs_allocated, 0))
818       {
819 	if (pstr->icase)
820 	  build_upper_buffer (pstr);
821 	else if (pstr->trans != NULL)
822 	  re_string_translate_buffer (pstr);
823       }
824     else
825       pstr->valid_len = pstr->len;
826 
827   pstr->cur_idx = 0;
828   return REG_NOERROR;
829 }
830 
831 static unsigned char
832 internal_function __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)833 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
834 {
835   int ch;
836   Idx off;
837 
838   /* Handle the common (easiest) cases first.  */
839   if (BE (!pstr->mbs_allocated, 1))
840     return re_string_peek_byte (pstr, idx);
841 
842 #ifdef RE_ENABLE_I18N
843   if (pstr->mb_cur_max > 1
844       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
845     return re_string_peek_byte (pstr, idx);
846 #endif
847 
848   off = pstr->cur_idx + idx;
849 #ifdef RE_ENABLE_I18N
850   if (pstr->offsets_needed)
851     off = pstr->offsets[off];
852 #endif
853 
854   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
855 
856 #ifdef RE_ENABLE_I18N
857   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
858      this function returns CAPITAL LETTER I instead of first byte of
859      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
860      since peek_byte_case doesn't advance cur_idx in any way.  */
861   if (pstr->offsets_needed && !isascii (ch))
862     return re_string_peek_byte (pstr, idx);
863 #endif
864 
865   return ch;
866 }
867 
868 static unsigned char
869 internal_function
re_string_fetch_byte_case(re_string_t * pstr)870 re_string_fetch_byte_case (re_string_t *pstr)
871 {
872   if (BE (!pstr->mbs_allocated, 1))
873     return re_string_fetch_byte (pstr);
874 
875 #ifdef RE_ENABLE_I18N
876   if (pstr->offsets_needed)
877     {
878       Idx off;
879       int ch;
880 
881       /* For tr_TR.UTF-8 [[:islower:]] there is
882 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
883 	 in that case the whole multi-byte character and return
884 	 the original letter.  On the other side, with
885 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
886 	 anything else would complicate things too much.  */
887 
888       if (!re_string_first_byte (pstr, pstr->cur_idx))
889 	return re_string_fetch_byte (pstr);
890 
891       off = pstr->offsets[pstr->cur_idx];
892       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
893 
894       if (! isascii (ch))
895 	return re_string_fetch_byte (pstr);
896 
897       re_string_skip_bytes (pstr,
898 			    re_string_char_size_at (pstr, pstr->cur_idx));
899       return ch;
900     }
901 #endif
902 
903   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
904 }
905 
906 static void
907 internal_function
re_string_destruct(re_string_t * pstr)908 re_string_destruct (re_string_t *pstr)
909 {
910 #ifdef RE_ENABLE_I18N
911   re_free (pstr->wcs);
912   re_free (pstr->offsets);
913 #endif /* RE_ENABLE_I18N  */
914   if (pstr->mbs_allocated)
915     re_free (pstr->mbs);
916 }
917 
918 /* Return the context at IDX in INPUT.  */
919 
920 static unsigned int
921 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)922 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
923 {
924   int c;
925   if (BE (! REG_VALID_INDEX (idx), 0))
926     /* In this case, we use the value stored in input->tip_context,
927        since we can't know the character in input->mbs[-1] here.  */
928     return input->tip_context;
929   if (BE (idx == input->len, 0))
930     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
931 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
932 #ifdef RE_ENABLE_I18N
933   if (input->mb_cur_max > 1)
934     {
935       wint_t wc;
936       Idx wc_idx = idx;
937       while(input->wcs[wc_idx] == WEOF)
938 	{
939 #if defined DEBUG && DEBUG
940 	  /* It must not happen.  */
941 	  assert (REG_VALID_INDEX (wc_idx));
942 #endif
943 	  --wc_idx;
944 	  if (! REG_VALID_INDEX (wc_idx))
945 	    return input->tip_context;
946 	}
947       wc = input->wcs[wc_idx];
948       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
949 	return CONTEXT_WORD;
950       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
951 	      ? CONTEXT_NEWLINE : 0);
952     }
953   else
954 #endif
955     {
956       c = re_string_byte_at (input, idx);
957       if (bitset_contain (input->word_char, c))
958 	return CONTEXT_WORD;
959       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
960     }
961 }
962 
963 /* Functions for set operation.  */
964 
965 static reg_errcode_t
966 internal_function __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)967 re_node_set_alloc (re_node_set *set, Idx size)
968 {
969   set->alloc = size;
970   set->nelem = 0;
971   set->elems = re_malloc (Idx, size);
972   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
973     return REG_ESPACE;
974   return REG_NOERROR;
975 }
976 
977 static reg_errcode_t
978 internal_function __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)979 re_node_set_init_1 (re_node_set *set, Idx elem)
980 {
981   set->alloc = 1;
982   set->nelem = 1;
983   set->elems = re_malloc (Idx, 1);
984   if (BE (set->elems == NULL, 0))
985     {
986       set->alloc = set->nelem = 0;
987       return REG_ESPACE;
988     }
989   set->elems[0] = elem;
990   return REG_NOERROR;
991 }
992 
993 static reg_errcode_t
994 internal_function __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)995 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
996 {
997   set->alloc = 2;
998   set->elems = re_malloc (Idx, 2);
999   if (BE (set->elems == NULL, 0))
1000     return REG_ESPACE;
1001   if (elem1 == elem2)
1002     {
1003       set->nelem = 1;
1004       set->elems[0] = elem1;
1005     }
1006   else
1007     {
1008       set->nelem = 2;
1009       if (elem1 < elem2)
1010 	{
1011 	  set->elems[0] = elem1;
1012 	  set->elems[1] = elem2;
1013 	}
1014       else
1015 	{
1016 	  set->elems[0] = elem2;
1017 	  set->elems[1] = elem1;
1018 	}
1019     }
1020   return REG_NOERROR;
1021 }
1022 
1023 static reg_errcode_t
1024 internal_function __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1025 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 {
1027   dest->nelem = src->nelem;
1028   if (src->nelem > 0)
1029     {
1030       dest->alloc = dest->nelem;
1031       dest->elems = re_malloc (Idx, dest->alloc);
1032       if (BE (dest->elems == NULL, 0))
1033 	{
1034 	  dest->alloc = dest->nelem = 0;
1035 	  return REG_ESPACE;
1036 	}
1037       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1038     }
1039   else
1040     re_node_set_init_empty (dest);
1041   return REG_NOERROR;
1042 }
1043 
1044 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047 
1048 static reg_errcode_t
1049 internal_function __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1050 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051 			   const re_node_set *src2)
1052 {
1053   Idx i1, i2, is, id, delta, sbase;
1054   if (src1->nelem == 0 || src2->nelem == 0)
1055     return REG_NOERROR;
1056 
1057   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058      conservative estimate.  */
1059   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060     {
1061       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1063       if (BE (new_elems == NULL, 0))
1064 	return REG_ESPACE;
1065       dest->elems = new_elems;
1066       dest->alloc = new_alloc;
1067     }
1068 
1069   /* Find the items in the intersection of SRC1 and SRC2, and copy
1070      into the top of DEST those that are not already in DEST itself.  */
1071   sbase = dest->nelem + src1->nelem + src2->nelem;
1072   i1 = src1->nelem - 1;
1073   i2 = src2->nelem - 1;
1074   id = dest->nelem - 1;
1075   for (;;)
1076     {
1077       if (src1->elems[i1] == src2->elems[i2])
1078 	{
1079 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1080 	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1081 	    --id;
1082 
1083           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1084             dest->elems[--sbase] = src1->elems[i1];
1085 
1086 	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1087 	    break;
1088 	}
1089 
1090       /* Lower the highest of the two items.  */
1091       else if (src1->elems[i1] < src2->elems[i2])
1092 	{
1093 	  if (! REG_VALID_INDEX (--i2))
1094 	    break;
1095 	}
1096       else
1097 	{
1098 	  if (! REG_VALID_INDEX (--i1))
1099 	    break;
1100 	}
1101     }
1102 
1103   id = dest->nelem - 1;
1104   is = dest->nelem + src1->nelem + src2->nelem - 1;
1105   delta = is - sbase + 1;
1106 
1107   /* Now copy.  When DELTA becomes zero, the remaining
1108      DEST elements are already in place; this is more or
1109      less the same loop that is in re_node_set_merge.  */
1110   dest->nelem += delta;
1111   if (delta > 0 && REG_VALID_INDEX (id))
1112     for (;;)
1113       {
1114 	if (dest->elems[is] > dest->elems[id])
1115 	  {
1116 	    /* Copy from the top.  */
1117 	    dest->elems[id + delta--] = dest->elems[is--];
1118 	    if (delta == 0)
1119 	      break;
1120 	  }
1121 	else
1122 	  {
1123 	    /* Slide from the bottom.  */
1124 	    dest->elems[id + delta] = dest->elems[id];
1125 	    if (! REG_VALID_INDEX (--id))
1126 	      break;
1127 	  }
1128       }
1129 
1130   /* Copy remaining SRC elements.  */
1131   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1132 
1133   return REG_NOERROR;
1134 }
1135 
1136 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138 
1139 static reg_errcode_t
1140 internal_function __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1141 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142 			const re_node_set *src2)
1143 {
1144   Idx i1, i2, id;
1145   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146     {
1147       dest->alloc = src1->nelem + src2->nelem;
1148       dest->elems = re_malloc (Idx, dest->alloc);
1149       if (BE (dest->elems == NULL, 0))
1150 	return REG_ESPACE;
1151     }
1152   else
1153     {
1154       if (src1 != NULL && src1->nelem > 0)
1155 	return re_node_set_init_copy (dest, src1);
1156       else if (src2 != NULL && src2->nelem > 0)
1157 	return re_node_set_init_copy (dest, src2);
1158       else
1159 	re_node_set_init_empty (dest);
1160       return REG_NOERROR;
1161     }
1162   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163     {
1164       if (src1->elems[i1] > src2->elems[i2])
1165 	{
1166 	  dest->elems[id++] = src2->elems[i2++];
1167 	  continue;
1168 	}
1169       if (src1->elems[i1] == src2->elems[i2])
1170 	++i2;
1171       dest->elems[id++] = src1->elems[i1++];
1172     }
1173   if (i1 < src1->nelem)
1174     {
1175       memcpy (dest->elems + id, src1->elems + i1,
1176 	     (src1->nelem - i1) * sizeof (Idx));
1177       id += src1->nelem - i1;
1178     }
1179   else if (i2 < src2->nelem)
1180     {
1181       memcpy (dest->elems + id, src2->elems + i2,
1182 	     (src2->nelem - i2) * sizeof (Idx));
1183       id += src2->nelem - i2;
1184     }
1185   dest->nelem = id;
1186   return REG_NOERROR;
1187 }
1188 
1189 /* Calculate the union set of the sets DEST and SRC. And store it to
1190    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191 
1192 static reg_errcode_t
1193 internal_function __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)1194 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 {
1196   Idx is, id, sbase, delta;
1197   if (src == NULL || src->nelem == 0)
1198     return REG_NOERROR;
1199   if (dest->alloc < 2 * src->nelem + dest->nelem)
1200     {
1201       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1202       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1203       if (BE (new_buffer == NULL, 0))
1204 	return REG_ESPACE;
1205       dest->elems = new_buffer;
1206       dest->alloc = new_alloc;
1207     }
1208 
1209   if (BE (dest->nelem == 0, 0))
1210     {
1211       dest->nelem = src->nelem;
1212       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1213       return REG_NOERROR;
1214     }
1215 
1216   /* Copy into the top of DEST the items of SRC that are not
1217      found in DEST.  Maybe we could binary search in DEST?  */
1218   for (sbase = dest->nelem + 2 * src->nelem,
1219        is = src->nelem - 1, id = dest->nelem - 1;
1220        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1221     {
1222       if (dest->elems[id] == src->elems[is])
1223 	is--, id--;
1224       else if (dest->elems[id] < src->elems[is])
1225 	dest->elems[--sbase] = src->elems[is--];
1226       else /* if (dest->elems[id] > src->elems[is]) */
1227 	--id;
1228     }
1229 
1230   if (REG_VALID_INDEX (is))
1231     {
1232       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1233       sbase -= is + 1;
1234       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1235     }
1236 
1237   id = dest->nelem - 1;
1238   is = dest->nelem + 2 * src->nelem - 1;
1239   delta = is - sbase + 1;
1240   if (delta == 0)
1241     return REG_NOERROR;
1242 
1243   /* Now copy.  When DELTA becomes zero, the remaining
1244      DEST elements are already in place.  */
1245   dest->nelem += delta;
1246   for (;;)
1247     {
1248       if (dest->elems[is] > dest->elems[id])
1249 	{
1250 	  /* Copy from the top.  */
1251 	  dest->elems[id + delta--] = dest->elems[is--];
1252 	  if (delta == 0)
1253 	    break;
1254 	}
1255       else
1256 	{
1257 	  /* Slide from the bottom.  */
1258 	  dest->elems[id + delta] = dest->elems[id];
1259 	  if (! REG_VALID_INDEX (--id))
1260 	    {
1261 	      /* Copy remaining SRC elements.  */
1262 	      memcpy (dest->elems, dest->elems + sbase,
1263 		      delta * sizeof (Idx));
1264 	      break;
1265 	    }
1266 	}
1267     }
1268 
1269   return REG_NOERROR;
1270 }
1271 
1272 /* Insert the new element ELEM to the re_node_set* SET.
1273    SET should not already have ELEM.
1274    Return true if successful.  */
1275 
1276 static bool
1277 internal_function __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)1278 re_node_set_insert (re_node_set *set, Idx elem)
1279 {
1280   Idx idx;
1281   /* In case the set is empty.  */
1282   if (set->alloc == 0)
1283     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1284 
1285   if (BE (set->nelem, 0) == 0)
1286     {
1287       /* We already guaranteed above that set->alloc != 0.  */
1288       set->elems[0] = elem;
1289       ++set->nelem;
1290       return true;
1291     }
1292 
1293   /* Realloc if we need.  */
1294   if (set->alloc == set->nelem)
1295     {
1296       Idx *new_elems;
1297       set->alloc = set->alloc * 2;
1298       new_elems = re_realloc (set->elems, Idx, set->alloc);
1299       if (BE (new_elems == NULL, 0))
1300 	return false;
1301       set->elems = new_elems;
1302     }
1303 
1304   /* Move the elements which follows the new element.  Test the
1305      first element separately to skip a check in the inner loop.  */
1306   if (elem < set->elems[0])
1307     {
1308       idx = 0;
1309       for (idx = set->nelem; idx > 0; idx--)
1310 	set->elems[idx] = set->elems[idx - 1];
1311     }
1312   else
1313     {
1314       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1315 	set->elems[idx] = set->elems[idx - 1];
1316     }
1317 
1318   /* Insert the new element.  */
1319   set->elems[idx] = elem;
1320   ++set->nelem;
1321   return true;
1322 }
1323 
1324 /* Insert the new element ELEM to the re_node_set* SET.
1325    SET should not already have any element greater than or equal to ELEM.
1326    Return true if successful.  */
1327 
1328 static bool
1329 internal_function __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)1330 re_node_set_insert_last (re_node_set *set, Idx elem)
1331 {
1332   /* Realloc if we need.  */
1333   if (set->alloc == set->nelem)
1334     {
1335       Idx *new_elems;
1336       set->alloc = (set->alloc + 1) * 2;
1337       new_elems = re_realloc (set->elems, Idx, set->alloc);
1338       if (BE (new_elems == NULL, 0))
1339 	return false;
1340       set->elems = new_elems;
1341     }
1342 
1343   /* Insert the new element.  */
1344   set->elems[set->nelem++] = elem;
1345   return true;
1346 }
1347 
1348 /* Compare two node sets SET1 and SET2.
1349    Return true if SET1 and SET2 are equivalent.  */
1350 
1351 static bool
1352 internal_function __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)1353 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1354 {
1355   Idx i;
1356   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1357     return false;
1358   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1359     if (set1->elems[i] != set2->elems[i])
1360       return false;
1361   return true;
1362 }
1363 
1364 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1365 
1366 static Idx
1367 internal_function __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)1368 re_node_set_contains (const re_node_set *set, Idx elem)
1369 {
1370   __re_size_t idx, right, mid;
1371   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1372     return 0;
1373 
1374   /* Binary search the element.  */
1375   idx = 0;
1376   right = set->nelem - 1;
1377   while (idx < right)
1378     {
1379       mid = (idx + right) / 2;
1380       if (set->elems[mid] < elem)
1381 	idx = mid + 1;
1382       else
1383 	right = mid;
1384     }
1385   return set->elems[idx] == elem ? idx + 1 : 0;
1386 }
1387 
1388 static void
1389 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1390 re_node_set_remove_at (re_node_set *set, Idx idx)
1391 {
1392   if (idx < 0 || idx >= set->nelem)
1393     return;
1394   --set->nelem;
1395   for (; idx < set->nelem; idx++)
1396     set->elems[idx] = set->elems[idx + 1];
1397 }
1398 
1399 
1400 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1401    Or return REG_MISSING if an error occurred.  */
1402 
1403 static Idx
1404 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1405 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1406 {
1407   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1408     {
1409       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1410       Idx *new_nexts, *new_indices;
1411       re_node_set *new_edests, *new_eclosures;
1412       re_token_t *new_nodes;
1413 
1414       /* Avoid overflows in realloc.  */
1415       const size_t max_object_size = MAX (sizeof (re_token_t),
1416 					  MAX (sizeof (re_node_set),
1417 					       sizeof (Idx)));
1418       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1419 	return REG_MISSING;
1420 
1421       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422       if (BE (new_nodes == NULL, 0))
1423 	return REG_MISSING;
1424       dfa->nodes = new_nodes;
1425       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1426       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1427       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429       if (BE (new_nexts == NULL || new_indices == NULL
1430 	      || new_edests == NULL || new_eclosures == NULL, 0))
1431 	return REG_MISSING;
1432       dfa->nexts = new_nexts;
1433       dfa->org_indices = new_indices;
1434       dfa->edests = new_edests;
1435       dfa->eclosures = new_eclosures;
1436       dfa->nodes_alloc = new_nodes_alloc;
1437     }
1438   dfa->nodes[dfa->nodes_len] = token;
1439   dfa->nodes[dfa->nodes_len].constraint = 0;
1440 #ifdef RE_ENABLE_I18N
1441   dfa->nodes[dfa->nodes_len].accept_mb =
1442     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1443      || token.type == COMPLEX_BRACKET);
1444 #endif
1445   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1446   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1447   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1448   return dfa->nodes_len++;
1449 }
1450 
1451 static re_hashval_t
1452 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1453 calc_state_hash (const re_node_set *nodes, unsigned int context)
1454 {
1455   re_hashval_t hash = nodes->nelem + context;
1456   Idx i;
1457   for (i = 0 ; i < nodes->nelem ; i++)
1458     hash += nodes->elems[i];
1459   return hash;
1460 }
1461 
1462 /* Search for the state whose node_set is equivalent to NODES.
1463    Return the pointer to the state, if we found it in the DFA.
1464    Otherwise create the new one and return it.  In case of an error
1465    return NULL and set the error code in ERR.
1466    Note: - We assume NULL as the invalid state, then it is possible that
1467 	   return value is NULL and ERR is REG_NOERROR.
1468 	 - We never return non-NULL value in case of any errors, it is for
1469 	   optimization.  */
1470 
1471 static re_dfastate_t *
1472 internal_function __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1473 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1474 		  const re_node_set *nodes)
1475 {
1476   re_hashval_t hash;
1477   re_dfastate_t *new_state;
1478   struct re_state_table_entry *spot;
1479   Idx i;
1480 #ifdef lint
1481   /* Suppress bogus uninitialized-variable warnings.  */
1482   *err = REG_NOERROR;
1483 #endif
1484   if (BE (nodes->nelem == 0, 0))
1485     {
1486       *err = REG_NOERROR;
1487       return NULL;
1488     }
1489   hash = calc_state_hash (nodes, 0);
1490   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1491 
1492   for (i = 0 ; i < spot->num ; i++)
1493     {
1494       re_dfastate_t *state = spot->array[i];
1495       if (hash != state->hash)
1496 	continue;
1497       if (re_node_set_compare (&state->nodes, nodes))
1498 	return state;
1499     }
1500 
1501   /* There are no appropriate state in the dfa, create the new one.  */
1502   new_state = create_ci_newstate (dfa, nodes, hash);
1503   if (BE (new_state == NULL, 0))
1504     *err = REG_ESPACE;
1505 
1506   return new_state;
1507 }
1508 
1509 /* Search for the state whose node_set is equivalent to NODES and
1510    whose context is equivalent to CONTEXT.
1511    Return the pointer to the state, if we found it in the DFA.
1512    Otherwise create the new one and return it.  In case of an error
1513    return NULL and set the error code in ERR.
1514    Note: - We assume NULL as the invalid state, then it is possible that
1515 	   return value is NULL and ERR is REG_NOERROR.
1516 	 - We never return non-NULL value in case of any errors, it is for
1517 	   optimization.  */
1518 
1519 static re_dfastate_t *
1520 internal_function __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1521 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1522 			  const re_node_set *nodes, unsigned int context)
1523 {
1524   re_hashval_t hash;
1525   re_dfastate_t *new_state;
1526   struct re_state_table_entry *spot;
1527   Idx i;
1528 #ifdef lint
1529   /* Suppress bogus uninitialized-variable warnings.  */
1530   *err = REG_NOERROR;
1531 #endif
1532   if (nodes->nelem == 0)
1533     {
1534       *err = REG_NOERROR;
1535       return NULL;
1536     }
1537   hash = calc_state_hash (nodes, context);
1538   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1539 
1540   for (i = 0 ; i < spot->num ; i++)
1541     {
1542       re_dfastate_t *state = spot->array[i];
1543       if (state->hash == hash
1544 	  && state->context == context
1545 	  && re_node_set_compare (state->entrance_nodes, nodes))
1546 	return state;
1547     }
1548   /* There are no appropriate state in 'dfa', create the new one.  */
1549   new_state = create_cd_newstate (dfa, nodes, context, hash);
1550   if (BE (new_state == NULL, 0))
1551     *err = REG_ESPACE;
1552 
1553   return new_state;
1554 }
1555 
1556 /* Finish initialization of the new state NEWSTATE, and using its hash value
1557    HASH put in the appropriate bucket of DFA's state table.  Return value
1558    indicates the error code if failed.  */
1559 
1560 static reg_errcode_t
1561 __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1562 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1563 		re_hashval_t hash)
1564 {
1565   struct re_state_table_entry *spot;
1566   reg_errcode_t err;
1567   Idx i;
1568 
1569   newstate->hash = hash;
1570   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1571   if (BE (err != REG_NOERROR, 0))
1572     return REG_ESPACE;
1573   for (i = 0; i < newstate->nodes.nelem; i++)
1574     {
1575       Idx elem = newstate->nodes.elems[i];
1576       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1577 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1578 	  return REG_ESPACE;
1579     }
1580 
1581   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1582   if (BE (spot->alloc <= spot->num, 0))
1583     {
1584       Idx new_alloc = 2 * spot->num + 2;
1585       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1586 					      new_alloc);
1587       if (BE (new_array == NULL, 0))
1588 	return REG_ESPACE;
1589       spot->array = new_array;
1590       spot->alloc = new_alloc;
1591     }
1592   spot->array[spot->num++] = newstate;
1593   return REG_NOERROR;
1594 }
1595 
1596 static void
free_state(re_dfastate_t * state)1597 free_state (re_dfastate_t *state)
1598 {
1599   re_node_set_free (&state->non_eps_nodes);
1600   re_node_set_free (&state->inveclosure);
1601   if (state->entrance_nodes != &state->nodes)
1602     {
1603       re_node_set_free (state->entrance_nodes);
1604       re_free (state->entrance_nodes);
1605     }
1606   re_node_set_free (&state->nodes);
1607   re_free (state->word_trtable);
1608   re_free (state->trtable);
1609   re_free (state);
1610 }
1611 
1612 /* Create the new state which is independent of contexts.
1613    Return the new state if succeeded, otherwise return NULL.  */
1614 
1615 static re_dfastate_t *
1616 internal_function __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1617 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1618 		    re_hashval_t hash)
1619 {
1620   Idx i;
1621   reg_errcode_t err;
1622   re_dfastate_t *newstate;
1623 
1624   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1625   if (BE (newstate == NULL, 0))
1626     return NULL;
1627   err = re_node_set_init_copy (&newstate->nodes, nodes);
1628   if (BE (err != REG_NOERROR, 0))
1629     {
1630       re_free (newstate);
1631       return NULL;
1632     }
1633 
1634   newstate->entrance_nodes = &newstate->nodes;
1635   for (i = 0 ; i < nodes->nelem ; i++)
1636     {
1637       re_token_t *node = dfa->nodes + nodes->elems[i];
1638       re_token_type_t type = node->type;
1639       if (type == CHARACTER && !node->constraint)
1640 	continue;
1641 #ifdef RE_ENABLE_I18N
1642       newstate->accept_mb |= node->accept_mb;
1643 #endif /* RE_ENABLE_I18N */
1644 
1645       /* If the state has the halt node, the state is a halt state.  */
1646       if (type == END_OF_RE)
1647 	newstate->halt = 1;
1648       else if (type == OP_BACK_REF)
1649 	newstate->has_backref = 1;
1650       else if (type == ANCHOR || node->constraint)
1651 	newstate->has_constraint = 1;
1652     }
1653   err = register_state (dfa, newstate, hash);
1654   if (BE (err != REG_NOERROR, 0))
1655     {
1656       free_state (newstate);
1657       newstate = NULL;
1658     }
1659   return newstate;
1660 }
1661 
1662 /* Create the new state which is depend on the context CONTEXT.
1663    Return the new state if succeeded, otherwise return NULL.  */
1664 
1665 static re_dfastate_t *
1666 internal_function __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1667 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1668 		    unsigned int context, re_hashval_t hash)
1669 {
1670   Idx i, nctx_nodes = 0;
1671   reg_errcode_t err;
1672   re_dfastate_t *newstate;
1673 
1674   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1675   if (BE (newstate == NULL, 0))
1676     return NULL;
1677   err = re_node_set_init_copy (&newstate->nodes, nodes);
1678   if (BE (err != REG_NOERROR, 0))
1679     {
1680       re_free (newstate);
1681       return NULL;
1682     }
1683 
1684   newstate->context = context;
1685   newstate->entrance_nodes = &newstate->nodes;
1686 
1687   for (i = 0 ; i < nodes->nelem ; i++)
1688     {
1689       re_token_t *node = dfa->nodes + nodes->elems[i];
1690       re_token_type_t type = node->type;
1691       unsigned int constraint = node->constraint;
1692 
1693       if (type == CHARACTER && !constraint)
1694 	continue;
1695 #ifdef RE_ENABLE_I18N
1696       newstate->accept_mb |= node->accept_mb;
1697 #endif /* RE_ENABLE_I18N */
1698 
1699       /* If the state has the halt node, the state is a halt state.  */
1700       if (type == END_OF_RE)
1701 	newstate->halt = 1;
1702       else if (type == OP_BACK_REF)
1703 	newstate->has_backref = 1;
1704 
1705       if (constraint)
1706 	{
1707 	  if (newstate->entrance_nodes == &newstate->nodes)
1708 	    {
1709 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1710 	      if (BE (newstate->entrance_nodes == NULL, 0))
1711 		{
1712 		  free_state (newstate);
1713 		  return NULL;
1714 		}
1715 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1716 		  != REG_NOERROR)
1717 		return NULL;
1718 	      nctx_nodes = 0;
1719 	      newstate->has_constraint = 1;
1720 	    }
1721 
1722 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1723 	    {
1724 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1725 	      ++nctx_nodes;
1726 	    }
1727 	}
1728     }
1729   err = register_state (dfa, newstate, hash);
1730   if (BE (err != REG_NOERROR, 0))
1731     {
1732       free_state (newstate);
1733       newstate = NULL;
1734     }
1735   return  newstate;
1736 }
1737