144b87433SJohn Marino /* Extended regular expression matching and search library.
2*6ea1f93eSDaniel Fojt    Copyright (C) 2002-2018 Free Software Foundation, Inc.
344b87433SJohn Marino    This file is part of the GNU C Library.
444b87433SJohn Marino    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
544b87433SJohn Marino 
64536c563SJohn Marino    The GNU C Library is free software; you can redistribute it and/or
74536c563SJohn Marino    modify it under the terms of the GNU General Public
84536c563SJohn Marino    License as published by the Free Software Foundation; either
94536c563SJohn Marino    version 3 of the License, or (at your option) any later version.
1044b87433SJohn Marino 
114536c563SJohn Marino    The GNU C Library is distributed in the hope that it will be useful,
1244b87433SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
134536c563SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
144536c563SJohn Marino    General Public License for more details.
1544b87433SJohn Marino 
164536c563SJohn Marino    You should have received a copy of the GNU General Public
174536c563SJohn Marino    License along with the GNU C Library; if not, see
18*6ea1f93eSDaniel Fojt    <https://www.gnu.org/licenses/>.  */
1944b87433SJohn Marino 
2044b87433SJohn Marino static void re_string_construct_common (const char *str, Idx len,
2144b87433SJohn Marino 					re_string_t *pstr,
2244b87433SJohn Marino 					RE_TRANSLATE_TYPE trans, bool icase,
23*6ea1f93eSDaniel Fojt 					const re_dfa_t *dfa);
2444b87433SJohn Marino static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
2544b87433SJohn Marino 					  const re_node_set *nodes,
26*6ea1f93eSDaniel Fojt 					  re_hashval_t hash);
2744b87433SJohn Marino static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
2844b87433SJohn Marino 					  const re_node_set *nodes,
2944b87433SJohn Marino 					  unsigned int context,
30*6ea1f93eSDaniel Fojt 					  re_hashval_t hash);
31*6ea1f93eSDaniel Fojt static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32*6ea1f93eSDaniel Fojt 						Idx new_buf_len);
33*6ea1f93eSDaniel Fojt #ifdef RE_ENABLE_I18N
34*6ea1f93eSDaniel Fojt static void build_wcs_buffer (re_string_t *pstr);
35*6ea1f93eSDaniel Fojt static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36*6ea1f93eSDaniel Fojt #endif /* RE_ENABLE_I18N */
37*6ea1f93eSDaniel Fojt static void build_upper_buffer (re_string_t *pstr);
38*6ea1f93eSDaniel Fojt static void re_string_translate_buffer (re_string_t *pstr);
39*6ea1f93eSDaniel Fojt static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40*6ea1f93eSDaniel Fojt 					  int eflags) __attribute__ ((pure));
4144b87433SJohn Marino 
4244b87433SJohn Marino /* Functions for string operation.  */
4344b87433SJohn Marino 
4444b87433SJohn Marino /* This function allocate the buffers.  It is necessary to call
4544b87433SJohn Marino    re_string_reconstruct before using the object.  */
4644b87433SJohn Marino 
4744b87433SJohn Marino static reg_errcode_t
48*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)4944b87433SJohn Marino re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
5044b87433SJohn Marino 		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
5144b87433SJohn Marino {
5244b87433SJohn Marino   reg_errcode_t ret;
5344b87433SJohn Marino   Idx init_buf_len;
5444b87433SJohn Marino 
5544b87433SJohn Marino   /* Ensure at least one character fits into the buffers.  */
5644b87433SJohn Marino   if (init_len < dfa->mb_cur_max)
5744b87433SJohn Marino     init_len = dfa->mb_cur_max;
5844b87433SJohn Marino   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
5944b87433SJohn Marino   re_string_construct_common (str, len, pstr, trans, icase, dfa);
6044b87433SJohn Marino 
6144b87433SJohn Marino   ret = re_string_realloc_buffers (pstr, init_buf_len);
62*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (ret != REG_NOERROR))
6344b87433SJohn Marino     return ret;
6444b87433SJohn Marino 
6544b87433SJohn Marino   pstr->word_char = dfa->word_char;
6644b87433SJohn Marino   pstr->word_ops_used = dfa->word_ops_used;
6744b87433SJohn Marino   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
6844b87433SJohn Marino   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
6944b87433SJohn Marino   pstr->valid_raw_len = pstr->valid_len;
7044b87433SJohn Marino   return REG_NOERROR;
7144b87433SJohn Marino }
7244b87433SJohn Marino 
7344b87433SJohn Marino /* This function allocate the buffers, and initialize them.  */
7444b87433SJohn Marino 
7544b87433SJohn Marino static reg_errcode_t
76*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)7744b87433SJohn Marino re_string_construct (re_string_t *pstr, const char *str, Idx len,
7844b87433SJohn Marino 		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
7944b87433SJohn Marino {
8044b87433SJohn Marino   reg_errcode_t ret;
8144b87433SJohn Marino   memset (pstr, '\0', sizeof (re_string_t));
8244b87433SJohn Marino   re_string_construct_common (str, len, pstr, trans, icase, dfa);
8344b87433SJohn Marino 
8444b87433SJohn Marino   if (len > 0)
8544b87433SJohn Marino     {
8644b87433SJohn Marino       ret = re_string_realloc_buffers (pstr, len + 1);
87*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (ret != REG_NOERROR))
8844b87433SJohn Marino 	return ret;
8944b87433SJohn Marino     }
9044b87433SJohn Marino   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
9144b87433SJohn Marino 
9244b87433SJohn Marino   if (icase)
9344b87433SJohn Marino     {
9444b87433SJohn Marino #ifdef RE_ENABLE_I18N
9544b87433SJohn Marino       if (dfa->mb_cur_max > 1)
9644b87433SJohn Marino 	{
9744b87433SJohn Marino 	  while (1)
9844b87433SJohn Marino 	    {
9944b87433SJohn Marino 	      ret = build_wcs_upper_buffer (pstr);
100*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (ret != REG_NOERROR))
10144b87433SJohn Marino 		return ret;
10244b87433SJohn Marino 	      if (pstr->valid_raw_len >= len)
10344b87433SJohn Marino 		break;
10444b87433SJohn Marino 	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
10544b87433SJohn Marino 		break;
10644b87433SJohn Marino 	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (ret != REG_NOERROR))
10844b87433SJohn Marino 		return ret;
10944b87433SJohn Marino 	    }
11044b87433SJohn Marino 	}
11144b87433SJohn Marino       else
11244b87433SJohn Marino #endif /* RE_ENABLE_I18N  */
11344b87433SJohn Marino 	build_upper_buffer (pstr);
11444b87433SJohn Marino     }
11544b87433SJohn Marino   else
11644b87433SJohn Marino     {
11744b87433SJohn Marino #ifdef RE_ENABLE_I18N
11844b87433SJohn Marino       if (dfa->mb_cur_max > 1)
11944b87433SJohn Marino 	build_wcs_buffer (pstr);
12044b87433SJohn Marino       else
12144b87433SJohn Marino #endif /* RE_ENABLE_I18N  */
12244b87433SJohn Marino 	{
12344b87433SJohn Marino 	  if (trans != NULL)
12444b87433SJohn Marino 	    re_string_translate_buffer (pstr);
12544b87433SJohn Marino 	  else
12644b87433SJohn Marino 	    {
12744b87433SJohn Marino 	      pstr->valid_len = pstr->bufs_len;
12844b87433SJohn Marino 	      pstr->valid_raw_len = pstr->bufs_len;
12944b87433SJohn Marino 	    }
13044b87433SJohn Marino 	}
13144b87433SJohn Marino     }
13244b87433SJohn Marino 
13344b87433SJohn Marino   return REG_NOERROR;
13444b87433SJohn Marino }
13544b87433SJohn Marino 
13644b87433SJohn Marino /* Helper functions for re_string_allocate, and re_string_construct.  */
13744b87433SJohn Marino 
13844b87433SJohn Marino static reg_errcode_t
139*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)14044b87433SJohn Marino re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
14144b87433SJohn Marino {
14244b87433SJohn Marino #ifdef RE_ENABLE_I18N
14344b87433SJohn Marino   if (pstr->mb_cur_max > 1)
14444b87433SJohn Marino     {
14544b87433SJohn Marino       wint_t *new_wcs;
14644b87433SJohn Marino 
1474536c563SJohn Marino       /* Avoid overflow in realloc.  */
1484536c563SJohn Marino       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150*6ea1f93eSDaniel Fojt 			    < new_buf_len))
15144b87433SJohn Marino 	return REG_ESPACE;
15244b87433SJohn Marino 
15344b87433SJohn Marino       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_wcs == NULL))
15544b87433SJohn Marino 	return REG_ESPACE;
15644b87433SJohn Marino       pstr->wcs = new_wcs;
15744b87433SJohn Marino       if (pstr->offsets != NULL)
15844b87433SJohn Marino 	{
15944b87433SJohn Marino 	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160*6ea1f93eSDaniel Fojt 	  if (__glibc_unlikely (new_offsets == NULL))
16144b87433SJohn Marino 	    return REG_ESPACE;
16244b87433SJohn Marino 	  pstr->offsets = new_offsets;
16344b87433SJohn Marino 	}
16444b87433SJohn Marino     }
16544b87433SJohn Marino #endif /* RE_ENABLE_I18N  */
16644b87433SJohn Marino   if (pstr->mbs_allocated)
16744b87433SJohn Marino     {
16844b87433SJohn Marino       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
16944b87433SJohn Marino 					   new_buf_len);
170*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_mbs == NULL))
17144b87433SJohn Marino 	return REG_ESPACE;
17244b87433SJohn Marino       pstr->mbs = new_mbs;
17344b87433SJohn Marino     }
17444b87433SJohn Marino   pstr->bufs_len = new_buf_len;
17544b87433SJohn Marino   return REG_NOERROR;
17644b87433SJohn Marino }
17744b87433SJohn Marino 
17844b87433SJohn Marino 
17944b87433SJohn Marino static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)18044b87433SJohn Marino re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
18144b87433SJohn Marino 			    RE_TRANSLATE_TYPE trans, bool icase,
18244b87433SJohn Marino 			    const re_dfa_t *dfa)
18344b87433SJohn Marino {
18444b87433SJohn Marino   pstr->raw_mbs = (const unsigned char *) str;
18544b87433SJohn Marino   pstr->len = len;
18644b87433SJohn Marino   pstr->raw_len = len;
18744b87433SJohn Marino   pstr->trans = trans;
18844b87433SJohn Marino   pstr->icase = icase;
18944b87433SJohn Marino   pstr->mbs_allocated = (trans != NULL || icase);
19044b87433SJohn Marino   pstr->mb_cur_max = dfa->mb_cur_max;
19144b87433SJohn Marino   pstr->is_utf8 = dfa->is_utf8;
19244b87433SJohn Marino   pstr->map_notascii = dfa->map_notascii;
19344b87433SJohn Marino   pstr->stop = pstr->len;
19444b87433SJohn Marino   pstr->raw_stop = pstr->stop;
19544b87433SJohn Marino }
19644b87433SJohn Marino 
19744b87433SJohn Marino #ifdef RE_ENABLE_I18N
19844b87433SJohn Marino 
19944b87433SJohn Marino /* Build wide character buffer PSTR->WCS.
20044b87433SJohn Marino    If the byte sequence of the string are:
20144b87433SJohn Marino      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
20244b87433SJohn Marino    Then wide character buffer will be:
20344b87433SJohn Marino      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
20444b87433SJohn Marino    We use WEOF for padding, they indicate that the position isn't
20544b87433SJohn Marino    a first byte of a multibyte character.
20644b87433SJohn Marino 
20744b87433SJohn Marino    Note that this function assumes PSTR->VALID_LEN elements are already
20844b87433SJohn Marino    built and starts from PSTR->VALID_LEN.  */
20944b87433SJohn Marino 
21044b87433SJohn Marino static void
build_wcs_buffer(re_string_t * pstr)21144b87433SJohn Marino build_wcs_buffer (re_string_t *pstr)
21244b87433SJohn Marino {
21344b87433SJohn Marino #ifdef _LIBC
21444b87433SJohn Marino   unsigned char buf[MB_LEN_MAX];
21544b87433SJohn Marino   assert (MB_LEN_MAX >= pstr->mb_cur_max);
21644b87433SJohn Marino #else
21744b87433SJohn Marino   unsigned char buf[64];
21844b87433SJohn Marino #endif
21944b87433SJohn Marino   mbstate_t prev_st;
22044b87433SJohn Marino   Idx byte_idx, end_idx, remain_len;
22144b87433SJohn Marino   size_t mbclen;
22244b87433SJohn Marino 
22344b87433SJohn Marino   /* Build the buffers from pstr->valid_len to either pstr->len or
22444b87433SJohn Marino      pstr->bufs_len.  */
22544b87433SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
22644b87433SJohn Marino   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
22744b87433SJohn Marino     {
22844b87433SJohn Marino       wchar_t wc;
22944b87433SJohn Marino       const char *p;
23044b87433SJohn Marino 
23144b87433SJohn Marino       remain_len = end_idx - byte_idx;
23244b87433SJohn Marino       prev_st = pstr->cur_state;
23344b87433SJohn Marino       /* Apply the translation if we need.  */
234*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (pstr->trans != NULL))
23544b87433SJohn Marino 	{
23644b87433SJohn Marino 	  int i, ch;
23744b87433SJohn Marino 
23844b87433SJohn Marino 	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
23944b87433SJohn Marino 	    {
24044b87433SJohn Marino 	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
24144b87433SJohn Marino 	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
24244b87433SJohn Marino 	    }
24344b87433SJohn Marino 	  p = (const char *) buf;
24444b87433SJohn Marino 	}
24544b87433SJohn Marino       else
24644b87433SJohn Marino 	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
24744b87433SJohn Marino       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249*6ea1f93eSDaniel Fojt 			    || (mbclen == (size_t) -2
250*6ea1f93eSDaniel Fojt 				&& pstr->bufs_len >= pstr->len)))
25144b87433SJohn Marino 	{
25244b87433SJohn Marino 	  /* We treat these cases as a singlebyte character.  */
25344b87433SJohn Marino 	  mbclen = 1;
25444b87433SJohn Marino 	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255*6ea1f93eSDaniel Fojt 	  if (__glibc_unlikely (pstr->trans != NULL))
25644b87433SJohn Marino 	    wc = pstr->trans[wc];
25744b87433SJohn Marino 	  pstr->cur_state = prev_st;
25844b87433SJohn Marino 	}
259*6ea1f93eSDaniel Fojt       else if (__glibc_unlikely (mbclen == (size_t) -2))
2604536c563SJohn Marino 	{
2614536c563SJohn Marino 	  /* The buffer doesn't have enough space, finish to build.  */
2624536c563SJohn Marino 	  pstr->cur_state = prev_st;
2634536c563SJohn Marino 	  break;
2644536c563SJohn Marino 	}
26544b87433SJohn Marino 
26644b87433SJohn Marino       /* Write wide character and padding.  */
26744b87433SJohn Marino       pstr->wcs[byte_idx++] = wc;
26844b87433SJohn Marino       /* Write paddings.  */
26944b87433SJohn Marino       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
27044b87433SJohn Marino 	pstr->wcs[byte_idx++] = WEOF;
27144b87433SJohn Marino     }
27244b87433SJohn Marino   pstr->valid_len = byte_idx;
27344b87433SJohn Marino   pstr->valid_raw_len = byte_idx;
27444b87433SJohn Marino }
27544b87433SJohn Marino 
27644b87433SJohn Marino /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
27744b87433SJohn Marino    but for REG_ICASE.  */
27844b87433SJohn Marino 
27944b87433SJohn Marino static reg_errcode_t
280*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)28144b87433SJohn Marino build_wcs_upper_buffer (re_string_t *pstr)
28244b87433SJohn Marino {
28344b87433SJohn Marino   mbstate_t prev_st;
28444b87433SJohn Marino   Idx src_idx, byte_idx, end_idx, remain_len;
28544b87433SJohn Marino   size_t mbclen;
28644b87433SJohn Marino #ifdef _LIBC
28744b87433SJohn Marino   char buf[MB_LEN_MAX];
28844b87433SJohn Marino   assert (MB_LEN_MAX >= pstr->mb_cur_max);
28944b87433SJohn Marino #else
29044b87433SJohn Marino   char buf[64];
29144b87433SJohn Marino #endif
29244b87433SJohn Marino 
29344b87433SJohn Marino   byte_idx = pstr->valid_len;
29444b87433SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
29544b87433SJohn Marino 
29644b87433SJohn Marino   /* The following optimization assumes that ASCII characters can be
29744b87433SJohn Marino      mapped to wide characters with a simple cast.  */
29844b87433SJohn Marino   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
29944b87433SJohn Marino     {
30044b87433SJohn Marino       while (byte_idx < end_idx)
30144b87433SJohn Marino 	{
30244b87433SJohn Marino 	  wchar_t wc;
30344b87433SJohn Marino 
30444b87433SJohn Marino 	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
30544b87433SJohn Marino 	      && mbsinit (&pstr->cur_state))
30644b87433SJohn Marino 	    {
30744b87433SJohn Marino 	      /* In case of a singlebyte character.  */
30844b87433SJohn Marino 	      pstr->mbs[byte_idx]
30944b87433SJohn Marino 		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
31044b87433SJohn Marino 	      /* The next step uses the assumption that wchar_t is encoded
31144b87433SJohn Marino 		 ASCII-safe: all ASCII values can be converted like this.  */
31244b87433SJohn Marino 	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
31344b87433SJohn Marino 	      ++byte_idx;
31444b87433SJohn Marino 	      continue;
31544b87433SJohn Marino 	    }
31644b87433SJohn Marino 
31744b87433SJohn Marino 	  remain_len = end_idx - byte_idx;
31844b87433SJohn Marino 	  prev_st = pstr->cur_state;
31944b87433SJohn Marino 	  mbclen = __mbrtowc (&wc,
32044b87433SJohn Marino 			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
32144b87433SJohn Marino 			       + byte_idx), remain_len, &pstr->cur_state);
322*6ea1f93eSDaniel Fojt 	  if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
32344b87433SJohn Marino 	    {
324*6ea1f93eSDaniel Fojt 	      wchar_t wcu = __towupper (wc);
325*6ea1f93eSDaniel Fojt 	      if (wcu != wc)
32644b87433SJohn Marino 		{
32744b87433SJohn Marino 		  size_t mbcdlen;
32844b87433SJohn Marino 
329*6ea1f93eSDaniel Fojt 		  mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330*6ea1f93eSDaniel Fojt 		  if (__glibc_likely (mbclen == mbcdlen))
33144b87433SJohn Marino 		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
33244b87433SJohn Marino 		  else
33344b87433SJohn Marino 		    {
33444b87433SJohn Marino 		      src_idx = byte_idx;
33544b87433SJohn Marino 		      goto offsets_needed;
33644b87433SJohn Marino 		    }
33744b87433SJohn Marino 		}
33844b87433SJohn Marino 	      else
33944b87433SJohn Marino 		memcpy (pstr->mbs + byte_idx,
34044b87433SJohn Marino 			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
34144b87433SJohn Marino 	      pstr->wcs[byte_idx++] = wcu;
34244b87433SJohn Marino 	      /* Write paddings.  */
34344b87433SJohn Marino 	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
34444b87433SJohn Marino 		pstr->wcs[byte_idx++] = WEOF;
34544b87433SJohn Marino 	    }
3464536c563SJohn Marino 	  else if (mbclen == (size_t) -1 || mbclen == 0
3474536c563SJohn Marino 		   || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
34844b87433SJohn Marino 	    {
3494536c563SJohn Marino 	      /* It is an invalid character, an incomplete character
3504536c563SJohn Marino 		 at the end of the string, or '\0'.  Just use the byte.  */
35144b87433SJohn Marino 	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
35244b87433SJohn Marino 	      pstr->mbs[byte_idx] = ch;
35344b87433SJohn Marino 	      /* And also cast it to wide char.  */
35444b87433SJohn Marino 	      pstr->wcs[byte_idx++] = (wchar_t) ch;
355*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (mbclen == (size_t) -1))
35644b87433SJohn Marino 		pstr->cur_state = prev_st;
35744b87433SJohn Marino 	    }
35844b87433SJohn Marino 	  else
35944b87433SJohn Marino 	    {
36044b87433SJohn Marino 	      /* The buffer doesn't have enough space, finish to build.  */
36144b87433SJohn Marino 	      pstr->cur_state = prev_st;
36244b87433SJohn Marino 	      break;
36344b87433SJohn Marino 	    }
36444b87433SJohn Marino 	}
36544b87433SJohn Marino       pstr->valid_len = byte_idx;
36644b87433SJohn Marino       pstr->valid_raw_len = byte_idx;
36744b87433SJohn Marino       return REG_NOERROR;
36844b87433SJohn Marino     }
36944b87433SJohn Marino   else
37044b87433SJohn Marino     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
37144b87433SJohn Marino       {
37244b87433SJohn Marino 	wchar_t wc;
37344b87433SJohn Marino 	const char *p;
37444b87433SJohn Marino       offsets_needed:
37544b87433SJohn Marino 	remain_len = end_idx - byte_idx;
37644b87433SJohn Marino 	prev_st = pstr->cur_state;
377*6ea1f93eSDaniel Fojt 	if (__glibc_unlikely (pstr->trans != NULL))
37844b87433SJohn Marino 	  {
37944b87433SJohn Marino 	    int i, ch;
38044b87433SJohn Marino 
38144b87433SJohn Marino 	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
38244b87433SJohn Marino 	      {
38344b87433SJohn Marino 		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
38444b87433SJohn Marino 		buf[i] = pstr->trans[ch];
38544b87433SJohn Marino 	      }
38644b87433SJohn Marino 	    p = (const char *) buf;
38744b87433SJohn Marino 	  }
38844b87433SJohn Marino 	else
38944b87433SJohn Marino 	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
39044b87433SJohn Marino 	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391*6ea1f93eSDaniel Fojt 	if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
39244b87433SJohn Marino 	  {
393*6ea1f93eSDaniel Fojt 	    wchar_t wcu = __towupper (wc);
394*6ea1f93eSDaniel Fojt 	    if (wcu != wc)
39544b87433SJohn Marino 	      {
39644b87433SJohn Marino 		size_t mbcdlen;
39744b87433SJohn Marino 
398*6ea1f93eSDaniel Fojt 		mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399*6ea1f93eSDaniel Fojt 		if (__glibc_likely (mbclen == mbcdlen))
40044b87433SJohn Marino 		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
40144b87433SJohn Marino 		else if (mbcdlen != (size_t) -1)
40244b87433SJohn Marino 		  {
40344b87433SJohn Marino 		    size_t i;
40444b87433SJohn Marino 
40544b87433SJohn Marino 		    if (byte_idx + mbcdlen > pstr->bufs_len)
40644b87433SJohn Marino 		      {
40744b87433SJohn Marino 			pstr->cur_state = prev_st;
40844b87433SJohn Marino 			break;
40944b87433SJohn Marino 		      }
41044b87433SJohn Marino 
41144b87433SJohn Marino 		    if (pstr->offsets == NULL)
41244b87433SJohn Marino 		      {
41344b87433SJohn Marino 			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
41444b87433SJohn Marino 
41544b87433SJohn Marino 			if (pstr->offsets == NULL)
41644b87433SJohn Marino 			  return REG_ESPACE;
41744b87433SJohn Marino 		      }
41844b87433SJohn Marino 		    if (!pstr->offsets_needed)
41944b87433SJohn Marino 		      {
42044b87433SJohn Marino 			for (i = 0; i < (size_t) byte_idx; ++i)
42144b87433SJohn Marino 			  pstr->offsets[i] = i;
42244b87433SJohn Marino 			pstr->offsets_needed = 1;
42344b87433SJohn Marino 		      }
42444b87433SJohn Marino 
42544b87433SJohn Marino 		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
42644b87433SJohn Marino 		    pstr->wcs[byte_idx] = wcu;
42744b87433SJohn Marino 		    pstr->offsets[byte_idx] = src_idx;
42844b87433SJohn Marino 		    for (i = 1; i < mbcdlen; ++i)
42944b87433SJohn Marino 		      {
43044b87433SJohn Marino 			pstr->offsets[byte_idx + i]
43144b87433SJohn Marino 			  = src_idx + (i < mbclen ? i : mbclen - 1);
43244b87433SJohn Marino 			pstr->wcs[byte_idx + i] = WEOF;
43344b87433SJohn Marino 		      }
43444b87433SJohn Marino 		    pstr->len += mbcdlen - mbclen;
43544b87433SJohn Marino 		    if (pstr->raw_stop > src_idx)
43644b87433SJohn Marino 		      pstr->stop += mbcdlen - mbclen;
43744b87433SJohn Marino 		    end_idx = (pstr->bufs_len > pstr->len)
43844b87433SJohn Marino 			      ? pstr->len : pstr->bufs_len;
43944b87433SJohn Marino 		    byte_idx += mbcdlen;
44044b87433SJohn Marino 		    src_idx += mbclen;
44144b87433SJohn Marino 		    continue;
44244b87433SJohn Marino 		  }
44344b87433SJohn Marino 		else
44444b87433SJohn Marino 		  memcpy (pstr->mbs + byte_idx, p, mbclen);
44544b87433SJohn Marino 	      }
44644b87433SJohn Marino 	    else
44744b87433SJohn Marino 	      memcpy (pstr->mbs + byte_idx, p, mbclen);
44844b87433SJohn Marino 
449*6ea1f93eSDaniel Fojt 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
45044b87433SJohn Marino 	      {
45144b87433SJohn Marino 		size_t i;
45244b87433SJohn Marino 		for (i = 0; i < mbclen; ++i)
45344b87433SJohn Marino 		  pstr->offsets[byte_idx + i] = src_idx + i;
45444b87433SJohn Marino 	      }
45544b87433SJohn Marino 	    src_idx += mbclen;
45644b87433SJohn Marino 
45744b87433SJohn Marino 	    pstr->wcs[byte_idx++] = wcu;
45844b87433SJohn Marino 	    /* Write paddings.  */
45944b87433SJohn Marino 	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
46044b87433SJohn Marino 	      pstr->wcs[byte_idx++] = WEOF;
46144b87433SJohn Marino 	  }
4624536c563SJohn Marino 	else if (mbclen == (size_t) -1 || mbclen == 0
4634536c563SJohn Marino 		 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
46444b87433SJohn Marino 	  {
46544b87433SJohn Marino 	    /* It is an invalid character or '\0'.  Just use the byte.  */
46644b87433SJohn Marino 	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
46744b87433SJohn Marino 
468*6ea1f93eSDaniel Fojt 	    if (__glibc_unlikely (pstr->trans != NULL))
46944b87433SJohn Marino 	      ch = pstr->trans [ch];
47044b87433SJohn Marino 	    pstr->mbs[byte_idx] = ch;
47144b87433SJohn Marino 
472*6ea1f93eSDaniel Fojt 	    if (__glibc_unlikely (pstr->offsets_needed != 0))
47344b87433SJohn Marino 	      pstr->offsets[byte_idx] = src_idx;
47444b87433SJohn Marino 	    ++src_idx;
47544b87433SJohn Marino 
47644b87433SJohn Marino 	    /* And also cast it to wide char.  */
47744b87433SJohn Marino 	    pstr->wcs[byte_idx++] = (wchar_t) ch;
478*6ea1f93eSDaniel Fojt 	    if (__glibc_unlikely (mbclen == (size_t) -1))
47944b87433SJohn Marino 	      pstr->cur_state = prev_st;
48044b87433SJohn Marino 	  }
48144b87433SJohn Marino 	else
48244b87433SJohn Marino 	  {
48344b87433SJohn Marino 	    /* The buffer doesn't have enough space, finish to build.  */
48444b87433SJohn Marino 	    pstr->cur_state = prev_st;
48544b87433SJohn Marino 	    break;
48644b87433SJohn Marino 	  }
48744b87433SJohn Marino       }
48844b87433SJohn Marino   pstr->valid_len = byte_idx;
48944b87433SJohn Marino   pstr->valid_raw_len = src_idx;
49044b87433SJohn Marino   return REG_NOERROR;
49144b87433SJohn Marino }
49244b87433SJohn Marino 
49344b87433SJohn Marino /* Skip characters until the index becomes greater than NEW_RAW_IDX.
49444b87433SJohn Marino    Return the index.  */
49544b87433SJohn Marino 
49644b87433SJohn Marino static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)49744b87433SJohn Marino re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
49844b87433SJohn Marino {
49944b87433SJohn Marino   mbstate_t prev_st;
50044b87433SJohn Marino   Idx rawbuf_idx;
50144b87433SJohn Marino   size_t mbclen;
50244b87433SJohn Marino   wint_t wc = WEOF;
50344b87433SJohn Marino 
50444b87433SJohn Marino   /* Skip the characters which are not necessary to check.  */
50544b87433SJohn Marino   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
50644b87433SJohn Marino        rawbuf_idx < new_raw_idx;)
50744b87433SJohn Marino     {
50844b87433SJohn Marino       wchar_t wc2;
5094536c563SJohn Marino       Idx remain_len = pstr->raw_len - rawbuf_idx;
51044b87433SJohn Marino       prev_st = pstr->cur_state;
51144b87433SJohn Marino       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
51244b87433SJohn Marino 			  remain_len, &pstr->cur_state);
513*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
514*6ea1f93eSDaniel Fojt 			    || mbclen == 0))
51544b87433SJohn Marino 	{
51644b87433SJohn Marino 	  /* We treat these cases as a single byte character.  */
51744b87433SJohn Marino 	  if (mbclen == 0 || remain_len == 0)
51844b87433SJohn Marino 	    wc = L'\0';
51944b87433SJohn Marino 	  else
52044b87433SJohn Marino 	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
52144b87433SJohn Marino 	  mbclen = 1;
52244b87433SJohn Marino 	  pstr->cur_state = prev_st;
52344b87433SJohn Marino 	}
52444b87433SJohn Marino       else
52544b87433SJohn Marino 	wc = wc2;
52644b87433SJohn Marino       /* Then proceed the next character.  */
52744b87433SJohn Marino       rawbuf_idx += mbclen;
52844b87433SJohn Marino     }
52944b87433SJohn Marino   *last_wc = wc;
53044b87433SJohn Marino   return rawbuf_idx;
53144b87433SJohn Marino }
53244b87433SJohn Marino #endif /* RE_ENABLE_I18N  */
53344b87433SJohn Marino 
53444b87433SJohn Marino /* Build the buffer PSTR->MBS, and apply the translation if we need.
53544b87433SJohn Marino    This function is used in case of REG_ICASE.  */
53644b87433SJohn Marino 
53744b87433SJohn Marino static void
build_upper_buffer(re_string_t * pstr)53844b87433SJohn Marino build_upper_buffer (re_string_t *pstr)
53944b87433SJohn Marino {
54044b87433SJohn Marino   Idx char_idx, end_idx;
54144b87433SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
54244b87433SJohn Marino 
54344b87433SJohn Marino   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
54444b87433SJohn Marino     {
54544b87433SJohn Marino       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (pstr->trans != NULL))
54744b87433SJohn Marino 	ch = pstr->trans[ch];
54844b87433SJohn Marino       pstr->mbs[char_idx] = toupper (ch);
54944b87433SJohn Marino     }
55044b87433SJohn Marino   pstr->valid_len = char_idx;
55144b87433SJohn Marino   pstr->valid_raw_len = char_idx;
55244b87433SJohn Marino }
55344b87433SJohn Marino 
55444b87433SJohn Marino /* Apply TRANS to the buffer in PSTR.  */
55544b87433SJohn Marino 
55644b87433SJohn Marino static void
re_string_translate_buffer(re_string_t * pstr)55744b87433SJohn Marino re_string_translate_buffer (re_string_t *pstr)
55844b87433SJohn Marino {
55944b87433SJohn Marino   Idx buf_idx, end_idx;
56044b87433SJohn Marino   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
56144b87433SJohn Marino 
56244b87433SJohn Marino   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
56344b87433SJohn Marino     {
56444b87433SJohn Marino       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
56544b87433SJohn Marino       pstr->mbs[buf_idx] = pstr->trans[ch];
56644b87433SJohn Marino     }
56744b87433SJohn Marino 
56844b87433SJohn Marino   pstr->valid_len = buf_idx;
56944b87433SJohn Marino   pstr->valid_raw_len = buf_idx;
57044b87433SJohn Marino }
57144b87433SJohn Marino 
57244b87433SJohn Marino /* This function re-construct the buffers.
57344b87433SJohn Marino    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
57444b87433SJohn Marino    convert to upper case in case of REG_ICASE, apply translation.  */
57544b87433SJohn Marino 
57644b87433SJohn Marino static reg_errcode_t
577*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)57844b87433SJohn Marino re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
57944b87433SJohn Marino {
58044b87433SJohn Marino   Idx offset;
58144b87433SJohn Marino 
582*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
58344b87433SJohn Marino     offset = idx - pstr->raw_mbs_idx;
58444b87433SJohn Marino   else
58544b87433SJohn Marino     {
58644b87433SJohn Marino       /* Reset buffer.  */
58744b87433SJohn Marino #ifdef RE_ENABLE_I18N
58844b87433SJohn Marino       if (pstr->mb_cur_max > 1)
58944b87433SJohn Marino 	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
59044b87433SJohn Marino #endif /* RE_ENABLE_I18N */
59144b87433SJohn Marino       pstr->len = pstr->raw_len;
59244b87433SJohn Marino       pstr->stop = pstr->raw_stop;
59344b87433SJohn Marino       pstr->valid_len = 0;
59444b87433SJohn Marino       pstr->raw_mbs_idx = 0;
59544b87433SJohn Marino       pstr->valid_raw_len = 0;
59644b87433SJohn Marino       pstr->offsets_needed = 0;
59744b87433SJohn Marino       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
59844b87433SJohn Marino 			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
59944b87433SJohn Marino       if (!pstr->mbs_allocated)
60044b87433SJohn Marino 	pstr->mbs = (unsigned char *) pstr->raw_mbs;
60144b87433SJohn Marino       offset = idx;
60244b87433SJohn Marino     }
60344b87433SJohn Marino 
604*6ea1f93eSDaniel Fojt   if (__glibc_likely (offset != 0))
60544b87433SJohn Marino     {
60644b87433SJohn Marino       /* Should the already checked characters be kept?  */
607*6ea1f93eSDaniel Fojt       if (__glibc_likely (offset < pstr->valid_raw_len))
60844b87433SJohn Marino 	{
60944b87433SJohn Marino 	  /* Yes, move them to the front of the buffer.  */
61044b87433SJohn Marino #ifdef RE_ENABLE_I18N
611*6ea1f93eSDaniel Fojt 	  if (__glibc_unlikely (pstr->offsets_needed))
61244b87433SJohn Marino 	    {
61344b87433SJohn Marino 	      Idx low = 0, high = pstr->valid_len, mid;
61444b87433SJohn Marino 	      do
61544b87433SJohn Marino 		{
61644b87433SJohn Marino 		  mid = (high + low) / 2;
61744b87433SJohn Marino 		  if (pstr->offsets[mid] > offset)
61844b87433SJohn Marino 		    high = mid;
61944b87433SJohn Marino 		  else if (pstr->offsets[mid] < offset)
62044b87433SJohn Marino 		    low = mid + 1;
62144b87433SJohn Marino 		  else
62244b87433SJohn Marino 		    break;
62344b87433SJohn Marino 		}
62444b87433SJohn Marino 	      while (low < high);
62544b87433SJohn Marino 	      if (pstr->offsets[mid] < offset)
62644b87433SJohn Marino 		++mid;
62744b87433SJohn Marino 	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
62844b87433SJohn Marino 							eflags);
62944b87433SJohn Marino 	      /* This can be quite complicated, so handle specially
63044b87433SJohn Marino 		 only the common and easy case where the character with
63144b87433SJohn Marino 		 different length representation of lower and upper
63244b87433SJohn Marino 		 case is present at or after offset.  */
63344b87433SJohn Marino 	      if (pstr->valid_len > offset
63444b87433SJohn Marino 		  && mid == offset && pstr->offsets[mid] == offset)
63544b87433SJohn Marino 		{
63644b87433SJohn Marino 		  memmove (pstr->wcs, pstr->wcs + offset,
63744b87433SJohn Marino 			   (pstr->valid_len - offset) * sizeof (wint_t));
63844b87433SJohn Marino 		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
63944b87433SJohn Marino 		  pstr->valid_len -= offset;
64044b87433SJohn Marino 		  pstr->valid_raw_len -= offset;
64144b87433SJohn Marino 		  for (low = 0; low < pstr->valid_len; low++)
64244b87433SJohn Marino 		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
64344b87433SJohn Marino 		}
64444b87433SJohn Marino 	      else
64544b87433SJohn Marino 		{
64644b87433SJohn Marino 		  /* Otherwise, just find out how long the partial multibyte
64744b87433SJohn Marino 		     character at offset is and fill it with WEOF/255.  */
64844b87433SJohn Marino 		  pstr->len = pstr->raw_len - idx + offset;
64944b87433SJohn Marino 		  pstr->stop = pstr->raw_stop - idx + offset;
65044b87433SJohn Marino 		  pstr->offsets_needed = 0;
65144b87433SJohn Marino 		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
65244b87433SJohn Marino 		    --mid;
65344b87433SJohn Marino 		  while (mid < pstr->valid_len)
65444b87433SJohn Marino 		    if (pstr->wcs[mid] != WEOF)
65544b87433SJohn Marino 		      break;
65644b87433SJohn Marino 		    else
65744b87433SJohn Marino 		      ++mid;
65844b87433SJohn Marino 		  if (mid == pstr->valid_len)
65944b87433SJohn Marino 		    pstr->valid_len = 0;
66044b87433SJohn Marino 		  else
66144b87433SJohn Marino 		    {
66244b87433SJohn Marino 		      pstr->valid_len = pstr->offsets[mid] - offset;
66344b87433SJohn Marino 		      if (pstr->valid_len)
66444b87433SJohn Marino 			{
66544b87433SJohn Marino 			  for (low = 0; low < pstr->valid_len; ++low)
66644b87433SJohn Marino 			    pstr->wcs[low] = WEOF;
66744b87433SJohn Marino 			  memset (pstr->mbs, 255, pstr->valid_len);
66844b87433SJohn Marino 			}
66944b87433SJohn Marino 		    }
67044b87433SJohn Marino 		  pstr->valid_raw_len = pstr->valid_len;
67144b87433SJohn Marino 		}
67244b87433SJohn Marino 	    }
67344b87433SJohn Marino 	  else
67444b87433SJohn Marino #endif
67544b87433SJohn Marino 	    {
67644b87433SJohn Marino 	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
67744b87433SJohn Marino 							eflags);
67844b87433SJohn Marino #ifdef RE_ENABLE_I18N
67944b87433SJohn Marino 	      if (pstr->mb_cur_max > 1)
68044b87433SJohn Marino 		memmove (pstr->wcs, pstr->wcs + offset,
68144b87433SJohn Marino 			 (pstr->valid_len - offset) * sizeof (wint_t));
68244b87433SJohn Marino #endif /* RE_ENABLE_I18N */
683*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (pstr->mbs_allocated))
68444b87433SJohn Marino 		memmove (pstr->mbs, pstr->mbs + offset,
68544b87433SJohn Marino 			 pstr->valid_len - offset);
68644b87433SJohn Marino 	      pstr->valid_len -= offset;
68744b87433SJohn Marino 	      pstr->valid_raw_len -= offset;
688*6ea1f93eSDaniel Fojt #if defined DEBUG && DEBUG
68944b87433SJohn Marino 	      assert (pstr->valid_len > 0);
69044b87433SJohn Marino #endif
69144b87433SJohn Marino 	    }
69244b87433SJohn Marino 	}
69344b87433SJohn Marino       else
69444b87433SJohn Marino 	{
69544b87433SJohn Marino #ifdef RE_ENABLE_I18N
69644b87433SJohn Marino 	  /* No, skip all characters until IDX.  */
69744b87433SJohn Marino 	  Idx prev_valid_len = pstr->valid_len;
69844b87433SJohn Marino 
699*6ea1f93eSDaniel Fojt 	  if (__glibc_unlikely (pstr->offsets_needed))
70044b87433SJohn Marino 	    {
70144b87433SJohn Marino 	      pstr->len = pstr->raw_len - idx + offset;
70244b87433SJohn Marino 	      pstr->stop = pstr->raw_stop - idx + offset;
70344b87433SJohn Marino 	      pstr->offsets_needed = 0;
70444b87433SJohn Marino 	    }
70544b87433SJohn Marino #endif
70644b87433SJohn Marino 	  pstr->valid_len = 0;
70744b87433SJohn Marino #ifdef RE_ENABLE_I18N
70844b87433SJohn Marino 	  if (pstr->mb_cur_max > 1)
70944b87433SJohn Marino 	    {
71044b87433SJohn Marino 	      Idx wcs_idx;
71144b87433SJohn Marino 	      wint_t wc = WEOF;
71244b87433SJohn Marino 
71344b87433SJohn Marino 	      if (pstr->is_utf8)
71444b87433SJohn Marino 		{
71544b87433SJohn Marino 		  const unsigned char *raw, *p, *end;
71644b87433SJohn Marino 
71744b87433SJohn Marino 		  /* Special case UTF-8.  Multi-byte chars start with any
71844b87433SJohn Marino 		     byte other than 0x80 - 0xbf.  */
71944b87433SJohn Marino 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
72044b87433SJohn Marino 		  end = raw + (offset - pstr->mb_cur_max);
72144b87433SJohn Marino 		  if (end < pstr->raw_mbs)
72244b87433SJohn Marino 		    end = pstr->raw_mbs;
72344b87433SJohn Marino 		  p = raw + offset - 1;
72444b87433SJohn Marino #ifdef _LIBC
72544b87433SJohn Marino 		  /* We know the wchar_t encoding is UCS4, so for the simple
72644b87433SJohn Marino 		     case, ASCII characters, skip the conversion step.  */
727*6ea1f93eSDaniel Fojt 		  if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
72844b87433SJohn Marino 		    {
72944b87433SJohn Marino 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
73044b87433SJohn Marino 		      /* pstr->valid_len = 0; */
73144b87433SJohn Marino 		      wc = (wchar_t) *p;
73244b87433SJohn Marino 		    }
73344b87433SJohn Marino 		  else
73444b87433SJohn Marino #endif
73544b87433SJohn Marino 		    for (; p >= end; --p)
73644b87433SJohn Marino 		      if ((*p & 0xc0) != 0x80)
73744b87433SJohn Marino 			{
73844b87433SJohn Marino 			  mbstate_t cur_state;
73944b87433SJohn Marino 			  wchar_t wc2;
74044b87433SJohn Marino 			  Idx mlen = raw + pstr->len - p;
7414536c563SJohn Marino 			  unsigned char buf[6];
74244b87433SJohn Marino 			  size_t mbclen;
74344b87433SJohn Marino 
7444536c563SJohn Marino 			  const unsigned char *pp = p;
745*6ea1f93eSDaniel Fojt 			  if (__glibc_unlikely (pstr->trans != NULL))
74644b87433SJohn Marino 			    {
74744b87433SJohn Marino 			      int i = mlen < 6 ? mlen : 6;
74844b87433SJohn Marino 			      while (--i >= 0)
74944b87433SJohn Marino 				buf[i] = pstr->trans[p[i]];
7504536c563SJohn Marino 			      pp = buf;
75144b87433SJohn Marino 			    }
75244b87433SJohn Marino 			  /* XXX Don't use mbrtowc, we know which conversion
75344b87433SJohn Marino 			     to use (UTF-8 -> UCS4).  */
75444b87433SJohn Marino 			  memset (&cur_state, 0, sizeof (cur_state));
7554536c563SJohn Marino 			  mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
75644b87433SJohn Marino 					      &cur_state);
75744b87433SJohn Marino 			  if (raw + offset - p <= mbclen
75844b87433SJohn Marino 			      && mbclen < (size_t) -2)
75944b87433SJohn Marino 			    {
76044b87433SJohn Marino 			      memset (&pstr->cur_state, '\0',
76144b87433SJohn Marino 				      sizeof (mbstate_t));
76244b87433SJohn Marino 			      pstr->valid_len = mbclen - (raw + offset - p);
76344b87433SJohn Marino 			      wc = wc2;
76444b87433SJohn Marino 			    }
76544b87433SJohn Marino 			  break;
76644b87433SJohn Marino 			}
76744b87433SJohn Marino 		}
76844b87433SJohn Marino 
76944b87433SJohn Marino 	      if (wc == WEOF)
77044b87433SJohn Marino 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
77144b87433SJohn Marino 	      if (wc == WEOF)
77244b87433SJohn Marino 		pstr->tip_context
77344b87433SJohn Marino 		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
77444b87433SJohn Marino 	      else
775*6ea1f93eSDaniel Fojt 		pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
77644b87433SJohn Marino 				      && IS_WIDE_WORD_CHAR (wc))
77744b87433SJohn Marino 				     ? CONTEXT_WORD
77844b87433SJohn Marino 				     : ((IS_WIDE_NEWLINE (wc)
77944b87433SJohn Marino 					 && pstr->newline_anchor)
78044b87433SJohn Marino 					? CONTEXT_NEWLINE : 0));
781*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (pstr->valid_len))
78244b87433SJohn Marino 		{
78344b87433SJohn Marino 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
78444b87433SJohn Marino 		    pstr->wcs[wcs_idx] = WEOF;
78544b87433SJohn Marino 		  if (pstr->mbs_allocated)
78644b87433SJohn Marino 		    memset (pstr->mbs, 255, pstr->valid_len);
78744b87433SJohn Marino 		}
78844b87433SJohn Marino 	      pstr->valid_raw_len = pstr->valid_len;
78944b87433SJohn Marino 	    }
79044b87433SJohn Marino 	  else
79144b87433SJohn Marino #endif /* RE_ENABLE_I18N */
79244b87433SJohn Marino 	    {
79344b87433SJohn Marino 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
79444b87433SJohn Marino 	      pstr->valid_raw_len = 0;
79544b87433SJohn Marino 	      if (pstr->trans)
79644b87433SJohn Marino 		c = pstr->trans[c];
79744b87433SJohn Marino 	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
79844b87433SJohn Marino 				   ? CONTEXT_WORD
79944b87433SJohn Marino 				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
80044b87433SJohn Marino 				      ? CONTEXT_NEWLINE : 0));
80144b87433SJohn Marino 	    }
80244b87433SJohn Marino 	}
803*6ea1f93eSDaniel Fojt       if (!__glibc_unlikely (pstr->mbs_allocated))
80444b87433SJohn Marino 	pstr->mbs += offset;
80544b87433SJohn Marino     }
80644b87433SJohn Marino   pstr->raw_mbs_idx = idx;
80744b87433SJohn Marino   pstr->len -= offset;
80844b87433SJohn Marino   pstr->stop -= offset;
80944b87433SJohn Marino 
81044b87433SJohn Marino   /* Then build the buffers.  */
81144b87433SJohn Marino #ifdef RE_ENABLE_I18N
81244b87433SJohn Marino   if (pstr->mb_cur_max > 1)
81344b87433SJohn Marino     {
81444b87433SJohn Marino       if (pstr->icase)
81544b87433SJohn Marino 	{
81644b87433SJohn Marino 	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
817*6ea1f93eSDaniel Fojt 	  if (__glibc_unlikely (ret != REG_NOERROR))
81844b87433SJohn Marino 	    return ret;
81944b87433SJohn Marino 	}
82044b87433SJohn Marino       else
82144b87433SJohn Marino 	build_wcs_buffer (pstr);
82244b87433SJohn Marino     }
82344b87433SJohn Marino   else
82444b87433SJohn Marino #endif /* RE_ENABLE_I18N */
825*6ea1f93eSDaniel Fojt     if (__glibc_unlikely (pstr->mbs_allocated))
82644b87433SJohn Marino       {
82744b87433SJohn Marino 	if (pstr->icase)
82844b87433SJohn Marino 	  build_upper_buffer (pstr);
82944b87433SJohn Marino 	else if (pstr->trans != NULL)
83044b87433SJohn Marino 	  re_string_translate_buffer (pstr);
83144b87433SJohn Marino       }
83244b87433SJohn Marino     else
83344b87433SJohn Marino       pstr->valid_len = pstr->len;
83444b87433SJohn Marino 
83544b87433SJohn Marino   pstr->cur_idx = 0;
83644b87433SJohn Marino   return REG_NOERROR;
83744b87433SJohn Marino }
83844b87433SJohn Marino 
83944b87433SJohn Marino static unsigned char
840*6ea1f93eSDaniel Fojt __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)84144b87433SJohn Marino re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
84244b87433SJohn Marino {
84344b87433SJohn Marino   int ch;
84444b87433SJohn Marino   Idx off;
84544b87433SJohn Marino 
84644b87433SJohn Marino   /* Handle the common (easiest) cases first.  */
847*6ea1f93eSDaniel Fojt   if (__glibc_likely (!pstr->mbs_allocated))
84844b87433SJohn Marino     return re_string_peek_byte (pstr, idx);
84944b87433SJohn Marino 
85044b87433SJohn Marino #ifdef RE_ENABLE_I18N
85144b87433SJohn Marino   if (pstr->mb_cur_max > 1
85244b87433SJohn Marino       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
85344b87433SJohn Marino     return re_string_peek_byte (pstr, idx);
85444b87433SJohn Marino #endif
85544b87433SJohn Marino 
85644b87433SJohn Marino   off = pstr->cur_idx + idx;
85744b87433SJohn Marino #ifdef RE_ENABLE_I18N
85844b87433SJohn Marino   if (pstr->offsets_needed)
85944b87433SJohn Marino     off = pstr->offsets[off];
86044b87433SJohn Marino #endif
86144b87433SJohn Marino 
86244b87433SJohn Marino   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
86344b87433SJohn Marino 
86444b87433SJohn Marino #ifdef RE_ENABLE_I18N
86544b87433SJohn Marino   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
86644b87433SJohn Marino      this function returns CAPITAL LETTER I instead of first byte of
86744b87433SJohn Marino      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
86844b87433SJohn Marino      since peek_byte_case doesn't advance cur_idx in any way.  */
86944b87433SJohn Marino   if (pstr->offsets_needed && !isascii (ch))
87044b87433SJohn Marino     return re_string_peek_byte (pstr, idx);
87144b87433SJohn Marino #endif
87244b87433SJohn Marino 
87344b87433SJohn Marino   return ch;
87444b87433SJohn Marino }
87544b87433SJohn Marino 
87644b87433SJohn Marino static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)87744b87433SJohn Marino re_string_fetch_byte_case (re_string_t *pstr)
87844b87433SJohn Marino {
879*6ea1f93eSDaniel Fojt   if (__glibc_likely (!pstr->mbs_allocated))
88044b87433SJohn Marino     return re_string_fetch_byte (pstr);
88144b87433SJohn Marino 
88244b87433SJohn Marino #ifdef RE_ENABLE_I18N
88344b87433SJohn Marino   if (pstr->offsets_needed)
88444b87433SJohn Marino     {
88544b87433SJohn Marino       Idx off;
88644b87433SJohn Marino       int ch;
88744b87433SJohn Marino 
88844b87433SJohn Marino       /* For tr_TR.UTF-8 [[:islower:]] there is
88944b87433SJohn Marino 	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
89044b87433SJohn Marino 	 in that case the whole multi-byte character and return
89144b87433SJohn Marino 	 the original letter.  On the other side, with
89244b87433SJohn Marino 	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
89344b87433SJohn Marino 	 anything else would complicate things too much.  */
89444b87433SJohn Marino 
89544b87433SJohn Marino       if (!re_string_first_byte (pstr, pstr->cur_idx))
89644b87433SJohn Marino 	return re_string_fetch_byte (pstr);
89744b87433SJohn Marino 
89844b87433SJohn Marino       off = pstr->offsets[pstr->cur_idx];
89944b87433SJohn Marino       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
90044b87433SJohn Marino 
90144b87433SJohn Marino       if (! isascii (ch))
90244b87433SJohn Marino 	return re_string_fetch_byte (pstr);
90344b87433SJohn Marino 
90444b87433SJohn Marino       re_string_skip_bytes (pstr,
90544b87433SJohn Marino 			    re_string_char_size_at (pstr, pstr->cur_idx));
90644b87433SJohn Marino       return ch;
90744b87433SJohn Marino     }
90844b87433SJohn Marino #endif
90944b87433SJohn Marino 
91044b87433SJohn Marino   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
91144b87433SJohn Marino }
91244b87433SJohn Marino 
91344b87433SJohn Marino static void
re_string_destruct(re_string_t * pstr)91444b87433SJohn Marino re_string_destruct (re_string_t *pstr)
91544b87433SJohn Marino {
91644b87433SJohn Marino #ifdef RE_ENABLE_I18N
91744b87433SJohn Marino   re_free (pstr->wcs);
91844b87433SJohn Marino   re_free (pstr->offsets);
91944b87433SJohn Marino #endif /* RE_ENABLE_I18N  */
92044b87433SJohn Marino   if (pstr->mbs_allocated)
92144b87433SJohn Marino     re_free (pstr->mbs);
92244b87433SJohn Marino }
92344b87433SJohn Marino 
92444b87433SJohn Marino /* Return the context at IDX in INPUT.  */
92544b87433SJohn Marino 
92644b87433SJohn Marino static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)92744b87433SJohn Marino re_string_context_at (const re_string_t *input, Idx idx, int eflags)
92844b87433SJohn Marino {
92944b87433SJohn Marino   int c;
930*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (idx < 0))
93144b87433SJohn Marino     /* In this case, we use the value stored in input->tip_context,
93244b87433SJohn Marino        since we can't know the character in input->mbs[-1] here.  */
93344b87433SJohn Marino     return input->tip_context;
934*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (idx == input->len))
93544b87433SJohn Marino     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
93644b87433SJohn Marino 	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
93744b87433SJohn Marino #ifdef RE_ENABLE_I18N
93844b87433SJohn Marino   if (input->mb_cur_max > 1)
93944b87433SJohn Marino     {
94044b87433SJohn Marino       wint_t wc;
94144b87433SJohn Marino       Idx wc_idx = idx;
94244b87433SJohn Marino       while(input->wcs[wc_idx] == WEOF)
94344b87433SJohn Marino 	{
944*6ea1f93eSDaniel Fojt #if defined DEBUG && DEBUG
94544b87433SJohn Marino 	  /* It must not happen.  */
946*6ea1f93eSDaniel Fojt 	  assert (wc_idx >= 0);
94744b87433SJohn Marino #endif
94844b87433SJohn Marino 	  --wc_idx;
949*6ea1f93eSDaniel Fojt 	  if (wc_idx < 0)
95044b87433SJohn Marino 	    return input->tip_context;
95144b87433SJohn Marino 	}
95244b87433SJohn Marino       wc = input->wcs[wc_idx];
953*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (input->word_ops_used != 0)
954*6ea1f93eSDaniel Fojt 	  && IS_WIDE_WORD_CHAR (wc))
95544b87433SJohn Marino 	return CONTEXT_WORD;
95644b87433SJohn Marino       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
95744b87433SJohn Marino 	      ? CONTEXT_NEWLINE : 0);
95844b87433SJohn Marino     }
95944b87433SJohn Marino   else
96044b87433SJohn Marino #endif
96144b87433SJohn Marino     {
96244b87433SJohn Marino       c = re_string_byte_at (input, idx);
96344b87433SJohn Marino       if (bitset_contain (input->word_char, c))
96444b87433SJohn Marino 	return CONTEXT_WORD;
96544b87433SJohn Marino       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
96644b87433SJohn Marino     }
96744b87433SJohn Marino }
96844b87433SJohn Marino 
96944b87433SJohn Marino /* Functions for set operation.  */
97044b87433SJohn Marino 
97144b87433SJohn Marino static reg_errcode_t
972*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)97344b87433SJohn Marino re_node_set_alloc (re_node_set *set, Idx size)
97444b87433SJohn Marino {
97544b87433SJohn Marino   set->alloc = size;
97644b87433SJohn Marino   set->nelem = 0;
97744b87433SJohn Marino   set->elems = re_malloc (Idx, size);
978*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (set->elems == NULL)
979*6ea1f93eSDaniel Fojt       && (MALLOC_0_IS_NONNULL || size != 0))
98044b87433SJohn Marino     return REG_ESPACE;
98144b87433SJohn Marino   return REG_NOERROR;
98244b87433SJohn Marino }
98344b87433SJohn Marino 
98444b87433SJohn Marino static reg_errcode_t
985*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)98644b87433SJohn Marino re_node_set_init_1 (re_node_set *set, Idx elem)
98744b87433SJohn Marino {
98844b87433SJohn Marino   set->alloc = 1;
98944b87433SJohn Marino   set->nelem = 1;
99044b87433SJohn Marino   set->elems = re_malloc (Idx, 1);
991*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (set->elems == NULL))
99244b87433SJohn Marino     {
99344b87433SJohn Marino       set->alloc = set->nelem = 0;
99444b87433SJohn Marino       return REG_ESPACE;
99544b87433SJohn Marino     }
99644b87433SJohn Marino   set->elems[0] = elem;
99744b87433SJohn Marino   return REG_NOERROR;
99844b87433SJohn Marino }
99944b87433SJohn Marino 
100044b87433SJohn Marino static reg_errcode_t
1001*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)100244b87433SJohn Marino re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
100344b87433SJohn Marino {
100444b87433SJohn Marino   set->alloc = 2;
100544b87433SJohn Marino   set->elems = re_malloc (Idx, 2);
1006*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (set->elems == NULL))
100744b87433SJohn Marino     return REG_ESPACE;
100844b87433SJohn Marino   if (elem1 == elem2)
100944b87433SJohn Marino     {
101044b87433SJohn Marino       set->nelem = 1;
101144b87433SJohn Marino       set->elems[0] = elem1;
101244b87433SJohn Marino     }
101344b87433SJohn Marino   else
101444b87433SJohn Marino     {
101544b87433SJohn Marino       set->nelem = 2;
101644b87433SJohn Marino       if (elem1 < elem2)
101744b87433SJohn Marino 	{
101844b87433SJohn Marino 	  set->elems[0] = elem1;
101944b87433SJohn Marino 	  set->elems[1] = elem2;
102044b87433SJohn Marino 	}
102144b87433SJohn Marino       else
102244b87433SJohn Marino 	{
102344b87433SJohn Marino 	  set->elems[0] = elem2;
102444b87433SJohn Marino 	  set->elems[1] = elem1;
102544b87433SJohn Marino 	}
102644b87433SJohn Marino     }
102744b87433SJohn Marino   return REG_NOERROR;
102844b87433SJohn Marino }
102944b87433SJohn Marino 
103044b87433SJohn Marino static reg_errcode_t
1031*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)103244b87433SJohn Marino re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
103344b87433SJohn Marino {
103444b87433SJohn Marino   dest->nelem = src->nelem;
103544b87433SJohn Marino   if (src->nelem > 0)
103644b87433SJohn Marino     {
103744b87433SJohn Marino       dest->alloc = dest->nelem;
103844b87433SJohn Marino       dest->elems = re_malloc (Idx, dest->alloc);
1039*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (dest->elems == NULL))
104044b87433SJohn Marino 	{
104144b87433SJohn Marino 	  dest->alloc = dest->nelem = 0;
104244b87433SJohn Marino 	  return REG_ESPACE;
104344b87433SJohn Marino 	}
104444b87433SJohn Marino       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
104544b87433SJohn Marino     }
104644b87433SJohn Marino   else
104744b87433SJohn Marino     re_node_set_init_empty (dest);
104844b87433SJohn Marino   return REG_NOERROR;
104944b87433SJohn Marino }
105044b87433SJohn Marino 
105144b87433SJohn Marino /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
105244b87433SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
105344b87433SJohn Marino    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
105444b87433SJohn Marino 
105544b87433SJohn Marino static reg_errcode_t
1056*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)105744b87433SJohn Marino re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
105844b87433SJohn Marino 			   const re_node_set *src2)
105944b87433SJohn Marino {
106044b87433SJohn Marino   Idx i1, i2, is, id, delta, sbase;
106144b87433SJohn Marino   if (src1->nelem == 0 || src2->nelem == 0)
106244b87433SJohn Marino     return REG_NOERROR;
106344b87433SJohn Marino 
106444b87433SJohn Marino   /* We need dest->nelem + 2 * elems_in_intersection; this is a
106544b87433SJohn Marino      conservative estimate.  */
106644b87433SJohn Marino   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
106744b87433SJohn Marino     {
106844b87433SJohn Marino       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
106944b87433SJohn Marino       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1070*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_elems == NULL))
107144b87433SJohn Marino 	return REG_ESPACE;
107244b87433SJohn Marino       dest->elems = new_elems;
107344b87433SJohn Marino       dest->alloc = new_alloc;
107444b87433SJohn Marino     }
107544b87433SJohn Marino 
107644b87433SJohn Marino   /* Find the items in the intersection of SRC1 and SRC2, and copy
107744b87433SJohn Marino      into the top of DEST those that are not already in DEST itself.  */
107844b87433SJohn Marino   sbase = dest->nelem + src1->nelem + src2->nelem;
107944b87433SJohn Marino   i1 = src1->nelem - 1;
108044b87433SJohn Marino   i2 = src2->nelem - 1;
108144b87433SJohn Marino   id = dest->nelem - 1;
108244b87433SJohn Marino   for (;;)
108344b87433SJohn Marino     {
108444b87433SJohn Marino       if (src1->elems[i1] == src2->elems[i2])
108544b87433SJohn Marino 	{
108644b87433SJohn Marino 	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1087*6ea1f93eSDaniel Fojt 	  while (id >= 0 && dest->elems[id] > src1->elems[i1])
108844b87433SJohn Marino 	    --id;
108944b87433SJohn Marino 
1090*6ea1f93eSDaniel Fojt 	  if (id < 0 || dest->elems[id] != src1->elems[i1])
109144b87433SJohn Marino             dest->elems[--sbase] = src1->elems[i1];
109244b87433SJohn Marino 
1093*6ea1f93eSDaniel Fojt 	  if (--i1 < 0 || --i2 < 0)
109444b87433SJohn Marino 	    break;
109544b87433SJohn Marino 	}
109644b87433SJohn Marino 
109744b87433SJohn Marino       /* Lower the highest of the two items.  */
109844b87433SJohn Marino       else if (src1->elems[i1] < src2->elems[i2])
109944b87433SJohn Marino 	{
1100*6ea1f93eSDaniel Fojt 	  if (--i2 < 0)
110144b87433SJohn Marino 	    break;
110244b87433SJohn Marino 	}
110344b87433SJohn Marino       else
110444b87433SJohn Marino 	{
1105*6ea1f93eSDaniel Fojt 	  if (--i1 < 0)
110644b87433SJohn Marino 	    break;
110744b87433SJohn Marino 	}
110844b87433SJohn Marino     }
110944b87433SJohn Marino 
111044b87433SJohn Marino   id = dest->nelem - 1;
111144b87433SJohn Marino   is = dest->nelem + src1->nelem + src2->nelem - 1;
111244b87433SJohn Marino   delta = is - sbase + 1;
111344b87433SJohn Marino 
111444b87433SJohn Marino   /* Now copy.  When DELTA becomes zero, the remaining
111544b87433SJohn Marino      DEST elements are already in place; this is more or
111644b87433SJohn Marino      less the same loop that is in re_node_set_merge.  */
111744b87433SJohn Marino   dest->nelem += delta;
1118*6ea1f93eSDaniel Fojt   if (delta > 0 && id >= 0)
111944b87433SJohn Marino     for (;;)
112044b87433SJohn Marino       {
112144b87433SJohn Marino 	if (dest->elems[is] > dest->elems[id])
112244b87433SJohn Marino 	  {
112344b87433SJohn Marino 	    /* Copy from the top.  */
112444b87433SJohn Marino 	    dest->elems[id + delta--] = dest->elems[is--];
112544b87433SJohn Marino 	    if (delta == 0)
112644b87433SJohn Marino 	      break;
112744b87433SJohn Marino 	  }
112844b87433SJohn Marino 	else
112944b87433SJohn Marino 	  {
113044b87433SJohn Marino 	    /* Slide from the bottom.  */
113144b87433SJohn Marino 	    dest->elems[id + delta] = dest->elems[id];
1132*6ea1f93eSDaniel Fojt 	    if (--id < 0)
113344b87433SJohn Marino 	      break;
113444b87433SJohn Marino 	  }
113544b87433SJohn Marino       }
113644b87433SJohn Marino 
113744b87433SJohn Marino   /* Copy remaining SRC elements.  */
113844b87433SJohn Marino   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
113944b87433SJohn Marino 
114044b87433SJohn Marino   return REG_NOERROR;
114144b87433SJohn Marino }
114244b87433SJohn Marino 
114344b87433SJohn Marino /* Calculate the union set of the sets SRC1 and SRC2. And store it to
114444b87433SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
114544b87433SJohn Marino 
114644b87433SJohn Marino static reg_errcode_t
1147*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)114844b87433SJohn Marino re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
114944b87433SJohn Marino 			const re_node_set *src2)
115044b87433SJohn Marino {
115144b87433SJohn Marino   Idx i1, i2, id;
115244b87433SJohn Marino   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
115344b87433SJohn Marino     {
115444b87433SJohn Marino       dest->alloc = src1->nelem + src2->nelem;
115544b87433SJohn Marino       dest->elems = re_malloc (Idx, dest->alloc);
1156*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (dest->elems == NULL))
115744b87433SJohn Marino 	return REG_ESPACE;
115844b87433SJohn Marino     }
115944b87433SJohn Marino   else
116044b87433SJohn Marino     {
116144b87433SJohn Marino       if (src1 != NULL && src1->nelem > 0)
116244b87433SJohn Marino 	return re_node_set_init_copy (dest, src1);
116344b87433SJohn Marino       else if (src2 != NULL && src2->nelem > 0)
116444b87433SJohn Marino 	return re_node_set_init_copy (dest, src2);
116544b87433SJohn Marino       else
116644b87433SJohn Marino 	re_node_set_init_empty (dest);
116744b87433SJohn Marino       return REG_NOERROR;
116844b87433SJohn Marino     }
116944b87433SJohn Marino   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
117044b87433SJohn Marino     {
117144b87433SJohn Marino       if (src1->elems[i1] > src2->elems[i2])
117244b87433SJohn Marino 	{
117344b87433SJohn Marino 	  dest->elems[id++] = src2->elems[i2++];
117444b87433SJohn Marino 	  continue;
117544b87433SJohn Marino 	}
117644b87433SJohn Marino       if (src1->elems[i1] == src2->elems[i2])
117744b87433SJohn Marino 	++i2;
117844b87433SJohn Marino       dest->elems[id++] = src1->elems[i1++];
117944b87433SJohn Marino     }
118044b87433SJohn Marino   if (i1 < src1->nelem)
118144b87433SJohn Marino     {
118244b87433SJohn Marino       memcpy (dest->elems + id, src1->elems + i1,
118344b87433SJohn Marino 	     (src1->nelem - i1) * sizeof (Idx));
118444b87433SJohn Marino       id += src1->nelem - i1;
118544b87433SJohn Marino     }
118644b87433SJohn Marino   else if (i2 < src2->nelem)
118744b87433SJohn Marino     {
118844b87433SJohn Marino       memcpy (dest->elems + id, src2->elems + i2,
118944b87433SJohn Marino 	     (src2->nelem - i2) * sizeof (Idx));
119044b87433SJohn Marino       id += src2->nelem - i2;
119144b87433SJohn Marino     }
119244b87433SJohn Marino   dest->nelem = id;
119344b87433SJohn Marino   return REG_NOERROR;
119444b87433SJohn Marino }
119544b87433SJohn Marino 
119644b87433SJohn Marino /* Calculate the union set of the sets DEST and SRC. And store it to
119744b87433SJohn Marino    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
119844b87433SJohn Marino 
119944b87433SJohn Marino static reg_errcode_t
1200*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)120144b87433SJohn Marino re_node_set_merge (re_node_set *dest, const re_node_set *src)
120244b87433SJohn Marino {
120344b87433SJohn Marino   Idx is, id, sbase, delta;
120444b87433SJohn Marino   if (src == NULL || src->nelem == 0)
120544b87433SJohn Marino     return REG_NOERROR;
120644b87433SJohn Marino   if (dest->alloc < 2 * src->nelem + dest->nelem)
120744b87433SJohn Marino     {
120844b87433SJohn Marino       Idx new_alloc = 2 * (src->nelem + dest->alloc);
120944b87433SJohn Marino       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1210*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_buffer == NULL))
121144b87433SJohn Marino 	return REG_ESPACE;
121244b87433SJohn Marino       dest->elems = new_buffer;
121344b87433SJohn Marino       dest->alloc = new_alloc;
121444b87433SJohn Marino     }
121544b87433SJohn Marino 
1216*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (dest->nelem == 0))
121744b87433SJohn Marino     {
121844b87433SJohn Marino       dest->nelem = src->nelem;
121944b87433SJohn Marino       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
122044b87433SJohn Marino       return REG_NOERROR;
122144b87433SJohn Marino     }
122244b87433SJohn Marino 
122344b87433SJohn Marino   /* Copy into the top of DEST the items of SRC that are not
122444b87433SJohn Marino      found in DEST.  Maybe we could binary search in DEST?  */
122544b87433SJohn Marino   for (sbase = dest->nelem + 2 * src->nelem,
1226*6ea1f93eSDaniel Fojt        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
122744b87433SJohn Marino     {
122844b87433SJohn Marino       if (dest->elems[id] == src->elems[is])
122944b87433SJohn Marino 	is--, id--;
123044b87433SJohn Marino       else if (dest->elems[id] < src->elems[is])
123144b87433SJohn Marino 	dest->elems[--sbase] = src->elems[is--];
123244b87433SJohn Marino       else /* if (dest->elems[id] > src->elems[is]) */
123344b87433SJohn Marino 	--id;
123444b87433SJohn Marino     }
123544b87433SJohn Marino 
1236*6ea1f93eSDaniel Fojt   if (is >= 0)
123744b87433SJohn Marino     {
123844b87433SJohn Marino       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
123944b87433SJohn Marino       sbase -= is + 1;
124044b87433SJohn Marino       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
124144b87433SJohn Marino     }
124244b87433SJohn Marino 
124344b87433SJohn Marino   id = dest->nelem - 1;
124444b87433SJohn Marino   is = dest->nelem + 2 * src->nelem - 1;
124544b87433SJohn Marino   delta = is - sbase + 1;
124644b87433SJohn Marino   if (delta == 0)
124744b87433SJohn Marino     return REG_NOERROR;
124844b87433SJohn Marino 
124944b87433SJohn Marino   /* Now copy.  When DELTA becomes zero, the remaining
125044b87433SJohn Marino      DEST elements are already in place.  */
125144b87433SJohn Marino   dest->nelem += delta;
125244b87433SJohn Marino   for (;;)
125344b87433SJohn Marino     {
125444b87433SJohn Marino       if (dest->elems[is] > dest->elems[id])
125544b87433SJohn Marino 	{
125644b87433SJohn Marino 	  /* Copy from the top.  */
125744b87433SJohn Marino 	  dest->elems[id + delta--] = dest->elems[is--];
125844b87433SJohn Marino 	  if (delta == 0)
125944b87433SJohn Marino 	    break;
126044b87433SJohn Marino 	}
126144b87433SJohn Marino       else
126244b87433SJohn Marino 	{
126344b87433SJohn Marino 	  /* Slide from the bottom.  */
126444b87433SJohn Marino 	  dest->elems[id + delta] = dest->elems[id];
1265*6ea1f93eSDaniel Fojt 	  if (--id < 0)
126644b87433SJohn Marino 	    {
126744b87433SJohn Marino 	      /* Copy remaining SRC elements.  */
126844b87433SJohn Marino 	      memcpy (dest->elems, dest->elems + sbase,
126944b87433SJohn Marino 		      delta * sizeof (Idx));
127044b87433SJohn Marino 	      break;
127144b87433SJohn Marino 	    }
127244b87433SJohn Marino 	}
127344b87433SJohn Marino     }
127444b87433SJohn Marino 
127544b87433SJohn Marino   return REG_NOERROR;
127644b87433SJohn Marino }
127744b87433SJohn Marino 
127844b87433SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
127944b87433SJohn Marino    SET should not already have ELEM.
128044b87433SJohn Marino    Return true if successful.  */
128144b87433SJohn Marino 
128244b87433SJohn Marino static bool
1283*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)128444b87433SJohn Marino re_node_set_insert (re_node_set *set, Idx elem)
128544b87433SJohn Marino {
128644b87433SJohn Marino   Idx idx;
128744b87433SJohn Marino   /* In case the set is empty.  */
128844b87433SJohn Marino   if (set->alloc == 0)
1289*6ea1f93eSDaniel Fojt     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
129044b87433SJohn Marino 
1291*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (set->nelem) == 0)
129244b87433SJohn Marino     {
129344b87433SJohn Marino       /* We already guaranteed above that set->alloc != 0.  */
129444b87433SJohn Marino       set->elems[0] = elem;
129544b87433SJohn Marino       ++set->nelem;
129644b87433SJohn Marino       return true;
129744b87433SJohn Marino     }
129844b87433SJohn Marino 
129944b87433SJohn Marino   /* Realloc if we need.  */
130044b87433SJohn Marino   if (set->alloc == set->nelem)
130144b87433SJohn Marino     {
130244b87433SJohn Marino       Idx *new_elems;
130344b87433SJohn Marino       set->alloc = set->alloc * 2;
130444b87433SJohn Marino       new_elems = re_realloc (set->elems, Idx, set->alloc);
1305*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_elems == NULL))
130644b87433SJohn Marino 	return false;
130744b87433SJohn Marino       set->elems = new_elems;
130844b87433SJohn Marino     }
130944b87433SJohn Marino 
131044b87433SJohn Marino   /* Move the elements which follows the new element.  Test the
131144b87433SJohn Marino      first element separately to skip a check in the inner loop.  */
131244b87433SJohn Marino   if (elem < set->elems[0])
131344b87433SJohn Marino     {
131444b87433SJohn Marino       idx = 0;
131544b87433SJohn Marino       for (idx = set->nelem; idx > 0; idx--)
131644b87433SJohn Marino 	set->elems[idx] = set->elems[idx - 1];
131744b87433SJohn Marino     }
131844b87433SJohn Marino   else
131944b87433SJohn Marino     {
132044b87433SJohn Marino       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
132144b87433SJohn Marino 	set->elems[idx] = set->elems[idx - 1];
132244b87433SJohn Marino     }
132344b87433SJohn Marino 
132444b87433SJohn Marino   /* Insert the new element.  */
132544b87433SJohn Marino   set->elems[idx] = elem;
132644b87433SJohn Marino   ++set->nelem;
132744b87433SJohn Marino   return true;
132844b87433SJohn Marino }
132944b87433SJohn Marino 
133044b87433SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
133144b87433SJohn Marino    SET should not already have any element greater than or equal to ELEM.
133244b87433SJohn Marino    Return true if successful.  */
133344b87433SJohn Marino 
133444b87433SJohn Marino static bool
1335*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)133644b87433SJohn Marino re_node_set_insert_last (re_node_set *set, Idx elem)
133744b87433SJohn Marino {
133844b87433SJohn Marino   /* Realloc if we need.  */
133944b87433SJohn Marino   if (set->alloc == set->nelem)
134044b87433SJohn Marino     {
134144b87433SJohn Marino       Idx *new_elems;
134244b87433SJohn Marino       set->alloc = (set->alloc + 1) * 2;
134344b87433SJohn Marino       new_elems = re_realloc (set->elems, Idx, set->alloc);
1344*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_elems == NULL))
134544b87433SJohn Marino 	return false;
134644b87433SJohn Marino       set->elems = new_elems;
134744b87433SJohn Marino     }
134844b87433SJohn Marino 
134944b87433SJohn Marino   /* Insert the new element.  */
135044b87433SJohn Marino   set->elems[set->nelem++] = elem;
135144b87433SJohn Marino   return true;
135244b87433SJohn Marino }
135344b87433SJohn Marino 
135444b87433SJohn Marino /* Compare two node sets SET1 and SET2.
135544b87433SJohn Marino    Return true if SET1 and SET2 are equivalent.  */
135644b87433SJohn Marino 
135744b87433SJohn Marino static bool
1358*6ea1f93eSDaniel Fojt __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)135944b87433SJohn Marino re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
136044b87433SJohn Marino {
136144b87433SJohn Marino   Idx i;
136244b87433SJohn Marino   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
136344b87433SJohn Marino     return false;
1364*6ea1f93eSDaniel Fojt   for (i = set1->nelem ; --i >= 0 ; )
136544b87433SJohn Marino     if (set1->elems[i] != set2->elems[i])
136644b87433SJohn Marino       return false;
136744b87433SJohn Marino   return true;
136844b87433SJohn Marino }
136944b87433SJohn Marino 
137044b87433SJohn Marino /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
137144b87433SJohn Marino 
137244b87433SJohn Marino static Idx
1373*6ea1f93eSDaniel Fojt __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)137444b87433SJohn Marino re_node_set_contains (const re_node_set *set, Idx elem)
137544b87433SJohn Marino {
137644b87433SJohn Marino   __re_size_t idx, right, mid;
1377*6ea1f93eSDaniel Fojt   if (set->nelem <= 0)
137844b87433SJohn Marino     return 0;
137944b87433SJohn Marino 
138044b87433SJohn Marino   /* Binary search the element.  */
138144b87433SJohn Marino   idx = 0;
138244b87433SJohn Marino   right = set->nelem - 1;
138344b87433SJohn Marino   while (idx < right)
138444b87433SJohn Marino     {
138544b87433SJohn Marino       mid = (idx + right) / 2;
138644b87433SJohn Marino       if (set->elems[mid] < elem)
138744b87433SJohn Marino 	idx = mid + 1;
138844b87433SJohn Marino       else
138944b87433SJohn Marino 	right = mid;
139044b87433SJohn Marino     }
139144b87433SJohn Marino   return set->elems[idx] == elem ? idx + 1 : 0;
139244b87433SJohn Marino }
139344b87433SJohn Marino 
139444b87433SJohn Marino static void
re_node_set_remove_at(re_node_set * set,Idx idx)139544b87433SJohn Marino re_node_set_remove_at (re_node_set *set, Idx idx)
139644b87433SJohn Marino {
1397*6ea1f93eSDaniel Fojt   if (idx < 0 || idx >= set->nelem)
139844b87433SJohn Marino     return;
139944b87433SJohn Marino   --set->nelem;
140044b87433SJohn Marino   for (; idx < set->nelem; idx++)
140144b87433SJohn Marino     set->elems[idx] = set->elems[idx + 1];
140244b87433SJohn Marino }
140344b87433SJohn Marino 
140444b87433SJohn Marino 
140544b87433SJohn Marino /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406*6ea1f93eSDaniel Fojt    Or return -1 if an error occurred.  */
140744b87433SJohn Marino 
140844b87433SJohn Marino static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)140944b87433SJohn Marino re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
141044b87433SJohn Marino {
1411*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
141244b87433SJohn Marino     {
141344b87433SJohn Marino       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
141444b87433SJohn Marino       Idx *new_nexts, *new_indices;
141544b87433SJohn Marino       re_node_set *new_edests, *new_eclosures;
141644b87433SJohn Marino       re_token_t *new_nodes;
14174536c563SJohn Marino 
14184536c563SJohn Marino       /* Avoid overflows in realloc.  */
14194536c563SJohn Marino       const size_t max_object_size = MAX (sizeof (re_token_t),
142044b87433SJohn Marino 					  MAX (sizeof (re_node_set),
142144b87433SJohn Marino 					       sizeof (Idx)));
1422*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1423*6ea1f93eSDaniel Fojt 			    < new_nodes_alloc))
1424*6ea1f93eSDaniel Fojt 	return -1;
142544b87433SJohn Marino 
142644b87433SJohn Marino       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_nodes == NULL))
1428*6ea1f93eSDaniel Fojt 	return -1;
142944b87433SJohn Marino       dfa->nodes = new_nodes;
143044b87433SJohn Marino       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
143144b87433SJohn Marino       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
143244b87433SJohn Marino       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
143344b87433SJohn Marino       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1435*6ea1f93eSDaniel Fojt 			    || new_edests == NULL || new_eclosures == NULL))
1436*6ea1f93eSDaniel Fojt 	{
1437*6ea1f93eSDaniel Fojt 	   re_free (new_nexts);
1438*6ea1f93eSDaniel Fojt 	   re_free (new_indices);
1439*6ea1f93eSDaniel Fojt 	   re_free (new_edests);
1440*6ea1f93eSDaniel Fojt 	   re_free (new_eclosures);
1441*6ea1f93eSDaniel Fojt 	   return -1;
1442*6ea1f93eSDaniel Fojt 	}
144344b87433SJohn Marino       dfa->nexts = new_nexts;
144444b87433SJohn Marino       dfa->org_indices = new_indices;
144544b87433SJohn Marino       dfa->edests = new_edests;
144644b87433SJohn Marino       dfa->eclosures = new_eclosures;
144744b87433SJohn Marino       dfa->nodes_alloc = new_nodes_alloc;
144844b87433SJohn Marino     }
144944b87433SJohn Marino   dfa->nodes[dfa->nodes_len] = token;
145044b87433SJohn Marino   dfa->nodes[dfa->nodes_len].constraint = 0;
145144b87433SJohn Marino #ifdef RE_ENABLE_I18N
145244b87433SJohn Marino   dfa->nodes[dfa->nodes_len].accept_mb =
14534536c563SJohn Marino     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
14544536c563SJohn Marino      || token.type == COMPLEX_BRACKET);
145544b87433SJohn Marino #endif
1456*6ea1f93eSDaniel Fojt   dfa->nexts[dfa->nodes_len] = -1;
145744b87433SJohn Marino   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
145844b87433SJohn Marino   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
145944b87433SJohn Marino   return dfa->nodes_len++;
146044b87433SJohn Marino }
146144b87433SJohn Marino 
14624536c563SJohn Marino static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)146344b87433SJohn Marino calc_state_hash (const re_node_set *nodes, unsigned int context)
146444b87433SJohn Marino {
146544b87433SJohn Marino   re_hashval_t hash = nodes->nelem + context;
146644b87433SJohn Marino   Idx i;
146744b87433SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
146844b87433SJohn Marino     hash += nodes->elems[i];
146944b87433SJohn Marino   return hash;
147044b87433SJohn Marino }
147144b87433SJohn Marino 
147244b87433SJohn Marino /* Search for the state whose node_set is equivalent to NODES.
147344b87433SJohn Marino    Return the pointer to the state, if we found it in the DFA.
147444b87433SJohn Marino    Otherwise create the new one and return it.  In case of an error
147544b87433SJohn Marino    return NULL and set the error code in ERR.
147644b87433SJohn Marino    Note: - We assume NULL as the invalid state, then it is possible that
147744b87433SJohn Marino 	   return value is NULL and ERR is REG_NOERROR.
147844b87433SJohn Marino 	 - We never return non-NULL value in case of any errors, it is for
147944b87433SJohn Marino 	   optimization.  */
148044b87433SJohn Marino 
148144b87433SJohn Marino static re_dfastate_t *
1482*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)148344b87433SJohn Marino re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
148444b87433SJohn Marino 		  const re_node_set *nodes)
148544b87433SJohn Marino {
148644b87433SJohn Marino   re_hashval_t hash;
148744b87433SJohn Marino   re_dfastate_t *new_state;
148844b87433SJohn Marino   struct re_state_table_entry *spot;
148944b87433SJohn Marino   Idx i;
1490*6ea1f93eSDaniel Fojt #if defined GCC_LINT || defined lint
149144b87433SJohn Marino   /* Suppress bogus uninitialized-variable warnings.  */
149244b87433SJohn Marino   *err = REG_NOERROR;
149344b87433SJohn Marino #endif
1494*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (nodes->nelem == 0))
149544b87433SJohn Marino     {
149644b87433SJohn Marino       *err = REG_NOERROR;
149744b87433SJohn Marino       return NULL;
149844b87433SJohn Marino     }
149944b87433SJohn Marino   hash = calc_state_hash (nodes, 0);
150044b87433SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
150144b87433SJohn Marino 
150244b87433SJohn Marino   for (i = 0 ; i < spot->num ; i++)
150344b87433SJohn Marino     {
150444b87433SJohn Marino       re_dfastate_t *state = spot->array[i];
150544b87433SJohn Marino       if (hash != state->hash)
150644b87433SJohn Marino 	continue;
150744b87433SJohn Marino       if (re_node_set_compare (&state->nodes, nodes))
150844b87433SJohn Marino 	return state;
150944b87433SJohn Marino     }
151044b87433SJohn Marino 
151144b87433SJohn Marino   /* There are no appropriate state in the dfa, create the new one.  */
151244b87433SJohn Marino   new_state = create_ci_newstate (dfa, nodes, hash);
1513*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (new_state == NULL))
151444b87433SJohn Marino     *err = REG_ESPACE;
151544b87433SJohn Marino 
151644b87433SJohn Marino   return new_state;
151744b87433SJohn Marino }
151844b87433SJohn Marino 
151944b87433SJohn Marino /* Search for the state whose node_set is equivalent to NODES and
152044b87433SJohn Marino    whose context is equivalent to CONTEXT.
152144b87433SJohn Marino    Return the pointer to the state, if we found it in the DFA.
152244b87433SJohn Marino    Otherwise create the new one and return it.  In case of an error
152344b87433SJohn Marino    return NULL and set the error code in ERR.
152444b87433SJohn Marino    Note: - We assume NULL as the invalid state, then it is possible that
152544b87433SJohn Marino 	   return value is NULL and ERR is REG_NOERROR.
152644b87433SJohn Marino 	 - We never return non-NULL value in case of any errors, it is for
152744b87433SJohn Marino 	   optimization.  */
152844b87433SJohn Marino 
152944b87433SJohn Marino static re_dfastate_t *
1530*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)153144b87433SJohn Marino re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
153244b87433SJohn Marino 			  const re_node_set *nodes, unsigned int context)
153344b87433SJohn Marino {
153444b87433SJohn Marino   re_hashval_t hash;
153544b87433SJohn Marino   re_dfastate_t *new_state;
153644b87433SJohn Marino   struct re_state_table_entry *spot;
153744b87433SJohn Marino   Idx i;
1538*6ea1f93eSDaniel Fojt #if defined GCC_LINT || defined lint
153944b87433SJohn Marino   /* Suppress bogus uninitialized-variable warnings.  */
154044b87433SJohn Marino   *err = REG_NOERROR;
154144b87433SJohn Marino #endif
154244b87433SJohn Marino   if (nodes->nelem == 0)
154344b87433SJohn Marino     {
154444b87433SJohn Marino       *err = REG_NOERROR;
154544b87433SJohn Marino       return NULL;
154644b87433SJohn Marino     }
154744b87433SJohn Marino   hash = calc_state_hash (nodes, context);
154844b87433SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
154944b87433SJohn Marino 
155044b87433SJohn Marino   for (i = 0 ; i < spot->num ; i++)
155144b87433SJohn Marino     {
155244b87433SJohn Marino       re_dfastate_t *state = spot->array[i];
155344b87433SJohn Marino       if (state->hash == hash
155444b87433SJohn Marino 	  && state->context == context
155544b87433SJohn Marino 	  && re_node_set_compare (state->entrance_nodes, nodes))
155644b87433SJohn Marino 	return state;
155744b87433SJohn Marino     }
15584536c563SJohn Marino   /* There are no appropriate state in 'dfa', create the new one.  */
155944b87433SJohn Marino   new_state = create_cd_newstate (dfa, nodes, context, hash);
1560*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (new_state == NULL))
156144b87433SJohn Marino     *err = REG_ESPACE;
156244b87433SJohn Marino 
156344b87433SJohn Marino   return new_state;
156444b87433SJohn Marino }
156544b87433SJohn Marino 
156644b87433SJohn Marino /* Finish initialization of the new state NEWSTATE, and using its hash value
156744b87433SJohn Marino    HASH put in the appropriate bucket of DFA's state table.  Return value
156844b87433SJohn Marino    indicates the error code if failed.  */
156944b87433SJohn Marino 
157044b87433SJohn Marino static reg_errcode_t
157144b87433SJohn Marino __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)157244b87433SJohn Marino register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
157344b87433SJohn Marino 		re_hashval_t hash)
157444b87433SJohn Marino {
157544b87433SJohn Marino   struct re_state_table_entry *spot;
157644b87433SJohn Marino   reg_errcode_t err;
157744b87433SJohn Marino   Idx i;
157844b87433SJohn Marino 
157944b87433SJohn Marino   newstate->hash = hash;
158044b87433SJohn Marino   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1581*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
158244b87433SJohn Marino     return REG_ESPACE;
158344b87433SJohn Marino   for (i = 0; i < newstate->nodes.nelem; i++)
158444b87433SJohn Marino     {
158544b87433SJohn Marino       Idx elem = newstate->nodes.elems[i];
158644b87433SJohn Marino       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
15874536c563SJohn Marino 	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
158844b87433SJohn Marino 	  return REG_ESPACE;
158944b87433SJohn Marino     }
159044b87433SJohn Marino 
159144b87433SJohn Marino   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1592*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (spot->alloc <= spot->num))
159344b87433SJohn Marino     {
159444b87433SJohn Marino       Idx new_alloc = 2 * spot->num + 2;
159544b87433SJohn Marino       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
159644b87433SJohn Marino 					      new_alloc);
1597*6ea1f93eSDaniel Fojt       if (__glibc_unlikely (new_array == NULL))
159844b87433SJohn Marino 	return REG_ESPACE;
159944b87433SJohn Marino       spot->array = new_array;
160044b87433SJohn Marino       spot->alloc = new_alloc;
160144b87433SJohn Marino     }
160244b87433SJohn Marino   spot->array[spot->num++] = newstate;
160344b87433SJohn Marino   return REG_NOERROR;
160444b87433SJohn Marino }
160544b87433SJohn Marino 
160644b87433SJohn Marino static void
free_state(re_dfastate_t * state)160744b87433SJohn Marino free_state (re_dfastate_t *state)
160844b87433SJohn Marino {
160944b87433SJohn Marino   re_node_set_free (&state->non_eps_nodes);
161044b87433SJohn Marino   re_node_set_free (&state->inveclosure);
161144b87433SJohn Marino   if (state->entrance_nodes != &state->nodes)
161244b87433SJohn Marino     {
161344b87433SJohn Marino       re_node_set_free (state->entrance_nodes);
161444b87433SJohn Marino       re_free (state->entrance_nodes);
161544b87433SJohn Marino     }
161644b87433SJohn Marino   re_node_set_free (&state->nodes);
161744b87433SJohn Marino   re_free (state->word_trtable);
161844b87433SJohn Marino   re_free (state->trtable);
161944b87433SJohn Marino   re_free (state);
162044b87433SJohn Marino }
162144b87433SJohn Marino 
16224536c563SJohn Marino /* Create the new state which is independent of contexts.
162344b87433SJohn Marino    Return the new state if succeeded, otherwise return NULL.  */
162444b87433SJohn Marino 
162544b87433SJohn Marino static re_dfastate_t *
1626*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)162744b87433SJohn Marino create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
162844b87433SJohn Marino 		    re_hashval_t hash)
162944b87433SJohn Marino {
163044b87433SJohn Marino   Idx i;
163144b87433SJohn Marino   reg_errcode_t err;
163244b87433SJohn Marino   re_dfastate_t *newstate;
163344b87433SJohn Marino 
163444b87433SJohn Marino   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1635*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (newstate == NULL))
163644b87433SJohn Marino     return NULL;
163744b87433SJohn Marino   err = re_node_set_init_copy (&newstate->nodes, nodes);
1638*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
163944b87433SJohn Marino     {
164044b87433SJohn Marino       re_free (newstate);
164144b87433SJohn Marino       return NULL;
164244b87433SJohn Marino     }
164344b87433SJohn Marino 
164444b87433SJohn Marino   newstate->entrance_nodes = &newstate->nodes;
164544b87433SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
164644b87433SJohn Marino     {
164744b87433SJohn Marino       re_token_t *node = dfa->nodes + nodes->elems[i];
164844b87433SJohn Marino       re_token_type_t type = node->type;
164944b87433SJohn Marino       if (type == CHARACTER && !node->constraint)
165044b87433SJohn Marino 	continue;
165144b87433SJohn Marino #ifdef RE_ENABLE_I18N
165244b87433SJohn Marino       newstate->accept_mb |= node->accept_mb;
165344b87433SJohn Marino #endif /* RE_ENABLE_I18N */
165444b87433SJohn Marino 
165544b87433SJohn Marino       /* If the state has the halt node, the state is a halt state.  */
165644b87433SJohn Marino       if (type == END_OF_RE)
165744b87433SJohn Marino 	newstate->halt = 1;
165844b87433SJohn Marino       else if (type == OP_BACK_REF)
165944b87433SJohn Marino 	newstate->has_backref = 1;
166044b87433SJohn Marino       else if (type == ANCHOR || node->constraint)
166144b87433SJohn Marino 	newstate->has_constraint = 1;
166244b87433SJohn Marino     }
166344b87433SJohn Marino   err = register_state (dfa, newstate, hash);
1664*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
166544b87433SJohn Marino     {
166644b87433SJohn Marino       free_state (newstate);
166744b87433SJohn Marino       newstate = NULL;
166844b87433SJohn Marino     }
166944b87433SJohn Marino   return newstate;
167044b87433SJohn Marino }
167144b87433SJohn Marino 
167244b87433SJohn Marino /* Create the new state which is depend on the context CONTEXT.
167344b87433SJohn Marino    Return the new state if succeeded, otherwise return NULL.  */
167444b87433SJohn Marino 
167544b87433SJohn Marino static re_dfastate_t *
1676*6ea1f93eSDaniel Fojt __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)167744b87433SJohn Marino create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
167844b87433SJohn Marino 		    unsigned int context, re_hashval_t hash)
167944b87433SJohn Marino {
168044b87433SJohn Marino   Idx i, nctx_nodes = 0;
168144b87433SJohn Marino   reg_errcode_t err;
168244b87433SJohn Marino   re_dfastate_t *newstate;
168344b87433SJohn Marino 
168444b87433SJohn Marino   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1685*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (newstate == NULL))
168644b87433SJohn Marino     return NULL;
168744b87433SJohn Marino   err = re_node_set_init_copy (&newstate->nodes, nodes);
1688*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
168944b87433SJohn Marino     {
169044b87433SJohn Marino       re_free (newstate);
169144b87433SJohn Marino       return NULL;
169244b87433SJohn Marino     }
169344b87433SJohn Marino 
169444b87433SJohn Marino   newstate->context = context;
169544b87433SJohn Marino   newstate->entrance_nodes = &newstate->nodes;
169644b87433SJohn Marino 
169744b87433SJohn Marino   for (i = 0 ; i < nodes->nelem ; i++)
169844b87433SJohn Marino     {
169944b87433SJohn Marino       re_token_t *node = dfa->nodes + nodes->elems[i];
170044b87433SJohn Marino       re_token_type_t type = node->type;
170144b87433SJohn Marino       unsigned int constraint = node->constraint;
170244b87433SJohn Marino 
170344b87433SJohn Marino       if (type == CHARACTER && !constraint)
170444b87433SJohn Marino 	continue;
170544b87433SJohn Marino #ifdef RE_ENABLE_I18N
170644b87433SJohn Marino       newstate->accept_mb |= node->accept_mb;
170744b87433SJohn Marino #endif /* RE_ENABLE_I18N */
170844b87433SJohn Marino 
170944b87433SJohn Marino       /* If the state has the halt node, the state is a halt state.  */
171044b87433SJohn Marino       if (type == END_OF_RE)
171144b87433SJohn Marino 	newstate->halt = 1;
171244b87433SJohn Marino       else if (type == OP_BACK_REF)
171344b87433SJohn Marino 	newstate->has_backref = 1;
171444b87433SJohn Marino 
171544b87433SJohn Marino       if (constraint)
171644b87433SJohn Marino 	{
171744b87433SJohn Marino 	  if (newstate->entrance_nodes == &newstate->nodes)
171844b87433SJohn Marino 	    {
171944b87433SJohn Marino 	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1720*6ea1f93eSDaniel Fojt 	      if (__glibc_unlikely (newstate->entrance_nodes == NULL))
172144b87433SJohn Marino 		{
172244b87433SJohn Marino 		  free_state (newstate);
172344b87433SJohn Marino 		  return NULL;
172444b87433SJohn Marino 		}
172544b87433SJohn Marino 	      if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
172644b87433SJohn Marino 		  != REG_NOERROR)
172744b87433SJohn Marino 		return NULL;
172844b87433SJohn Marino 	      nctx_nodes = 0;
172944b87433SJohn Marino 	      newstate->has_constraint = 1;
173044b87433SJohn Marino 	    }
173144b87433SJohn Marino 
173244b87433SJohn Marino 	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
173344b87433SJohn Marino 	    {
173444b87433SJohn Marino 	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
173544b87433SJohn Marino 	      ++nctx_nodes;
173644b87433SJohn Marino 	    }
173744b87433SJohn Marino 	}
173844b87433SJohn Marino     }
173944b87433SJohn Marino   err = register_state (dfa, newstate, hash);
1740*6ea1f93eSDaniel Fojt   if (__glibc_unlikely (err != REG_NOERROR))
174144b87433SJohn Marino     {
174244b87433SJohn Marino       free_state (newstate);
174344b87433SJohn Marino       newstate = NULL;
174444b87433SJohn Marino     }
174544b87433SJohn Marino   return  newstate;
174644b87433SJohn Marino }
1747