195b7b453SJohn Marino /* Extended regular expression matching and search library.
2*09d4459fSDaniel Fojt Copyright (C) 2002-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino This file is part of the GNU C Library.
495b7b453SJohn Marino Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
595b7b453SJohn Marino
6680a9cb8SJohn Marino The GNU C Library is free software; you can redistribute it and/or
7680a9cb8SJohn Marino modify it under the terms of the GNU General Public
8680a9cb8SJohn Marino License as published by the Free Software Foundation; either
9680a9cb8SJohn Marino version 3 of the License, or (at your option) any later version.
1095b7b453SJohn Marino
11680a9cb8SJohn Marino The GNU C Library is distributed in the hope that it will be useful,
1295b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
13680a9cb8SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14680a9cb8SJohn Marino General Public License for more details.
1595b7b453SJohn Marino
16680a9cb8SJohn Marino You should have received a copy of the GNU General Public
17680a9cb8SJohn Marino License along with the GNU C Library; if not, see
18*09d4459fSDaniel Fojt <https://www.gnu.org/licenses/>. */
1995b7b453SJohn Marino
2095b7b453SJohn Marino static void re_string_construct_common (const char *str, Idx len,
2195b7b453SJohn Marino re_string_t *pstr,
2295b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase,
23*09d4459fSDaniel Fojt const re_dfa_t *dfa);
2495b7b453SJohn Marino static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
2595b7b453SJohn Marino const re_node_set *nodes,
26*09d4459fSDaniel Fojt re_hashval_t hash);
2795b7b453SJohn Marino static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
2895b7b453SJohn Marino const re_node_set *nodes,
2995b7b453SJohn Marino unsigned int context,
30*09d4459fSDaniel Fojt re_hashval_t hash);
31*09d4459fSDaniel Fojt static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
32*09d4459fSDaniel Fojt Idx new_buf_len);
33*09d4459fSDaniel Fojt #ifdef RE_ENABLE_I18N
34*09d4459fSDaniel Fojt static void build_wcs_buffer (re_string_t *pstr);
35*09d4459fSDaniel Fojt static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
36*09d4459fSDaniel Fojt #endif /* RE_ENABLE_I18N */
37*09d4459fSDaniel Fojt static void build_upper_buffer (re_string_t *pstr);
38*09d4459fSDaniel Fojt static void re_string_translate_buffer (re_string_t *pstr);
39*09d4459fSDaniel Fojt static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
40*09d4459fSDaniel Fojt int eflags) __attribute__ ((pure));
4195b7b453SJohn Marino
4295b7b453SJohn Marino /* Functions for string operation. */
4395b7b453SJohn Marino
4495b7b453SJohn Marino /* This function allocate the buffers. It is necessary to call
4595b7b453SJohn Marino re_string_reconstruct before using the object. */
4695b7b453SJohn Marino
4795b7b453SJohn Marino static reg_errcode_t
48*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)4995b7b453SJohn Marino re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
5095b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
5195b7b453SJohn Marino {
5295b7b453SJohn Marino reg_errcode_t ret;
5395b7b453SJohn Marino Idx init_buf_len;
5495b7b453SJohn Marino
5595b7b453SJohn Marino /* Ensure at least one character fits into the buffers. */
5695b7b453SJohn Marino if (init_len < dfa->mb_cur_max)
5795b7b453SJohn Marino init_len = dfa->mb_cur_max;
5895b7b453SJohn Marino init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
5995b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa);
6095b7b453SJohn Marino
6195b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, init_buf_len);
62*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
6395b7b453SJohn Marino return ret;
6495b7b453SJohn Marino
6595b7b453SJohn Marino pstr->word_char = dfa->word_char;
6695b7b453SJohn Marino pstr->word_ops_used = dfa->word_ops_used;
6795b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
6895b7b453SJohn Marino pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
6995b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len;
7095b7b453SJohn Marino return REG_NOERROR;
7195b7b453SJohn Marino }
7295b7b453SJohn Marino
7395b7b453SJohn Marino /* This function allocate the buffers, and initialize them. */
7495b7b453SJohn Marino
7595b7b453SJohn Marino static reg_errcode_t
76*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)7795b7b453SJohn Marino re_string_construct (re_string_t *pstr, const char *str, Idx len,
7895b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
7995b7b453SJohn Marino {
8095b7b453SJohn Marino reg_errcode_t ret;
8195b7b453SJohn Marino memset (pstr, '\0', sizeof (re_string_t));
8295b7b453SJohn Marino re_string_construct_common (str, len, pstr, trans, icase, dfa);
8395b7b453SJohn Marino
8495b7b453SJohn Marino if (len > 0)
8595b7b453SJohn Marino {
8695b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, len + 1);
87*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
8895b7b453SJohn Marino return ret;
8995b7b453SJohn Marino }
9095b7b453SJohn Marino pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
9195b7b453SJohn Marino
9295b7b453SJohn Marino if (icase)
9395b7b453SJohn Marino {
9495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
9595b7b453SJohn Marino if (dfa->mb_cur_max > 1)
9695b7b453SJohn Marino {
9795b7b453SJohn Marino while (1)
9895b7b453SJohn Marino {
9995b7b453SJohn Marino ret = build_wcs_upper_buffer (pstr);
100*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
10195b7b453SJohn Marino return ret;
10295b7b453SJohn Marino if (pstr->valid_raw_len >= len)
10395b7b453SJohn Marino break;
10495b7b453SJohn Marino if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
10595b7b453SJohn Marino break;
10695b7b453SJohn Marino ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
107*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
10895b7b453SJohn Marino return ret;
10995b7b453SJohn Marino }
11095b7b453SJohn Marino }
11195b7b453SJohn Marino else
11295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
11395b7b453SJohn Marino build_upper_buffer (pstr);
11495b7b453SJohn Marino }
11595b7b453SJohn Marino else
11695b7b453SJohn Marino {
11795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
11895b7b453SJohn Marino if (dfa->mb_cur_max > 1)
11995b7b453SJohn Marino build_wcs_buffer (pstr);
12095b7b453SJohn Marino else
12195b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
12295b7b453SJohn Marino {
12395b7b453SJohn Marino if (trans != NULL)
12495b7b453SJohn Marino re_string_translate_buffer (pstr);
12595b7b453SJohn Marino else
12695b7b453SJohn Marino {
12795b7b453SJohn Marino pstr->valid_len = pstr->bufs_len;
12895b7b453SJohn Marino pstr->valid_raw_len = pstr->bufs_len;
12995b7b453SJohn Marino }
13095b7b453SJohn Marino }
13195b7b453SJohn Marino }
13295b7b453SJohn Marino
13395b7b453SJohn Marino return REG_NOERROR;
13495b7b453SJohn Marino }
13595b7b453SJohn Marino
13695b7b453SJohn Marino /* Helper functions for re_string_allocate, and re_string_construct. */
13795b7b453SJohn Marino
13895b7b453SJohn Marino static reg_errcode_t
139*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)14095b7b453SJohn Marino re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
14195b7b453SJohn Marino {
14295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
14395b7b453SJohn Marino if (pstr->mb_cur_max > 1)
14495b7b453SJohn Marino {
14595b7b453SJohn Marino wint_t *new_wcs;
14695b7b453SJohn Marino
147cf28ed85SJohn Marino /* Avoid overflow in realloc. */
148cf28ed85SJohn Marino const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
149*09d4459fSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
150*09d4459fSDaniel Fojt < new_buf_len))
15195b7b453SJohn Marino return REG_ESPACE;
15295b7b453SJohn Marino
15395b7b453SJohn Marino new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
154*09d4459fSDaniel Fojt if (__glibc_unlikely (new_wcs == NULL))
15595b7b453SJohn Marino return REG_ESPACE;
15695b7b453SJohn Marino pstr->wcs = new_wcs;
15795b7b453SJohn Marino if (pstr->offsets != NULL)
15895b7b453SJohn Marino {
15995b7b453SJohn Marino Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
160*09d4459fSDaniel Fojt if (__glibc_unlikely (new_offsets == NULL))
16195b7b453SJohn Marino return REG_ESPACE;
16295b7b453SJohn Marino pstr->offsets = new_offsets;
16395b7b453SJohn Marino }
16495b7b453SJohn Marino }
16595b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
16695b7b453SJohn Marino if (pstr->mbs_allocated)
16795b7b453SJohn Marino {
16895b7b453SJohn Marino unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
16995b7b453SJohn Marino new_buf_len);
170*09d4459fSDaniel Fojt if (__glibc_unlikely (new_mbs == NULL))
17195b7b453SJohn Marino return REG_ESPACE;
17295b7b453SJohn Marino pstr->mbs = new_mbs;
17395b7b453SJohn Marino }
17495b7b453SJohn Marino pstr->bufs_len = new_buf_len;
17595b7b453SJohn Marino return REG_NOERROR;
17695b7b453SJohn Marino }
17795b7b453SJohn Marino
17895b7b453SJohn Marino
17995b7b453SJohn Marino static void
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)18095b7b453SJohn Marino re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
18195b7b453SJohn Marino RE_TRANSLATE_TYPE trans, bool icase,
18295b7b453SJohn Marino const re_dfa_t *dfa)
18395b7b453SJohn Marino {
18495b7b453SJohn Marino pstr->raw_mbs = (const unsigned char *) str;
18595b7b453SJohn Marino pstr->len = len;
18695b7b453SJohn Marino pstr->raw_len = len;
18795b7b453SJohn Marino pstr->trans = trans;
18895b7b453SJohn Marino pstr->icase = icase;
18995b7b453SJohn Marino pstr->mbs_allocated = (trans != NULL || icase);
19095b7b453SJohn Marino pstr->mb_cur_max = dfa->mb_cur_max;
19195b7b453SJohn Marino pstr->is_utf8 = dfa->is_utf8;
19295b7b453SJohn Marino pstr->map_notascii = dfa->map_notascii;
19395b7b453SJohn Marino pstr->stop = pstr->len;
19495b7b453SJohn Marino pstr->raw_stop = pstr->stop;
19595b7b453SJohn Marino }
19695b7b453SJohn Marino
19795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
19895b7b453SJohn Marino
19995b7b453SJohn Marino /* Build wide character buffer PSTR->WCS.
20095b7b453SJohn Marino If the byte sequence of the string are:
20195b7b453SJohn Marino <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
20295b7b453SJohn Marino Then wide character buffer will be:
20395b7b453SJohn Marino <wc1> , WEOF , <wc2> , WEOF , <wc3>
20495b7b453SJohn Marino We use WEOF for padding, they indicate that the position isn't
20595b7b453SJohn Marino a first byte of a multibyte character.
20695b7b453SJohn Marino
20795b7b453SJohn Marino Note that this function assumes PSTR->VALID_LEN elements are already
20895b7b453SJohn Marino built and starts from PSTR->VALID_LEN. */
20995b7b453SJohn Marino
21095b7b453SJohn Marino static void
build_wcs_buffer(re_string_t * pstr)21195b7b453SJohn Marino build_wcs_buffer (re_string_t *pstr)
21295b7b453SJohn Marino {
21395b7b453SJohn Marino #ifdef _LIBC
21495b7b453SJohn Marino unsigned char buf[MB_LEN_MAX];
215*09d4459fSDaniel Fojt DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
21695b7b453SJohn Marino #else
21795b7b453SJohn Marino unsigned char buf[64];
21895b7b453SJohn Marino #endif
21995b7b453SJohn Marino mbstate_t prev_st;
22095b7b453SJohn Marino Idx byte_idx, end_idx, remain_len;
22195b7b453SJohn Marino size_t mbclen;
22295b7b453SJohn Marino
22395b7b453SJohn Marino /* Build the buffers from pstr->valid_len to either pstr->len or
22495b7b453SJohn Marino pstr->bufs_len. */
22595b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
22695b7b453SJohn Marino for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
22795b7b453SJohn Marino {
22895b7b453SJohn Marino wchar_t wc;
22995b7b453SJohn Marino const char *p;
23095b7b453SJohn Marino
23195b7b453SJohn Marino remain_len = end_idx - byte_idx;
23295b7b453SJohn Marino prev_st = pstr->cur_state;
23395b7b453SJohn Marino /* Apply the translation if we need. */
234*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
23595b7b453SJohn Marino {
23695b7b453SJohn Marino int i, ch;
23795b7b453SJohn Marino
23895b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
23995b7b453SJohn Marino {
24095b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
24195b7b453SJohn Marino buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
24295b7b453SJohn Marino }
24395b7b453SJohn Marino p = (const char *) buf;
24495b7b453SJohn Marino }
24595b7b453SJohn Marino else
24695b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
24795b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
248*09d4459fSDaniel Fojt if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
249*09d4459fSDaniel Fojt || (mbclen == (size_t) -2
250*09d4459fSDaniel Fojt && pstr->bufs_len >= pstr->len)))
25195b7b453SJohn Marino {
25295b7b453SJohn Marino /* We treat these cases as a singlebyte character. */
25395b7b453SJohn Marino mbclen = 1;
25495b7b453SJohn Marino wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
255*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
25695b7b453SJohn Marino wc = pstr->trans[wc];
25795b7b453SJohn Marino pstr->cur_state = prev_st;
25895b7b453SJohn Marino }
259*09d4459fSDaniel Fojt else if (__glibc_unlikely (mbclen == (size_t) -2))
260cf28ed85SJohn Marino {
261cf28ed85SJohn Marino /* The buffer doesn't have enough space, finish to build. */
262cf28ed85SJohn Marino pstr->cur_state = prev_st;
263cf28ed85SJohn Marino break;
264cf28ed85SJohn Marino }
26595b7b453SJohn Marino
26695b7b453SJohn Marino /* Write wide character and padding. */
26795b7b453SJohn Marino pstr->wcs[byte_idx++] = wc;
26895b7b453SJohn Marino /* Write paddings. */
26995b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
27095b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF;
27195b7b453SJohn Marino }
27295b7b453SJohn Marino pstr->valid_len = byte_idx;
27395b7b453SJohn Marino pstr->valid_raw_len = byte_idx;
27495b7b453SJohn Marino }
27595b7b453SJohn Marino
27695b7b453SJohn Marino /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
27795b7b453SJohn Marino but for REG_ICASE. */
27895b7b453SJohn Marino
27995b7b453SJohn Marino static reg_errcode_t
280*09d4459fSDaniel Fojt __attribute_warn_unused_result__
build_wcs_upper_buffer(re_string_t * pstr)28195b7b453SJohn Marino build_wcs_upper_buffer (re_string_t *pstr)
28295b7b453SJohn Marino {
28395b7b453SJohn Marino mbstate_t prev_st;
28495b7b453SJohn Marino Idx src_idx, byte_idx, end_idx, remain_len;
28595b7b453SJohn Marino size_t mbclen;
28695b7b453SJohn Marino #ifdef _LIBC
28795b7b453SJohn Marino char buf[MB_LEN_MAX];
288*09d4459fSDaniel Fojt DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
28995b7b453SJohn Marino #else
29095b7b453SJohn Marino char buf[64];
29195b7b453SJohn Marino #endif
29295b7b453SJohn Marino
29395b7b453SJohn Marino byte_idx = pstr->valid_len;
29495b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
29595b7b453SJohn Marino
29695b7b453SJohn Marino /* The following optimization assumes that ASCII characters can be
29795b7b453SJohn Marino mapped to wide characters with a simple cast. */
29895b7b453SJohn Marino if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
29995b7b453SJohn Marino {
30095b7b453SJohn Marino while (byte_idx < end_idx)
30195b7b453SJohn Marino {
30295b7b453SJohn Marino wchar_t wc;
30395b7b453SJohn Marino
30495b7b453SJohn Marino if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
30595b7b453SJohn Marino && mbsinit (&pstr->cur_state))
30695b7b453SJohn Marino {
30795b7b453SJohn Marino /* In case of a singlebyte character. */
30895b7b453SJohn Marino pstr->mbs[byte_idx]
30995b7b453SJohn Marino = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
31095b7b453SJohn Marino /* The next step uses the assumption that wchar_t is encoded
31195b7b453SJohn Marino ASCII-safe: all ASCII values can be converted like this. */
31295b7b453SJohn Marino pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
31395b7b453SJohn Marino ++byte_idx;
31495b7b453SJohn Marino continue;
31595b7b453SJohn Marino }
31695b7b453SJohn Marino
31795b7b453SJohn Marino remain_len = end_idx - byte_idx;
31895b7b453SJohn Marino prev_st = pstr->cur_state;
31995b7b453SJohn Marino mbclen = __mbrtowc (&wc,
32095b7b453SJohn Marino ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
32195b7b453SJohn Marino + byte_idx), remain_len, &pstr->cur_state);
322*09d4459fSDaniel Fojt if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
32395b7b453SJohn Marino {
324dc7c36e4SJohn Marino wchar_t wcu = __towupper (wc);
325680a9cb8SJohn Marino if (wcu != wc)
32695b7b453SJohn Marino {
32795b7b453SJohn Marino size_t mbcdlen;
32895b7b453SJohn Marino
329dc7c36e4SJohn Marino mbcdlen = __wcrtomb (buf, wcu, &prev_st);
330*09d4459fSDaniel Fojt if (__glibc_likely (mbclen == mbcdlen))
33195b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen);
33295b7b453SJohn Marino else
33395b7b453SJohn Marino {
33495b7b453SJohn Marino src_idx = byte_idx;
33595b7b453SJohn Marino goto offsets_needed;
33695b7b453SJohn Marino }
33795b7b453SJohn Marino }
33895b7b453SJohn Marino else
33995b7b453SJohn Marino memcpy (pstr->mbs + byte_idx,
34095b7b453SJohn Marino pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
34195b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu;
34295b7b453SJohn Marino /* Write paddings. */
34395b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
34495b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF;
34595b7b453SJohn Marino }
346cf28ed85SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0
347cf28ed85SJohn Marino || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
34895b7b453SJohn Marino {
349cf28ed85SJohn Marino /* It is an invalid character, an incomplete character
350cf28ed85SJohn Marino at the end of the string, or '\0'. Just use the byte. */
35195b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
35295b7b453SJohn Marino pstr->mbs[byte_idx] = ch;
35395b7b453SJohn Marino /* And also cast it to wide char. */
35495b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch;
355*09d4459fSDaniel Fojt if (__glibc_unlikely (mbclen == (size_t) -1))
35695b7b453SJohn Marino pstr->cur_state = prev_st;
35795b7b453SJohn Marino }
35895b7b453SJohn Marino else
35995b7b453SJohn Marino {
36095b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */
36195b7b453SJohn Marino pstr->cur_state = prev_st;
36295b7b453SJohn Marino break;
36395b7b453SJohn Marino }
36495b7b453SJohn Marino }
36595b7b453SJohn Marino pstr->valid_len = byte_idx;
36695b7b453SJohn Marino pstr->valid_raw_len = byte_idx;
36795b7b453SJohn Marino return REG_NOERROR;
36895b7b453SJohn Marino }
36995b7b453SJohn Marino else
37095b7b453SJohn Marino for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
37195b7b453SJohn Marino {
37295b7b453SJohn Marino wchar_t wc;
37395b7b453SJohn Marino const char *p;
37495b7b453SJohn Marino offsets_needed:
37595b7b453SJohn Marino remain_len = end_idx - byte_idx;
37695b7b453SJohn Marino prev_st = pstr->cur_state;
377*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
37895b7b453SJohn Marino {
37995b7b453SJohn Marino int i, ch;
38095b7b453SJohn Marino
38195b7b453SJohn Marino for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
38295b7b453SJohn Marino {
38395b7b453SJohn Marino ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
38495b7b453SJohn Marino buf[i] = pstr->trans[ch];
38595b7b453SJohn Marino }
38695b7b453SJohn Marino p = (const char *) buf;
38795b7b453SJohn Marino }
38895b7b453SJohn Marino else
38995b7b453SJohn Marino p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
39095b7b453SJohn Marino mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
391*09d4459fSDaniel Fojt if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
39295b7b453SJohn Marino {
393dc7c36e4SJohn Marino wchar_t wcu = __towupper (wc);
394680a9cb8SJohn Marino if (wcu != wc)
39595b7b453SJohn Marino {
39695b7b453SJohn Marino size_t mbcdlen;
39795b7b453SJohn Marino
398*09d4459fSDaniel Fojt mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
399*09d4459fSDaniel Fojt if (__glibc_likely (mbclen == mbcdlen))
40095b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbclen);
40195b7b453SJohn Marino else if (mbcdlen != (size_t) -1)
40295b7b453SJohn Marino {
40395b7b453SJohn Marino size_t i;
40495b7b453SJohn Marino
40595b7b453SJohn Marino if (byte_idx + mbcdlen > pstr->bufs_len)
40695b7b453SJohn Marino {
40795b7b453SJohn Marino pstr->cur_state = prev_st;
40895b7b453SJohn Marino break;
40995b7b453SJohn Marino }
41095b7b453SJohn Marino
41195b7b453SJohn Marino if (pstr->offsets == NULL)
41295b7b453SJohn Marino {
41395b7b453SJohn Marino pstr->offsets = re_malloc (Idx, pstr->bufs_len);
41495b7b453SJohn Marino
41595b7b453SJohn Marino if (pstr->offsets == NULL)
41695b7b453SJohn Marino return REG_ESPACE;
41795b7b453SJohn Marino }
41895b7b453SJohn Marino if (!pstr->offsets_needed)
41995b7b453SJohn Marino {
42095b7b453SJohn Marino for (i = 0; i < (size_t) byte_idx; ++i)
42195b7b453SJohn Marino pstr->offsets[i] = i;
42295b7b453SJohn Marino pstr->offsets_needed = 1;
42395b7b453SJohn Marino }
42495b7b453SJohn Marino
42595b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
42695b7b453SJohn Marino pstr->wcs[byte_idx] = wcu;
42795b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx;
42895b7b453SJohn Marino for (i = 1; i < mbcdlen; ++i)
42995b7b453SJohn Marino {
43095b7b453SJohn Marino pstr->offsets[byte_idx + i]
43195b7b453SJohn Marino = src_idx + (i < mbclen ? i : mbclen - 1);
43295b7b453SJohn Marino pstr->wcs[byte_idx + i] = WEOF;
43395b7b453SJohn Marino }
43495b7b453SJohn Marino pstr->len += mbcdlen - mbclen;
43595b7b453SJohn Marino if (pstr->raw_stop > src_idx)
43695b7b453SJohn Marino pstr->stop += mbcdlen - mbclen;
43795b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len)
43895b7b453SJohn Marino ? pstr->len : pstr->bufs_len;
43995b7b453SJohn Marino byte_idx += mbcdlen;
44095b7b453SJohn Marino src_idx += mbclen;
44195b7b453SJohn Marino continue;
44295b7b453SJohn Marino }
44395b7b453SJohn Marino else
44495b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen);
44595b7b453SJohn Marino }
44695b7b453SJohn Marino else
44795b7b453SJohn Marino memcpy (pstr->mbs + byte_idx, p, mbclen);
44895b7b453SJohn Marino
449*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->offsets_needed != 0))
45095b7b453SJohn Marino {
45195b7b453SJohn Marino size_t i;
45295b7b453SJohn Marino for (i = 0; i < mbclen; ++i)
45395b7b453SJohn Marino pstr->offsets[byte_idx + i] = src_idx + i;
45495b7b453SJohn Marino }
45595b7b453SJohn Marino src_idx += mbclen;
45695b7b453SJohn Marino
45795b7b453SJohn Marino pstr->wcs[byte_idx++] = wcu;
45895b7b453SJohn Marino /* Write paddings. */
45995b7b453SJohn Marino for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
46095b7b453SJohn Marino pstr->wcs[byte_idx++] = WEOF;
46195b7b453SJohn Marino }
462cf28ed85SJohn Marino else if (mbclen == (size_t) -1 || mbclen == 0
463cf28ed85SJohn Marino || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
46495b7b453SJohn Marino {
46595b7b453SJohn Marino /* It is an invalid character or '\0'. Just use the byte. */
46695b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
46795b7b453SJohn Marino
468*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
46995b7b453SJohn Marino ch = pstr->trans [ch];
47095b7b453SJohn Marino pstr->mbs[byte_idx] = ch;
47195b7b453SJohn Marino
472*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->offsets_needed != 0))
47395b7b453SJohn Marino pstr->offsets[byte_idx] = src_idx;
47495b7b453SJohn Marino ++src_idx;
47595b7b453SJohn Marino
47695b7b453SJohn Marino /* And also cast it to wide char. */
47795b7b453SJohn Marino pstr->wcs[byte_idx++] = (wchar_t) ch;
478*09d4459fSDaniel Fojt if (__glibc_unlikely (mbclen == (size_t) -1))
47995b7b453SJohn Marino pstr->cur_state = prev_st;
48095b7b453SJohn Marino }
48195b7b453SJohn Marino else
48295b7b453SJohn Marino {
48395b7b453SJohn Marino /* The buffer doesn't have enough space, finish to build. */
48495b7b453SJohn Marino pstr->cur_state = prev_st;
48595b7b453SJohn Marino break;
48695b7b453SJohn Marino }
48795b7b453SJohn Marino }
48895b7b453SJohn Marino pstr->valid_len = byte_idx;
48995b7b453SJohn Marino pstr->valid_raw_len = src_idx;
49095b7b453SJohn Marino return REG_NOERROR;
49195b7b453SJohn Marino }
49295b7b453SJohn Marino
49395b7b453SJohn Marino /* Skip characters until the index becomes greater than NEW_RAW_IDX.
49495b7b453SJohn Marino Return the index. */
49595b7b453SJohn Marino
49695b7b453SJohn Marino static Idx
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)49795b7b453SJohn Marino re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
49895b7b453SJohn Marino {
49995b7b453SJohn Marino mbstate_t prev_st;
50095b7b453SJohn Marino Idx rawbuf_idx;
50195b7b453SJohn Marino size_t mbclen;
50295b7b453SJohn Marino wint_t wc = WEOF;
50395b7b453SJohn Marino
50495b7b453SJohn Marino /* Skip the characters which are not necessary to check. */
50595b7b453SJohn Marino for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
50695b7b453SJohn Marino rawbuf_idx < new_raw_idx;)
50795b7b453SJohn Marino {
50895b7b453SJohn Marino wchar_t wc2;
509cf28ed85SJohn Marino Idx remain_len = pstr->raw_len - rawbuf_idx;
51095b7b453SJohn Marino prev_st = pstr->cur_state;
51195b7b453SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
51295b7b453SJohn Marino remain_len, &pstr->cur_state);
513*09d4459fSDaniel Fojt if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
514*09d4459fSDaniel Fojt || mbclen == 0))
51595b7b453SJohn Marino {
51695b7b453SJohn Marino /* We treat these cases as a single byte character. */
51795b7b453SJohn Marino if (mbclen == 0 || remain_len == 0)
51895b7b453SJohn Marino wc = L'\0';
51995b7b453SJohn Marino else
52095b7b453SJohn Marino wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
52195b7b453SJohn Marino mbclen = 1;
52295b7b453SJohn Marino pstr->cur_state = prev_st;
52395b7b453SJohn Marino }
52495b7b453SJohn Marino else
52595b7b453SJohn Marino wc = wc2;
52695b7b453SJohn Marino /* Then proceed the next character. */
52795b7b453SJohn Marino rawbuf_idx += mbclen;
52895b7b453SJohn Marino }
52995b7b453SJohn Marino *last_wc = wc;
53095b7b453SJohn Marino return rawbuf_idx;
53195b7b453SJohn Marino }
53295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
53395b7b453SJohn Marino
53495b7b453SJohn Marino /* Build the buffer PSTR->MBS, and apply the translation if we need.
53595b7b453SJohn Marino This function is used in case of REG_ICASE. */
53695b7b453SJohn Marino
53795b7b453SJohn Marino static void
build_upper_buffer(re_string_t * pstr)53895b7b453SJohn Marino build_upper_buffer (re_string_t *pstr)
53995b7b453SJohn Marino {
54095b7b453SJohn Marino Idx char_idx, end_idx;
54195b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
54295b7b453SJohn Marino
54395b7b453SJohn Marino for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
54495b7b453SJohn Marino {
54595b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
546*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
54795b7b453SJohn Marino ch = pstr->trans[ch];
54895b7b453SJohn Marino pstr->mbs[char_idx] = toupper (ch);
54995b7b453SJohn Marino }
55095b7b453SJohn Marino pstr->valid_len = char_idx;
55195b7b453SJohn Marino pstr->valid_raw_len = char_idx;
55295b7b453SJohn Marino }
55395b7b453SJohn Marino
55495b7b453SJohn Marino /* Apply TRANS to the buffer in PSTR. */
55595b7b453SJohn Marino
55695b7b453SJohn Marino static void
re_string_translate_buffer(re_string_t * pstr)55795b7b453SJohn Marino re_string_translate_buffer (re_string_t *pstr)
55895b7b453SJohn Marino {
55995b7b453SJohn Marino Idx buf_idx, end_idx;
56095b7b453SJohn Marino end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
56195b7b453SJohn Marino
56295b7b453SJohn Marino for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
56395b7b453SJohn Marino {
56495b7b453SJohn Marino int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
56595b7b453SJohn Marino pstr->mbs[buf_idx] = pstr->trans[ch];
56695b7b453SJohn Marino }
56795b7b453SJohn Marino
56895b7b453SJohn Marino pstr->valid_len = buf_idx;
56995b7b453SJohn Marino pstr->valid_raw_len = buf_idx;
57095b7b453SJohn Marino }
57195b7b453SJohn Marino
57295b7b453SJohn Marino /* This function re-construct the buffers.
57395b7b453SJohn Marino Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
57495b7b453SJohn Marino convert to upper case in case of REG_ICASE, apply translation. */
57595b7b453SJohn Marino
57695b7b453SJohn Marino static reg_errcode_t
577*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)57895b7b453SJohn Marino re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
57995b7b453SJohn Marino {
58095b7b453SJohn Marino Idx offset;
58195b7b453SJohn Marino
582*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
58395b7b453SJohn Marino offset = idx - pstr->raw_mbs_idx;
58495b7b453SJohn Marino else
58595b7b453SJohn Marino {
58695b7b453SJohn Marino /* Reset buffer. */
58795b7b453SJohn Marino #ifdef RE_ENABLE_I18N
58895b7b453SJohn Marino if (pstr->mb_cur_max > 1)
58995b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
59095b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
59195b7b453SJohn Marino pstr->len = pstr->raw_len;
59295b7b453SJohn Marino pstr->stop = pstr->raw_stop;
59395b7b453SJohn Marino pstr->valid_len = 0;
59495b7b453SJohn Marino pstr->raw_mbs_idx = 0;
59595b7b453SJohn Marino pstr->valid_raw_len = 0;
59695b7b453SJohn Marino pstr->offsets_needed = 0;
59795b7b453SJohn Marino pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
59895b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
59995b7b453SJohn Marino if (!pstr->mbs_allocated)
60095b7b453SJohn Marino pstr->mbs = (unsigned char *) pstr->raw_mbs;
60195b7b453SJohn Marino offset = idx;
60295b7b453SJohn Marino }
60395b7b453SJohn Marino
604*09d4459fSDaniel Fojt if (__glibc_likely (offset != 0))
60595b7b453SJohn Marino {
60695b7b453SJohn Marino /* Should the already checked characters be kept? */
607*09d4459fSDaniel Fojt if (__glibc_likely (offset < pstr->valid_raw_len))
60895b7b453SJohn Marino {
60995b7b453SJohn Marino /* Yes, move them to the front of the buffer. */
61095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
611*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->offsets_needed))
61295b7b453SJohn Marino {
61395b7b453SJohn Marino Idx low = 0, high = pstr->valid_len, mid;
61495b7b453SJohn Marino do
61595b7b453SJohn Marino {
61695b7b453SJohn Marino mid = (high + low) / 2;
61795b7b453SJohn Marino if (pstr->offsets[mid] > offset)
61895b7b453SJohn Marino high = mid;
61995b7b453SJohn Marino else if (pstr->offsets[mid] < offset)
62095b7b453SJohn Marino low = mid + 1;
62195b7b453SJohn Marino else
62295b7b453SJohn Marino break;
62395b7b453SJohn Marino }
62495b7b453SJohn Marino while (low < high);
62595b7b453SJohn Marino if (pstr->offsets[mid] < offset)
62695b7b453SJohn Marino ++mid;
62795b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, mid - 1,
62895b7b453SJohn Marino eflags);
62995b7b453SJohn Marino /* This can be quite complicated, so handle specially
63095b7b453SJohn Marino only the common and easy case where the character with
63195b7b453SJohn Marino different length representation of lower and upper
63295b7b453SJohn Marino case is present at or after offset. */
63395b7b453SJohn Marino if (pstr->valid_len > offset
63495b7b453SJohn Marino && mid == offset && pstr->offsets[mid] == offset)
63595b7b453SJohn Marino {
63695b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset,
63795b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t));
63895b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
63995b7b453SJohn Marino pstr->valid_len -= offset;
64095b7b453SJohn Marino pstr->valid_raw_len -= offset;
64195b7b453SJohn Marino for (low = 0; low < pstr->valid_len; low++)
64295b7b453SJohn Marino pstr->offsets[low] = pstr->offsets[low + offset] - offset;
64395b7b453SJohn Marino }
64495b7b453SJohn Marino else
64595b7b453SJohn Marino {
64695b7b453SJohn Marino /* Otherwise, just find out how long the partial multibyte
64795b7b453SJohn Marino character at offset is and fill it with WEOF/255. */
64895b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset;
64995b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset;
65095b7b453SJohn Marino pstr->offsets_needed = 0;
65195b7b453SJohn Marino while (mid > 0 && pstr->offsets[mid - 1] == offset)
65295b7b453SJohn Marino --mid;
65395b7b453SJohn Marino while (mid < pstr->valid_len)
65495b7b453SJohn Marino if (pstr->wcs[mid] != WEOF)
65595b7b453SJohn Marino break;
65695b7b453SJohn Marino else
65795b7b453SJohn Marino ++mid;
65895b7b453SJohn Marino if (mid == pstr->valid_len)
65995b7b453SJohn Marino pstr->valid_len = 0;
66095b7b453SJohn Marino else
66195b7b453SJohn Marino {
66295b7b453SJohn Marino pstr->valid_len = pstr->offsets[mid] - offset;
66395b7b453SJohn Marino if (pstr->valid_len)
66495b7b453SJohn Marino {
66595b7b453SJohn Marino for (low = 0; low < pstr->valid_len; ++low)
66695b7b453SJohn Marino pstr->wcs[low] = WEOF;
66795b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len);
66895b7b453SJohn Marino }
66995b7b453SJohn Marino }
67095b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len;
67195b7b453SJohn Marino }
67295b7b453SJohn Marino }
67395b7b453SJohn Marino else
67495b7b453SJohn Marino #endif
67595b7b453SJohn Marino {
67695b7b453SJohn Marino pstr->tip_context = re_string_context_at (pstr, offset - 1,
67795b7b453SJohn Marino eflags);
67895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
67995b7b453SJohn Marino if (pstr->mb_cur_max > 1)
68095b7b453SJohn Marino memmove (pstr->wcs, pstr->wcs + offset,
68195b7b453SJohn Marino (pstr->valid_len - offset) * sizeof (wint_t));
68295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
683*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->mbs_allocated))
68495b7b453SJohn Marino memmove (pstr->mbs, pstr->mbs + offset,
68595b7b453SJohn Marino pstr->valid_len - offset);
68695b7b453SJohn Marino pstr->valid_len -= offset;
68795b7b453SJohn Marino pstr->valid_raw_len -= offset;
688*09d4459fSDaniel Fojt DEBUG_ASSERT (pstr->valid_len > 0);
68995b7b453SJohn Marino }
69095b7b453SJohn Marino }
69195b7b453SJohn Marino else
69295b7b453SJohn Marino {
69395b7b453SJohn Marino #ifdef RE_ENABLE_I18N
69495b7b453SJohn Marino /* No, skip all characters until IDX. */
69595b7b453SJohn Marino Idx prev_valid_len = pstr->valid_len;
69695b7b453SJohn Marino
697*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->offsets_needed))
69895b7b453SJohn Marino {
69995b7b453SJohn Marino pstr->len = pstr->raw_len - idx + offset;
70095b7b453SJohn Marino pstr->stop = pstr->raw_stop - idx + offset;
70195b7b453SJohn Marino pstr->offsets_needed = 0;
70295b7b453SJohn Marino }
70395b7b453SJohn Marino #endif
70495b7b453SJohn Marino pstr->valid_len = 0;
70595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
70695b7b453SJohn Marino if (pstr->mb_cur_max > 1)
70795b7b453SJohn Marino {
70895b7b453SJohn Marino Idx wcs_idx;
70995b7b453SJohn Marino wint_t wc = WEOF;
71095b7b453SJohn Marino
71195b7b453SJohn Marino if (pstr->is_utf8)
71295b7b453SJohn Marino {
71395b7b453SJohn Marino const unsigned char *raw, *p, *end;
71495b7b453SJohn Marino
71595b7b453SJohn Marino /* Special case UTF-8. Multi-byte chars start with any
71695b7b453SJohn Marino byte other than 0x80 - 0xbf. */
71795b7b453SJohn Marino raw = pstr->raw_mbs + pstr->raw_mbs_idx;
71895b7b453SJohn Marino end = raw + (offset - pstr->mb_cur_max);
71995b7b453SJohn Marino if (end < pstr->raw_mbs)
72095b7b453SJohn Marino end = pstr->raw_mbs;
72195b7b453SJohn Marino p = raw + offset - 1;
72295b7b453SJohn Marino #ifdef _LIBC
72395b7b453SJohn Marino /* We know the wchar_t encoding is UCS4, so for the simple
72495b7b453SJohn Marino case, ASCII characters, skip the conversion step. */
725*09d4459fSDaniel Fojt if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
72695b7b453SJohn Marino {
72795b7b453SJohn Marino memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
72895b7b453SJohn Marino /* pstr->valid_len = 0; */
72995b7b453SJohn Marino wc = (wchar_t) *p;
73095b7b453SJohn Marino }
73195b7b453SJohn Marino else
73295b7b453SJohn Marino #endif
73395b7b453SJohn Marino for (; p >= end; --p)
73495b7b453SJohn Marino if ((*p & 0xc0) != 0x80)
73595b7b453SJohn Marino {
73695b7b453SJohn Marino mbstate_t cur_state;
73795b7b453SJohn Marino wchar_t wc2;
73895b7b453SJohn Marino Idx mlen = raw + pstr->len - p;
739cf28ed85SJohn Marino unsigned char buf[6];
74095b7b453SJohn Marino size_t mbclen;
74195b7b453SJohn Marino
742cf28ed85SJohn Marino const unsigned char *pp = p;
743*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->trans != NULL))
74495b7b453SJohn Marino {
74595b7b453SJohn Marino int i = mlen < 6 ? mlen : 6;
74695b7b453SJohn Marino while (--i >= 0)
74795b7b453SJohn Marino buf[i] = pstr->trans[p[i]];
748cf28ed85SJohn Marino pp = buf;
74995b7b453SJohn Marino }
75095b7b453SJohn Marino /* XXX Don't use mbrtowc, we know which conversion
75195b7b453SJohn Marino to use (UTF-8 -> UCS4). */
75295b7b453SJohn Marino memset (&cur_state, 0, sizeof (cur_state));
753cf28ed85SJohn Marino mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
75495b7b453SJohn Marino &cur_state);
75595b7b453SJohn Marino if (raw + offset - p <= mbclen
75695b7b453SJohn Marino && mbclen < (size_t) -2)
75795b7b453SJohn Marino {
75895b7b453SJohn Marino memset (&pstr->cur_state, '\0',
75995b7b453SJohn Marino sizeof (mbstate_t));
76095b7b453SJohn Marino pstr->valid_len = mbclen - (raw + offset - p);
76195b7b453SJohn Marino wc = wc2;
76295b7b453SJohn Marino }
76395b7b453SJohn Marino break;
76495b7b453SJohn Marino }
76595b7b453SJohn Marino }
76695b7b453SJohn Marino
76795b7b453SJohn Marino if (wc == WEOF)
76895b7b453SJohn Marino pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
76995b7b453SJohn Marino if (wc == WEOF)
77095b7b453SJohn Marino pstr->tip_context
77195b7b453SJohn Marino = re_string_context_at (pstr, prev_valid_len - 1, eflags);
77295b7b453SJohn Marino else
773*09d4459fSDaniel Fojt pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
77495b7b453SJohn Marino && IS_WIDE_WORD_CHAR (wc))
77595b7b453SJohn Marino ? CONTEXT_WORD
77695b7b453SJohn Marino : ((IS_WIDE_NEWLINE (wc)
77795b7b453SJohn Marino && pstr->newline_anchor)
77895b7b453SJohn Marino ? CONTEXT_NEWLINE : 0));
779*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->valid_len))
78095b7b453SJohn Marino {
78195b7b453SJohn Marino for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
78295b7b453SJohn Marino pstr->wcs[wcs_idx] = WEOF;
78395b7b453SJohn Marino if (pstr->mbs_allocated)
78495b7b453SJohn Marino memset (pstr->mbs, 255, pstr->valid_len);
78595b7b453SJohn Marino }
78695b7b453SJohn Marino pstr->valid_raw_len = pstr->valid_len;
78795b7b453SJohn Marino }
78895b7b453SJohn Marino else
78995b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
79095b7b453SJohn Marino {
79195b7b453SJohn Marino int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
79295b7b453SJohn Marino pstr->valid_raw_len = 0;
79395b7b453SJohn Marino if (pstr->trans)
79495b7b453SJohn Marino c = pstr->trans[c];
79595b7b453SJohn Marino pstr->tip_context = (bitset_contain (pstr->word_char, c)
79695b7b453SJohn Marino ? CONTEXT_WORD
79795b7b453SJohn Marino : ((IS_NEWLINE (c) && pstr->newline_anchor)
79895b7b453SJohn Marino ? CONTEXT_NEWLINE : 0));
79995b7b453SJohn Marino }
80095b7b453SJohn Marino }
801*09d4459fSDaniel Fojt if (!__glibc_unlikely (pstr->mbs_allocated))
80295b7b453SJohn Marino pstr->mbs += offset;
80395b7b453SJohn Marino }
80495b7b453SJohn Marino pstr->raw_mbs_idx = idx;
80595b7b453SJohn Marino pstr->len -= offset;
80695b7b453SJohn Marino pstr->stop -= offset;
80795b7b453SJohn Marino
80895b7b453SJohn Marino /* Then build the buffers. */
80995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
81095b7b453SJohn Marino if (pstr->mb_cur_max > 1)
81195b7b453SJohn Marino {
81295b7b453SJohn Marino if (pstr->icase)
81395b7b453SJohn Marino {
81495b7b453SJohn Marino reg_errcode_t ret = build_wcs_upper_buffer (pstr);
815*09d4459fSDaniel Fojt if (__glibc_unlikely (ret != REG_NOERROR))
81695b7b453SJohn Marino return ret;
81795b7b453SJohn Marino }
81895b7b453SJohn Marino else
81995b7b453SJohn Marino build_wcs_buffer (pstr);
82095b7b453SJohn Marino }
82195b7b453SJohn Marino else
82295b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
823*09d4459fSDaniel Fojt if (__glibc_unlikely (pstr->mbs_allocated))
82495b7b453SJohn Marino {
82595b7b453SJohn Marino if (pstr->icase)
82695b7b453SJohn Marino build_upper_buffer (pstr);
82795b7b453SJohn Marino else if (pstr->trans != NULL)
82895b7b453SJohn Marino re_string_translate_buffer (pstr);
82995b7b453SJohn Marino }
83095b7b453SJohn Marino else
83195b7b453SJohn Marino pstr->valid_len = pstr->len;
83295b7b453SJohn Marino
83395b7b453SJohn Marino pstr->cur_idx = 0;
83495b7b453SJohn Marino return REG_NOERROR;
83595b7b453SJohn Marino }
83695b7b453SJohn Marino
83795b7b453SJohn Marino static unsigned char
838*09d4459fSDaniel Fojt __attribute__ ((pure))
re_string_peek_byte_case(const re_string_t * pstr,Idx idx)83995b7b453SJohn Marino re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
84095b7b453SJohn Marino {
84195b7b453SJohn Marino int ch;
84295b7b453SJohn Marino Idx off;
84395b7b453SJohn Marino
84495b7b453SJohn Marino /* Handle the common (easiest) cases first. */
845*09d4459fSDaniel Fojt if (__glibc_likely (!pstr->mbs_allocated))
84695b7b453SJohn Marino return re_string_peek_byte (pstr, idx);
84795b7b453SJohn Marino
84895b7b453SJohn Marino #ifdef RE_ENABLE_I18N
84995b7b453SJohn Marino if (pstr->mb_cur_max > 1
85095b7b453SJohn Marino && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
85195b7b453SJohn Marino return re_string_peek_byte (pstr, idx);
85295b7b453SJohn Marino #endif
85395b7b453SJohn Marino
85495b7b453SJohn Marino off = pstr->cur_idx + idx;
85595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
85695b7b453SJohn Marino if (pstr->offsets_needed)
85795b7b453SJohn Marino off = pstr->offsets[off];
85895b7b453SJohn Marino #endif
85995b7b453SJohn Marino
86095b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
86195b7b453SJohn Marino
86295b7b453SJohn Marino #ifdef RE_ENABLE_I18N
86395b7b453SJohn Marino /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
86495b7b453SJohn Marino this function returns CAPITAL LETTER I instead of first byte of
86595b7b453SJohn Marino DOTLESS SMALL LETTER I. The latter would confuse the parser,
86695b7b453SJohn Marino since peek_byte_case doesn't advance cur_idx in any way. */
86795b7b453SJohn Marino if (pstr->offsets_needed && !isascii (ch))
86895b7b453SJohn Marino return re_string_peek_byte (pstr, idx);
86995b7b453SJohn Marino #endif
87095b7b453SJohn Marino
87195b7b453SJohn Marino return ch;
87295b7b453SJohn Marino }
87395b7b453SJohn Marino
87495b7b453SJohn Marino static unsigned char
re_string_fetch_byte_case(re_string_t * pstr)87595b7b453SJohn Marino re_string_fetch_byte_case (re_string_t *pstr)
87695b7b453SJohn Marino {
877*09d4459fSDaniel Fojt if (__glibc_likely (!pstr->mbs_allocated))
87895b7b453SJohn Marino return re_string_fetch_byte (pstr);
87995b7b453SJohn Marino
88095b7b453SJohn Marino #ifdef RE_ENABLE_I18N
88195b7b453SJohn Marino if (pstr->offsets_needed)
88295b7b453SJohn Marino {
88395b7b453SJohn Marino Idx off;
88495b7b453SJohn Marino int ch;
88595b7b453SJohn Marino
88695b7b453SJohn Marino /* For tr_TR.UTF-8 [[:islower:]] there is
88795b7b453SJohn Marino [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
88895b7b453SJohn Marino in that case the whole multi-byte character and return
88995b7b453SJohn Marino the original letter. On the other side, with
89095b7b453SJohn Marino [[: DOTLESS SMALL LETTER I return [[:I, as doing
89195b7b453SJohn Marino anything else would complicate things too much. */
89295b7b453SJohn Marino
89395b7b453SJohn Marino if (!re_string_first_byte (pstr, pstr->cur_idx))
89495b7b453SJohn Marino return re_string_fetch_byte (pstr);
89595b7b453SJohn Marino
89695b7b453SJohn Marino off = pstr->offsets[pstr->cur_idx];
89795b7b453SJohn Marino ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
89895b7b453SJohn Marino
89995b7b453SJohn Marino if (! isascii (ch))
90095b7b453SJohn Marino return re_string_fetch_byte (pstr);
90195b7b453SJohn Marino
90295b7b453SJohn Marino re_string_skip_bytes (pstr,
90395b7b453SJohn Marino re_string_char_size_at (pstr, pstr->cur_idx));
90495b7b453SJohn Marino return ch;
90595b7b453SJohn Marino }
90695b7b453SJohn Marino #endif
90795b7b453SJohn Marino
90895b7b453SJohn Marino return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
90995b7b453SJohn Marino }
91095b7b453SJohn Marino
91195b7b453SJohn Marino static void
re_string_destruct(re_string_t * pstr)91295b7b453SJohn Marino re_string_destruct (re_string_t *pstr)
91395b7b453SJohn Marino {
91495b7b453SJohn Marino #ifdef RE_ENABLE_I18N
91595b7b453SJohn Marino re_free (pstr->wcs);
91695b7b453SJohn Marino re_free (pstr->offsets);
91795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
91895b7b453SJohn Marino if (pstr->mbs_allocated)
91995b7b453SJohn Marino re_free (pstr->mbs);
92095b7b453SJohn Marino }
92195b7b453SJohn Marino
92295b7b453SJohn Marino /* Return the context at IDX in INPUT. */
92395b7b453SJohn Marino
92495b7b453SJohn Marino static unsigned int
re_string_context_at(const re_string_t * input,Idx idx,int eflags)92595b7b453SJohn Marino re_string_context_at (const re_string_t *input, Idx idx, int eflags)
92695b7b453SJohn Marino {
92795b7b453SJohn Marino int c;
928*09d4459fSDaniel Fojt if (__glibc_unlikely (idx < 0))
92995b7b453SJohn Marino /* In this case, we use the value stored in input->tip_context,
93095b7b453SJohn Marino since we can't know the character in input->mbs[-1] here. */
93195b7b453SJohn Marino return input->tip_context;
932*09d4459fSDaniel Fojt if (__glibc_unlikely (idx == input->len))
93395b7b453SJohn Marino return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
93495b7b453SJohn Marino : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
93595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
93695b7b453SJohn Marino if (input->mb_cur_max > 1)
93795b7b453SJohn Marino {
93895b7b453SJohn Marino wint_t wc;
93995b7b453SJohn Marino Idx wc_idx = idx;
94095b7b453SJohn Marino while(input->wcs[wc_idx] == WEOF)
94195b7b453SJohn Marino {
942*09d4459fSDaniel Fojt DEBUG_ASSERT (wc_idx >= 0);
94395b7b453SJohn Marino --wc_idx;
944*09d4459fSDaniel Fojt if (wc_idx < 0)
94595b7b453SJohn Marino return input->tip_context;
94695b7b453SJohn Marino }
94795b7b453SJohn Marino wc = input->wcs[wc_idx];
948*09d4459fSDaniel Fojt if (__glibc_unlikely (input->word_ops_used != 0)
949*09d4459fSDaniel Fojt && IS_WIDE_WORD_CHAR (wc))
95095b7b453SJohn Marino return CONTEXT_WORD;
95195b7b453SJohn Marino return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
95295b7b453SJohn Marino ? CONTEXT_NEWLINE : 0);
95395b7b453SJohn Marino }
95495b7b453SJohn Marino else
95595b7b453SJohn Marino #endif
95695b7b453SJohn Marino {
95795b7b453SJohn Marino c = re_string_byte_at (input, idx);
95895b7b453SJohn Marino if (bitset_contain (input->word_char, c))
95995b7b453SJohn Marino return CONTEXT_WORD;
96095b7b453SJohn Marino return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
96195b7b453SJohn Marino }
96295b7b453SJohn Marino }
96395b7b453SJohn Marino
96495b7b453SJohn Marino /* Functions for set operation. */
96595b7b453SJohn Marino
96695b7b453SJohn Marino static reg_errcode_t
967*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_alloc(re_node_set * set,Idx size)96895b7b453SJohn Marino re_node_set_alloc (re_node_set *set, Idx size)
96995b7b453SJohn Marino {
97095b7b453SJohn Marino set->alloc = size;
97195b7b453SJohn Marino set->nelem = 0;
97295b7b453SJohn Marino set->elems = re_malloc (Idx, size);
973*09d4459fSDaniel Fojt if (__glibc_unlikely (set->elems == NULL)
974*09d4459fSDaniel Fojt && (MALLOC_0_IS_NONNULL || size != 0))
97595b7b453SJohn Marino return REG_ESPACE;
97695b7b453SJohn Marino return REG_NOERROR;
97795b7b453SJohn Marino }
97895b7b453SJohn Marino
97995b7b453SJohn Marino static reg_errcode_t
980*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_1(re_node_set * set,Idx elem)98195b7b453SJohn Marino re_node_set_init_1 (re_node_set *set, Idx elem)
98295b7b453SJohn Marino {
98395b7b453SJohn Marino set->alloc = 1;
98495b7b453SJohn Marino set->nelem = 1;
98595b7b453SJohn Marino set->elems = re_malloc (Idx, 1);
986*09d4459fSDaniel Fojt if (__glibc_unlikely (set->elems == NULL))
98795b7b453SJohn Marino {
98895b7b453SJohn Marino set->alloc = set->nelem = 0;
98995b7b453SJohn Marino return REG_ESPACE;
99095b7b453SJohn Marino }
99195b7b453SJohn Marino set->elems[0] = elem;
99295b7b453SJohn Marino return REG_NOERROR;
99395b7b453SJohn Marino }
99495b7b453SJohn Marino
99595b7b453SJohn Marino static reg_errcode_t
996*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)99795b7b453SJohn Marino re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
99895b7b453SJohn Marino {
99995b7b453SJohn Marino set->alloc = 2;
100095b7b453SJohn Marino set->elems = re_malloc (Idx, 2);
1001*09d4459fSDaniel Fojt if (__glibc_unlikely (set->elems == NULL))
100295b7b453SJohn Marino return REG_ESPACE;
100395b7b453SJohn Marino if (elem1 == elem2)
100495b7b453SJohn Marino {
100595b7b453SJohn Marino set->nelem = 1;
100695b7b453SJohn Marino set->elems[0] = elem1;
100795b7b453SJohn Marino }
100895b7b453SJohn Marino else
100995b7b453SJohn Marino {
101095b7b453SJohn Marino set->nelem = 2;
101195b7b453SJohn Marino if (elem1 < elem2)
101295b7b453SJohn Marino {
101395b7b453SJohn Marino set->elems[0] = elem1;
101495b7b453SJohn Marino set->elems[1] = elem2;
101595b7b453SJohn Marino }
101695b7b453SJohn Marino else
101795b7b453SJohn Marino {
101895b7b453SJohn Marino set->elems[0] = elem2;
101995b7b453SJohn Marino set->elems[1] = elem1;
102095b7b453SJohn Marino }
102195b7b453SJohn Marino }
102295b7b453SJohn Marino return REG_NOERROR;
102395b7b453SJohn Marino }
102495b7b453SJohn Marino
102595b7b453SJohn Marino static reg_errcode_t
1026*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)102795b7b453SJohn Marino re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
102895b7b453SJohn Marino {
102995b7b453SJohn Marino dest->nelem = src->nelem;
103095b7b453SJohn Marino if (src->nelem > 0)
103195b7b453SJohn Marino {
103295b7b453SJohn Marino dest->alloc = dest->nelem;
103395b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc);
1034*09d4459fSDaniel Fojt if (__glibc_unlikely (dest->elems == NULL))
103595b7b453SJohn Marino {
103695b7b453SJohn Marino dest->alloc = dest->nelem = 0;
103795b7b453SJohn Marino return REG_ESPACE;
103895b7b453SJohn Marino }
103995b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
104095b7b453SJohn Marino }
104195b7b453SJohn Marino else
104295b7b453SJohn Marino re_node_set_init_empty (dest);
104395b7b453SJohn Marino return REG_NOERROR;
104495b7b453SJohn Marino }
104595b7b453SJohn Marino
104695b7b453SJohn Marino /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
104795b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded.
104895b7b453SJohn Marino Note: We assume dest->elems is NULL, when dest->alloc is 0. */
104995b7b453SJohn Marino
105095b7b453SJohn Marino static reg_errcode_t
1051*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)105295b7b453SJohn Marino re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
105395b7b453SJohn Marino const re_node_set *src2)
105495b7b453SJohn Marino {
105595b7b453SJohn Marino Idx i1, i2, is, id, delta, sbase;
105695b7b453SJohn Marino if (src1->nelem == 0 || src2->nelem == 0)
105795b7b453SJohn Marino return REG_NOERROR;
105895b7b453SJohn Marino
105995b7b453SJohn Marino /* We need dest->nelem + 2 * elems_in_intersection; this is a
106095b7b453SJohn Marino conservative estimate. */
106195b7b453SJohn Marino if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
106295b7b453SJohn Marino {
106395b7b453SJohn Marino Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
106495b7b453SJohn Marino Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065*09d4459fSDaniel Fojt if (__glibc_unlikely (new_elems == NULL))
106695b7b453SJohn Marino return REG_ESPACE;
106795b7b453SJohn Marino dest->elems = new_elems;
106895b7b453SJohn Marino dest->alloc = new_alloc;
106995b7b453SJohn Marino }
107095b7b453SJohn Marino
107195b7b453SJohn Marino /* Find the items in the intersection of SRC1 and SRC2, and copy
107295b7b453SJohn Marino into the top of DEST those that are not already in DEST itself. */
107395b7b453SJohn Marino sbase = dest->nelem + src1->nelem + src2->nelem;
107495b7b453SJohn Marino i1 = src1->nelem - 1;
107595b7b453SJohn Marino i2 = src2->nelem - 1;
107695b7b453SJohn Marino id = dest->nelem - 1;
107795b7b453SJohn Marino for (;;)
107895b7b453SJohn Marino {
107995b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2])
108095b7b453SJohn Marino {
108195b7b453SJohn Marino /* Try to find the item in DEST. Maybe we could binary search? */
1082*09d4459fSDaniel Fojt while (id >= 0 && dest->elems[id] > src1->elems[i1])
108395b7b453SJohn Marino --id;
108495b7b453SJohn Marino
1085*09d4459fSDaniel Fojt if (id < 0 || dest->elems[id] != src1->elems[i1])
108695b7b453SJohn Marino dest->elems[--sbase] = src1->elems[i1];
108795b7b453SJohn Marino
1088*09d4459fSDaniel Fojt if (--i1 < 0 || --i2 < 0)
108995b7b453SJohn Marino break;
109095b7b453SJohn Marino }
109195b7b453SJohn Marino
109295b7b453SJohn Marino /* Lower the highest of the two items. */
109395b7b453SJohn Marino else if (src1->elems[i1] < src2->elems[i2])
109495b7b453SJohn Marino {
1095*09d4459fSDaniel Fojt if (--i2 < 0)
109695b7b453SJohn Marino break;
109795b7b453SJohn Marino }
109895b7b453SJohn Marino else
109995b7b453SJohn Marino {
1100*09d4459fSDaniel Fojt if (--i1 < 0)
110195b7b453SJohn Marino break;
110295b7b453SJohn Marino }
110395b7b453SJohn Marino }
110495b7b453SJohn Marino
110595b7b453SJohn Marino id = dest->nelem - 1;
110695b7b453SJohn Marino is = dest->nelem + src1->nelem + src2->nelem - 1;
110795b7b453SJohn Marino delta = is - sbase + 1;
110895b7b453SJohn Marino
110995b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining
111095b7b453SJohn Marino DEST elements are already in place; this is more or
111195b7b453SJohn Marino less the same loop that is in re_node_set_merge. */
111295b7b453SJohn Marino dest->nelem += delta;
1113*09d4459fSDaniel Fojt if (delta > 0 && id >= 0)
111495b7b453SJohn Marino for (;;)
111595b7b453SJohn Marino {
111695b7b453SJohn Marino if (dest->elems[is] > dest->elems[id])
111795b7b453SJohn Marino {
111895b7b453SJohn Marino /* Copy from the top. */
111995b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--];
112095b7b453SJohn Marino if (delta == 0)
112195b7b453SJohn Marino break;
112295b7b453SJohn Marino }
112395b7b453SJohn Marino else
112495b7b453SJohn Marino {
112595b7b453SJohn Marino /* Slide from the bottom. */
112695b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id];
1127*09d4459fSDaniel Fojt if (--id < 0)
112895b7b453SJohn Marino break;
112995b7b453SJohn Marino }
113095b7b453SJohn Marino }
113195b7b453SJohn Marino
113295b7b453SJohn Marino /* Copy remaining SRC elements. */
113395b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
113495b7b453SJohn Marino
113595b7b453SJohn Marino return REG_NOERROR;
113695b7b453SJohn Marino }
113795b7b453SJohn Marino
113895b7b453SJohn Marino /* Calculate the union set of the sets SRC1 and SRC2. And store it to
113995b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
114095b7b453SJohn Marino
114195b7b453SJohn Marino static reg_errcode_t
1142*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)114395b7b453SJohn Marino re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
114495b7b453SJohn Marino const re_node_set *src2)
114595b7b453SJohn Marino {
114695b7b453SJohn Marino Idx i1, i2, id;
114795b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
114895b7b453SJohn Marino {
114995b7b453SJohn Marino dest->alloc = src1->nelem + src2->nelem;
115095b7b453SJohn Marino dest->elems = re_malloc (Idx, dest->alloc);
1151*09d4459fSDaniel Fojt if (__glibc_unlikely (dest->elems == NULL))
115295b7b453SJohn Marino return REG_ESPACE;
115395b7b453SJohn Marino }
115495b7b453SJohn Marino else
115595b7b453SJohn Marino {
115695b7b453SJohn Marino if (src1 != NULL && src1->nelem > 0)
115795b7b453SJohn Marino return re_node_set_init_copy (dest, src1);
115895b7b453SJohn Marino else if (src2 != NULL && src2->nelem > 0)
115995b7b453SJohn Marino return re_node_set_init_copy (dest, src2);
116095b7b453SJohn Marino else
116195b7b453SJohn Marino re_node_set_init_empty (dest);
116295b7b453SJohn Marino return REG_NOERROR;
116395b7b453SJohn Marino }
116495b7b453SJohn Marino for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
116595b7b453SJohn Marino {
116695b7b453SJohn Marino if (src1->elems[i1] > src2->elems[i2])
116795b7b453SJohn Marino {
116895b7b453SJohn Marino dest->elems[id++] = src2->elems[i2++];
116995b7b453SJohn Marino continue;
117095b7b453SJohn Marino }
117195b7b453SJohn Marino if (src1->elems[i1] == src2->elems[i2])
117295b7b453SJohn Marino ++i2;
117395b7b453SJohn Marino dest->elems[id++] = src1->elems[i1++];
117495b7b453SJohn Marino }
117595b7b453SJohn Marino if (i1 < src1->nelem)
117695b7b453SJohn Marino {
117795b7b453SJohn Marino memcpy (dest->elems + id, src1->elems + i1,
117895b7b453SJohn Marino (src1->nelem - i1) * sizeof (Idx));
117995b7b453SJohn Marino id += src1->nelem - i1;
118095b7b453SJohn Marino }
118195b7b453SJohn Marino else if (i2 < src2->nelem)
118295b7b453SJohn Marino {
118395b7b453SJohn Marino memcpy (dest->elems + id, src2->elems + i2,
118495b7b453SJohn Marino (src2->nelem - i2) * sizeof (Idx));
118595b7b453SJohn Marino id += src2->nelem - i2;
118695b7b453SJohn Marino }
118795b7b453SJohn Marino dest->nelem = id;
118895b7b453SJohn Marino return REG_NOERROR;
118995b7b453SJohn Marino }
119095b7b453SJohn Marino
119195b7b453SJohn Marino /* Calculate the union set of the sets DEST and SRC. And store it to
119295b7b453SJohn Marino DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
119395b7b453SJohn Marino
119495b7b453SJohn Marino static reg_errcode_t
1195*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_merge(re_node_set * dest,const re_node_set * src)119695b7b453SJohn Marino re_node_set_merge (re_node_set *dest, const re_node_set *src)
119795b7b453SJohn Marino {
119895b7b453SJohn Marino Idx is, id, sbase, delta;
119995b7b453SJohn Marino if (src == NULL || src->nelem == 0)
120095b7b453SJohn Marino return REG_NOERROR;
120195b7b453SJohn Marino if (dest->alloc < 2 * src->nelem + dest->nelem)
120295b7b453SJohn Marino {
120395b7b453SJohn Marino Idx new_alloc = 2 * (src->nelem + dest->alloc);
120495b7b453SJohn Marino Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205*09d4459fSDaniel Fojt if (__glibc_unlikely (new_buffer == NULL))
120695b7b453SJohn Marino return REG_ESPACE;
120795b7b453SJohn Marino dest->elems = new_buffer;
120895b7b453SJohn Marino dest->alloc = new_alloc;
120995b7b453SJohn Marino }
121095b7b453SJohn Marino
1211*09d4459fSDaniel Fojt if (__glibc_unlikely (dest->nelem == 0))
121295b7b453SJohn Marino {
121395b7b453SJohn Marino dest->nelem = src->nelem;
121495b7b453SJohn Marino memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
121595b7b453SJohn Marino return REG_NOERROR;
121695b7b453SJohn Marino }
121795b7b453SJohn Marino
121895b7b453SJohn Marino /* Copy into the top of DEST the items of SRC that are not
121995b7b453SJohn Marino found in DEST. Maybe we could binary search in DEST? */
122095b7b453SJohn Marino for (sbase = dest->nelem + 2 * src->nelem,
1221*09d4459fSDaniel Fojt is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
122295b7b453SJohn Marino {
122395b7b453SJohn Marino if (dest->elems[id] == src->elems[is])
122495b7b453SJohn Marino is--, id--;
122595b7b453SJohn Marino else if (dest->elems[id] < src->elems[is])
122695b7b453SJohn Marino dest->elems[--sbase] = src->elems[is--];
122795b7b453SJohn Marino else /* if (dest->elems[id] > src->elems[is]) */
122895b7b453SJohn Marino --id;
122995b7b453SJohn Marino }
123095b7b453SJohn Marino
1231*09d4459fSDaniel Fojt if (is >= 0)
123295b7b453SJohn Marino {
123395b7b453SJohn Marino /* If DEST is exhausted, the remaining items of SRC must be unique. */
123495b7b453SJohn Marino sbase -= is + 1;
123595b7b453SJohn Marino memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
123695b7b453SJohn Marino }
123795b7b453SJohn Marino
123895b7b453SJohn Marino id = dest->nelem - 1;
123995b7b453SJohn Marino is = dest->nelem + 2 * src->nelem - 1;
124095b7b453SJohn Marino delta = is - sbase + 1;
124195b7b453SJohn Marino if (delta == 0)
124295b7b453SJohn Marino return REG_NOERROR;
124395b7b453SJohn Marino
124495b7b453SJohn Marino /* Now copy. When DELTA becomes zero, the remaining
124595b7b453SJohn Marino DEST elements are already in place. */
124695b7b453SJohn Marino dest->nelem += delta;
124795b7b453SJohn Marino for (;;)
124895b7b453SJohn Marino {
124995b7b453SJohn Marino if (dest->elems[is] > dest->elems[id])
125095b7b453SJohn Marino {
125195b7b453SJohn Marino /* Copy from the top. */
125295b7b453SJohn Marino dest->elems[id + delta--] = dest->elems[is--];
125395b7b453SJohn Marino if (delta == 0)
125495b7b453SJohn Marino break;
125595b7b453SJohn Marino }
125695b7b453SJohn Marino else
125795b7b453SJohn Marino {
125895b7b453SJohn Marino /* Slide from the bottom. */
125995b7b453SJohn Marino dest->elems[id + delta] = dest->elems[id];
1260*09d4459fSDaniel Fojt if (--id < 0)
126195b7b453SJohn Marino {
126295b7b453SJohn Marino /* Copy remaining SRC elements. */
126395b7b453SJohn Marino memcpy (dest->elems, dest->elems + sbase,
126495b7b453SJohn Marino delta * sizeof (Idx));
126595b7b453SJohn Marino break;
126695b7b453SJohn Marino }
126795b7b453SJohn Marino }
126895b7b453SJohn Marino }
126995b7b453SJohn Marino
127095b7b453SJohn Marino return REG_NOERROR;
127195b7b453SJohn Marino }
127295b7b453SJohn Marino
127395b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
127495b7b453SJohn Marino SET should not already have ELEM.
127595b7b453SJohn Marino Return true if successful. */
127695b7b453SJohn Marino
127795b7b453SJohn Marino static bool
1278*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_insert(re_node_set * set,Idx elem)127995b7b453SJohn Marino re_node_set_insert (re_node_set *set, Idx elem)
128095b7b453SJohn Marino {
128195b7b453SJohn Marino Idx idx;
128295b7b453SJohn Marino /* In case the set is empty. */
128395b7b453SJohn Marino if (set->alloc == 0)
1284*09d4459fSDaniel Fojt return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
128595b7b453SJohn Marino
1286*09d4459fSDaniel Fojt if (__glibc_unlikely (set->nelem) == 0)
128795b7b453SJohn Marino {
128895b7b453SJohn Marino /* We already guaranteed above that set->alloc != 0. */
128995b7b453SJohn Marino set->elems[0] = elem;
129095b7b453SJohn Marino ++set->nelem;
129195b7b453SJohn Marino return true;
129295b7b453SJohn Marino }
129395b7b453SJohn Marino
129495b7b453SJohn Marino /* Realloc if we need. */
129595b7b453SJohn Marino if (set->alloc == set->nelem)
129695b7b453SJohn Marino {
129795b7b453SJohn Marino Idx *new_elems;
129895b7b453SJohn Marino set->alloc = set->alloc * 2;
129995b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc);
1300*09d4459fSDaniel Fojt if (__glibc_unlikely (new_elems == NULL))
130195b7b453SJohn Marino return false;
130295b7b453SJohn Marino set->elems = new_elems;
130395b7b453SJohn Marino }
130495b7b453SJohn Marino
130595b7b453SJohn Marino /* Move the elements which follows the new element. Test the
130695b7b453SJohn Marino first element separately to skip a check in the inner loop. */
130795b7b453SJohn Marino if (elem < set->elems[0])
130895b7b453SJohn Marino {
130995b7b453SJohn Marino for (idx = set->nelem; idx > 0; idx--)
131095b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1];
131195b7b453SJohn Marino }
131295b7b453SJohn Marino else
131395b7b453SJohn Marino {
131495b7b453SJohn Marino for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
131595b7b453SJohn Marino set->elems[idx] = set->elems[idx - 1];
131695b7b453SJohn Marino }
131795b7b453SJohn Marino
131895b7b453SJohn Marino /* Insert the new element. */
131995b7b453SJohn Marino set->elems[idx] = elem;
132095b7b453SJohn Marino ++set->nelem;
132195b7b453SJohn Marino return true;
132295b7b453SJohn Marino }
132395b7b453SJohn Marino
132495b7b453SJohn Marino /* Insert the new element ELEM to the re_node_set* SET.
132595b7b453SJohn Marino SET should not already have any element greater than or equal to ELEM.
132695b7b453SJohn Marino Return true if successful. */
132795b7b453SJohn Marino
132895b7b453SJohn Marino static bool
1329*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_node_set_insert_last(re_node_set * set,Idx elem)133095b7b453SJohn Marino re_node_set_insert_last (re_node_set *set, Idx elem)
133195b7b453SJohn Marino {
133295b7b453SJohn Marino /* Realloc if we need. */
133395b7b453SJohn Marino if (set->alloc == set->nelem)
133495b7b453SJohn Marino {
133595b7b453SJohn Marino Idx *new_elems;
133695b7b453SJohn Marino set->alloc = (set->alloc + 1) * 2;
133795b7b453SJohn Marino new_elems = re_realloc (set->elems, Idx, set->alloc);
1338*09d4459fSDaniel Fojt if (__glibc_unlikely (new_elems == NULL))
133995b7b453SJohn Marino return false;
134095b7b453SJohn Marino set->elems = new_elems;
134195b7b453SJohn Marino }
134295b7b453SJohn Marino
134395b7b453SJohn Marino /* Insert the new element. */
134495b7b453SJohn Marino set->elems[set->nelem++] = elem;
134595b7b453SJohn Marino return true;
134695b7b453SJohn Marino }
134795b7b453SJohn Marino
134895b7b453SJohn Marino /* Compare two node sets SET1 and SET2.
134995b7b453SJohn Marino Return true if SET1 and SET2 are equivalent. */
135095b7b453SJohn Marino
135195b7b453SJohn Marino static bool
1352*09d4459fSDaniel Fojt __attribute__ ((pure))
re_node_set_compare(const re_node_set * set1,const re_node_set * set2)135395b7b453SJohn Marino re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
135495b7b453SJohn Marino {
135595b7b453SJohn Marino Idx i;
135695b7b453SJohn Marino if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
135795b7b453SJohn Marino return false;
1358*09d4459fSDaniel Fojt for (i = set1->nelem ; --i >= 0 ; )
135995b7b453SJohn Marino if (set1->elems[i] != set2->elems[i])
136095b7b453SJohn Marino return false;
136195b7b453SJohn Marino return true;
136295b7b453SJohn Marino }
136395b7b453SJohn Marino
136495b7b453SJohn Marino /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
136595b7b453SJohn Marino
136695b7b453SJohn Marino static Idx
1367*09d4459fSDaniel Fojt __attribute__ ((pure))
re_node_set_contains(const re_node_set * set,Idx elem)136895b7b453SJohn Marino re_node_set_contains (const re_node_set *set, Idx elem)
136995b7b453SJohn Marino {
137095b7b453SJohn Marino __re_size_t idx, right, mid;
1371*09d4459fSDaniel Fojt if (set->nelem <= 0)
137295b7b453SJohn Marino return 0;
137395b7b453SJohn Marino
137495b7b453SJohn Marino /* Binary search the element. */
137595b7b453SJohn Marino idx = 0;
137695b7b453SJohn Marino right = set->nelem - 1;
137795b7b453SJohn Marino while (idx < right)
137895b7b453SJohn Marino {
137995b7b453SJohn Marino mid = (idx + right) / 2;
138095b7b453SJohn Marino if (set->elems[mid] < elem)
138195b7b453SJohn Marino idx = mid + 1;
138295b7b453SJohn Marino else
138395b7b453SJohn Marino right = mid;
138495b7b453SJohn Marino }
138595b7b453SJohn Marino return set->elems[idx] == elem ? idx + 1 : 0;
138695b7b453SJohn Marino }
138795b7b453SJohn Marino
138895b7b453SJohn Marino static void
re_node_set_remove_at(re_node_set * set,Idx idx)138995b7b453SJohn Marino re_node_set_remove_at (re_node_set *set, Idx idx)
139095b7b453SJohn Marino {
1391680a9cb8SJohn Marino if (idx < 0 || idx >= set->nelem)
139295b7b453SJohn Marino return;
139395b7b453SJohn Marino --set->nelem;
139495b7b453SJohn Marino for (; idx < set->nelem; idx++)
139595b7b453SJohn Marino set->elems[idx] = set->elems[idx + 1];
139695b7b453SJohn Marino }
139795b7b453SJohn Marino
139895b7b453SJohn Marino
139995b7b453SJohn Marino /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400*09d4459fSDaniel Fojt Or return -1 if an error occurred. */
140195b7b453SJohn Marino
140295b7b453SJohn Marino static Idx
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)140395b7b453SJohn Marino re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
140495b7b453SJohn Marino {
1405*09d4459fSDaniel Fojt if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
140695b7b453SJohn Marino {
140795b7b453SJohn Marino size_t new_nodes_alloc = dfa->nodes_alloc * 2;
140895b7b453SJohn Marino Idx *new_nexts, *new_indices;
140995b7b453SJohn Marino re_node_set *new_edests, *new_eclosures;
141095b7b453SJohn Marino re_token_t *new_nodes;
1411cf28ed85SJohn Marino
1412cf28ed85SJohn Marino /* Avoid overflows in realloc. */
1413cf28ed85SJohn Marino const size_t max_object_size = MAX (sizeof (re_token_t),
141495b7b453SJohn Marino MAX (sizeof (re_node_set),
141595b7b453SJohn Marino sizeof (Idx)));
1416*09d4459fSDaniel Fojt if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1417*09d4459fSDaniel Fojt < new_nodes_alloc))
1418*09d4459fSDaniel Fojt return -1;
141995b7b453SJohn Marino
142095b7b453SJohn Marino new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421*09d4459fSDaniel Fojt if (__glibc_unlikely (new_nodes == NULL))
1422*09d4459fSDaniel Fojt return -1;
142395b7b453SJohn Marino dfa->nodes = new_nodes;
142495b7b453SJohn Marino new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
142595b7b453SJohn Marino new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
142695b7b453SJohn Marino new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
142795b7b453SJohn Marino new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428*09d4459fSDaniel Fojt if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1429*09d4459fSDaniel Fojt || new_edests == NULL || new_eclosures == NULL))
1430*09d4459fSDaniel Fojt {
1431*09d4459fSDaniel Fojt re_free (new_nexts);
1432*09d4459fSDaniel Fojt re_free (new_indices);
1433*09d4459fSDaniel Fojt re_free (new_edests);
1434*09d4459fSDaniel Fojt re_free (new_eclosures);
1435*09d4459fSDaniel Fojt return -1;
1436*09d4459fSDaniel Fojt }
143795b7b453SJohn Marino dfa->nexts = new_nexts;
143895b7b453SJohn Marino dfa->org_indices = new_indices;
143995b7b453SJohn Marino dfa->edests = new_edests;
144095b7b453SJohn Marino dfa->eclosures = new_eclosures;
144195b7b453SJohn Marino dfa->nodes_alloc = new_nodes_alloc;
144295b7b453SJohn Marino }
144395b7b453SJohn Marino dfa->nodes[dfa->nodes_len] = token;
144495b7b453SJohn Marino dfa->nodes[dfa->nodes_len].constraint = 0;
144595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
144695b7b453SJohn Marino dfa->nodes[dfa->nodes_len].accept_mb =
1447680a9cb8SJohn Marino ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448680a9cb8SJohn Marino || token.type == COMPLEX_BRACKET);
144995b7b453SJohn Marino #endif
1450*09d4459fSDaniel Fojt dfa->nexts[dfa->nodes_len] = -1;
145195b7b453SJohn Marino re_node_set_init_empty (dfa->edests + dfa->nodes_len);
145295b7b453SJohn Marino re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
145395b7b453SJohn Marino return dfa->nodes_len++;
145495b7b453SJohn Marino }
145595b7b453SJohn Marino
1456680a9cb8SJohn Marino static re_hashval_t
calc_state_hash(const re_node_set * nodes,unsigned int context)145795b7b453SJohn Marino calc_state_hash (const re_node_set *nodes, unsigned int context)
145895b7b453SJohn Marino {
145995b7b453SJohn Marino re_hashval_t hash = nodes->nelem + context;
146095b7b453SJohn Marino Idx i;
146195b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++)
146295b7b453SJohn Marino hash += nodes->elems[i];
146395b7b453SJohn Marino return hash;
146495b7b453SJohn Marino }
146595b7b453SJohn Marino
146695b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES.
146795b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA.
146895b7b453SJohn Marino Otherwise create the new one and return it. In case of an error
146995b7b453SJohn Marino return NULL and set the error code in ERR.
147095b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that
147195b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR.
147295b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for
147395b7b453SJohn Marino optimization. */
147495b7b453SJohn Marino
147595b7b453SJohn Marino static re_dfastate_t *
1476*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)147795b7b453SJohn Marino re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
147895b7b453SJohn Marino const re_node_set *nodes)
147995b7b453SJohn Marino {
148095b7b453SJohn Marino re_hashval_t hash;
148195b7b453SJohn Marino re_dfastate_t *new_state;
148295b7b453SJohn Marino struct re_state_table_entry *spot;
148395b7b453SJohn Marino Idx i;
1484*09d4459fSDaniel Fojt #if defined GCC_LINT || defined lint
148595b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */
148695b7b453SJohn Marino *err = REG_NOERROR;
148795b7b453SJohn Marino #endif
1488*09d4459fSDaniel Fojt if (__glibc_unlikely (nodes->nelem == 0))
148995b7b453SJohn Marino {
149095b7b453SJohn Marino *err = REG_NOERROR;
149195b7b453SJohn Marino return NULL;
149295b7b453SJohn Marino }
149395b7b453SJohn Marino hash = calc_state_hash (nodes, 0);
149495b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask);
149595b7b453SJohn Marino
149695b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++)
149795b7b453SJohn Marino {
149895b7b453SJohn Marino re_dfastate_t *state = spot->array[i];
149995b7b453SJohn Marino if (hash != state->hash)
150095b7b453SJohn Marino continue;
150195b7b453SJohn Marino if (re_node_set_compare (&state->nodes, nodes))
150295b7b453SJohn Marino return state;
150395b7b453SJohn Marino }
150495b7b453SJohn Marino
150595b7b453SJohn Marino /* There are no appropriate state in the dfa, create the new one. */
150695b7b453SJohn Marino new_state = create_ci_newstate (dfa, nodes, hash);
1507*09d4459fSDaniel Fojt if (__glibc_unlikely (new_state == NULL))
150895b7b453SJohn Marino *err = REG_ESPACE;
150995b7b453SJohn Marino
151095b7b453SJohn Marino return new_state;
151195b7b453SJohn Marino }
151295b7b453SJohn Marino
151395b7b453SJohn Marino /* Search for the state whose node_set is equivalent to NODES and
151495b7b453SJohn Marino whose context is equivalent to CONTEXT.
151595b7b453SJohn Marino Return the pointer to the state, if we found it in the DFA.
151695b7b453SJohn Marino Otherwise create the new one and return it. In case of an error
151795b7b453SJohn Marino return NULL and set the error code in ERR.
151895b7b453SJohn Marino Note: - We assume NULL as the invalid state, then it is possible that
151995b7b453SJohn Marino return value is NULL and ERR is REG_NOERROR.
152095b7b453SJohn Marino - We never return non-NULL value in case of any errors, it is for
152195b7b453SJohn Marino optimization. */
152295b7b453SJohn Marino
152395b7b453SJohn Marino static re_dfastate_t *
1524*09d4459fSDaniel Fojt __attribute_warn_unused_result__
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)152595b7b453SJohn Marino re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
152695b7b453SJohn Marino const re_node_set *nodes, unsigned int context)
152795b7b453SJohn Marino {
152895b7b453SJohn Marino re_hashval_t hash;
152995b7b453SJohn Marino re_dfastate_t *new_state;
153095b7b453SJohn Marino struct re_state_table_entry *spot;
153195b7b453SJohn Marino Idx i;
1532*09d4459fSDaniel Fojt #if defined GCC_LINT || defined lint
153395b7b453SJohn Marino /* Suppress bogus uninitialized-variable warnings. */
153495b7b453SJohn Marino *err = REG_NOERROR;
153595b7b453SJohn Marino #endif
153695b7b453SJohn Marino if (nodes->nelem == 0)
153795b7b453SJohn Marino {
153895b7b453SJohn Marino *err = REG_NOERROR;
153995b7b453SJohn Marino return NULL;
154095b7b453SJohn Marino }
154195b7b453SJohn Marino hash = calc_state_hash (nodes, context);
154295b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask);
154395b7b453SJohn Marino
154495b7b453SJohn Marino for (i = 0 ; i < spot->num ; i++)
154595b7b453SJohn Marino {
154695b7b453SJohn Marino re_dfastate_t *state = spot->array[i];
154795b7b453SJohn Marino if (state->hash == hash
154895b7b453SJohn Marino && state->context == context
154995b7b453SJohn Marino && re_node_set_compare (state->entrance_nodes, nodes))
155095b7b453SJohn Marino return state;
155195b7b453SJohn Marino }
1552cf28ed85SJohn Marino /* There are no appropriate state in 'dfa', create the new one. */
155395b7b453SJohn Marino new_state = create_cd_newstate (dfa, nodes, context, hash);
1554*09d4459fSDaniel Fojt if (__glibc_unlikely (new_state == NULL))
155595b7b453SJohn Marino *err = REG_ESPACE;
155695b7b453SJohn Marino
155795b7b453SJohn Marino return new_state;
155895b7b453SJohn Marino }
155995b7b453SJohn Marino
156095b7b453SJohn Marino /* Finish initialization of the new state NEWSTATE, and using its hash value
156195b7b453SJohn Marino HASH put in the appropriate bucket of DFA's state table. Return value
156295b7b453SJohn Marino indicates the error code if failed. */
156395b7b453SJohn Marino
156495b7b453SJohn Marino static reg_errcode_t
156595b7b453SJohn Marino __attribute_warn_unused_result__
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)156695b7b453SJohn Marino register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
156795b7b453SJohn Marino re_hashval_t hash)
156895b7b453SJohn Marino {
156995b7b453SJohn Marino struct re_state_table_entry *spot;
157095b7b453SJohn Marino reg_errcode_t err;
157195b7b453SJohn Marino Idx i;
157295b7b453SJohn Marino
157395b7b453SJohn Marino newstate->hash = hash;
157495b7b453SJohn Marino err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
157695b7b453SJohn Marino return REG_ESPACE;
157795b7b453SJohn Marino for (i = 0; i < newstate->nodes.nelem; i++)
157895b7b453SJohn Marino {
157995b7b453SJohn Marino Idx elem = newstate->nodes.elems[i];
158095b7b453SJohn Marino if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581cf28ed85SJohn Marino if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
158295b7b453SJohn Marino return REG_ESPACE;
158395b7b453SJohn Marino }
158495b7b453SJohn Marino
158595b7b453SJohn Marino spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586*09d4459fSDaniel Fojt if (__glibc_unlikely (spot->alloc <= spot->num))
158795b7b453SJohn Marino {
158895b7b453SJohn Marino Idx new_alloc = 2 * spot->num + 2;
158995b7b453SJohn Marino re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
159095b7b453SJohn Marino new_alloc);
1591*09d4459fSDaniel Fojt if (__glibc_unlikely (new_array == NULL))
159295b7b453SJohn Marino return REG_ESPACE;
159395b7b453SJohn Marino spot->array = new_array;
159495b7b453SJohn Marino spot->alloc = new_alloc;
159595b7b453SJohn Marino }
159695b7b453SJohn Marino spot->array[spot->num++] = newstate;
159795b7b453SJohn Marino return REG_NOERROR;
159895b7b453SJohn Marino }
159995b7b453SJohn Marino
160095b7b453SJohn Marino static void
free_state(re_dfastate_t * state)160195b7b453SJohn Marino free_state (re_dfastate_t *state)
160295b7b453SJohn Marino {
160395b7b453SJohn Marino re_node_set_free (&state->non_eps_nodes);
160495b7b453SJohn Marino re_node_set_free (&state->inveclosure);
160595b7b453SJohn Marino if (state->entrance_nodes != &state->nodes)
160695b7b453SJohn Marino {
160795b7b453SJohn Marino re_node_set_free (state->entrance_nodes);
160895b7b453SJohn Marino re_free (state->entrance_nodes);
160995b7b453SJohn Marino }
161095b7b453SJohn Marino re_node_set_free (&state->nodes);
161195b7b453SJohn Marino re_free (state->word_trtable);
161295b7b453SJohn Marino re_free (state->trtable);
161395b7b453SJohn Marino re_free (state);
161495b7b453SJohn Marino }
161595b7b453SJohn Marino
1616cf28ed85SJohn Marino /* Create the new state which is independent of contexts.
161795b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */
161895b7b453SJohn Marino
161995b7b453SJohn Marino static re_dfastate_t *
1620*09d4459fSDaniel Fojt __attribute_warn_unused_result__
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)162195b7b453SJohn Marino create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
162295b7b453SJohn Marino re_hashval_t hash)
162395b7b453SJohn Marino {
162495b7b453SJohn Marino Idx i;
162595b7b453SJohn Marino reg_errcode_t err;
162695b7b453SJohn Marino re_dfastate_t *newstate;
162795b7b453SJohn Marino
162895b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629*09d4459fSDaniel Fojt if (__glibc_unlikely (newstate == NULL))
163095b7b453SJohn Marino return NULL;
163195b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes);
1632*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
163395b7b453SJohn Marino {
163495b7b453SJohn Marino re_free (newstate);
163595b7b453SJohn Marino return NULL;
163695b7b453SJohn Marino }
163795b7b453SJohn Marino
163895b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes;
163995b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++)
164095b7b453SJohn Marino {
164195b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i];
164295b7b453SJohn Marino re_token_type_t type = node->type;
164395b7b453SJohn Marino if (type == CHARACTER && !node->constraint)
164495b7b453SJohn Marino continue;
164595b7b453SJohn Marino #ifdef RE_ENABLE_I18N
164695b7b453SJohn Marino newstate->accept_mb |= node->accept_mb;
164795b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
164895b7b453SJohn Marino
164995b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */
165095b7b453SJohn Marino if (type == END_OF_RE)
165195b7b453SJohn Marino newstate->halt = 1;
165295b7b453SJohn Marino else if (type == OP_BACK_REF)
165395b7b453SJohn Marino newstate->has_backref = 1;
165495b7b453SJohn Marino else if (type == ANCHOR || node->constraint)
165595b7b453SJohn Marino newstate->has_constraint = 1;
165695b7b453SJohn Marino }
165795b7b453SJohn Marino err = register_state (dfa, newstate, hash);
1658*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
165995b7b453SJohn Marino {
166095b7b453SJohn Marino free_state (newstate);
166195b7b453SJohn Marino newstate = NULL;
166295b7b453SJohn Marino }
166395b7b453SJohn Marino return newstate;
166495b7b453SJohn Marino }
166595b7b453SJohn Marino
166695b7b453SJohn Marino /* Create the new state which is depend on the context CONTEXT.
166795b7b453SJohn Marino Return the new state if succeeded, otherwise return NULL. */
166895b7b453SJohn Marino
166995b7b453SJohn Marino static re_dfastate_t *
1670*09d4459fSDaniel Fojt __attribute_warn_unused_result__
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)167195b7b453SJohn Marino create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
167295b7b453SJohn Marino unsigned int context, re_hashval_t hash)
167395b7b453SJohn Marino {
167495b7b453SJohn Marino Idx i, nctx_nodes = 0;
167595b7b453SJohn Marino reg_errcode_t err;
167695b7b453SJohn Marino re_dfastate_t *newstate;
167795b7b453SJohn Marino
167895b7b453SJohn Marino newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679*09d4459fSDaniel Fojt if (__glibc_unlikely (newstate == NULL))
168095b7b453SJohn Marino return NULL;
168195b7b453SJohn Marino err = re_node_set_init_copy (&newstate->nodes, nodes);
1682*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
168395b7b453SJohn Marino {
168495b7b453SJohn Marino re_free (newstate);
168595b7b453SJohn Marino return NULL;
168695b7b453SJohn Marino }
168795b7b453SJohn Marino
168895b7b453SJohn Marino newstate->context = context;
168995b7b453SJohn Marino newstate->entrance_nodes = &newstate->nodes;
169095b7b453SJohn Marino
169195b7b453SJohn Marino for (i = 0 ; i < nodes->nelem ; i++)
169295b7b453SJohn Marino {
169395b7b453SJohn Marino re_token_t *node = dfa->nodes + nodes->elems[i];
169495b7b453SJohn Marino re_token_type_t type = node->type;
169595b7b453SJohn Marino unsigned int constraint = node->constraint;
169695b7b453SJohn Marino
169795b7b453SJohn Marino if (type == CHARACTER && !constraint)
169895b7b453SJohn Marino continue;
169995b7b453SJohn Marino #ifdef RE_ENABLE_I18N
170095b7b453SJohn Marino newstate->accept_mb |= node->accept_mb;
170195b7b453SJohn Marino #endif /* RE_ENABLE_I18N */
170295b7b453SJohn Marino
170395b7b453SJohn Marino /* If the state has the halt node, the state is a halt state. */
170495b7b453SJohn Marino if (type == END_OF_RE)
170595b7b453SJohn Marino newstate->halt = 1;
170695b7b453SJohn Marino else if (type == OP_BACK_REF)
170795b7b453SJohn Marino newstate->has_backref = 1;
170895b7b453SJohn Marino
170995b7b453SJohn Marino if (constraint)
171095b7b453SJohn Marino {
171195b7b453SJohn Marino if (newstate->entrance_nodes == &newstate->nodes)
171295b7b453SJohn Marino {
1713*09d4459fSDaniel Fojt re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1714*09d4459fSDaniel Fojt if (__glibc_unlikely (entrance_nodes == NULL))
171595b7b453SJohn Marino {
171695b7b453SJohn Marino free_state (newstate);
171795b7b453SJohn Marino return NULL;
171895b7b453SJohn Marino }
1719*09d4459fSDaniel Fojt newstate->entrance_nodes = entrance_nodes;
172095b7b453SJohn Marino if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
172195b7b453SJohn Marino != REG_NOERROR)
1722*09d4459fSDaniel Fojt {
1723*09d4459fSDaniel Fojt free_state (newstate);
172495b7b453SJohn Marino return NULL;
1725*09d4459fSDaniel Fojt }
172695b7b453SJohn Marino nctx_nodes = 0;
172795b7b453SJohn Marino newstate->has_constraint = 1;
172895b7b453SJohn Marino }
172995b7b453SJohn Marino
173095b7b453SJohn Marino if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
173195b7b453SJohn Marino {
173295b7b453SJohn Marino re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
173395b7b453SJohn Marino ++nctx_nodes;
173495b7b453SJohn Marino }
173595b7b453SJohn Marino }
173695b7b453SJohn Marino }
173795b7b453SJohn Marino err = register_state (dfa, newstate, hash);
1738*09d4459fSDaniel Fojt if (__glibc_unlikely (err != REG_NOERROR))
173995b7b453SJohn Marino {
174095b7b453SJohn Marino free_state (newstate);
174195b7b453SJohn Marino newstate = NULL;
174295b7b453SJohn Marino }
174395b7b453SJohn Marino return newstate;
174495b7b453SJohn Marino }
1745