195b7b453SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*09d4459fSDaniel Fojt Copyright (C) 2001, 2005, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino
495b7b453SJohn Marino This program is free software: you can redistribute it and/or modify
595b7b453SJohn Marino it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino the Free Software Foundation; either version 3 of the License, or
795b7b453SJohn Marino (at your option) any later version.
895b7b453SJohn Marino
995b7b453SJohn Marino This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1295b7b453SJohn Marino GNU General Public License for more details.
1395b7b453SJohn Marino
1495b7b453SJohn Marino You should have received a copy of the GNU General Public License
15*09d4459fSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
1695b7b453SJohn Marino
1795b7b453SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>. */
1895b7b453SJohn Marino
1995b7b453SJohn Marino /* The macros in this file implement forward iteration through a
2095b7b453SJohn Marino multi-byte string, without knowing its length a-priori.
2195b7b453SJohn Marino
2295b7b453SJohn Marino With these macros, an iteration loop that looks like
2395b7b453SJohn Marino
2495b7b453SJohn Marino char *iter;
2595b7b453SJohn Marino for (iter = buf; *iter != '\0'; iter++)
2695b7b453SJohn Marino {
2795b7b453SJohn Marino do_something (*iter);
2895b7b453SJohn Marino }
2995b7b453SJohn Marino
3095b7b453SJohn Marino becomes
3195b7b453SJohn Marino
3295b7b453SJohn Marino mbui_iterator_t iter;
3395b7b453SJohn Marino for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3495b7b453SJohn Marino {
3595b7b453SJohn Marino do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3695b7b453SJohn Marino }
3795b7b453SJohn Marino
3895b7b453SJohn Marino The benefit of these macros over plain use of mbrtowc is:
3995b7b453SJohn Marino - Handling of invalid multibyte sequences is possible without
4095b7b453SJohn Marino making the code more complicated, while still preserving the
4195b7b453SJohn Marino invalid multibyte sequences.
4295b7b453SJohn Marino
4395b7b453SJohn Marino Compared to mbiter.h, the macros here don't need to know the string's
4495b7b453SJohn Marino length a-priori. The downside is that at each step, the look-ahead
4595b7b453SJohn Marino that guards against overrunning the terminating '\0' is more expensive.
4695b7b453SJohn Marino The mbui_* macros are therefore suitable when there is a high probability
4795b7b453SJohn Marino that only the first few multibyte characters need to be inspected.
4895b7b453SJohn Marino Whereas the mbi_* macros are better if usually the iteration runs
4995b7b453SJohn Marino through the entire string.
5095b7b453SJohn Marino
5195b7b453SJohn Marino mbui_iterator_t
5295b7b453SJohn Marino is a type usable for variable declarations.
5395b7b453SJohn Marino
5495b7b453SJohn Marino mbui_init (iter, startptr)
5595b7b453SJohn Marino initializes the iterator, starting at startptr.
5695b7b453SJohn Marino
5795b7b453SJohn Marino mbui_avail (iter)
58cf28ed85SJohn Marino returns true if there are more multibyte characters available before
5995b7b453SJohn Marino the end of string is reached. In this case, mbui_cur (iter) is
60cf28ed85SJohn Marino initialized to the next multibyte character.
6195b7b453SJohn Marino
6295b7b453SJohn Marino mbui_advance (iter)
6395b7b453SJohn Marino advances the iterator by one multibyte character.
6495b7b453SJohn Marino
6595b7b453SJohn Marino mbui_cur (iter)
6695b7b453SJohn Marino returns the current multibyte character, of type mbchar_t. All the
6795b7b453SJohn Marino macros defined in mbchar.h can be used on it.
6895b7b453SJohn Marino
6995b7b453SJohn Marino mbui_cur_ptr (iter)
7095b7b453SJohn Marino return a pointer to the beginning of the current multibyte character.
7195b7b453SJohn Marino
7295b7b453SJohn Marino mbui_reloc (iter, ptrdiff)
7395b7b453SJohn Marino relocates iterator when the string is moved by ptrdiff bytes.
7495b7b453SJohn Marino
7595b7b453SJohn Marino mbui_copy (&destiter, &srciter)
7695b7b453SJohn Marino copies srciter to destiter.
7795b7b453SJohn Marino
7895b7b453SJohn Marino Here are the function prototypes of the macros.
7995b7b453SJohn Marino
8095b7b453SJohn Marino extern void mbui_init (mbui_iterator_t iter, const char *startptr);
8195b7b453SJohn Marino extern bool mbui_avail (mbui_iterator_t iter);
8295b7b453SJohn Marino extern void mbui_advance (mbui_iterator_t iter);
8395b7b453SJohn Marino extern mbchar_t mbui_cur (mbui_iterator_t iter);
8495b7b453SJohn Marino extern const char * mbui_cur_ptr (mbui_iterator_t iter);
8595b7b453SJohn Marino extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8695b7b453SJohn Marino extern void mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old);
8795b7b453SJohn Marino */
8895b7b453SJohn Marino
8995b7b453SJohn Marino #ifndef _MBUITER_H
9095b7b453SJohn Marino #define _MBUITER_H 1
9195b7b453SJohn Marino
9295b7b453SJohn Marino #include <assert.h>
9395b7b453SJohn Marino #include <stdbool.h>
9495b7b453SJohn Marino #include <stddef.h>
9595b7b453SJohn Marino #include <stdlib.h>
9695b7b453SJohn Marino #include <string.h>
9795b7b453SJohn Marino
9895b7b453SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9995b7b453SJohn Marino <wchar.h>.
10095b7b453SJohn Marino BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
10195b7b453SJohn Marino <wchar.h>. */
10295b7b453SJohn Marino #include <stdio.h>
10395b7b453SJohn Marino #include <time.h>
10495b7b453SJohn Marino #include <wchar.h>
10595b7b453SJohn Marino
10695b7b453SJohn Marino #include "mbchar.h"
10795b7b453SJohn Marino #include "strnlen1.h"
10895b7b453SJohn Marino
109680a9cb8SJohn Marino #ifndef _GL_INLINE_HEADER_BEGIN
110680a9cb8SJohn Marino #error "Please include config.h first."
111680a9cb8SJohn Marino #endif
112680a9cb8SJohn Marino _GL_INLINE_HEADER_BEGIN
113680a9cb8SJohn Marino #ifndef MBUITER_INLINE
114680a9cb8SJohn Marino # define MBUITER_INLINE _GL_INLINE
115680a9cb8SJohn Marino #endif
116680a9cb8SJohn Marino
11795b7b453SJohn Marino struct mbuiter_multi
11895b7b453SJohn Marino {
11995b7b453SJohn Marino bool in_shift; /* true if next byte may not be interpreted as ASCII */
12095b7b453SJohn Marino mbstate_t state; /* if in_shift: current shift state */
12195b7b453SJohn Marino bool next_done; /* true if mbui_avail has already filled the following */
12295b7b453SJohn Marino struct mbchar cur; /* the current character:
12395b7b453SJohn Marino const char *cur.ptr pointer to current character
12495b7b453SJohn Marino The following are only valid after mbui_avail.
12595b7b453SJohn Marino size_t cur.bytes number of bytes of current character
12695b7b453SJohn Marino bool cur.wc_valid true if wc is a valid wide character
12795b7b453SJohn Marino wchar_t cur.wc if wc_valid: the current character
12895b7b453SJohn Marino */
12995b7b453SJohn Marino };
13095b7b453SJohn Marino
131680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_next(struct mbuiter_multi * iter)13295b7b453SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
13395b7b453SJohn Marino {
13495b7b453SJohn Marino if (iter->next_done)
13595b7b453SJohn Marino return;
13695b7b453SJohn Marino if (iter->in_shift)
13795b7b453SJohn Marino goto with_shift;
13895b7b453SJohn Marino /* Handle most ASCII characters quickly, without calling mbrtowc(). */
13995b7b453SJohn Marino if (is_basic (*iter->cur.ptr))
14095b7b453SJohn Marino {
14195b7b453SJohn Marino /* These characters are part of the basic character set. ISO C 99
14295b7b453SJohn Marino guarantees that their wide character code is identical to their
14395b7b453SJohn Marino char code. */
14495b7b453SJohn Marino iter->cur.bytes = 1;
14595b7b453SJohn Marino iter->cur.wc = *iter->cur.ptr;
14695b7b453SJohn Marino iter->cur.wc_valid = true;
14795b7b453SJohn Marino }
14895b7b453SJohn Marino else
14995b7b453SJohn Marino {
15095b7b453SJohn Marino assert (mbsinit (&iter->state));
15195b7b453SJohn Marino iter->in_shift = true;
15295b7b453SJohn Marino with_shift:
15395b7b453SJohn Marino iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
15495b7b453SJohn Marino strnlen1 (iter->cur.ptr, MB_CUR_MAX),
15595b7b453SJohn Marino &iter->state);
15695b7b453SJohn Marino if (iter->cur.bytes == (size_t) -1)
15795b7b453SJohn Marino {
15895b7b453SJohn Marino /* An invalid multibyte sequence was encountered. */
15995b7b453SJohn Marino iter->cur.bytes = 1;
16095b7b453SJohn Marino iter->cur.wc_valid = false;
16195b7b453SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
16295b7b453SJohn Marino or not is not very important; the string is bogus anyway. */
16395b7b453SJohn Marino }
16495b7b453SJohn Marino else if (iter->cur.bytes == (size_t) -2)
16595b7b453SJohn Marino {
16695b7b453SJohn Marino /* An incomplete multibyte character at the end. */
16795b7b453SJohn Marino iter->cur.bytes = strlen (iter->cur.ptr);
16895b7b453SJohn Marino iter->cur.wc_valid = false;
16995b7b453SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
17095b7b453SJohn Marino or not is not important; the string end is reached anyway. */
17195b7b453SJohn Marino }
17295b7b453SJohn Marino else
17395b7b453SJohn Marino {
17495b7b453SJohn Marino if (iter->cur.bytes == 0)
17595b7b453SJohn Marino {
17695b7b453SJohn Marino /* A null wide character was encountered. */
17795b7b453SJohn Marino iter->cur.bytes = 1;
17895b7b453SJohn Marino assert (*iter->cur.ptr == '\0');
17995b7b453SJohn Marino assert (iter->cur.wc == 0);
18095b7b453SJohn Marino }
18195b7b453SJohn Marino iter->cur.wc_valid = true;
18295b7b453SJohn Marino
18395b7b453SJohn Marino /* When in the initial state, we can go back treating ASCII
18495b7b453SJohn Marino characters more quickly. */
18595b7b453SJohn Marino if (mbsinit (&iter->state))
18695b7b453SJohn Marino iter->in_shift = false;
18795b7b453SJohn Marino }
18895b7b453SJohn Marino }
18995b7b453SJohn Marino iter->next_done = true;
19095b7b453SJohn Marino }
19195b7b453SJohn Marino
192680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)19395b7b453SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
19495b7b453SJohn Marino {
19595b7b453SJohn Marino iter->cur.ptr += ptrdiff;
19695b7b453SJohn Marino }
19795b7b453SJohn Marino
198680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_copy(struct mbuiter_multi * new_iter,const struct mbuiter_multi * old_iter)19995b7b453SJohn Marino mbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter)
20095b7b453SJohn Marino {
20195b7b453SJohn Marino if ((new_iter->in_shift = old_iter->in_shift))
20295b7b453SJohn Marino memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
20395b7b453SJohn Marino else
20495b7b453SJohn Marino memset (&new_iter->state, 0, sizeof (mbstate_t));
20595b7b453SJohn Marino new_iter->next_done = old_iter->next_done;
20695b7b453SJohn Marino mb_copy (&new_iter->cur, &old_iter->cur);
20795b7b453SJohn Marino }
20895b7b453SJohn Marino
20995b7b453SJohn Marino /* Iteration macros. */
21095b7b453SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
21195b7b453SJohn Marino #define mbui_init(iter, startptr) \
21295b7b453SJohn Marino ((iter).cur.ptr = (startptr), \
21395b7b453SJohn Marino (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
21495b7b453SJohn Marino (iter).next_done = false)
21595b7b453SJohn Marino #define mbui_avail(iter) \
21695b7b453SJohn Marino (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
21795b7b453SJohn Marino #define mbui_advance(iter) \
21895b7b453SJohn Marino ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21995b7b453SJohn Marino
22095b7b453SJohn Marino /* Access to the current character. */
22195b7b453SJohn Marino #define mbui_cur(iter) (iter).cur
22295b7b453SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
22395b7b453SJohn Marino
22495b7b453SJohn Marino /* Relocation. */
22595b7b453SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
22695b7b453SJohn Marino
22795b7b453SJohn Marino /* Copying an iterator. */
22895b7b453SJohn Marino #define mbui_copy mbuiter_multi_copy
22995b7b453SJohn Marino
230680a9cb8SJohn Marino _GL_INLINE_HEADER_END
231680a9cb8SJohn Marino
23295b7b453SJohn Marino #endif /* _MBUITER_H */
233