144b87433SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*6ea1f93eSDaniel Fojt Copyright (C) 2001, 2005, 2007, 2009-2018 Free Software Foundation, Inc.
344b87433SJohn Marino
444b87433SJohn Marino This program is free software: you can redistribute it and/or modify
544b87433SJohn Marino it under the terms of the GNU General Public License as published by
644b87433SJohn Marino the Free Software Foundation; either version 3 of the License, or
744b87433SJohn Marino (at your option) any later version.
844b87433SJohn Marino
944b87433SJohn Marino This program is distributed in the hope that it will be useful,
1044b87433SJohn Marino but WITHOUT ANY WARRANTY; without even the implied warranty of
1144b87433SJohn Marino MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1244b87433SJohn Marino GNU General Public License for more details.
1344b87433SJohn Marino
1444b87433SJohn Marino You should have received a copy of the GNU General Public License
15*6ea1f93eSDaniel Fojt along with this program. If not, see <https://www.gnu.org/licenses/>. */
1644b87433SJohn Marino
1744b87433SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>. */
1844b87433SJohn Marino
1944b87433SJohn Marino /* The macros in this file implement forward iteration through a
2044b87433SJohn Marino multi-byte string, without knowing its length a-priori.
2144b87433SJohn Marino
2244b87433SJohn Marino With these macros, an iteration loop that looks like
2344b87433SJohn Marino
2444b87433SJohn Marino char *iter;
2544b87433SJohn Marino for (iter = buf; *iter != '\0'; iter++)
2644b87433SJohn Marino {
2744b87433SJohn Marino do_something (*iter);
2844b87433SJohn Marino }
2944b87433SJohn Marino
3044b87433SJohn Marino becomes
3144b87433SJohn Marino
3244b87433SJohn Marino mbui_iterator_t iter;
3344b87433SJohn Marino for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3444b87433SJohn Marino {
3544b87433SJohn Marino do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3644b87433SJohn Marino }
3744b87433SJohn Marino
3844b87433SJohn Marino The benefit of these macros over plain use of mbrtowc is:
3944b87433SJohn Marino - Handling of invalid multibyte sequences is possible without
4044b87433SJohn Marino making the code more complicated, while still preserving the
4144b87433SJohn Marino invalid multibyte sequences.
4244b87433SJohn Marino
4344b87433SJohn Marino Compared to mbiter.h, the macros here don't need to know the string's
4444b87433SJohn Marino length a-priori. The downside is that at each step, the look-ahead
4544b87433SJohn Marino that guards against overrunning the terminating '\0' is more expensive.
4644b87433SJohn Marino The mbui_* macros are therefore suitable when there is a high probability
4744b87433SJohn Marino that only the first few multibyte characters need to be inspected.
4844b87433SJohn Marino Whereas the mbi_* macros are better if usually the iteration runs
4944b87433SJohn Marino through the entire string.
5044b87433SJohn Marino
5144b87433SJohn Marino mbui_iterator_t
5244b87433SJohn Marino is a type usable for variable declarations.
5344b87433SJohn Marino
5444b87433SJohn Marino mbui_init (iter, startptr)
5544b87433SJohn Marino initializes the iterator, starting at startptr.
5644b87433SJohn Marino
5744b87433SJohn Marino mbui_avail (iter)
584536c563SJohn Marino returns true if there are more multibyte characters available before
5944b87433SJohn Marino the end of string is reached. In this case, mbui_cur (iter) is
604536c563SJohn Marino initialized to the next multibyte character.
6144b87433SJohn Marino
6244b87433SJohn Marino mbui_advance (iter)
6344b87433SJohn Marino advances the iterator by one multibyte character.
6444b87433SJohn Marino
6544b87433SJohn Marino mbui_cur (iter)
6644b87433SJohn Marino returns the current multibyte character, of type mbchar_t. All the
6744b87433SJohn Marino macros defined in mbchar.h can be used on it.
6844b87433SJohn Marino
6944b87433SJohn Marino mbui_cur_ptr (iter)
7044b87433SJohn Marino return a pointer to the beginning of the current multibyte character.
7144b87433SJohn Marino
7244b87433SJohn Marino mbui_reloc (iter, ptrdiff)
7344b87433SJohn Marino relocates iterator when the string is moved by ptrdiff bytes.
7444b87433SJohn Marino
7544b87433SJohn Marino mbui_copy (&destiter, &srciter)
7644b87433SJohn Marino copies srciter to destiter.
7744b87433SJohn Marino
7844b87433SJohn Marino Here are the function prototypes of the macros.
7944b87433SJohn Marino
8044b87433SJohn Marino extern void mbui_init (mbui_iterator_t iter, const char *startptr);
8144b87433SJohn Marino extern bool mbui_avail (mbui_iterator_t iter);
8244b87433SJohn Marino extern void mbui_advance (mbui_iterator_t iter);
8344b87433SJohn Marino extern mbchar_t mbui_cur (mbui_iterator_t iter);
8444b87433SJohn Marino extern const char * mbui_cur_ptr (mbui_iterator_t iter);
8544b87433SJohn Marino extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8644b87433SJohn Marino extern void mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old);
8744b87433SJohn Marino */
8844b87433SJohn Marino
8944b87433SJohn Marino #ifndef _MBUITER_H
9044b87433SJohn Marino #define _MBUITER_H 1
9144b87433SJohn Marino
9244b87433SJohn Marino #include <assert.h>
9344b87433SJohn Marino #include <stdbool.h>
9444b87433SJohn Marino #include <stddef.h>
9544b87433SJohn Marino #include <stdlib.h>
9644b87433SJohn Marino #include <string.h>
9744b87433SJohn Marino
9844b87433SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9944b87433SJohn Marino <wchar.h>.
10044b87433SJohn Marino BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
10144b87433SJohn Marino <wchar.h>. */
10244b87433SJohn Marino #include <stdio.h>
10344b87433SJohn Marino #include <time.h>
10444b87433SJohn Marino #include <wchar.h>
10544b87433SJohn Marino
10644b87433SJohn Marino #include "mbchar.h"
10744b87433SJohn Marino #include "strnlen1.h"
10844b87433SJohn Marino
109*6ea1f93eSDaniel Fojt #ifndef _GL_INLINE_HEADER_BEGIN
110*6ea1f93eSDaniel Fojt #error "Please include config.h first."
111*6ea1f93eSDaniel Fojt #endif
1124536c563SJohn Marino _GL_INLINE_HEADER_BEGIN
1134536c563SJohn Marino #ifndef MBUITER_INLINE
1144536c563SJohn Marino # define MBUITER_INLINE _GL_INLINE
1154536c563SJohn Marino #endif
1164536c563SJohn Marino
11744b87433SJohn Marino struct mbuiter_multi
11844b87433SJohn Marino {
11944b87433SJohn Marino bool in_shift; /* true if next byte may not be interpreted as ASCII */
12044b87433SJohn Marino mbstate_t state; /* if in_shift: current shift state */
12144b87433SJohn Marino bool next_done; /* true if mbui_avail has already filled the following */
12244b87433SJohn Marino struct mbchar cur; /* the current character:
12344b87433SJohn Marino const char *cur.ptr pointer to current character
12444b87433SJohn Marino The following are only valid after mbui_avail.
12544b87433SJohn Marino size_t cur.bytes number of bytes of current character
12644b87433SJohn Marino bool cur.wc_valid true if wc is a valid wide character
12744b87433SJohn Marino wchar_t cur.wc if wc_valid: the current character
12844b87433SJohn Marino */
12944b87433SJohn Marino };
13044b87433SJohn Marino
1314536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_next(struct mbuiter_multi * iter)13244b87433SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
13344b87433SJohn Marino {
13444b87433SJohn Marino if (iter->next_done)
13544b87433SJohn Marino return;
13644b87433SJohn Marino if (iter->in_shift)
13744b87433SJohn Marino goto with_shift;
13844b87433SJohn Marino /* Handle most ASCII characters quickly, without calling mbrtowc(). */
13944b87433SJohn Marino if (is_basic (*iter->cur.ptr))
14044b87433SJohn Marino {
14144b87433SJohn Marino /* These characters are part of the basic character set. ISO C 99
14244b87433SJohn Marino guarantees that their wide character code is identical to their
14344b87433SJohn Marino char code. */
14444b87433SJohn Marino iter->cur.bytes = 1;
14544b87433SJohn Marino iter->cur.wc = *iter->cur.ptr;
14644b87433SJohn Marino iter->cur.wc_valid = true;
14744b87433SJohn Marino }
14844b87433SJohn Marino else
14944b87433SJohn Marino {
15044b87433SJohn Marino assert (mbsinit (&iter->state));
15144b87433SJohn Marino iter->in_shift = true;
15244b87433SJohn Marino with_shift:
15344b87433SJohn Marino iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
15444b87433SJohn Marino strnlen1 (iter->cur.ptr, MB_CUR_MAX),
15544b87433SJohn Marino &iter->state);
15644b87433SJohn Marino if (iter->cur.bytes == (size_t) -1)
15744b87433SJohn Marino {
15844b87433SJohn Marino /* An invalid multibyte sequence was encountered. */
15944b87433SJohn Marino iter->cur.bytes = 1;
16044b87433SJohn Marino iter->cur.wc_valid = false;
16144b87433SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
16244b87433SJohn Marino or not is not very important; the string is bogus anyway. */
16344b87433SJohn Marino }
16444b87433SJohn Marino else if (iter->cur.bytes == (size_t) -2)
16544b87433SJohn Marino {
16644b87433SJohn Marino /* An incomplete multibyte character at the end. */
16744b87433SJohn Marino iter->cur.bytes = strlen (iter->cur.ptr);
16844b87433SJohn Marino iter->cur.wc_valid = false;
16944b87433SJohn Marino /* Whether to set iter->in_shift = false and reset iter->state
17044b87433SJohn Marino or not is not important; the string end is reached anyway. */
17144b87433SJohn Marino }
17244b87433SJohn Marino else
17344b87433SJohn Marino {
17444b87433SJohn Marino if (iter->cur.bytes == 0)
17544b87433SJohn Marino {
17644b87433SJohn Marino /* A null wide character was encountered. */
17744b87433SJohn Marino iter->cur.bytes = 1;
17844b87433SJohn Marino assert (*iter->cur.ptr == '\0');
17944b87433SJohn Marino assert (iter->cur.wc == 0);
18044b87433SJohn Marino }
18144b87433SJohn Marino iter->cur.wc_valid = true;
18244b87433SJohn Marino
18344b87433SJohn Marino /* When in the initial state, we can go back treating ASCII
18444b87433SJohn Marino characters more quickly. */
18544b87433SJohn Marino if (mbsinit (&iter->state))
18644b87433SJohn Marino iter->in_shift = false;
18744b87433SJohn Marino }
18844b87433SJohn Marino }
18944b87433SJohn Marino iter->next_done = true;
19044b87433SJohn Marino }
19144b87433SJohn Marino
1924536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)19344b87433SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
19444b87433SJohn Marino {
19544b87433SJohn Marino iter->cur.ptr += ptrdiff;
19644b87433SJohn Marino }
19744b87433SJohn Marino
1984536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_copy(struct mbuiter_multi * new_iter,const struct mbuiter_multi * old_iter)19944b87433SJohn Marino mbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter)
20044b87433SJohn Marino {
20144b87433SJohn Marino if ((new_iter->in_shift = old_iter->in_shift))
20244b87433SJohn Marino memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
20344b87433SJohn Marino else
20444b87433SJohn Marino memset (&new_iter->state, 0, sizeof (mbstate_t));
20544b87433SJohn Marino new_iter->next_done = old_iter->next_done;
20644b87433SJohn Marino mb_copy (&new_iter->cur, &old_iter->cur);
20744b87433SJohn Marino }
20844b87433SJohn Marino
20944b87433SJohn Marino /* Iteration macros. */
21044b87433SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
21144b87433SJohn Marino #define mbui_init(iter, startptr) \
21244b87433SJohn Marino ((iter).cur.ptr = (startptr), \
21344b87433SJohn Marino (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
21444b87433SJohn Marino (iter).next_done = false)
21544b87433SJohn Marino #define mbui_avail(iter) \
21644b87433SJohn Marino (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
21744b87433SJohn Marino #define mbui_advance(iter) \
21844b87433SJohn Marino ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21944b87433SJohn Marino
22044b87433SJohn Marino /* Access to the current character. */
22144b87433SJohn Marino #define mbui_cur(iter) (iter).cur
22244b87433SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
22344b87433SJohn Marino
22444b87433SJohn Marino /* Relocation. */
22544b87433SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
22644b87433SJohn Marino
22744b87433SJohn Marino /* Copying an iterator. */
22844b87433SJohn Marino #define mbui_copy mbuiter_multi_copy
22944b87433SJohn Marino
2304536c563SJohn Marino _GL_INLINE_HEADER_END
2314536c563SJohn Marino
23244b87433SJohn Marino #endif /* _MBUITER_H */
233