xref: /dragonfly/contrib/grep/lib/mbuiter.h (revision 09d4459f)
195b7b453SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*09d4459fSDaniel Fojt    Copyright (C) 2001, 2005, 2007, 2009-2020 Free Software Foundation, Inc.
395b7b453SJohn Marino 
495b7b453SJohn Marino    This program is free software: you can redistribute it and/or modify
595b7b453SJohn Marino    it under the terms of the GNU General Public License as published by
695b7b453SJohn Marino    the Free Software Foundation; either version 3 of the License, or
795b7b453SJohn Marino    (at your option) any later version.
895b7b453SJohn Marino 
995b7b453SJohn Marino    This program is distributed in the hope that it will be useful,
1095b7b453SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1195b7b453SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1295b7b453SJohn Marino    GNU General Public License for more details.
1395b7b453SJohn Marino 
1495b7b453SJohn Marino    You should have received a copy of the GNU General Public License
15*09d4459fSDaniel Fojt    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
1695b7b453SJohn Marino 
1795b7b453SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>.  */
1895b7b453SJohn Marino 
1995b7b453SJohn Marino /* The macros in this file implement forward iteration through a
2095b7b453SJohn Marino    multi-byte string, without knowing its length a-priori.
2195b7b453SJohn Marino 
2295b7b453SJohn Marino    With these macros, an iteration loop that looks like
2395b7b453SJohn Marino 
2495b7b453SJohn Marino       char *iter;
2595b7b453SJohn Marino       for (iter = buf; *iter != '\0'; iter++)
2695b7b453SJohn Marino         {
2795b7b453SJohn Marino           do_something (*iter);
2895b7b453SJohn Marino         }
2995b7b453SJohn Marino 
3095b7b453SJohn Marino    becomes
3195b7b453SJohn Marino 
3295b7b453SJohn Marino       mbui_iterator_t iter;
3395b7b453SJohn Marino       for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3495b7b453SJohn Marino         {
3595b7b453SJohn Marino           do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3695b7b453SJohn Marino         }
3795b7b453SJohn Marino 
3895b7b453SJohn Marino    The benefit of these macros over plain use of mbrtowc is:
3995b7b453SJohn Marino    - Handling of invalid multibyte sequences is possible without
4095b7b453SJohn Marino      making the code more complicated, while still preserving the
4195b7b453SJohn Marino      invalid multibyte sequences.
4295b7b453SJohn Marino 
4395b7b453SJohn Marino    Compared to mbiter.h, the macros here don't need to know the string's
4495b7b453SJohn Marino    length a-priori.  The downside is that at each step, the look-ahead
4595b7b453SJohn Marino    that guards against overrunning the terminating '\0' is more expensive.
4695b7b453SJohn Marino    The mbui_* macros are therefore suitable when there is a high probability
4795b7b453SJohn Marino    that only the first few multibyte characters need to be inspected.
4895b7b453SJohn Marino    Whereas the mbi_* macros are better if usually the iteration runs
4995b7b453SJohn Marino    through the entire string.
5095b7b453SJohn Marino 
5195b7b453SJohn Marino    mbui_iterator_t
5295b7b453SJohn Marino      is a type usable for variable declarations.
5395b7b453SJohn Marino 
5495b7b453SJohn Marino    mbui_init (iter, startptr)
5595b7b453SJohn Marino      initializes the iterator, starting at startptr.
5695b7b453SJohn Marino 
5795b7b453SJohn Marino    mbui_avail (iter)
58cf28ed85SJohn Marino      returns true if there are more multibyte characters available before
5995b7b453SJohn Marino      the end of string is reached. In this case, mbui_cur (iter) is
60cf28ed85SJohn Marino      initialized to the next multibyte character.
6195b7b453SJohn Marino 
6295b7b453SJohn Marino    mbui_advance (iter)
6395b7b453SJohn Marino      advances the iterator by one multibyte character.
6495b7b453SJohn Marino 
6595b7b453SJohn Marino    mbui_cur (iter)
6695b7b453SJohn Marino      returns the current multibyte character, of type mbchar_t.  All the
6795b7b453SJohn Marino      macros defined in mbchar.h can be used on it.
6895b7b453SJohn Marino 
6995b7b453SJohn Marino    mbui_cur_ptr (iter)
7095b7b453SJohn Marino      return a pointer to the beginning of the current multibyte character.
7195b7b453SJohn Marino 
7295b7b453SJohn Marino    mbui_reloc (iter, ptrdiff)
7395b7b453SJohn Marino      relocates iterator when the string is moved by ptrdiff bytes.
7495b7b453SJohn Marino 
7595b7b453SJohn Marino    mbui_copy (&destiter, &srciter)
7695b7b453SJohn Marino      copies srciter to destiter.
7795b7b453SJohn Marino 
7895b7b453SJohn Marino    Here are the function prototypes of the macros.
7995b7b453SJohn Marino 
8095b7b453SJohn Marino    extern void          mbui_init (mbui_iterator_t iter, const char *startptr);
8195b7b453SJohn Marino    extern bool          mbui_avail (mbui_iterator_t iter);
8295b7b453SJohn Marino    extern void          mbui_advance (mbui_iterator_t iter);
8395b7b453SJohn Marino    extern mbchar_t      mbui_cur (mbui_iterator_t iter);
8495b7b453SJohn Marino    extern const char *  mbui_cur_ptr (mbui_iterator_t iter);
8595b7b453SJohn Marino    extern void          mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8695b7b453SJohn Marino    extern void          mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old);
8795b7b453SJohn Marino  */
8895b7b453SJohn Marino 
8995b7b453SJohn Marino #ifndef _MBUITER_H
9095b7b453SJohn Marino #define _MBUITER_H 1
9195b7b453SJohn Marino 
9295b7b453SJohn Marino #include <assert.h>
9395b7b453SJohn Marino #include <stdbool.h>
9495b7b453SJohn Marino #include <stddef.h>
9595b7b453SJohn Marino #include <stdlib.h>
9695b7b453SJohn Marino #include <string.h>
9795b7b453SJohn Marino 
9895b7b453SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9995b7b453SJohn Marino    <wchar.h>.
10095b7b453SJohn Marino    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
10195b7b453SJohn Marino    <wchar.h>.  */
10295b7b453SJohn Marino #include <stdio.h>
10395b7b453SJohn Marino #include <time.h>
10495b7b453SJohn Marino #include <wchar.h>
10595b7b453SJohn Marino 
10695b7b453SJohn Marino #include "mbchar.h"
10795b7b453SJohn Marino #include "strnlen1.h"
10895b7b453SJohn Marino 
109680a9cb8SJohn Marino #ifndef _GL_INLINE_HEADER_BEGIN
110680a9cb8SJohn Marino  #error "Please include config.h first."
111680a9cb8SJohn Marino #endif
112680a9cb8SJohn Marino _GL_INLINE_HEADER_BEGIN
113680a9cb8SJohn Marino #ifndef MBUITER_INLINE
114680a9cb8SJohn Marino # define MBUITER_INLINE _GL_INLINE
115680a9cb8SJohn Marino #endif
116680a9cb8SJohn Marino 
11795b7b453SJohn Marino struct mbuiter_multi
11895b7b453SJohn Marino {
11995b7b453SJohn Marino   bool in_shift;        /* true if next byte may not be interpreted as ASCII */
12095b7b453SJohn Marino   mbstate_t state;      /* if in_shift: current shift state */
12195b7b453SJohn Marino   bool next_done;       /* true if mbui_avail has already filled the following */
12295b7b453SJohn Marino   struct mbchar cur;    /* the current character:
12395b7b453SJohn Marino         const char *cur.ptr             pointer to current character
12495b7b453SJohn Marino         The following are only valid after mbui_avail.
12595b7b453SJohn Marino         size_t cur.bytes                number of bytes of current character
12695b7b453SJohn Marino         bool cur.wc_valid               true if wc is a valid wide character
12795b7b453SJohn Marino         wchar_t cur.wc                  if wc_valid: the current character
12895b7b453SJohn Marino         */
12995b7b453SJohn Marino };
13095b7b453SJohn Marino 
131680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_next(struct mbuiter_multi * iter)13295b7b453SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
13395b7b453SJohn Marino {
13495b7b453SJohn Marino   if (iter->next_done)
13595b7b453SJohn Marino     return;
13695b7b453SJohn Marino   if (iter->in_shift)
13795b7b453SJohn Marino     goto with_shift;
13895b7b453SJohn Marino   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
13995b7b453SJohn Marino   if (is_basic (*iter->cur.ptr))
14095b7b453SJohn Marino     {
14195b7b453SJohn Marino       /* These characters are part of the basic character set.  ISO C 99
14295b7b453SJohn Marino          guarantees that their wide character code is identical to their
14395b7b453SJohn Marino          char code.  */
14495b7b453SJohn Marino       iter->cur.bytes = 1;
14595b7b453SJohn Marino       iter->cur.wc = *iter->cur.ptr;
14695b7b453SJohn Marino       iter->cur.wc_valid = true;
14795b7b453SJohn Marino     }
14895b7b453SJohn Marino   else
14995b7b453SJohn Marino     {
15095b7b453SJohn Marino       assert (mbsinit (&iter->state));
15195b7b453SJohn Marino       iter->in_shift = true;
15295b7b453SJohn Marino     with_shift:
15395b7b453SJohn Marino       iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
15495b7b453SJohn Marino                                  strnlen1 (iter->cur.ptr, MB_CUR_MAX),
15595b7b453SJohn Marino                                  &iter->state);
15695b7b453SJohn Marino       if (iter->cur.bytes == (size_t) -1)
15795b7b453SJohn Marino         {
15895b7b453SJohn Marino           /* An invalid multibyte sequence was encountered.  */
15995b7b453SJohn Marino           iter->cur.bytes = 1;
16095b7b453SJohn Marino           iter->cur.wc_valid = false;
16195b7b453SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
16295b7b453SJohn Marino              or not is not very important; the string is bogus anyway.  */
16395b7b453SJohn Marino         }
16495b7b453SJohn Marino       else if (iter->cur.bytes == (size_t) -2)
16595b7b453SJohn Marino         {
16695b7b453SJohn Marino           /* An incomplete multibyte character at the end.  */
16795b7b453SJohn Marino           iter->cur.bytes = strlen (iter->cur.ptr);
16895b7b453SJohn Marino           iter->cur.wc_valid = false;
16995b7b453SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
17095b7b453SJohn Marino              or not is not important; the string end is reached anyway.  */
17195b7b453SJohn Marino         }
17295b7b453SJohn Marino       else
17395b7b453SJohn Marino         {
17495b7b453SJohn Marino           if (iter->cur.bytes == 0)
17595b7b453SJohn Marino             {
17695b7b453SJohn Marino               /* A null wide character was encountered.  */
17795b7b453SJohn Marino               iter->cur.bytes = 1;
17895b7b453SJohn Marino               assert (*iter->cur.ptr == '\0');
17995b7b453SJohn Marino               assert (iter->cur.wc == 0);
18095b7b453SJohn Marino             }
18195b7b453SJohn Marino           iter->cur.wc_valid = true;
18295b7b453SJohn Marino 
18395b7b453SJohn Marino           /* When in the initial state, we can go back treating ASCII
18495b7b453SJohn Marino              characters more quickly.  */
18595b7b453SJohn Marino           if (mbsinit (&iter->state))
18695b7b453SJohn Marino             iter->in_shift = false;
18795b7b453SJohn Marino         }
18895b7b453SJohn Marino     }
18995b7b453SJohn Marino   iter->next_done = true;
19095b7b453SJohn Marino }
19195b7b453SJohn Marino 
192680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)19395b7b453SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
19495b7b453SJohn Marino {
19595b7b453SJohn Marino   iter->cur.ptr += ptrdiff;
19695b7b453SJohn Marino }
19795b7b453SJohn Marino 
198680a9cb8SJohn Marino MBUITER_INLINE void
mbuiter_multi_copy(struct mbuiter_multi * new_iter,const struct mbuiter_multi * old_iter)19995b7b453SJohn Marino mbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter)
20095b7b453SJohn Marino {
20195b7b453SJohn Marino   if ((new_iter->in_shift = old_iter->in_shift))
20295b7b453SJohn Marino     memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
20395b7b453SJohn Marino   else
20495b7b453SJohn Marino     memset (&new_iter->state, 0, sizeof (mbstate_t));
20595b7b453SJohn Marino   new_iter->next_done = old_iter->next_done;
20695b7b453SJohn Marino   mb_copy (&new_iter->cur, &old_iter->cur);
20795b7b453SJohn Marino }
20895b7b453SJohn Marino 
20995b7b453SJohn Marino /* Iteration macros.  */
21095b7b453SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
21195b7b453SJohn Marino #define mbui_init(iter, startptr) \
21295b7b453SJohn Marino   ((iter).cur.ptr = (startptr), \
21395b7b453SJohn Marino    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
21495b7b453SJohn Marino    (iter).next_done = false)
21595b7b453SJohn Marino #define mbui_avail(iter) \
21695b7b453SJohn Marino   (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
21795b7b453SJohn Marino #define mbui_advance(iter) \
21895b7b453SJohn Marino   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21995b7b453SJohn Marino 
22095b7b453SJohn Marino /* Access to the current character.  */
22195b7b453SJohn Marino #define mbui_cur(iter) (iter).cur
22295b7b453SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
22395b7b453SJohn Marino 
22495b7b453SJohn Marino /* Relocation.  */
22595b7b453SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
22695b7b453SJohn Marino 
22795b7b453SJohn Marino /* Copying an iterator.  */
22895b7b453SJohn Marino #define mbui_copy mbuiter_multi_copy
22995b7b453SJohn Marino 
230680a9cb8SJohn Marino _GL_INLINE_HEADER_END
231680a9cb8SJohn Marino 
23295b7b453SJohn Marino #endif /* _MBUITER_H */
233