xref: /dragonfly/contrib/diffutils/lib/mbuiter.h (revision 6ea1f93e)
144b87433SJohn Marino /* Iterating through multibyte strings: macros for multi-byte encodings.
2*6ea1f93eSDaniel Fojt    Copyright (C) 2001, 2005, 2007, 2009-2018 Free Software Foundation, Inc.
344b87433SJohn Marino 
444b87433SJohn Marino    This program is free software: you can redistribute it and/or modify
544b87433SJohn Marino    it under the terms of the GNU General Public License as published by
644b87433SJohn Marino    the Free Software Foundation; either version 3 of the License, or
744b87433SJohn Marino    (at your option) any later version.
844b87433SJohn Marino 
944b87433SJohn Marino    This program is distributed in the hope that it will be useful,
1044b87433SJohn Marino    but WITHOUT ANY WARRANTY; without even the implied warranty of
1144b87433SJohn Marino    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1244b87433SJohn Marino    GNU General Public License for more details.
1344b87433SJohn Marino 
1444b87433SJohn Marino    You should have received a copy of the GNU General Public License
15*6ea1f93eSDaniel Fojt    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
1644b87433SJohn Marino 
1744b87433SJohn Marino /* Written by Bruno Haible <bruno@clisp.org>.  */
1844b87433SJohn Marino 
1944b87433SJohn Marino /* The macros in this file implement forward iteration through a
2044b87433SJohn Marino    multi-byte string, without knowing its length a-priori.
2144b87433SJohn Marino 
2244b87433SJohn Marino    With these macros, an iteration loop that looks like
2344b87433SJohn Marino 
2444b87433SJohn Marino       char *iter;
2544b87433SJohn Marino       for (iter = buf; *iter != '\0'; iter++)
2644b87433SJohn Marino         {
2744b87433SJohn Marino           do_something (*iter);
2844b87433SJohn Marino         }
2944b87433SJohn Marino 
3044b87433SJohn Marino    becomes
3144b87433SJohn Marino 
3244b87433SJohn Marino       mbui_iterator_t iter;
3344b87433SJohn Marino       for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3444b87433SJohn Marino         {
3544b87433SJohn Marino           do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3644b87433SJohn Marino         }
3744b87433SJohn Marino 
3844b87433SJohn Marino    The benefit of these macros over plain use of mbrtowc is:
3944b87433SJohn Marino    - Handling of invalid multibyte sequences is possible without
4044b87433SJohn Marino      making the code more complicated, while still preserving the
4144b87433SJohn Marino      invalid multibyte sequences.
4244b87433SJohn Marino 
4344b87433SJohn Marino    Compared to mbiter.h, the macros here don't need to know the string's
4444b87433SJohn Marino    length a-priori.  The downside is that at each step, the look-ahead
4544b87433SJohn Marino    that guards against overrunning the terminating '\0' is more expensive.
4644b87433SJohn Marino    The mbui_* macros are therefore suitable when there is a high probability
4744b87433SJohn Marino    that only the first few multibyte characters need to be inspected.
4844b87433SJohn Marino    Whereas the mbi_* macros are better if usually the iteration runs
4944b87433SJohn Marino    through the entire string.
5044b87433SJohn Marino 
5144b87433SJohn Marino    mbui_iterator_t
5244b87433SJohn Marino      is a type usable for variable declarations.
5344b87433SJohn Marino 
5444b87433SJohn Marino    mbui_init (iter, startptr)
5544b87433SJohn Marino      initializes the iterator, starting at startptr.
5644b87433SJohn Marino 
5744b87433SJohn Marino    mbui_avail (iter)
584536c563SJohn Marino      returns true if there are more multibyte characters available before
5944b87433SJohn Marino      the end of string is reached. In this case, mbui_cur (iter) is
604536c563SJohn Marino      initialized to the next multibyte character.
6144b87433SJohn Marino 
6244b87433SJohn Marino    mbui_advance (iter)
6344b87433SJohn Marino      advances the iterator by one multibyte character.
6444b87433SJohn Marino 
6544b87433SJohn Marino    mbui_cur (iter)
6644b87433SJohn Marino      returns the current multibyte character, of type mbchar_t.  All the
6744b87433SJohn Marino      macros defined in mbchar.h can be used on it.
6844b87433SJohn Marino 
6944b87433SJohn Marino    mbui_cur_ptr (iter)
7044b87433SJohn Marino      return a pointer to the beginning of the current multibyte character.
7144b87433SJohn Marino 
7244b87433SJohn Marino    mbui_reloc (iter, ptrdiff)
7344b87433SJohn Marino      relocates iterator when the string is moved by ptrdiff bytes.
7444b87433SJohn Marino 
7544b87433SJohn Marino    mbui_copy (&destiter, &srciter)
7644b87433SJohn Marino      copies srciter to destiter.
7744b87433SJohn Marino 
7844b87433SJohn Marino    Here are the function prototypes of the macros.
7944b87433SJohn Marino 
8044b87433SJohn Marino    extern void          mbui_init (mbui_iterator_t iter, const char *startptr);
8144b87433SJohn Marino    extern bool          mbui_avail (mbui_iterator_t iter);
8244b87433SJohn Marino    extern void          mbui_advance (mbui_iterator_t iter);
8344b87433SJohn Marino    extern mbchar_t      mbui_cur (mbui_iterator_t iter);
8444b87433SJohn Marino    extern const char *  mbui_cur_ptr (mbui_iterator_t iter);
8544b87433SJohn Marino    extern void          mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8644b87433SJohn Marino    extern void          mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old);
8744b87433SJohn Marino  */
8844b87433SJohn Marino 
8944b87433SJohn Marino #ifndef _MBUITER_H
9044b87433SJohn Marino #define _MBUITER_H 1
9144b87433SJohn Marino 
9244b87433SJohn Marino #include <assert.h>
9344b87433SJohn Marino #include <stdbool.h>
9444b87433SJohn Marino #include <stddef.h>
9544b87433SJohn Marino #include <stdlib.h>
9644b87433SJohn Marino #include <string.h>
9744b87433SJohn Marino 
9844b87433SJohn Marino /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9944b87433SJohn Marino    <wchar.h>.
10044b87433SJohn Marino    BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
10144b87433SJohn Marino    <wchar.h>.  */
10244b87433SJohn Marino #include <stdio.h>
10344b87433SJohn Marino #include <time.h>
10444b87433SJohn Marino #include <wchar.h>
10544b87433SJohn Marino 
10644b87433SJohn Marino #include "mbchar.h"
10744b87433SJohn Marino #include "strnlen1.h"
10844b87433SJohn Marino 
109*6ea1f93eSDaniel Fojt #ifndef _GL_INLINE_HEADER_BEGIN
110*6ea1f93eSDaniel Fojt  #error "Please include config.h first."
111*6ea1f93eSDaniel Fojt #endif
1124536c563SJohn Marino _GL_INLINE_HEADER_BEGIN
1134536c563SJohn Marino #ifndef MBUITER_INLINE
1144536c563SJohn Marino # define MBUITER_INLINE _GL_INLINE
1154536c563SJohn Marino #endif
1164536c563SJohn Marino 
11744b87433SJohn Marino struct mbuiter_multi
11844b87433SJohn Marino {
11944b87433SJohn Marino   bool in_shift;        /* true if next byte may not be interpreted as ASCII */
12044b87433SJohn Marino   mbstate_t state;      /* if in_shift: current shift state */
12144b87433SJohn Marino   bool next_done;       /* true if mbui_avail has already filled the following */
12244b87433SJohn Marino   struct mbchar cur;    /* the current character:
12344b87433SJohn Marino         const char *cur.ptr             pointer to current character
12444b87433SJohn Marino         The following are only valid after mbui_avail.
12544b87433SJohn Marino         size_t cur.bytes                number of bytes of current character
12644b87433SJohn Marino         bool cur.wc_valid               true if wc is a valid wide character
12744b87433SJohn Marino         wchar_t cur.wc                  if wc_valid: the current character
12844b87433SJohn Marino         */
12944b87433SJohn Marino };
13044b87433SJohn Marino 
1314536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_next(struct mbuiter_multi * iter)13244b87433SJohn Marino mbuiter_multi_next (struct mbuiter_multi *iter)
13344b87433SJohn Marino {
13444b87433SJohn Marino   if (iter->next_done)
13544b87433SJohn Marino     return;
13644b87433SJohn Marino   if (iter->in_shift)
13744b87433SJohn Marino     goto with_shift;
13844b87433SJohn Marino   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
13944b87433SJohn Marino   if (is_basic (*iter->cur.ptr))
14044b87433SJohn Marino     {
14144b87433SJohn Marino       /* These characters are part of the basic character set.  ISO C 99
14244b87433SJohn Marino          guarantees that their wide character code is identical to their
14344b87433SJohn Marino          char code.  */
14444b87433SJohn Marino       iter->cur.bytes = 1;
14544b87433SJohn Marino       iter->cur.wc = *iter->cur.ptr;
14644b87433SJohn Marino       iter->cur.wc_valid = true;
14744b87433SJohn Marino     }
14844b87433SJohn Marino   else
14944b87433SJohn Marino     {
15044b87433SJohn Marino       assert (mbsinit (&iter->state));
15144b87433SJohn Marino       iter->in_shift = true;
15244b87433SJohn Marino     with_shift:
15344b87433SJohn Marino       iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
15444b87433SJohn Marino                                  strnlen1 (iter->cur.ptr, MB_CUR_MAX),
15544b87433SJohn Marino                                  &iter->state);
15644b87433SJohn Marino       if (iter->cur.bytes == (size_t) -1)
15744b87433SJohn Marino         {
15844b87433SJohn Marino           /* An invalid multibyte sequence was encountered.  */
15944b87433SJohn Marino           iter->cur.bytes = 1;
16044b87433SJohn Marino           iter->cur.wc_valid = false;
16144b87433SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
16244b87433SJohn Marino              or not is not very important; the string is bogus anyway.  */
16344b87433SJohn Marino         }
16444b87433SJohn Marino       else if (iter->cur.bytes == (size_t) -2)
16544b87433SJohn Marino         {
16644b87433SJohn Marino           /* An incomplete multibyte character at the end.  */
16744b87433SJohn Marino           iter->cur.bytes = strlen (iter->cur.ptr);
16844b87433SJohn Marino           iter->cur.wc_valid = false;
16944b87433SJohn Marino           /* Whether to set iter->in_shift = false and reset iter->state
17044b87433SJohn Marino              or not is not important; the string end is reached anyway.  */
17144b87433SJohn Marino         }
17244b87433SJohn Marino       else
17344b87433SJohn Marino         {
17444b87433SJohn Marino           if (iter->cur.bytes == 0)
17544b87433SJohn Marino             {
17644b87433SJohn Marino               /* A null wide character was encountered.  */
17744b87433SJohn Marino               iter->cur.bytes = 1;
17844b87433SJohn Marino               assert (*iter->cur.ptr == '\0');
17944b87433SJohn Marino               assert (iter->cur.wc == 0);
18044b87433SJohn Marino             }
18144b87433SJohn Marino           iter->cur.wc_valid = true;
18244b87433SJohn Marino 
18344b87433SJohn Marino           /* When in the initial state, we can go back treating ASCII
18444b87433SJohn Marino              characters more quickly.  */
18544b87433SJohn Marino           if (mbsinit (&iter->state))
18644b87433SJohn Marino             iter->in_shift = false;
18744b87433SJohn Marino         }
18844b87433SJohn Marino     }
18944b87433SJohn Marino   iter->next_done = true;
19044b87433SJohn Marino }
19144b87433SJohn Marino 
1924536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_reloc(struct mbuiter_multi * iter,ptrdiff_t ptrdiff)19344b87433SJohn Marino mbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
19444b87433SJohn Marino {
19544b87433SJohn Marino   iter->cur.ptr += ptrdiff;
19644b87433SJohn Marino }
19744b87433SJohn Marino 
1984536c563SJohn Marino MBUITER_INLINE void
mbuiter_multi_copy(struct mbuiter_multi * new_iter,const struct mbuiter_multi * old_iter)19944b87433SJohn Marino mbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter)
20044b87433SJohn Marino {
20144b87433SJohn Marino   if ((new_iter->in_shift = old_iter->in_shift))
20244b87433SJohn Marino     memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
20344b87433SJohn Marino   else
20444b87433SJohn Marino     memset (&new_iter->state, 0, sizeof (mbstate_t));
20544b87433SJohn Marino   new_iter->next_done = old_iter->next_done;
20644b87433SJohn Marino   mb_copy (&new_iter->cur, &old_iter->cur);
20744b87433SJohn Marino }
20844b87433SJohn Marino 
20944b87433SJohn Marino /* Iteration macros.  */
21044b87433SJohn Marino typedef struct mbuiter_multi mbui_iterator_t;
21144b87433SJohn Marino #define mbui_init(iter, startptr) \
21244b87433SJohn Marino   ((iter).cur.ptr = (startptr), \
21344b87433SJohn Marino    (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
21444b87433SJohn Marino    (iter).next_done = false)
21544b87433SJohn Marino #define mbui_avail(iter) \
21644b87433SJohn Marino   (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
21744b87433SJohn Marino #define mbui_advance(iter) \
21844b87433SJohn Marino   ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21944b87433SJohn Marino 
22044b87433SJohn Marino /* Access to the current character.  */
22144b87433SJohn Marino #define mbui_cur(iter) (iter).cur
22244b87433SJohn Marino #define mbui_cur_ptr(iter) (iter).cur.ptr
22344b87433SJohn Marino 
22444b87433SJohn Marino /* Relocation.  */
22544b87433SJohn Marino #define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
22644b87433SJohn Marino 
22744b87433SJohn Marino /* Copying an iterator.  */
22844b87433SJohn Marino #define mbui_copy mbuiter_multi_copy
22944b87433SJohn Marino 
2304536c563SJohn Marino _GL_INLINE_HEADER_END
2314536c563SJohn Marino 
23244b87433SJohn Marino #endif /* _MBUITER_H */
233