1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Convert multibyte character to wide character.
4    Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc.
5    Written by Bruno Haible <bruno@clisp.org>, 2008.
6 
7    This program is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
19 
20 #include <config.h>
21 
22 /* Specification.  */
23 #include <wchar.h>
24 
25 #if C_LOCALE_MAYBE_EILSEQ
26 # include "hard-locale.h"
27 # include <locale.h>
28 #endif
29 
30 #if GNULIB_defined_mbstate_t
31 /* Implement mbrtowc() on top of mbtowc().  */
32 
33 # include <errno.h>
34 # include <stdlib.h>
35 
36 # include "localcharset.h"
37 # include "streq.h"
38 # include "verify.h"
39 
40 # ifndef FALLTHROUGH
41 #  if __GNUC__ < 7
42 #   define FALLTHROUGH ((void) 0)
43 #  else
44 #   define FALLTHROUGH __attribute__ ((__fallthrough__))
45 #  endif
46 # endif
47 
48 /* Returns a classification of special values of the encoding of the current
49    locale.  */
50 typedef enum {
51   enc_other,      /* other */
52   enc_utf8,       /* UTF-8 */
53   enc_eucjp,      /* EUC-JP */
54   enc_94,         /* EUC-KR, GB2312, BIG5 */
55   enc_euctw,      /* EUC-TW */
56   enc_gb18030,    /* GB18030 */
57   enc_sjis        /* SJIS */
58 } enc_t;
59 static inline enc_t
locale_enc(void)60 locale_enc (void)
61 {
62   const char *encoding = locale_charset ();
63   if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
64     return enc_utf8;
65   if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
66     return enc_eucjp;
67   if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
68       || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
69       || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
70     return enc_94;
71   if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
72     return enc_euctw;
73   if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
74     return enc_gb18030;
75   if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
76     return enc_sjis;
77   return enc_other;
78 }
79 
80 #if GNULIB_WCHAR_SINGLE
81 /* When we know that the locale does not change, provide a speedup by
82    caching the value of locale_enc.  */
83 static int cached_locale_enc = -1;
84 static inline enc_t
locale_enc_cached(void)85 locale_enc_cached (void)
86 {
87   if (cached_locale_enc < 0)
88     cached_locale_enc = locale_enc ();
89   return cached_locale_enc;
90 }
91 #else
92 /* By default, don't make assumptions, hence no caching.  */
93 # define locale_enc_cached locale_enc
94 #endif
95 
96 verify (sizeof (mbstate_t) >= 4);
97 
98 static char internal_state[4];
99 
100 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)101 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
102 {
103   char *pstate = (char *)ps;
104 
105   if (s == NULL)
106     {
107       pwc = NULL;
108       s = "";
109       n = 1;
110     }
111 
112   if (n == 0)
113     return (size_t)(-2);
114 
115   /* Here n > 0.  */
116 
117   if (pstate == NULL)
118     pstate = internal_state;
119 
120   {
121     size_t nstate = pstate[0];
122     char buf[4];
123     const char *p;
124     size_t m;
125 
126     switch (nstate)
127       {
128       case 0:
129         p = s;
130         m = n;
131         break;
132       case 3:
133         buf[2] = pstate[3];
134         FALLTHROUGH;
135       case 2:
136         buf[1] = pstate[2];
137         FALLTHROUGH;
138       case 1:
139         buf[0] = pstate[1];
140         p = buf;
141         m = nstate;
142         buf[m++] = s[0];
143         if (n >= 2 && m < 4)
144           {
145             buf[m++] = s[1];
146             if (n >= 3 && m < 4)
147               buf[m++] = s[2];
148           }
149         break;
150       default:
151         errno = EINVAL;
152         return (size_t)(-1);
153       }
154 
155     /* Here m > 0.  */
156 
157 # if __GLIBC__ || defined __UCLIBC__
158     /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
159     mbtowc (NULL, NULL, 0);
160 # endif
161     {
162       int res = mbtowc (pwc, p, m);
163 
164       if (res >= 0)
165         {
166           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
167             abort ();
168           if (nstate >= (res > 0 ? res : 1))
169             abort ();
170           res -= nstate;
171           pstate[0] = 0;
172           return res;
173         }
174 
175       /* mbtowc does not distinguish between invalid and incomplete multibyte
176          sequences.  But mbrtowc needs to make this distinction.
177          There are two possible approaches:
178            - Use iconv() and its return value.
179            - Use built-in knowledge about the possible encodings.
180          Given the low quality of implementation of iconv() on the systems that
181          lack mbrtowc(), we use the second approach.
182          The possible encodings are:
183            - 8-bit encodings,
184            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
185            - UTF-8.
186          Use specialized code for each.  */
187       if (m >= 4 || m >= MB_CUR_MAX)
188         goto invalid;
189       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
190       switch (locale_enc_cached ())
191         {
192         case enc_utf8: /* UTF-8 */
193           {
194             /* Cf. unistr/u8-mblen.c.  */
195             unsigned char c = (unsigned char) p[0];
196 
197             if (c >= 0xc2)
198               {
199                 if (c < 0xe0)
200                   {
201                     if (m == 1)
202                       goto incomplete;
203                   }
204                 else if (c < 0xf0)
205                   {
206                     if (m == 1)
207                       goto incomplete;
208                     if (m == 2)
209                       {
210                         unsigned char c2 = (unsigned char) p[1];
211 
212                         if ((c2 ^ 0x80) < 0x40
213                             && (c >= 0xe1 || c2 >= 0xa0)
214                             && (c != 0xed || c2 < 0xa0))
215                           goto incomplete;
216                       }
217                   }
218                 else if (c <= 0xf4)
219                   {
220                     if (m == 1)
221                       goto incomplete;
222                     else /* m == 2 || m == 3 */
223                       {
224                         unsigned char c2 = (unsigned char) p[1];
225 
226                         if ((c2 ^ 0x80) < 0x40
227                             && (c >= 0xf1 || c2 >= 0x90)
228                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
229                           {
230                             if (m == 2)
231                               goto incomplete;
232                             else /* m == 3 */
233                               {
234                                 unsigned char c3 = (unsigned char) p[2];
235 
236                                 if ((c3 ^ 0x80) < 0x40)
237                                   goto incomplete;
238                               }
239                           }
240                       }
241                   }
242               }
243             goto invalid;
244           }
245 
246         /* As a reference for this code, you can use the GNU libiconv
247            implementation.  Look for uses of the RET_TOOFEW macro.  */
248 
249         case enc_eucjp: /* EUC-JP */
250           {
251             if (m == 1)
252               {
253                 unsigned char c = (unsigned char) p[0];
254 
255                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
256                   goto incomplete;
257               }
258             if (m == 2)
259               {
260                 unsigned char c = (unsigned char) p[0];
261 
262                 if (c == 0x8f)
263                   {
264                     unsigned char c2 = (unsigned char) p[1];
265 
266                     if (c2 >= 0xa1 && c2 < 0xff)
267                       goto incomplete;
268                   }
269               }
270             goto invalid;
271           }
272 
273         case enc_94: /* EUC-KR, GB2312, BIG5 */
274           {
275             if (m == 1)
276               {
277                 unsigned char c = (unsigned char) p[0];
278 
279                 if (c >= 0xa1 && c < 0xff)
280                   goto incomplete;
281               }
282             goto invalid;
283           }
284 
285         case enc_euctw: /* EUC-TW */
286           {
287             if (m == 1)
288               {
289                 unsigned char c = (unsigned char) p[0];
290 
291                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
292                   goto incomplete;
293               }
294             else /* m == 2 || m == 3 */
295               {
296                 unsigned char c = (unsigned char) p[0];
297 
298                 if (c == 0x8e)
299                   goto incomplete;
300               }
301             goto invalid;
302           }
303 
304         case enc_gb18030: /* GB18030 */
305           {
306             if (m == 1)
307               {
308                 unsigned char c = (unsigned char) p[0];
309 
310                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
311                   goto incomplete;
312               }
313             else /* m == 2 || m == 3 */
314               {
315                 unsigned char c = (unsigned char) p[0];
316 
317                 if (c >= 0x90 && c <= 0xe3)
318                   {
319                     unsigned char c2 = (unsigned char) p[1];
320 
321                     if (c2 >= 0x30 && c2 <= 0x39)
322                       {
323                         if (m == 2)
324                           goto incomplete;
325                         else /* m == 3 */
326                           {
327                             unsigned char c3 = (unsigned char) p[2];
328 
329                             if (c3 >= 0x81 && c3 <= 0xfe)
330                               goto incomplete;
331                           }
332                       }
333                   }
334               }
335             goto invalid;
336           }
337 
338         case enc_sjis: /* SJIS */
339           {
340             if (m == 1)
341               {
342                 unsigned char c = (unsigned char) p[0];
343 
344                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
345                     || (c >= 0xf0 && c <= 0xf9))
346                   goto incomplete;
347               }
348             goto invalid;
349           }
350 
351         default:
352           /* An unknown multibyte encoding.  */
353           goto incomplete;
354         }
355 
356      incomplete:
357       {
358         size_t k = nstate;
359         /* Here 0 <= k < m < 4.  */
360         pstate[++k] = s[0];
361         if (k < m)
362           {
363             pstate[++k] = s[1];
364             if (k < m)
365               pstate[++k] = s[2];
366           }
367         if (k != m)
368           abort ();
369       }
370       pstate[0] = m;
371       return (size_t)(-2);
372 
373      invalid:
374       errno = EILSEQ;
375       /* The conversion state is undefined, says POSIX.  */
376       return (size_t)(-1);
377     }
378   }
379 }
380 
381 #else
382 /* Override the system's mbrtowc() function.  */
383 
384 # undef mbrtowc
385 
386 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)387 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
388 {
389   size_t ret;
390   wchar_t wc;
391 
392 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
393   if (s == NULL)
394     {
395       pwc = NULL;
396       s = "";
397       n = 1;
398     }
399 # endif
400 
401 # if MBRTOWC_EMPTY_INPUT_BUG
402   if (n == 0)
403     return (size_t) -2;
404 # endif
405 
406   if (! pwc)
407     pwc = &wc;
408 
409 # if MBRTOWC_RETVAL_BUG
410   {
411     static mbstate_t internal_state;
412 
413     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
414        hidden internal state, but we can call it on our variable.  */
415     if (ps == NULL)
416       ps = &internal_state;
417 
418     if (!mbsinit (ps))
419       {
420         /* Parse the rest of the multibyte character byte for byte.  */
421         size_t count = 0;
422         for (; n > 0; s++, n--)
423           {
424             ret = mbrtowc (&wc, s, 1, ps);
425 
426             if (ret == (size_t)(-1))
427               return (size_t)(-1);
428             count++;
429             if (ret != (size_t)(-2))
430               {
431                 /* The multibyte character has been completed.  */
432                 *pwc = wc;
433                 return (wc == 0 ? 0 : count);
434               }
435           }
436         return (size_t)(-2);
437       }
438   }
439 # endif
440 
441   ret = mbrtowc (pwc, s, n, ps);
442 
443 # if MBRTOWC_NUL_RETVAL_BUG
444   if (ret < (size_t) -2 && !*pwc)
445     return 0;
446 # endif
447 
448 # if C_LOCALE_MAYBE_EILSEQ
449   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
450     {
451       unsigned char uc = *s;
452       *pwc = uc;
453       return 1;
454     }
455 # endif
456 
457   return ret;
458 }
459 
460 #endif
461