1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2008.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 /* Specification.  */
21 #include <wchar.h>
22 
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
25 # include <locale.h>
26 #endif
27 
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc().  */
30 
31 # include <errno.h>
32 # include <stdlib.h>
33 
34 # include "localcharset.h"
35 # include "streq.h"
36 # include "verify.h"
37 
38 # ifndef FALLTHROUGH
39 #  if __GNUC__ < 7
40 #   define FALLTHROUGH ((void) 0)
41 #  else
42 #   define FALLTHROUGH __attribute__ ((__fallthrough__))
43 #  endif
44 # endif
45 
46 /* Returns a classification of special values of the encoding of the current
47    locale.  */
48 typedef enum {
49   enc_other,      /* other */
50   enc_utf8,       /* UTF-8 */
51   enc_eucjp,      /* EUC-JP */
52   enc_94,         /* EUC-KR, GB2312, BIG5 */
53   enc_euctw,      /* EUC-TW */
54   enc_gb18030,    /* GB18030 */
55   enc_sjis        /* SJIS */
56 } enc_t;
57 static inline enc_t
locale_enc(void)58 locale_enc (void)
59 {
60   const char *encoding = locale_charset ();
61   if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
62     return enc_utf8;
63   if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
64     return enc_eucjp;
65   if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
66       || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
67       || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
68     return enc_94;
69   if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
70     return enc_euctw;
71   if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
72     return enc_gb18030;
73   if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
74     return enc_sjis;
75   return enc_other;
76 }
77 
78 #if GNULIB_WCHAR_SINGLE
79 /* When we know that the locale does not change, provide a speedup by
80    caching the value of locale_enc.  */
81 static int cached_locale_enc = -1;
82 static inline enc_t
locale_enc_cached(void)83 locale_enc_cached (void)
84 {
85   if (cached_locale_enc < 0)
86     cached_locale_enc = locale_enc ();
87   return cached_locale_enc;
88 }
89 #else
90 /* By default, don't make assumptions, hence no caching.  */
91 # define locale_enc_cached locale_enc
92 #endif
93 
94 verify (sizeof (mbstate_t) >= 4);
95 
96 static char internal_state[4];
97 
98 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)99 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
100 {
101   char *pstate = (char *)ps;
102 
103   if (s == NULL)
104     {
105       pwc = NULL;
106       s = "";
107       n = 1;
108     }
109 
110   if (n == 0)
111     return (size_t)(-2);
112 
113   /* Here n > 0.  */
114 
115   if (pstate == NULL)
116     pstate = internal_state;
117 
118   {
119     size_t nstate = pstate[0];
120     char buf[4];
121     const char *p;
122     size_t m;
123 
124     switch (nstate)
125       {
126       case 0:
127         p = s;
128         m = n;
129         break;
130       case 3:
131         buf[2] = pstate[3];
132         FALLTHROUGH;
133       case 2:
134         buf[1] = pstate[2];
135         FALLTHROUGH;
136       case 1:
137         buf[0] = pstate[1];
138         p = buf;
139         m = nstate;
140         buf[m++] = s[0];
141         if (n >= 2 && m < 4)
142           {
143             buf[m++] = s[1];
144             if (n >= 3 && m < 4)
145               buf[m++] = s[2];
146           }
147         break;
148       default:
149         errno = EINVAL;
150         return (size_t)(-1);
151       }
152 
153     /* Here m > 0.  */
154 
155 # if __GLIBC__ || defined __UCLIBC__
156     /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
157     mbtowc (NULL, NULL, 0);
158 # endif
159     {
160       int res = mbtowc (pwc, p, m);
161 
162       if (res >= 0)
163         {
164           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
165             abort ();
166           if (nstate >= (res > 0 ? res : 1))
167             abort ();
168           res -= nstate;
169           pstate[0] = 0;
170           return res;
171         }
172 
173       /* mbtowc does not distinguish between invalid and incomplete multibyte
174          sequences.  But mbrtowc needs to make this distinction.
175          There are two possible approaches:
176            - Use iconv() and its return value.
177            - Use built-in knowledge about the possible encodings.
178          Given the low quality of implementation of iconv() on the systems that
179          lack mbrtowc(), we use the second approach.
180          The possible encodings are:
181            - 8-bit encodings,
182            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
183            - UTF-8.
184          Use specialized code for each.  */
185       if (m >= 4 || m >= MB_CUR_MAX)
186         goto invalid;
187       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
188       switch (locale_enc_cached ())
189         {
190         case enc_utf8: /* UTF-8 */
191           {
192             /* Cf. unistr/u8-mblen.c.  */
193             unsigned char c = (unsigned char) p[0];
194 
195             if (c >= 0xc2)
196               {
197                 if (c < 0xe0)
198                   {
199                     if (m == 1)
200                       goto incomplete;
201                   }
202                 else if (c < 0xf0)
203                   {
204                     if (m == 1)
205                       goto incomplete;
206                     if (m == 2)
207                       {
208                         unsigned char c2 = (unsigned char) p[1];
209 
210                         if ((c2 ^ 0x80) < 0x40
211                             && (c >= 0xe1 || c2 >= 0xa0)
212                             && (c != 0xed || c2 < 0xa0))
213                           goto incomplete;
214                       }
215                   }
216                 else if (c <= 0xf4)
217                   {
218                     if (m == 1)
219                       goto incomplete;
220                     else /* m == 2 || m == 3 */
221                       {
222                         unsigned char c2 = (unsigned char) p[1];
223 
224                         if ((c2 ^ 0x80) < 0x40
225                             && (c >= 0xf1 || c2 >= 0x90)
226                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
227                           {
228                             if (m == 2)
229                               goto incomplete;
230                             else /* m == 3 */
231                               {
232                                 unsigned char c3 = (unsigned char) p[2];
233 
234                                 if ((c3 ^ 0x80) < 0x40)
235                                   goto incomplete;
236                               }
237                           }
238                       }
239                   }
240               }
241             goto invalid;
242           }
243 
244         /* As a reference for this code, you can use the GNU libiconv
245            implementation.  Look for uses of the RET_TOOFEW macro.  */
246 
247         case enc_eucjp: /* EUC-JP */
248           {
249             if (m == 1)
250               {
251                 unsigned char c = (unsigned char) p[0];
252 
253                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
254                   goto incomplete;
255               }
256             if (m == 2)
257               {
258                 unsigned char c = (unsigned char) p[0];
259 
260                 if (c == 0x8f)
261                   {
262                     unsigned char c2 = (unsigned char) p[1];
263 
264                     if (c2 >= 0xa1 && c2 < 0xff)
265                       goto incomplete;
266                   }
267               }
268             goto invalid;
269           }
270 
271         case enc_94: /* EUC-KR, GB2312, BIG5 */
272           {
273             if (m == 1)
274               {
275                 unsigned char c = (unsigned char) p[0];
276 
277                 if (c >= 0xa1 && c < 0xff)
278                   goto incomplete;
279               }
280             goto invalid;
281           }
282 
283         case enc_euctw: /* EUC-TW */
284           {
285             if (m == 1)
286               {
287                 unsigned char c = (unsigned char) p[0];
288 
289                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
290                   goto incomplete;
291               }
292             else /* m == 2 || m == 3 */
293               {
294                 unsigned char c = (unsigned char) p[0];
295 
296                 if (c == 0x8e)
297                   goto incomplete;
298               }
299             goto invalid;
300           }
301 
302         case enc_gb18030: /* GB18030 */
303           {
304             if (m == 1)
305               {
306                 unsigned char c = (unsigned char) p[0];
307 
308                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
309                   goto incomplete;
310               }
311             else /* m == 2 || m == 3 */
312               {
313                 unsigned char c = (unsigned char) p[0];
314 
315                 if (c >= 0x90 && c <= 0xe3)
316                   {
317                     unsigned char c2 = (unsigned char) p[1];
318 
319                     if (c2 >= 0x30 && c2 <= 0x39)
320                       {
321                         if (m == 2)
322                           goto incomplete;
323                         else /* m == 3 */
324                           {
325                             unsigned char c3 = (unsigned char) p[2];
326 
327                             if (c3 >= 0x81 && c3 <= 0xfe)
328                               goto incomplete;
329                           }
330                       }
331                   }
332               }
333             goto invalid;
334           }
335 
336         case enc_sjis: /* SJIS */
337           {
338             if (m == 1)
339               {
340                 unsigned char c = (unsigned char) p[0];
341 
342                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
343                     || (c >= 0xf0 && c <= 0xf9))
344                   goto incomplete;
345               }
346             goto invalid;
347           }
348 
349         default:
350           /* An unknown multibyte encoding.  */
351           goto incomplete;
352         }
353 
354      incomplete:
355       {
356         size_t k = nstate;
357         /* Here 0 <= k < m < 4.  */
358         pstate[++k] = s[0];
359         if (k < m)
360           {
361             pstate[++k] = s[1];
362             if (k < m)
363               pstate[++k] = s[2];
364           }
365         if (k != m)
366           abort ();
367       }
368       pstate[0] = m;
369       return (size_t)(-2);
370 
371      invalid:
372       errno = EILSEQ;
373       /* The conversion state is undefined, says POSIX.  */
374       return (size_t)(-1);
375     }
376   }
377 }
378 
379 #else
380 /* Override the system's mbrtowc() function.  */
381 
382 # undef mbrtowc
383 
384 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)385 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
386 {
387   size_t ret;
388   wchar_t wc;
389 
390 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
391   if (s == NULL)
392     {
393       pwc = NULL;
394       s = "";
395       n = 1;
396     }
397 # endif
398 
399 # if MBRTOWC_EMPTY_INPUT_BUG
400   if (n == 0)
401     return (size_t) -2;
402 # endif
403 
404   if (! pwc)
405     pwc = &wc;
406 
407 # if MBRTOWC_RETVAL_BUG
408   {
409     static mbstate_t internal_state;
410 
411     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
412        hidden internal state, but we can call it on our variable.  */
413     if (ps == NULL)
414       ps = &internal_state;
415 
416     if (!mbsinit (ps))
417       {
418         /* Parse the rest of the multibyte character byte for byte.  */
419         size_t count = 0;
420         for (; n > 0; s++, n--)
421           {
422             ret = mbrtowc (&wc, s, 1, ps);
423 
424             if (ret == (size_t)(-1))
425               return (size_t)(-1);
426             count++;
427             if (ret != (size_t)(-2))
428               {
429                 /* The multibyte character has been completed.  */
430                 *pwc = wc;
431                 return (wc == 0 ? 0 : count);
432               }
433           }
434         return (size_t)(-2);
435       }
436   }
437 # endif
438 
439   ret = mbrtowc (pwc, s, n, ps);
440 
441 # if MBRTOWC_NUL_RETVAL_BUG
442   if (ret < (size_t) -2 && !*pwc)
443     return 0;
444 # endif
445 
446 # if C_LOCALE_MAYBE_EILSEQ
447   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
448     {
449       unsigned char uc = *s;
450       *pwc = uc;
451       return 1;
452     }
453 # endif
454 
455   return ret;
456 }
457 
458 #endif
459