1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2008.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 /* Specification.  */
21 #include <wchar.h>
22 
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
25 # include <locale.h>
26 #endif
27 
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc().  */
30 
31 # include <errno.h>
32 # include <stdlib.h>
33 
34 # include "localcharset.h"
35 # include "streq.h"
36 # include "verify.h"
37 
38 #ifndef FALLTHROUGH
39 # if __GNUC__ < 7
40 #  define FALLTHROUGH ((void) 0)
41 # else
42 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
43 # endif
44 #endif
45 
46 verify (sizeof (mbstate_t) >= 4);
47 
48 static char internal_state[4];
49 
50 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)51 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
52 {
53   char *pstate = (char *)ps;
54 
55   if (s == NULL)
56     {
57       pwc = NULL;
58       s = "";
59       n = 1;
60     }
61 
62   if (n == 0)
63     return (size_t)(-2);
64 
65   /* Here n > 0.  */
66 
67   if (pstate == NULL)
68     pstate = internal_state;
69 
70   {
71     size_t nstate = pstate[0];
72     char buf[4];
73     const char *p;
74     size_t m;
75 
76     switch (nstate)
77       {
78       case 0:
79         p = s;
80         m = n;
81         break;
82       case 3:
83         buf[2] = pstate[3];
84         FALLTHROUGH;
85       case 2:
86         buf[1] = pstate[2];
87         FALLTHROUGH;
88       case 1:
89         buf[0] = pstate[1];
90         p = buf;
91         m = nstate;
92         buf[m++] = s[0];
93         if (n >= 2 && m < 4)
94           {
95             buf[m++] = s[1];
96             if (n >= 3 && m < 4)
97               buf[m++] = s[2];
98           }
99         break;
100       default:
101         errno = EINVAL;
102         return (size_t)(-1);
103       }
104 
105     /* Here m > 0.  */
106 
107 # if __GLIBC__ || defined __UCLIBC__
108     /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
109     mbtowc (NULL, NULL, 0);
110 # endif
111     {
112       int res = mbtowc (pwc, p, m);
113 
114       if (res >= 0)
115         {
116           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
117             abort ();
118           if (nstate >= (res > 0 ? res : 1))
119             abort ();
120           res -= nstate;
121           pstate[0] = 0;
122           return res;
123         }
124 
125       /* mbtowc does not distinguish between invalid and incomplete multibyte
126          sequences.  But mbrtowc needs to make this distinction.
127          There are two possible approaches:
128            - Use iconv() and its return value.
129            - Use built-in knowledge about the possible encodings.
130          Given the low quality of implementation of iconv() on the systems that
131          lack mbrtowc(), we use the second approach.
132          The possible encodings are:
133            - 8-bit encodings,
134            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
135            - UTF-8.
136          Use specialized code for each.  */
137       if (m >= 4 || m >= MB_CUR_MAX)
138         goto invalid;
139       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
140       {
141         const char *encoding = locale_charset ();
142 
143         if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
144           {
145             /* Cf. unistr/u8-mblen.c.  */
146             unsigned char c = (unsigned char) p[0];
147 
148             if (c >= 0xc2)
149               {
150                 if (c < 0xe0)
151                   {
152                     if (m == 1)
153                       goto incomplete;
154                   }
155                 else if (c < 0xf0)
156                   {
157                     if (m == 1)
158                       goto incomplete;
159                     if (m == 2)
160                       {
161                         unsigned char c2 = (unsigned char) p[1];
162 
163                         if ((c2 ^ 0x80) < 0x40
164                             && (c >= 0xe1 || c2 >= 0xa0)
165                             && (c != 0xed || c2 < 0xa0))
166                           goto incomplete;
167                       }
168                   }
169                 else if (c <= 0xf4)
170                   {
171                     if (m == 1)
172                       goto incomplete;
173                     else /* m == 2 || m == 3 */
174                       {
175                         unsigned char c2 = (unsigned char) p[1];
176 
177                         if ((c2 ^ 0x80) < 0x40
178                             && (c >= 0xf1 || c2 >= 0x90)
179                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
180                           {
181                             if (m == 2)
182                               goto incomplete;
183                             else /* m == 3 */
184                               {
185                                 unsigned char c3 = (unsigned char) p[2];
186 
187                                 if ((c3 ^ 0x80) < 0x40)
188                                   goto incomplete;
189                               }
190                           }
191                       }
192                   }
193               }
194             goto invalid;
195           }
196 
197         /* As a reference for this code, you can use the GNU libiconv
198            implementation.  Look for uses of the RET_TOOFEW macro.  */
199 
200         if (STREQ_OPT (encoding,
201                        "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
202           {
203             if (m == 1)
204               {
205                 unsigned char c = (unsigned char) p[0];
206 
207                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
208                   goto incomplete;
209               }
210             if (m == 2)
211               {
212                 unsigned char c = (unsigned char) p[0];
213 
214                 if (c == 0x8f)
215                   {
216                     unsigned char c2 = (unsigned char) p[1];
217 
218                     if (c2 >= 0xa1 && c2 < 0xff)
219                       goto incomplete;
220                   }
221               }
222             goto invalid;
223           }
224         if (STREQ_OPT (encoding,
225                        "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
226             || STREQ_OPT (encoding,
227                           "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
228             || STREQ_OPT (encoding,
229                           "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
230           {
231             if (m == 1)
232               {
233                 unsigned char c = (unsigned char) p[0];
234 
235                 if (c >= 0xa1 && c < 0xff)
236                   goto incomplete;
237               }
238             goto invalid;
239           }
240         if (STREQ_OPT (encoding,
241                        "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
242           {
243             if (m == 1)
244               {
245                 unsigned char c = (unsigned char) p[0];
246 
247                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
248                   goto incomplete;
249               }
250             else /* m == 2 || m == 3 */
251               {
252                 unsigned char c = (unsigned char) p[0];
253 
254                 if (c == 0x8e)
255                   goto incomplete;
256               }
257             goto invalid;
258           }
259         if (STREQ_OPT (encoding,
260                        "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
261           {
262             if (m == 1)
263               {
264                 unsigned char c = (unsigned char) p[0];
265 
266                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
267                   goto incomplete;
268               }
269             else /* m == 2 || m == 3 */
270               {
271                 unsigned char c = (unsigned char) p[0];
272 
273                 if (c >= 0x90 && c <= 0xe3)
274                   {
275                     unsigned char c2 = (unsigned char) p[1];
276 
277                     if (c2 >= 0x30 && c2 <= 0x39)
278                       {
279                         if (m == 2)
280                           goto incomplete;
281                         else /* m == 3 */
282                           {
283                             unsigned char c3 = (unsigned char) p[2];
284 
285                             if (c3 >= 0x81 && c3 <= 0xfe)
286                               goto incomplete;
287                           }
288                       }
289                   }
290               }
291             goto invalid;
292           }
293         if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
294           {
295             if (m == 1)
296               {
297                 unsigned char c = (unsigned char) p[0];
298 
299                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
300                     || (c >= 0xf0 && c <= 0xf9))
301                   goto incomplete;
302               }
303             goto invalid;
304           }
305 
306         /* An unknown multibyte encoding.  */
307         goto incomplete;
308       }
309 
310      incomplete:
311       {
312         size_t k = nstate;
313         /* Here 0 <= k < m < 4.  */
314         pstate[++k] = s[0];
315         if (k < m)
316           {
317             pstate[++k] = s[1];
318             if (k < m)
319               pstate[++k] = s[2];
320           }
321         if (k != m)
322           abort ();
323       }
324       pstate[0] = m;
325       return (size_t)(-2);
326 
327      invalid:
328       errno = EILSEQ;
329       /* The conversion state is undefined, says POSIX.  */
330       return (size_t)(-1);
331     }
332   }
333 }
334 
335 #else
336 /* Override the system's mbrtowc() function.  */
337 
338 # undef mbrtowc
339 
340 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)341 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
342 {
343   size_t ret;
344   wchar_t wc;
345 
346 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
347   if (s == NULL)
348     {
349       pwc = NULL;
350       s = "";
351       n = 1;
352     }
353 # endif
354 
355 # if MBRTOWC_EMPTY_INPUT_BUG
356   if (n == 0)
357     return (size_t) -2;
358 # endif
359 
360   if (! pwc)
361     pwc = &wc;
362 
363 # if MBRTOWC_RETVAL_BUG
364   {
365     static mbstate_t internal_state;
366 
367     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
368        hidden internal state, but we can call it on our variable.  */
369     if (ps == NULL)
370       ps = &internal_state;
371 
372     if (!mbsinit (ps))
373       {
374         /* Parse the rest of the multibyte character byte for byte.  */
375         size_t count = 0;
376         for (; n > 0; s++, n--)
377           {
378             ret = mbrtowc (&wc, s, 1, ps);
379 
380             if (ret == (size_t)(-1))
381               return (size_t)(-1);
382             count++;
383             if (ret != (size_t)(-2))
384               {
385                 /* The multibyte character has been completed.  */
386                 *pwc = wc;
387                 return (wc == 0 ? 0 : count);
388               }
389           }
390         return (size_t)(-2);
391       }
392   }
393 # endif
394 
395   ret = mbrtowc (pwc, s, n, ps);
396 
397 # if MBRTOWC_NUL_RETVAL_BUG
398   if (ret < (size_t) -2 && !*pwc)
399     return 0;
400 # endif
401 
402 # if C_LOCALE_MAYBE_EILSEQ
403   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
404     {
405       unsigned char uc = *s;
406       *pwc = uc;
407       return 1;
408     }
409 # endif
410 
411   return ret;
412 }
413 
414 #endif
415