1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2008.
4 
5    This program is free software: you can redistribute it and/or
6    modify it under the terms of either:
7 
8      * the GNU Lesser General Public License as published by the Free
9        Software Foundation; either version 3 of the License, or (at your
10        option) any later version.
11 
12    or
13 
14      * the GNU General Public License as published by the Free
15        Software Foundation; either version 2 of the License, or (at your
16        option) any later version.
17 
18    or both in parallel, as here.
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23 
24    You should have received a copy of the GNU General Public License
25    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
26 
27 #include <config.h>
28 
29 /* Specification.  */
30 #include <wchar.h>
31 
32 #if C_LOCALE_MAYBE_EILSEQ
33 # include "hard-locale.h"
34 # include <locale.h>
35 #endif
36 
37 #if GNULIB_defined_mbstate_t
38 /* Implement mbrtowc() on top of mbtowc().  */
39 
40 # include <errno.h>
41 # include <stdlib.h>
42 
43 # include "localcharset.h"
44 # include "streq.h"
45 # include "verify.h"
46 
47 #ifndef FALLTHROUGH
48 # if __GNUC__ < 7
49 #  define FALLTHROUGH ((void) 0)
50 # else
51 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
52 # endif
53 #endif
54 
55 verify (sizeof (mbstate_t) >= 4);
56 
57 static char internal_state[4];
58 
59 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
61 {
62   char *pstate = (char *)ps;
63 
64   if (s == NULL)
65     {
66       pwc = NULL;
67       s = "";
68       n = 1;
69     }
70 
71   if (n == 0)
72     return (size_t)(-2);
73 
74   /* Here n > 0.  */
75 
76   if (pstate == NULL)
77     pstate = internal_state;
78 
79   {
80     size_t nstate = pstate[0];
81     char buf[4];
82     const char *p;
83     size_t m;
84 
85     switch (nstate)
86       {
87       case 0:
88         p = s;
89         m = n;
90         break;
91       case 3:
92         buf[2] = pstate[3];
93         FALLTHROUGH;
94       case 2:
95         buf[1] = pstate[2];
96         FALLTHROUGH;
97       case 1:
98         buf[0] = pstate[1];
99         p = buf;
100         m = nstate;
101         buf[m++] = s[0];
102         if (n >= 2 && m < 4)
103           {
104             buf[m++] = s[1];
105             if (n >= 3 && m < 4)
106               buf[m++] = s[2];
107           }
108         break;
109       default:
110         errno = EINVAL;
111         return (size_t)(-1);
112       }
113 
114     /* Here m > 0.  */
115 
116 # if __GLIBC__ || defined __UCLIBC__
117     /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
118     mbtowc (NULL, NULL, 0);
119 # endif
120     {
121       int res = mbtowc (pwc, p, m);
122 
123       if (res >= 0)
124         {
125           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
126             abort ();
127           if (nstate >= (res > 0 ? res : 1))
128             abort ();
129           res -= nstate;
130           pstate[0] = 0;
131           return res;
132         }
133 
134       /* mbtowc does not distinguish between invalid and incomplete multibyte
135          sequences.  But mbrtowc needs to make this distinction.
136          There are two possible approaches:
137            - Use iconv() and its return value.
138            - Use built-in knowledge about the possible encodings.
139          Given the low quality of implementation of iconv() on the systems that
140          lack mbrtowc(), we use the second approach.
141          The possible encodings are:
142            - 8-bit encodings,
143            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
144            - UTF-8.
145          Use specialized code for each.  */
146       if (m >= 4 || m >= MB_CUR_MAX)
147         goto invalid;
148       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
149       {
150         const char *encoding = locale_charset ();
151 
152         if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
153           {
154             /* Cf. unistr/u8-mblen.c.  */
155             unsigned char c = (unsigned char) p[0];
156 
157             if (c >= 0xc2)
158               {
159                 if (c < 0xe0)
160                   {
161                     if (m == 1)
162                       goto incomplete;
163                   }
164                 else if (c < 0xf0)
165                   {
166                     if (m == 1)
167                       goto incomplete;
168                     if (m == 2)
169                       {
170                         unsigned char c2 = (unsigned char) p[1];
171 
172                         if ((c2 ^ 0x80) < 0x40
173                             && (c >= 0xe1 || c2 >= 0xa0)
174                             && (c != 0xed || c2 < 0xa0))
175                           goto incomplete;
176                       }
177                   }
178                 else if (c <= 0xf4)
179                   {
180                     if (m == 1)
181                       goto incomplete;
182                     else /* m == 2 || m == 3 */
183                       {
184                         unsigned char c2 = (unsigned char) p[1];
185 
186                         if ((c2 ^ 0x80) < 0x40
187                             && (c >= 0xf1 || c2 >= 0x90)
188                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
189                           {
190                             if (m == 2)
191                               goto incomplete;
192                             else /* m == 3 */
193                               {
194                                 unsigned char c3 = (unsigned char) p[2];
195 
196                                 if ((c3 ^ 0x80) < 0x40)
197                                   goto incomplete;
198                               }
199                           }
200                       }
201                   }
202               }
203             goto invalid;
204           }
205 
206         /* As a reference for this code, you can use the GNU libiconv
207            implementation.  Look for uses of the RET_TOOFEW macro.  */
208 
209         if (STREQ_OPT (encoding,
210                        "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
211           {
212             if (m == 1)
213               {
214                 unsigned char c = (unsigned char) p[0];
215 
216                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
217                   goto incomplete;
218               }
219             if (m == 2)
220               {
221                 unsigned char c = (unsigned char) p[0];
222 
223                 if (c == 0x8f)
224                   {
225                     unsigned char c2 = (unsigned char) p[1];
226 
227                     if (c2 >= 0xa1 && c2 < 0xff)
228                       goto incomplete;
229                   }
230               }
231             goto invalid;
232           }
233         if (STREQ_OPT (encoding,
234                        "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
235             || STREQ_OPT (encoding,
236                           "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
237             || STREQ_OPT (encoding,
238                           "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
239           {
240             if (m == 1)
241               {
242                 unsigned char c = (unsigned char) p[0];
243 
244                 if (c >= 0xa1 && c < 0xff)
245                   goto incomplete;
246               }
247             goto invalid;
248           }
249         if (STREQ_OPT (encoding,
250                        "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
251           {
252             if (m == 1)
253               {
254                 unsigned char c = (unsigned char) p[0];
255 
256                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
257                   goto incomplete;
258               }
259             else /* m == 2 || m == 3 */
260               {
261                 unsigned char c = (unsigned char) p[0];
262 
263                 if (c == 0x8e)
264                   goto incomplete;
265               }
266             goto invalid;
267           }
268         if (STREQ_OPT (encoding,
269                        "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
270           {
271             if (m == 1)
272               {
273                 unsigned char c = (unsigned char) p[0];
274 
275                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
276                   goto incomplete;
277               }
278             else /* m == 2 || m == 3 */
279               {
280                 unsigned char c = (unsigned char) p[0];
281 
282                 if (c >= 0x90 && c <= 0xe3)
283                   {
284                     unsigned char c2 = (unsigned char) p[1];
285 
286                     if (c2 >= 0x30 && c2 <= 0x39)
287                       {
288                         if (m == 2)
289                           goto incomplete;
290                         else /* m == 3 */
291                           {
292                             unsigned char c3 = (unsigned char) p[2];
293 
294                             if (c3 >= 0x81 && c3 <= 0xfe)
295                               goto incomplete;
296                           }
297                       }
298                   }
299               }
300             goto invalid;
301           }
302         if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
303           {
304             if (m == 1)
305               {
306                 unsigned char c = (unsigned char) p[0];
307 
308                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
309                     || (c >= 0xf0 && c <= 0xf9))
310                   goto incomplete;
311               }
312             goto invalid;
313           }
314 
315         /* An unknown multibyte encoding.  */
316         goto incomplete;
317       }
318 
319      incomplete:
320       {
321         size_t k = nstate;
322         /* Here 0 <= k < m < 4.  */
323         pstate[++k] = s[0];
324         if (k < m)
325           {
326             pstate[++k] = s[1];
327             if (k < m)
328               pstate[++k] = s[2];
329           }
330         if (k != m)
331           abort ();
332       }
333       pstate[0] = m;
334       return (size_t)(-2);
335 
336      invalid:
337       errno = EILSEQ;
338       /* The conversion state is undefined, says POSIX.  */
339       return (size_t)(-1);
340     }
341   }
342 }
343 
344 #else
345 /* Override the system's mbrtowc() function.  */
346 
347 # undef mbrtowc
348 
349 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)350 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
351 {
352   size_t ret;
353   wchar_t wc;
354 
355 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
356   if (s == NULL)
357     {
358       pwc = NULL;
359       s = "";
360       n = 1;
361     }
362 # endif
363 
364 # if MBRTOWC_EMPTY_INPUT_BUG
365   if (n == 0)
366     return (size_t) -2;
367 # endif
368 
369   if (! pwc)
370     pwc = &wc;
371 
372 # if MBRTOWC_RETVAL_BUG
373   {
374     static mbstate_t internal_state;
375 
376     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
377        hidden internal state, but we can call it on our variable.  */
378     if (ps == NULL)
379       ps = &internal_state;
380 
381     if (!mbsinit (ps))
382       {
383         /* Parse the rest of the multibyte character byte for byte.  */
384         size_t count = 0;
385         for (; n > 0; s++, n--)
386           {
387             ret = mbrtowc (&wc, s, 1, ps);
388 
389             if (ret == (size_t)(-1))
390               return (size_t)(-1);
391             count++;
392             if (ret != (size_t)(-2))
393               {
394                 /* The multibyte character has been completed.  */
395                 *pwc = wc;
396                 return (wc == 0 ? 0 : count);
397               }
398           }
399         return (size_t)(-2);
400       }
401   }
402 # endif
403 
404   ret = mbrtowc (pwc, s, n, ps);
405 
406 # if MBRTOWC_NUL_RETVAL_BUG
407   if (ret < (size_t) -2 && !*pwc)
408     return 0;
409 # endif
410 
411 # if C_LOCALE_MAYBE_EILSEQ
412   if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
413     {
414       unsigned char uc = *s;
415       *pwc = uc;
416       return 1;
417     }
418 # endif
419 
420   return ret;
421 }
422 
423 #endif
424