1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2012 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2008.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17 
18 #include <config.h>
19 
20 /* Specification.  */
21 #include <wchar.h>
22 
23 #if GNULIB_defined_mbstate_t
24 /* Implement mbrtowc() on top of mbtowc().  */
25 
26 # include <errno.h>
27 # include <stdlib.h>
28 
29 # include "localcharset.h"
30 # include "streq.h"
31 # include "verify.h"
32 
33 
34 verify (sizeof (mbstate_t) >= 4);
35 
36 static char internal_state[4];
37 
38 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)39 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
40 {
41   char *pstate = (char *)ps;
42 
43   if (s == NULL)
44     {
45       pwc = NULL;
46       s = "";
47       n = 1;
48     }
49 
50   if (n == 0)
51     return (size_t)(-2);
52 
53   /* Here n > 0.  */
54 
55   if (pstate == NULL)
56     pstate = internal_state;
57 
58   {
59     size_t nstate = pstate[0];
60     char buf[4];
61     const char *p;
62     size_t m;
63 
64     switch (nstate)
65       {
66       case 0:
67         p = s;
68         m = n;
69         break;
70       case 3:
71         buf[2] = pstate[3];
72         /*FALLTHROUGH*/
73       case 2:
74         buf[1] = pstate[2];
75         /*FALLTHROUGH*/
76       case 1:
77         buf[0] = pstate[1];
78         p = buf;
79         m = nstate;
80         buf[m++] = s[0];
81         if (n >= 2 && m < 4)
82           {
83             buf[m++] = s[1];
84             if (n >= 3 && m < 4)
85               buf[m++] = s[2];
86           }
87         break;
88       default:
89         errno = EINVAL;
90         return (size_t)(-1);
91       }
92 
93     /* Here m > 0.  */
94 
95 # if __GLIBC__ || defined __UCLIBC__
96     /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
97     mbtowc (NULL, NULL, 0);
98 # endif
99     {
100       int res = mbtowc (pwc, p, m);
101 
102       if (res >= 0)
103         {
104           if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105             abort ();
106           if (nstate >= (res > 0 ? res : 1))
107             abort ();
108           res -= nstate;
109           pstate[0] = 0;
110           return res;
111         }
112 
113       /* mbtowc does not distinguish between invalid and incomplete multibyte
114          sequences.  But mbrtowc needs to make this distinction.
115          There are two possible approaches:
116            - Use iconv() and its return value.
117            - Use built-in knowledge about the possible encodings.
118          Given the low quality of implementation of iconv() on the systems that
119          lack mbrtowc(), we use the second approach.
120          The possible encodings are:
121            - 8-bit encodings,
122            - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123            - UTF-8.
124          Use specialized code for each.  */
125       if (m >= 4 || m >= MB_CUR_MAX)
126         goto invalid;
127       /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
128       {
129         const char *encoding = locale_charset ();
130 
131         if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132           {
133             /* Cf. unistr/u8-mblen.c.  */
134             unsigned char c = (unsigned char) p[0];
135 
136             if (c >= 0xc2)
137               {
138                 if (c < 0xe0)
139                   {
140                     if (m == 1)
141                       goto incomplete;
142                   }
143                 else if (c < 0xf0)
144                   {
145                     if (m == 1)
146                       goto incomplete;
147                     if (m == 2)
148                       {
149                         unsigned char c2 = (unsigned char) p[1];
150 
151                         if ((c2 ^ 0x80) < 0x40
152                             && (c >= 0xe1 || c2 >= 0xa0)
153                             && (c != 0xed || c2 < 0xa0))
154                           goto incomplete;
155                       }
156                   }
157                 else if (c <= 0xf4)
158                   {
159                     if (m == 1)
160                       goto incomplete;
161                     else /* m == 2 || m == 3 */
162                       {
163                         unsigned char c2 = (unsigned char) p[1];
164 
165                         if ((c2 ^ 0x80) < 0x40
166                             && (c >= 0xf1 || c2 >= 0x90)
167                             && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
168                           {
169                             if (m == 2)
170                               goto incomplete;
171                             else /* m == 3 */
172                               {
173                                 unsigned char c3 = (unsigned char) p[2];
174 
175                                 if ((c3 ^ 0x80) < 0x40)
176                                   goto incomplete;
177                               }
178                           }
179                       }
180                   }
181               }
182             goto invalid;
183           }
184 
185         /* As a reference for this code, you can use the GNU libiconv
186            implementation.  Look for uses of the RET_TOOFEW macro.  */
187 
188         if (STREQ_OPT (encoding,
189                        "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
190           {
191             if (m == 1)
192               {
193                 unsigned char c = (unsigned char) p[0];
194 
195                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
196                   goto incomplete;
197               }
198             if (m == 2)
199               {
200                 unsigned char c = (unsigned char) p[0];
201 
202                 if (c == 0x8f)
203                   {
204                     unsigned char c2 = (unsigned char) p[1];
205 
206                     if (c2 >= 0xa1 && c2 < 0xff)
207                       goto incomplete;
208                   }
209               }
210             goto invalid;
211           }
212         if (STREQ_OPT (encoding,
213                        "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
214             || STREQ_OPT (encoding,
215                           "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
216             || STREQ_OPT (encoding,
217                           "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
218           {
219             if (m == 1)
220               {
221                 unsigned char c = (unsigned char) p[0];
222 
223                 if (c >= 0xa1 && c < 0xff)
224                   goto incomplete;
225               }
226             goto invalid;
227           }
228         if (STREQ_OPT (encoding,
229                        "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
230           {
231             if (m == 1)
232               {
233                 unsigned char c = (unsigned char) p[0];
234 
235                 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
236                   goto incomplete;
237               }
238             else /* m == 2 || m == 3 */
239               {
240                 unsigned char c = (unsigned char) p[0];
241 
242                 if (c == 0x8e)
243                   goto incomplete;
244               }
245             goto invalid;
246           }
247         if (STREQ_OPT (encoding,
248                        "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
249           {
250             if (m == 1)
251               {
252                 unsigned char c = (unsigned char) p[0];
253 
254                 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
255                   goto incomplete;
256               }
257             else /* m == 2 || m == 3 */
258               {
259                 unsigned char c = (unsigned char) p[0];
260 
261                 if (c >= 0x90 && c <= 0xe3)
262                   {
263                     unsigned char c2 = (unsigned char) p[1];
264 
265                     if (c2 >= 0x30 && c2 <= 0x39)
266                       {
267                         if (m == 2)
268                           goto incomplete;
269                         else /* m == 3 */
270                           {
271                             unsigned char c3 = (unsigned char) p[2];
272 
273                             if (c3 >= 0x81 && c3 <= 0xfe)
274                               goto incomplete;
275                           }
276                       }
277                   }
278               }
279             goto invalid;
280           }
281         if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
282           {
283             if (m == 1)
284               {
285                 unsigned char c = (unsigned char) p[0];
286 
287                 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
288                     || (c >= 0xf0 && c <= 0xf9))
289                   goto incomplete;
290               }
291             goto invalid;
292           }
293 
294         /* An unknown multibyte encoding.  */
295         goto incomplete;
296       }
297 
298      incomplete:
299       {
300         size_t k = nstate;
301         /* Here 0 <= k < m < 4.  */
302         pstate[++k] = s[0];
303         if (k < m)
304           {
305             pstate[++k] = s[1];
306             if (k < m)
307               pstate[++k] = s[2];
308           }
309         if (k != m)
310           abort ();
311       }
312       pstate[0] = m;
313       return (size_t)(-2);
314 
315      invalid:
316       errno = EILSEQ;
317       /* The conversion state is undefined, says POSIX.  */
318       return (size_t)(-1);
319     }
320   }
321 }
322 
323 #else
324 /* Override the system's mbrtowc() function.  */
325 
326 # undef mbrtowc
327 
328 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)329 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
330 {
331 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG
332   if (s == NULL)
333     {
334       pwc = NULL;
335       s = "";
336       n = 1;
337     }
338 # endif
339 
340 # if MBRTOWC_RETVAL_BUG
341   {
342     static mbstate_t internal_state;
343 
344     /* Override mbrtowc's internal state.  We cannot call mbsinit() on the
345        hidden internal state, but we can call it on our variable.  */
346     if (ps == NULL)
347       ps = &internal_state;
348 
349     if (!mbsinit (ps))
350       {
351         /* Parse the rest of the multibyte character byte for byte.  */
352         size_t count = 0;
353         for (; n > 0; s++, n--)
354           {
355             wchar_t wc;
356             size_t ret = mbrtowc (&wc, s, 1, ps);
357 
358             if (ret == (size_t)(-1))
359               return (size_t)(-1);
360             count++;
361             if (ret != (size_t)(-2))
362               {
363                 /* The multibyte character has been completed.  */
364                 if (pwc != NULL)
365                   *pwc = wc;
366                 return (wc == 0 ? 0 : count);
367               }
368           }
369         return (size_t)(-2);
370       }
371   }
372 # endif
373 
374 # if MBRTOWC_NUL_RETVAL_BUG
375   {
376     wchar_t wc;
377     size_t ret = mbrtowc (&wc, s, n, ps);
378 
379     if (ret != (size_t)(-1) && ret != (size_t)(-2))
380       {
381         if (pwc != NULL)
382           *pwc = wc;
383         if (wc == 0)
384           ret = 0;
385       }
386     return ret;
387   }
388 # else
389   {
390 #   if MBRTOWC_NULL_ARG1_BUG
391     wchar_t dummy;
392 
393     if (pwc == NULL)
394       pwc = &dummy;
395 #   endif
396 
397     return mbrtowc (pwc, s, n, ps);
398   }
399 # endif
400 }
401 
402 #endif
403