1 /* Convert multibyte character to wide character.
2    Copyright (C) 1999-2002, 2005-2020 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
16 
17 /* Written by Bruno Haible <bruno@clisp.org>, 2008.  */
18 
19 /* This file contains the body of the mbrtowc and mbrtoc32 functions,
20    when GNULIB_defined_mbstate_t is defined.  */
21 
22   char *pstate = (char *)ps;
23 
24   if (s == NULL)
25     {
26       pwc = NULL;
27       s = "";
28       n = 1;
29     }
30 
31   if (n == 0)
32     return (size_t)(-2);
33 
34   /* Here n > 0.  */
35 
36   if (pstate == NULL)
37     pstate = internal_state;
38 
39   {
40     size_t nstate = pstate[0];
41     char buf[4];
42     const char *p;
43     size_t m;
44     enc_t enc;
45     int res;
46 
47     switch (nstate)
48       {
49       case 0:
50         p = s;
51         m = n;
52         break;
53       case 3:
54         buf[2] = pstate[3];
55         FALLTHROUGH;
56       case 2:
57         buf[1] = pstate[2];
58         FALLTHROUGH;
59       case 1:
60         buf[0] = pstate[1];
61         p = buf;
62         m = nstate;
63         buf[m++] = s[0];
64         if (n >= 2 && m < 4)
65           {
66             buf[m++] = s[1];
67             if (n >= 3 && m < 4)
68               buf[m++] = s[2];
69           }
70         break;
71       default:
72         errno = EINVAL;
73         return (size_t)(-1);
74       }
75 
76     /* Here m > 0.  */
77 
78     enc = locale_encoding_classification ();
79 
80     if (enc == enc_utf8) /* UTF-8 */
81       {
82         /* Achieve
83              - multi-thread safety and
84              - the ability to produce wide character values > WCHAR_MAX
85            by not calling mbtowc() at all.  */
86 #include "mbrtowc-impl-utf8.h"
87       }
88     else
89       {
90         /* The hidden internal state of mbtowc would make this function not
91            multi-thread safe.  Achieve multi-thread safety through a lock.  */
92         wchar_t wc;
93         res = mbtowc_with_lock (&wc, p, m);
94 
95         if (res >= 0)
96           {
97             if ((wc == 0) != (res == 0))
98               abort ();
99             if (pwc != NULL)
100               *pwc = wc;
101             goto success;
102           }
103 
104         /* mbtowc does not distinguish between invalid and incomplete multibyte
105            sequences.  But mbrtowc needs to make this distinction.
106            There are two possible approaches:
107              - Use iconv() and its return value.
108              - Use built-in knowledge about the possible encodings.
109            Given the low quality of implementation of iconv() on the systems
110            that lack mbrtowc(), we use the second approach.
111            The possible encodings are:
112              - 8-bit encodings,
113              - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
114              - UTF-8 (already handled above).
115            Use specialized code for each.  */
116         if (m >= 4 || m >= MB_CUR_MAX)
117           goto invalid;
118         /* Here MB_CUR_MAX > 1 and 0 < m < 4.  */
119         switch (enc)
120           {
121           /* As a reference for this code, you can use the GNU libiconv
122              implementation.  Look for uses of the RET_TOOFEW macro.  */
123 
124           case enc_eucjp: /* EUC-JP */
125             {
126               if (m == 1)
127                 {
128                   unsigned char c = (unsigned char) p[0];
129 
130                   if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
131                     goto incomplete;
132                 }
133               if (m == 2)
134                 {
135                   unsigned char c = (unsigned char) p[0];
136 
137                   if (c == 0x8f)
138                     {
139                       unsigned char c2 = (unsigned char) p[1];
140 
141                       if (c2 >= 0xa1 && c2 < 0xff)
142                         goto incomplete;
143                     }
144                 }
145               goto invalid;
146             }
147 
148           case enc_94: /* EUC-KR, GB2312, BIG5 */
149             {
150               if (m == 1)
151                 {
152                   unsigned char c = (unsigned char) p[0];
153 
154                   if (c >= 0xa1 && c < 0xff)
155                     goto incomplete;
156                 }
157               goto invalid;
158             }
159 
160           case enc_euctw: /* EUC-TW */
161             {
162               if (m == 1)
163                 {
164                   unsigned char c = (unsigned char) p[0];
165 
166                   if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
167                     goto incomplete;
168                 }
169               else /* m == 2 || m == 3 */
170                 {
171                   unsigned char c = (unsigned char) p[0];
172 
173                   if (c == 0x8e)
174                     goto incomplete;
175                 }
176               goto invalid;
177             }
178 
179           case enc_gb18030: /* GB18030 */
180             {
181               if (m == 1)
182                 {
183                   unsigned char c = (unsigned char) p[0];
184 
185                   if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
186                     goto incomplete;
187                 }
188               else /* m == 2 || m == 3 */
189                 {
190                   unsigned char c = (unsigned char) p[0];
191 
192                   if (c >= 0x90 && c <= 0xe3)
193                     {
194                       unsigned char c2 = (unsigned char) p[1];
195 
196                       if (c2 >= 0x30 && c2 <= 0x39)
197                         {
198                           if (m == 2)
199                             goto incomplete;
200                           else /* m == 3 */
201                             {
202                               unsigned char c3 = (unsigned char) p[2];
203 
204                               if (c3 >= 0x81 && c3 <= 0xfe)
205                                 goto incomplete;
206                             }
207                         }
208                     }
209                 }
210               goto invalid;
211             }
212 
213           case enc_sjis: /* SJIS */
214             {
215               if (m == 1)
216                 {
217                   unsigned char c = (unsigned char) p[0];
218 
219                   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
220                       || (c >= 0xf0 && c <= 0xf9))
221                     goto incomplete;
222                 }
223               goto invalid;
224             }
225 
226           default:
227             /* An unknown multibyte encoding.  */
228             goto incomplete;
229           }
230       }
231 
232    success:
233     /* res >= 0 is the corrected return value of
234        mbtowc_with_lock (&wc, p, m).  */
235     if (nstate >= (res > 0 ? res : 1))
236       abort ();
237     res -= nstate;
238     pstate[0] = 0;
239     return res;
240 
241    incomplete:
242     {
243       size_t k = nstate;
244       /* Here 0 <= k < m < 4.  */
245       pstate[++k] = s[0];
246       if (k < m)
247         {
248           pstate[++k] = s[1];
249           if (k < m)
250             pstate[++k] = s[2];
251         }
252       if (k != m)
253         abort ();
254     }
255     pstate[0] = m;
256     return (size_t)(-2);
257 
258    invalid:
259     errno = EILSEQ;
260     /* The conversion state is undefined, says POSIX.  */
261     return (size_t)(-1);
262   }
263