1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2018 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or
6 modify it under the terms of either:
7
8 * the GNU Lesser General Public License as published by the Free
9 Software Foundation; either version 3 of the License, or (at your
10 option) any later version.
11
12 or
13
14 * the GNU General Public License as published by the Free
15 Software Foundation; either version 2 of the License, or (at your
16 option) any later version.
17
18 or both in parallel, as here.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <https://www.gnu.org/licenses/>. */
26
27 #include <config.h>
28
29 /* Specification. */
30 #include <wchar.h>
31
32 #if C_LOCALE_MAYBE_EILSEQ
33 # include "hard-locale.h"
34 # include <locale.h>
35 #endif
36
37 #if GNULIB_defined_mbstate_t
38 /* Implement mbrtowc() on top of mbtowc(). */
39
40 # include <errno.h>
41 # include <stdlib.h>
42
43 # include "localcharset.h"
44 # include "streq.h"
45 # include "verify.h"
46
47 #ifndef FALLTHROUGH
48 # if __GNUC__ < 7
49 # define FALLTHROUGH ((void) 0)
50 # else
51 # define FALLTHROUGH __attribute__ ((__fallthrough__))
52 # endif
53 #endif
54
55 verify (sizeof (mbstate_t) >= 4);
56
57 static char internal_state[4];
58
59 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
61 {
62 char *pstate = (char *)ps;
63
64 if (s == NULL)
65 {
66 pwc = NULL;
67 s = "";
68 n = 1;
69 }
70
71 if (n == 0)
72 return (size_t)(-2);
73
74 /* Here n > 0. */
75
76 if (pstate == NULL)
77 pstate = internal_state;
78
79 {
80 size_t nstate = pstate[0];
81 char buf[4];
82 const char *p;
83 size_t m;
84
85 switch (nstate)
86 {
87 case 0:
88 p = s;
89 m = n;
90 break;
91 case 3:
92 buf[2] = pstate[3];
93 FALLTHROUGH;
94 case 2:
95 buf[1] = pstate[2];
96 FALLTHROUGH;
97 case 1:
98 buf[0] = pstate[1];
99 p = buf;
100 m = nstate;
101 buf[m++] = s[0];
102 if (n >= 2 && m < 4)
103 {
104 buf[m++] = s[1];
105 if (n >= 3 && m < 4)
106 buf[m++] = s[2];
107 }
108 break;
109 default:
110 errno = EINVAL;
111 return (size_t)(-1);
112 }
113
114 /* Here m > 0. */
115
116 # if __GLIBC__ || defined __UCLIBC__
117 /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
118 mbtowc (NULL, NULL, 0);
119 # endif
120 {
121 int res = mbtowc (pwc, p, m);
122
123 if (res >= 0)
124 {
125 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
126 abort ();
127 if (nstate >= (res > 0 ? res : 1))
128 abort ();
129 res -= nstate;
130 pstate[0] = 0;
131 return res;
132 }
133
134 /* mbtowc does not distinguish between invalid and incomplete multibyte
135 sequences. But mbrtowc needs to make this distinction.
136 There are two possible approaches:
137 - Use iconv() and its return value.
138 - Use built-in knowledge about the possible encodings.
139 Given the low quality of implementation of iconv() on the systems that
140 lack mbrtowc(), we use the second approach.
141 The possible encodings are:
142 - 8-bit encodings,
143 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
144 - UTF-8.
145 Use specialized code for each. */
146 if (m >= 4 || m >= MB_CUR_MAX)
147 goto invalid;
148 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
149 {
150 const char *encoding = locale_charset ();
151
152 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
153 {
154 /* Cf. unistr/u8-mblen.c. */
155 unsigned char c = (unsigned char) p[0];
156
157 if (c >= 0xc2)
158 {
159 if (c < 0xe0)
160 {
161 if (m == 1)
162 goto incomplete;
163 }
164 else if (c < 0xf0)
165 {
166 if (m == 1)
167 goto incomplete;
168 if (m == 2)
169 {
170 unsigned char c2 = (unsigned char) p[1];
171
172 if ((c2 ^ 0x80) < 0x40
173 && (c >= 0xe1 || c2 >= 0xa0)
174 && (c != 0xed || c2 < 0xa0))
175 goto incomplete;
176 }
177 }
178 else if (c <= 0xf4)
179 {
180 if (m == 1)
181 goto incomplete;
182 else /* m == 2 || m == 3 */
183 {
184 unsigned char c2 = (unsigned char) p[1];
185
186 if ((c2 ^ 0x80) < 0x40
187 && (c >= 0xf1 || c2 >= 0x90)
188 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
189 {
190 if (m == 2)
191 goto incomplete;
192 else /* m == 3 */
193 {
194 unsigned char c3 = (unsigned char) p[2];
195
196 if ((c3 ^ 0x80) < 0x40)
197 goto incomplete;
198 }
199 }
200 }
201 }
202 }
203 goto invalid;
204 }
205
206 /* As a reference for this code, you can use the GNU libiconv
207 implementation. Look for uses of the RET_TOOFEW macro. */
208
209 if (STREQ_OPT (encoding,
210 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
211 {
212 if (m == 1)
213 {
214 unsigned char c = (unsigned char) p[0];
215
216 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
217 goto incomplete;
218 }
219 if (m == 2)
220 {
221 unsigned char c = (unsigned char) p[0];
222
223 if (c == 0x8f)
224 {
225 unsigned char c2 = (unsigned char) p[1];
226
227 if (c2 >= 0xa1 && c2 < 0xff)
228 goto incomplete;
229 }
230 }
231 goto invalid;
232 }
233 if (STREQ_OPT (encoding,
234 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
235 || STREQ_OPT (encoding,
236 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
237 || STREQ_OPT (encoding,
238 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
239 {
240 if (m == 1)
241 {
242 unsigned char c = (unsigned char) p[0];
243
244 if (c >= 0xa1 && c < 0xff)
245 goto incomplete;
246 }
247 goto invalid;
248 }
249 if (STREQ_OPT (encoding,
250 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
251 {
252 if (m == 1)
253 {
254 unsigned char c = (unsigned char) p[0];
255
256 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
257 goto incomplete;
258 }
259 else /* m == 2 || m == 3 */
260 {
261 unsigned char c = (unsigned char) p[0];
262
263 if (c == 0x8e)
264 goto incomplete;
265 }
266 goto invalid;
267 }
268 if (STREQ_OPT (encoding,
269 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
270 {
271 if (m == 1)
272 {
273 unsigned char c = (unsigned char) p[0];
274
275 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
276 goto incomplete;
277 }
278 else /* m == 2 || m == 3 */
279 {
280 unsigned char c = (unsigned char) p[0];
281
282 if (c >= 0x90 && c <= 0xe3)
283 {
284 unsigned char c2 = (unsigned char) p[1];
285
286 if (c2 >= 0x30 && c2 <= 0x39)
287 {
288 if (m == 2)
289 goto incomplete;
290 else /* m == 3 */
291 {
292 unsigned char c3 = (unsigned char) p[2];
293
294 if (c3 >= 0x81 && c3 <= 0xfe)
295 goto incomplete;
296 }
297 }
298 }
299 }
300 goto invalid;
301 }
302 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
303 {
304 if (m == 1)
305 {
306 unsigned char c = (unsigned char) p[0];
307
308 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
309 || (c >= 0xf0 && c <= 0xf9))
310 goto incomplete;
311 }
312 goto invalid;
313 }
314
315 /* An unknown multibyte encoding. */
316 goto incomplete;
317 }
318
319 incomplete:
320 {
321 size_t k = nstate;
322 /* Here 0 <= k < m < 4. */
323 pstate[++k] = s[0];
324 if (k < m)
325 {
326 pstate[++k] = s[1];
327 if (k < m)
328 pstate[++k] = s[2];
329 }
330 if (k != m)
331 abort ();
332 }
333 pstate[0] = m;
334 return (size_t)(-2);
335
336 invalid:
337 errno = EILSEQ;
338 /* The conversion state is undefined, says POSIX. */
339 return (size_t)(-1);
340 }
341 }
342 }
343
344 #else
345 /* Override the system's mbrtowc() function. */
346
347 # undef mbrtowc
348
349 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)350 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
351 {
352 size_t ret;
353 wchar_t wc;
354
355 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
356 if (s == NULL)
357 {
358 pwc = NULL;
359 s = "";
360 n = 1;
361 }
362 # endif
363
364 # if MBRTOWC_EMPTY_INPUT_BUG
365 if (n == 0)
366 return (size_t) -2;
367 # endif
368
369 if (! pwc)
370 pwc = &wc;
371
372 # if MBRTOWC_RETVAL_BUG
373 {
374 static mbstate_t internal_state;
375
376 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
377 hidden internal state, but we can call it on our variable. */
378 if (ps == NULL)
379 ps = &internal_state;
380
381 if (!mbsinit (ps))
382 {
383 /* Parse the rest of the multibyte character byte for byte. */
384 size_t count = 0;
385 for (; n > 0; s++, n--)
386 {
387 ret = mbrtowc (&wc, s, 1, ps);
388
389 if (ret == (size_t)(-1))
390 return (size_t)(-1);
391 count++;
392 if (ret != (size_t)(-2))
393 {
394 /* The multibyte character has been completed. */
395 *pwc = wc;
396 return (wc == 0 ? 0 : count);
397 }
398 }
399 return (size_t)(-2);
400 }
401 }
402 # endif
403
404 ret = mbrtowc (pwc, s, n, ps);
405
406 # if MBRTOWC_NUL_RETVAL_BUG
407 if (ret < (size_t) -2 && !*pwc)
408 return 0;
409 # endif
410
411 # if C_LOCALE_MAYBE_EILSEQ
412 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
413 {
414 unsigned char uc = *s;
415 *pwc = uc;
416 return 1;
417 }
418 # endif
419
420 return ret;
421 }
422
423 #endif
424