1 /* Convert multibyte character to wide character.
2 Copyright (C) 1999-2002, 2005-2017 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2008.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #include <config.h>
19
20 /* Specification. */
21 #include <wchar.h>
22
23 #if C_LOCALE_MAYBE_EILSEQ
24 # include "hard-locale.h"
25 # include <locale.h>
26 #endif
27
28 #if GNULIB_defined_mbstate_t
29 /* Implement mbrtowc() on top of mbtowc(). */
30
31 # include <errno.h>
32 # include <stdlib.h>
33
34 # include "localcharset.h"
35 # include "streq.h"
36 # include "verify.h"
37
38 #ifndef FALLTHROUGH
39 # if __GNUC__ < 7
40 # define FALLTHROUGH ((void) 0)
41 # else
42 # define FALLTHROUGH __attribute__ ((__fallthrough__))
43 # endif
44 #endif
45
46 verify (sizeof (mbstate_t) >= 4);
47
48 static char internal_state[4];
49
50 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)51 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
52 {
53 char *pstate = (char *)ps;
54
55 if (s == NULL)
56 {
57 pwc = NULL;
58 s = "";
59 n = 1;
60 }
61
62 if (n == 0)
63 return (size_t)(-2);
64
65 /* Here n > 0. */
66
67 if (pstate == NULL)
68 pstate = internal_state;
69
70 {
71 size_t nstate = pstate[0];
72 char buf[4];
73 const char *p;
74 size_t m;
75
76 switch (nstate)
77 {
78 case 0:
79 p = s;
80 m = n;
81 break;
82 case 3:
83 buf[2] = pstate[3];
84 FALLTHROUGH;
85 case 2:
86 buf[1] = pstate[2];
87 FALLTHROUGH;
88 case 1:
89 buf[0] = pstate[1];
90 p = buf;
91 m = nstate;
92 buf[m++] = s[0];
93 if (n >= 2 && m < 4)
94 {
95 buf[m++] = s[1];
96 if (n >= 3 && m < 4)
97 buf[m++] = s[2];
98 }
99 break;
100 default:
101 errno = EINVAL;
102 return (size_t)(-1);
103 }
104
105 /* Here m > 0. */
106
107 # if __GLIBC__ || defined __UCLIBC__
108 /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
109 mbtowc (NULL, NULL, 0);
110 # endif
111 {
112 int res = mbtowc (pwc, p, m);
113
114 if (res >= 0)
115 {
116 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
117 abort ();
118 if (nstate >= (res > 0 ? res : 1))
119 abort ();
120 res -= nstate;
121 pstate[0] = 0;
122 return res;
123 }
124
125 /* mbtowc does not distinguish between invalid and incomplete multibyte
126 sequences. But mbrtowc needs to make this distinction.
127 There are two possible approaches:
128 - Use iconv() and its return value.
129 - Use built-in knowledge about the possible encodings.
130 Given the low quality of implementation of iconv() on the systems that
131 lack mbrtowc(), we use the second approach.
132 The possible encodings are:
133 - 8-bit encodings,
134 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
135 - UTF-8.
136 Use specialized code for each. */
137 if (m >= 4 || m >= MB_CUR_MAX)
138 goto invalid;
139 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
140 {
141 const char *encoding = locale_charset ();
142
143 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
144 {
145 /* Cf. unistr/u8-mblen.c. */
146 unsigned char c = (unsigned char) p[0];
147
148 if (c >= 0xc2)
149 {
150 if (c < 0xe0)
151 {
152 if (m == 1)
153 goto incomplete;
154 }
155 else if (c < 0xf0)
156 {
157 if (m == 1)
158 goto incomplete;
159 if (m == 2)
160 {
161 unsigned char c2 = (unsigned char) p[1];
162
163 if ((c2 ^ 0x80) < 0x40
164 && (c >= 0xe1 || c2 >= 0xa0)
165 && (c != 0xed || c2 < 0xa0))
166 goto incomplete;
167 }
168 }
169 else if (c <= 0xf4)
170 {
171 if (m == 1)
172 goto incomplete;
173 else /* m == 2 || m == 3 */
174 {
175 unsigned char c2 = (unsigned char) p[1];
176
177 if ((c2 ^ 0x80) < 0x40
178 && (c >= 0xf1 || c2 >= 0x90)
179 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
180 {
181 if (m == 2)
182 goto incomplete;
183 else /* m == 3 */
184 {
185 unsigned char c3 = (unsigned char) p[2];
186
187 if ((c3 ^ 0x80) < 0x40)
188 goto incomplete;
189 }
190 }
191 }
192 }
193 }
194 goto invalid;
195 }
196
197 /* As a reference for this code, you can use the GNU libiconv
198 implementation. Look for uses of the RET_TOOFEW macro. */
199
200 if (STREQ_OPT (encoding,
201 "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
202 {
203 if (m == 1)
204 {
205 unsigned char c = (unsigned char) p[0];
206
207 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
208 goto incomplete;
209 }
210 if (m == 2)
211 {
212 unsigned char c = (unsigned char) p[0];
213
214 if (c == 0x8f)
215 {
216 unsigned char c2 = (unsigned char) p[1];
217
218 if (c2 >= 0xa1 && c2 < 0xff)
219 goto incomplete;
220 }
221 }
222 goto invalid;
223 }
224 if (STREQ_OPT (encoding,
225 "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
226 || STREQ_OPT (encoding,
227 "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
228 || STREQ_OPT (encoding,
229 "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
230 {
231 if (m == 1)
232 {
233 unsigned char c = (unsigned char) p[0];
234
235 if (c >= 0xa1 && c < 0xff)
236 goto incomplete;
237 }
238 goto invalid;
239 }
240 if (STREQ_OPT (encoding,
241 "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
242 {
243 if (m == 1)
244 {
245 unsigned char c = (unsigned char) p[0];
246
247 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
248 goto incomplete;
249 }
250 else /* m == 2 || m == 3 */
251 {
252 unsigned char c = (unsigned char) p[0];
253
254 if (c == 0x8e)
255 goto incomplete;
256 }
257 goto invalid;
258 }
259 if (STREQ_OPT (encoding,
260 "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
261 {
262 if (m == 1)
263 {
264 unsigned char c = (unsigned char) p[0];
265
266 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
267 goto incomplete;
268 }
269 else /* m == 2 || m == 3 */
270 {
271 unsigned char c = (unsigned char) p[0];
272
273 if (c >= 0x90 && c <= 0xe3)
274 {
275 unsigned char c2 = (unsigned char) p[1];
276
277 if (c2 >= 0x30 && c2 <= 0x39)
278 {
279 if (m == 2)
280 goto incomplete;
281 else /* m == 3 */
282 {
283 unsigned char c3 = (unsigned char) p[2];
284
285 if (c3 >= 0x81 && c3 <= 0xfe)
286 goto incomplete;
287 }
288 }
289 }
290 }
291 goto invalid;
292 }
293 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
294 {
295 if (m == 1)
296 {
297 unsigned char c = (unsigned char) p[0];
298
299 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
300 || (c >= 0xf0 && c <= 0xf9))
301 goto incomplete;
302 }
303 goto invalid;
304 }
305
306 /* An unknown multibyte encoding. */
307 goto incomplete;
308 }
309
310 incomplete:
311 {
312 size_t k = nstate;
313 /* Here 0 <= k < m < 4. */
314 pstate[++k] = s[0];
315 if (k < m)
316 {
317 pstate[++k] = s[1];
318 if (k < m)
319 pstate[++k] = s[2];
320 }
321 if (k != m)
322 abort ();
323 }
324 pstate[0] = m;
325 return (size_t)(-2);
326
327 invalid:
328 errno = EILSEQ;
329 /* The conversion state is undefined, says POSIX. */
330 return (size_t)(-1);
331 }
332 }
333 }
334
335 #else
336 /* Override the system's mbrtowc() function. */
337
338 # undef mbrtowc
339
340 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)341 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
342 {
343 size_t ret;
344 wchar_t wc;
345
346 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
347 if (s == NULL)
348 {
349 pwc = NULL;
350 s = "";
351 n = 1;
352 }
353 # endif
354
355 # if MBRTOWC_EMPTY_INPUT_BUG
356 if (n == 0)
357 return (size_t) -2;
358 # endif
359
360 if (! pwc)
361 pwc = &wc;
362
363 # if MBRTOWC_RETVAL_BUG
364 {
365 static mbstate_t internal_state;
366
367 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
368 hidden internal state, but we can call it on our variable. */
369 if (ps == NULL)
370 ps = &internal_state;
371
372 if (!mbsinit (ps))
373 {
374 /* Parse the rest of the multibyte character byte for byte. */
375 size_t count = 0;
376 for (; n > 0; s++, n--)
377 {
378 ret = mbrtowc (&wc, s, 1, ps);
379
380 if (ret == (size_t)(-1))
381 return (size_t)(-1);
382 count++;
383 if (ret != (size_t)(-2))
384 {
385 /* The multibyte character has been completed. */
386 *pwc = wc;
387 return (wc == 0 ? 0 : count);
388 }
389 }
390 return (size_t)(-2);
391 }
392 }
393 # endif
394
395 ret = mbrtowc (pwc, s, n, ps);
396
397 # if MBRTOWC_NUL_RETVAL_BUG
398 if (ret < (size_t) -2 && !*pwc)
399 return 0;
400 # endif
401
402 # if C_LOCALE_MAYBE_EILSEQ
403 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
404 {
405 unsigned char uc = *s;
406 *pwc = uc;
407 return 1;
408 }
409 # endif
410
411 return ret;
412 }
413
414 #endif
415