1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Convert multibyte character to wide character.
4 Copyright (C) 1999-2002, 2005-2019 Free Software Foundation, Inc.
5 Written by Bruno Haible <bruno@clisp.org>, 2008.
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19
20 #include <config.h>
21
22 /* Specification. */
23 #include <wchar.h>
24
25 #if C_LOCALE_MAYBE_EILSEQ
26 # include "hard-locale.h"
27 # include <locale.h>
28 #endif
29
30 #if GNULIB_defined_mbstate_t
31 /* Implement mbrtowc() on top of mbtowc(). */
32
33 # include <errno.h>
34 # include <stdlib.h>
35
36 # include "localcharset.h"
37 # include "streq.h"
38 # include "verify.h"
39
40 # ifndef FALLTHROUGH
41 # if __GNUC__ < 7
42 # define FALLTHROUGH ((void) 0)
43 # else
44 # define FALLTHROUGH __attribute__ ((__fallthrough__))
45 # endif
46 # endif
47
48 /* Returns a classification of special values of the encoding of the current
49 locale. */
50 typedef enum {
51 enc_other, /* other */
52 enc_utf8, /* UTF-8 */
53 enc_eucjp, /* EUC-JP */
54 enc_94, /* EUC-KR, GB2312, BIG5 */
55 enc_euctw, /* EUC-TW */
56 enc_gb18030, /* GB18030 */
57 enc_sjis /* SJIS */
58 } enc_t;
59 static inline enc_t
locale_enc(void)60 locale_enc (void)
61 {
62 const char *encoding = locale_charset ();
63 if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
64 return enc_utf8;
65 if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
66 return enc_eucjp;
67 if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
68 || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
69 || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
70 return enc_94;
71 if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
72 return enc_euctw;
73 if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
74 return enc_gb18030;
75 if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
76 return enc_sjis;
77 return enc_other;
78 }
79
80 #if GNULIB_WCHAR_SINGLE
81 /* When we know that the locale does not change, provide a speedup by
82 caching the value of locale_enc. */
83 static int cached_locale_enc = -1;
84 static inline enc_t
locale_enc_cached(void)85 locale_enc_cached (void)
86 {
87 if (cached_locale_enc < 0)
88 cached_locale_enc = locale_enc ();
89 return cached_locale_enc;
90 }
91 #else
92 /* By default, don't make assumptions, hence no caching. */
93 # define locale_enc_cached locale_enc
94 #endif
95
96 verify (sizeof (mbstate_t) >= 4);
97
98 static char internal_state[4];
99
100 size_t
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)101 mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
102 {
103 char *pstate = (char *)ps;
104
105 if (s == NULL)
106 {
107 pwc = NULL;
108 s = "";
109 n = 1;
110 }
111
112 if (n == 0)
113 return (size_t)(-2);
114
115 /* Here n > 0. */
116
117 if (pstate == NULL)
118 pstate = internal_state;
119
120 {
121 size_t nstate = pstate[0];
122 char buf[4];
123 const char *p;
124 size_t m;
125
126 switch (nstate)
127 {
128 case 0:
129 p = s;
130 m = n;
131 break;
132 case 3:
133 buf[2] = pstate[3];
134 FALLTHROUGH;
135 case 2:
136 buf[1] = pstate[2];
137 FALLTHROUGH;
138 case 1:
139 buf[0] = pstate[1];
140 p = buf;
141 m = nstate;
142 buf[m++] = s[0];
143 if (n >= 2 && m < 4)
144 {
145 buf[m++] = s[1];
146 if (n >= 3 && m < 4)
147 buf[m++] = s[2];
148 }
149 break;
150 default:
151 errno = EINVAL;
152 return (size_t)(-1);
153 }
154
155 /* Here m > 0. */
156
157 # if __GLIBC__ || defined __UCLIBC__
158 /* Work around bug <https://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
159 mbtowc (NULL, NULL, 0);
160 # endif
161 {
162 int res = mbtowc (pwc, p, m);
163
164 if (res >= 0)
165 {
166 if (pwc != NULL && ((*pwc == 0) != (res == 0)))
167 abort ();
168 if (nstate >= (res > 0 ? res : 1))
169 abort ();
170 res -= nstate;
171 pstate[0] = 0;
172 return res;
173 }
174
175 /* mbtowc does not distinguish between invalid and incomplete multibyte
176 sequences. But mbrtowc needs to make this distinction.
177 There are two possible approaches:
178 - Use iconv() and its return value.
179 - Use built-in knowledge about the possible encodings.
180 Given the low quality of implementation of iconv() on the systems that
181 lack mbrtowc(), we use the second approach.
182 The possible encodings are:
183 - 8-bit encodings,
184 - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
185 - UTF-8.
186 Use specialized code for each. */
187 if (m >= 4 || m >= MB_CUR_MAX)
188 goto invalid;
189 /* Here MB_CUR_MAX > 1 and 0 < m < 4. */
190 switch (locale_enc_cached ())
191 {
192 case enc_utf8: /* UTF-8 */
193 {
194 /* Cf. unistr/u8-mblen.c. */
195 unsigned char c = (unsigned char) p[0];
196
197 if (c >= 0xc2)
198 {
199 if (c < 0xe0)
200 {
201 if (m == 1)
202 goto incomplete;
203 }
204 else if (c < 0xf0)
205 {
206 if (m == 1)
207 goto incomplete;
208 if (m == 2)
209 {
210 unsigned char c2 = (unsigned char) p[1];
211
212 if ((c2 ^ 0x80) < 0x40
213 && (c >= 0xe1 || c2 >= 0xa0)
214 && (c != 0xed || c2 < 0xa0))
215 goto incomplete;
216 }
217 }
218 else if (c <= 0xf4)
219 {
220 if (m == 1)
221 goto incomplete;
222 else /* m == 2 || m == 3 */
223 {
224 unsigned char c2 = (unsigned char) p[1];
225
226 if ((c2 ^ 0x80) < 0x40
227 && (c >= 0xf1 || c2 >= 0x90)
228 && (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
229 {
230 if (m == 2)
231 goto incomplete;
232 else /* m == 3 */
233 {
234 unsigned char c3 = (unsigned char) p[2];
235
236 if ((c3 ^ 0x80) < 0x40)
237 goto incomplete;
238 }
239 }
240 }
241 }
242 }
243 goto invalid;
244 }
245
246 /* As a reference for this code, you can use the GNU libiconv
247 implementation. Look for uses of the RET_TOOFEW macro. */
248
249 case enc_eucjp: /* EUC-JP */
250 {
251 if (m == 1)
252 {
253 unsigned char c = (unsigned char) p[0];
254
255 if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
256 goto incomplete;
257 }
258 if (m == 2)
259 {
260 unsigned char c = (unsigned char) p[0];
261
262 if (c == 0x8f)
263 {
264 unsigned char c2 = (unsigned char) p[1];
265
266 if (c2 >= 0xa1 && c2 < 0xff)
267 goto incomplete;
268 }
269 }
270 goto invalid;
271 }
272
273 case enc_94: /* EUC-KR, GB2312, BIG5 */
274 {
275 if (m == 1)
276 {
277 unsigned char c = (unsigned char) p[0];
278
279 if (c >= 0xa1 && c < 0xff)
280 goto incomplete;
281 }
282 goto invalid;
283 }
284
285 case enc_euctw: /* EUC-TW */
286 {
287 if (m == 1)
288 {
289 unsigned char c = (unsigned char) p[0];
290
291 if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
292 goto incomplete;
293 }
294 else /* m == 2 || m == 3 */
295 {
296 unsigned char c = (unsigned char) p[0];
297
298 if (c == 0x8e)
299 goto incomplete;
300 }
301 goto invalid;
302 }
303
304 case enc_gb18030: /* GB18030 */
305 {
306 if (m == 1)
307 {
308 unsigned char c = (unsigned char) p[0];
309
310 if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
311 goto incomplete;
312 }
313 else /* m == 2 || m == 3 */
314 {
315 unsigned char c = (unsigned char) p[0];
316
317 if (c >= 0x90 && c <= 0xe3)
318 {
319 unsigned char c2 = (unsigned char) p[1];
320
321 if (c2 >= 0x30 && c2 <= 0x39)
322 {
323 if (m == 2)
324 goto incomplete;
325 else /* m == 3 */
326 {
327 unsigned char c3 = (unsigned char) p[2];
328
329 if (c3 >= 0x81 && c3 <= 0xfe)
330 goto incomplete;
331 }
332 }
333 }
334 }
335 goto invalid;
336 }
337
338 case enc_sjis: /* SJIS */
339 {
340 if (m == 1)
341 {
342 unsigned char c = (unsigned char) p[0];
343
344 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
345 || (c >= 0xf0 && c <= 0xf9))
346 goto incomplete;
347 }
348 goto invalid;
349 }
350
351 default:
352 /* An unknown multibyte encoding. */
353 goto incomplete;
354 }
355
356 incomplete:
357 {
358 size_t k = nstate;
359 /* Here 0 <= k < m < 4. */
360 pstate[++k] = s[0];
361 if (k < m)
362 {
363 pstate[++k] = s[1];
364 if (k < m)
365 pstate[++k] = s[2];
366 }
367 if (k != m)
368 abort ();
369 }
370 pstate[0] = m;
371 return (size_t)(-2);
372
373 invalid:
374 errno = EILSEQ;
375 /* The conversion state is undefined, says POSIX. */
376 return (size_t)(-1);
377 }
378 }
379 }
380
381 #else
382 /* Override the system's mbrtowc() function. */
383
384 # undef mbrtowc
385
386 size_t
rpl_mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)387 rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
388 {
389 size_t ret;
390 wchar_t wc;
391
392 # if MBRTOWC_NULL_ARG2_BUG || MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG
393 if (s == NULL)
394 {
395 pwc = NULL;
396 s = "";
397 n = 1;
398 }
399 # endif
400
401 # if MBRTOWC_EMPTY_INPUT_BUG
402 if (n == 0)
403 return (size_t) -2;
404 # endif
405
406 if (! pwc)
407 pwc = &wc;
408
409 # if MBRTOWC_RETVAL_BUG
410 {
411 static mbstate_t internal_state;
412
413 /* Override mbrtowc's internal state. We cannot call mbsinit() on the
414 hidden internal state, but we can call it on our variable. */
415 if (ps == NULL)
416 ps = &internal_state;
417
418 if (!mbsinit (ps))
419 {
420 /* Parse the rest of the multibyte character byte for byte. */
421 size_t count = 0;
422 for (; n > 0; s++, n--)
423 {
424 ret = mbrtowc (&wc, s, 1, ps);
425
426 if (ret == (size_t)(-1))
427 return (size_t)(-1);
428 count++;
429 if (ret != (size_t)(-2))
430 {
431 /* The multibyte character has been completed. */
432 *pwc = wc;
433 return (wc == 0 ? 0 : count);
434 }
435 }
436 return (size_t)(-2);
437 }
438 }
439 # endif
440
441 ret = mbrtowc (pwc, s, n, ps);
442
443 # if MBRTOWC_NUL_RETVAL_BUG
444 if (ret < (size_t) -2 && !*pwc)
445 return 0;
446 # endif
447
448 # if C_LOCALE_MAYBE_EILSEQ
449 if ((size_t) -2 <= ret && n != 0 && ! hard_locale (LC_CTYPE))
450 {
451 unsigned char uc = *s;
452 *pwc = uc;
453 return 1;
454 }
455 # endif
456
457 return ret;
458 }
459
460 #endif
461