1 /***
2 *mbrtowc.c - Convert multibyte char to wide char.
3 *
4 * Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 *Purpose:
7 * Convert a multibyte character into the equivalent wide character.
8 *
9 *******************************************************************************/
10 #include <corecrt_internal_mbstring.h>
11 #include <corecrt_internal_ptd_propagation.h>
12 #include <corecrt_internal_securecrt.h>
13 #include <limits.h>
14 #include <locale.h>
15 #include <stdio.h>
16 #include <uchar.h>
17 #include <wchar.h>
18
19 using namespace __crt_mbstring;
20
21 /***
22 *errno_t _mbrtowc_internal() - Helper function to convert multibyte char to wide character.
23 *
24 *Purpose:
25 * Convert a multi-byte character into the equivalent wide character,
26 * according to the specified LC_CTYPE category, or the current locale.
27 * [ANSI].
28 *
29 * NOTE: Currently, the C libraries support the "C" locale only.
30 * Non-C locale support now available under _INTL switch.
31 *Entry:
32 * wchar_t *dst = pointer to (single) destination wide character
33 * const char *s = pointer to multibyte character
34 * size_t n = maximum length of multibyte character to consider
35 * mbstate_t *pmbst = pointer to state (must be not nullptr)
36 * _locale_t plocinfo = locale info
37 *
38 *Exit:
39 * returns, in *pRetValue:
40 * If s = nullptr, 0, indicating we only use state-independent
41 * character encodings.
42 * If s != nullptr: 0 (if *s = null char)
43 * -1 (if the next n or fewer bytes not valid mbc)
44 * number of bytes comprising converted mbc
45 *
46 *Exceptions:
47 *
48 *******************************************************************************/
49
50 _Success_(return != 0)
51 _Post_satisfies_(*pRetValue <= _String_length_(s))
52 static errno_t __cdecl _mbrtowc_internal(
53 _Inout_ _Out_range_(<=, 1) int * pRetValue,
54 _Pre_maybenull_ _Out_writes_opt_z_(1) wchar_t * dst,
55 _In_opt_z_ const char * s,
56 _In_ size_t n,
57 _Inout_ mbstate_t * pmbst,
58 _Inout_ __crt_cached_ptd_host& ptd
59 ) throw()
60 {
61 _ASSERTE(pmbst != nullptr);
62 _ASSIGN_IF_NOT_NULL(dst, 0);
63
64 if (!s || n == 0)
65 {
66 /* indicate do not have state-dependent encodings,
67 handle zero length string */
68 _ASSIGN_IF_NOT_NULL(pRetValue, 0);
69 return 0;
70 }
71
72 if (!*s)
73 {
74 /* handle nullptr char */
75 _ASSIGN_IF_NOT_NULL(pRetValue, 0);
76 return 0;
77 }
78
79 const _locale_t locale = ptd.get_locale();
80
81 if (locale->locinfo->_public._locale_lc_codepage == CP_UTF8)
82 {
83 const size_t retval = __mbrtowc_utf8(dst, s, n, pmbst, ptd);
84 _ASSIGN_IF_NOT_NULL(pRetValue, static_cast<int>(retval));
85 return ptd.get_errno().value_or(0);
86 }
87
88 const int locale_mb_cur_max = locale->locinfo->_public._locale_mb_cur_max;
89 _ASSERTE(locale_mb_cur_max == 1 || locale_mb_cur_max == 2);
90
91 if (locale->locinfo->locale_name[LC_CTYPE] == nullptr)
92 {
93 _ASSIGN_IF_NOT_NULL(dst, (wchar_t) (unsigned char) *s);
94 _ASSIGN_IF_NOT_NULL(pRetValue, 1);
95 return 0;
96 }
97
98 if (pmbst->_Wchar != 0)
99 {
100 /* complete two-byte multibyte character */
101 ((char *) pmbst)[1] = *s;
102 if (locale_mb_cur_max <= 1 ||
103 (__acrt_MultiByteToWideChar(
104 locale->locinfo->_public._locale_lc_codepage,
105 MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
106 (char *) pmbst,
107 2,
108 dst,
109 (dst != nullptr ? 1 : 0)) == 0))
110 {
111 /* translation failed */
112 pmbst->_Wchar = 0;
113 _ASSIGN_IF_NOT_NULL(dst, 0);
114 _ASSIGN_IF_NOT_NULL(pRetValue, -1);
115 return ptd.get_errno().set(EILSEQ);
116 }
117 pmbst->_Wchar = 0;
118 _ASSIGN_IF_NOT_NULL(pRetValue, locale_mb_cur_max);
119 return 0;
120 }
121 else if (_isleadbyte_fast_internal((unsigned char) *s, locale))
122 {
123 /* multi-byte char */
124 if (n < (size_t) locale_mb_cur_max)
125 {
126 /* save partial multibyte character */
127 ((char *) pmbst)[0] = *s;
128 _ASSIGN_IF_NOT_NULL(pRetValue, -2);
129 return 0;
130 }
131 else if (locale_mb_cur_max <= 1 ||
132 (__acrt_MultiByteToWideChar(locale->locinfo->_public._locale_lc_codepage,
133 MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
134 s,
135 static_cast<int>(__min(strlen(s), INT_MAX)),
136 dst,
137 (dst != nullptr ? 1 : 0)) == 0))
138 {
139 /* validate high byte of mbcs char */
140 if (!*(s + 1))
141 {
142 pmbst->_Wchar = 0;
143 _ASSIGN_IF_NOT_NULL(dst, 0);
144 _ASSIGN_IF_NOT_NULL(pRetValue, -1);
145 return ptd.get_errno().set(EILSEQ);
146 }
147 }
148 _ASSIGN_IF_NOT_NULL(pRetValue, locale_mb_cur_max);
149 return 0;
150 }
151 else {
152 /* single byte char */
153 if (__acrt_MultiByteToWideChar(
154 locale->locinfo->_public._locale_lc_codepage,
155 MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
156 s,
157 1,
158 dst,
159 (dst != nullptr ? 1 : 0)) == 0)
160 {
161 _ASSIGN_IF_NOT_NULL(dst, 0);
162 _ASSIGN_IF_NOT_NULL(pRetValue, -1);
163 return ptd.get_errno().set(EILSEQ);
164 }
165
166 _ASSIGN_IF_NOT_NULL(pRetValue, sizeof(char) );
167 return 0;
168 }
169 }
170
171
172 /***
173 *wint_t btowc(c) - translate single byte to wide char
174 *
175 *Purpose:
176 *
177 *Entry:
178 *
179 *Exit:
180 *
181 *Exceptions:
182 *
183 *******************************************************************************/
184
btowc(int c)185 extern "C" wint_t __cdecl btowc(
186 int c
187 )
188 {
189 if (c == EOF)
190 {
191 return WEOF;
192 }
193 else
194 {
195 /* convert as one-byte string */
196 char ch = (char) c;
197 mbstate_t mbst = {};
198 wchar_t wc = 0;
199 int retValue = -1;
200
201 __crt_cached_ptd_host ptd;
202 _mbrtowc_internal(&retValue, &wc, &ch, 1, &mbst, ptd);
203 return (retValue < 0 ? WEOF : wc);
204 }
205 }
206
207
208 /***
209 *size_t mbrlen(s, n, pst) - determine next multibyte code, restartably
210 *
211 *Purpose:
212 *
213 *Entry:
214 *
215 *Exit:
216 *
217 *Exceptions:
218 *
219 *******************************************************************************/
220
mbrlen(const char * s,size_t n,mbstate_t * pst)221 extern "C" size_t __cdecl mbrlen(
222 const char *s,
223 size_t n,
224 mbstate_t *pst
225 )
226 {
227 static mbstate_t mbst = {};
228 int retValue = -1;
229
230 __crt_cached_ptd_host ptd;
231 _mbrtowc_internal(&retValue, nullptr, s, n, (pst != nullptr ? pst : &mbst), ptd);
232 return retValue;
233 }
234
235
236 /***
237 *size_t mbrtowc(pwc, s, n, pst) - translate multibyte to wchar_t, restartably
238 *
239 *Purpose:
240 *
241 *Entry:
242 *
243 *Exit:
244 *
245 *Exceptions:
246 *
247 *******************************************************************************/
248
mbrtowc(wchar_t * dst,const char * s,size_t n,mbstate_t * pst)249 extern "C" size_t __cdecl mbrtowc(
250 wchar_t *dst,
251 const char *s,
252 size_t n,
253 mbstate_t *pst
254 )
255 {
256 static mbstate_t mbst = {};
257 int retValue = -1;
258
259 __crt_cached_ptd_host ptd;
260
261 if (s != nullptr)
262 {
263 _mbrtowc_internal(&retValue, dst, s, n, (pst != nullptr ? pst : &mbst), ptd);
264 }
265 else
266 {
267 _mbrtowc_internal(&retValue, nullptr, "", 1, (pst != nullptr ? pst : &mbst), ptd);
268 }
269 return retValue;
270 }
271
272
273 /***
274 *size_t mbsrtowcs(wcs, ps, n, pst) - translate multibyte string to wide,
275 * restartably
276 *
277 *Purpose:
278 *
279 *Entry:
280 *
281 *Exit:
282 *
283 *Exceptions:
284 *
285 *******************************************************************************/
286
287 /* Helper function shared by the secure and non-secure versions. */
288
289 _Success_(return == 0)
_mbsrtowcs_helper(_Out_writes_opt_z_ (n)wchar_t * wcs,_Deref_pre_opt_z_ const char ** ps,_In_ size_t n,_Inout_ mbstate_t * pst,_Inout_ __crt_cached_ptd_host & ptd)290 static size_t __cdecl _mbsrtowcs_helper(
291 _Out_writes_opt_z_(n) wchar_t * wcs,
292 _Deref_pre_opt_z_ const char ** ps,
293 _In_ size_t n,
294 _Inout_ mbstate_t * pst,
295 _Inout_ __crt_cached_ptd_host& ptd
296 ) throw()
297 {
298 /* validation section */
299 _UCRT_VALIDATE_RETURN(ptd, ps != nullptr, EINVAL, (size_t) - 1);
300
301 static mbstate_t mbst = {};
302 const char *s = *ps;
303 int i = 0;
304 size_t nwc = 0;
305
306 // Use the static cached state if necessary
307 if (pst == nullptr)
308 {
309 pst = &mbst;
310 }
311
312 const _locale_t locale = ptd.get_locale();
313
314 if (locale->locinfo->_public._locale_lc_codepage == CP_UTF8)
315 {
316 return __mbsrtowcs_utf8(wcs, ps, n, pst, ptd);
317 }
318
319 if (wcs == nullptr)
320 {
321 for (;; ++nwc, s += i)
322 {
323 /* translate but don't store */
324 wchar_t wc;
325 _mbrtowc_internal(&i, &wc, s, INT_MAX, pst, ptd);
326 if (i < 0)
327 {
328 return (size_t) - 1;
329 }
330 else if (i == 0)
331 {
332 return nwc;
333 }
334 }
335 }
336
337 for (; 0 < n; ++nwc, s += i, ++wcs, --n)
338 {
339 /* translate and store */
340 _mbrtowc_internal(&i, wcs, s, INT_MAX, pst, ptd);
341 if (i < 0)
342 {
343 /* encountered invalid sequence */
344 nwc = (size_t) - 1;
345 break;
346 }
347 else if (i == 0)
348 {
349 /* encountered terminating null */
350 s = 0;
351 break;
352 }
353 }
354
355 *ps = s;
356 return nwc;
357 }
358
359 /***
360 *size_t mbsrtowcs() - Convert multibyte char string to wide char string.
361 *
362 *Purpose:
363 * Convert a multi-byte char string into the equivalent wide char string,
364 * according to the LC_CTYPE category of the current locale.
365 * Same as mbsrtowcs_s(), but the destination may not be null terminated.
366 * If there's not enough space, we return EINVAL.
367 *
368 *Entry:
369 * wchar_t *pwcs = pointer to destination wide character string buffer
370 * const char **s = pointer to source multibyte character string
371 * size_t n = maximum number of wide characters to store (not including the terminating null character)
372 * mbstate_t *pst = pointer to the conversion state
373 *
374 *Exit:
375 * The nunber if wide characters written to *wcs, not including any terminating null character)
376 *
377 *Exceptions:
378 * Input parameters are validated. Refer to the validation section of the function.
379 *
380 *******************************************************************************/
mbsrtowcs(wchar_t * wcs,const char ** ps,size_t n,mbstate_t * pst)381 extern "C" size_t __cdecl mbsrtowcs(
382 wchar_t * wcs,
383 const char ** ps,
384 size_t n,
385 mbstate_t * pst
386 )
387 {
388 /* Call a non-deprecated helper to do the work. */
389 __crt_cached_ptd_host ptd;
390 return _mbsrtowcs_helper(wcs, ps, n, pst, ptd);
391 }
392
393
394 /***
395 *errno_t mbsrtowcs_s() - Convert multibyte char string to wide char string.
396 *
397 *Purpose:
398 * Convert a multi-byte char string into the equivalent wide char string,
399 * according to the LC_CTYPE category of the current locale.
400 * Same as mbsrtowcs(), but the destination is ensured to be null terminated.
401 * If there's not enough space, we return EINVAL.
402 *
403 *Entry:
404 * size_t *pRetValue = Number of bytes modified including the terminating nullptr
405 * This pointer can be nullptr.
406 * wchar_t *pwcs = pointer to destination wide character string buffer
407 * size_t sizeInWords = size of the destination buffer
408 * const char **s = pointer to source multibyte character string
409 * size_t n = maximum number of wide characters to store (not including the terminating null character)
410 * mbstate_t *pst = pointer to the conversion state
411 *
412 *Exit:
413 * The error code.
414 *
415 *Exceptions:
416 * Input parameters are validated. Refer to the validation section of the function.
417 *
418 *******************************************************************************/
419
mbsrtowcs_s_internal(size_t * pRetValue,wchar_t * dst,size_t sizeInWords,const char ** ps,size_t n,mbstate_t * pmbst,__crt_cached_ptd_host & ptd)420 static errno_t __cdecl mbsrtowcs_s_internal(
421 size_t * pRetValue,
422 wchar_t * dst,
423 size_t sizeInWords,
424 const char ** ps,
425 size_t n,
426 mbstate_t * pmbst,
427 __crt_cached_ptd_host& ptd
428 )
429 {
430 size_t retsize;
431
432 /* validation section */
433 _ASSIGN_IF_NOT_NULL(pRetValue, (size_t) - 1);
434 _UCRT_VALIDATE_RETURN_ERRCODE(ptd, (dst == nullptr && sizeInWords == 0) || (dst != nullptr && sizeInWords > 0), EINVAL);
435 if (dst != nullptr)
436 {
437 _RESET_STRING(dst, sizeInWords);
438 }
439 _UCRT_VALIDATE_RETURN_ERRCODE(ptd, ps != nullptr, EINVAL);
440
441 /* Call a non-deprecated helper to do the work. */
442
443 retsize = _mbsrtowcs_helper(dst, ps, (n > sizeInWords ? sizeInWords : n), pmbst, ptd);
444
445 if (retsize == (size_t) - 1)
446 {
447 if (dst != nullptr)
448 {
449 _RESET_STRING(dst, sizeInWords);
450 }
451 return ptd.get_errno().value_or(0);
452 }
453
454 /* count the null terminator */
455 retsize++;
456
457 if (dst != nullptr)
458 {
459 /* return error if the string does not fit */
460 if (retsize > sizeInWords)
461 {
462 _RESET_STRING(dst, sizeInWords);
463 _UCRT_VALIDATE_RETURN_ERRCODE(ptd, sizeInWords <= retsize, ERANGE);
464 }
465 else
466 {
467 /* ensure the string is null terminated */
468 dst[retsize - 1] = '\0';
469 }
470 }
471
472 _ASSIGN_IF_NOT_NULL(pRetValue, retsize);
473
474 return 0;
475 }
476
mbsrtowcs_s(size_t * pRetValue,wchar_t * dst,size_t sizeInWords,const char ** ps,size_t n,mbstate_t * pmbst)477 extern "C" errno_t __cdecl mbsrtowcs_s(
478 size_t * pRetValue,
479 wchar_t * dst,
480 size_t sizeInWords,
481 const char ** ps,
482 size_t n,
483 mbstate_t * pmbst
484 )
485 {
486 __crt_cached_ptd_host ptd;
487 return mbsrtowcs_s_internal(pRetValue, dst, sizeInWords, ps, n, pmbst, ptd);
488 }
489
__mbrtowc_utf8(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps,__crt_cached_ptd_host & ptd)490 size_t __cdecl __crt_mbstring::__mbrtowc_utf8(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps, __crt_cached_ptd_host& ptd)
491 {
492 static_assert(sizeof(wchar_t) == 2, "wchar_t is assumed to be 16 bits");
493 char32_t c32;
494 const size_t retval = __mbrtoc32_utf8(&c32, s, n, ps, ptd);
495 // If we succesfully consumed a character, write the result after a quick range check
496 if (retval <= 4)
497 {
498 if (c32 > 0xffff)
499 {
500 // A 4-byte UTF-8 character won't fit into a single UTF-16 wchar
501 // So return the "replacement char"
502 c32 = 0xfffd;
503 }
504 _ASSIGN_IF_NOT_NULL(pwc, static_cast<wchar_t>(c32));
505 }
506 return retval;
507 }
508
__mbsrtowcs_utf8(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps,__crt_cached_ptd_host & ptd)509 size_t __cdecl __crt_mbstring::__mbsrtowcs_utf8(wchar_t* dst, const char** src, size_t len, mbstate_t* ps, __crt_cached_ptd_host& ptd)
510 {
511 const char* current_src = *src;
512
513 auto compute_available = [](const char* s) -> size_t
514 {
515 // We shouldn't just blindly request to read 4 bytes, because there might not be 4 bytes left to read.
516 if (s[0] == '\0')
517 {
518 return 1;
519 }
520 else if (s[1] == '\0')
521 {
522 return 2;
523 }
524 else if (s[2] == '\0')
525 {
526 return 3;
527 }
528 return 4;
529 };
530
531 if (dst != nullptr)
532 {
533 wchar_t* current_dest = dst;
534 for (; len > 0; --len)
535 {
536 const size_t avail = compute_available(current_src);
537 char32_t c32;
538 const size_t retval = __mbrtoc32_utf8(&c32, current_src, avail, ps, ptd);
539 if (retval == __crt_mbstring::INVALID)
540 {
541 // Set src to the beginning of the invalid char
542 *src = current_src;
543 ptd.get_errno().set(EILSEQ);
544 return retval;
545 }
546 else if (retval == 0)
547 {
548 current_src = nullptr;
549 *current_dest = L'\0';
550 break;
551 }
552 else if (c32 > 0xffff)
553 {
554 // This is going to take two output wchars. Make sure we have enough room for this output.
555 if (len > 1)
556 {
557 --len;
558 c32 -= 0x10000;
559 const char16_t high_surrogate = static_cast<char16_t>((c32 >> 10) | 0xd800);
560 const char16_t low_surrogate = static_cast<char16_t>((c32 & 0x03ff) | 0xdc00);
561 *current_dest++ = high_surrogate;
562 *current_dest++ = low_surrogate;
563 }
564 else
565 {
566 break;
567 }
568 }
569 else
570 {
571 *current_dest++ = static_cast<wchar_t>(c32);
572 }
573 current_src += retval;
574 }
575 *src = current_src;
576 return current_dest - dst;
577 }
578 else
579 {
580 size_t total_count = 0;
581 for (;; ++total_count)
582 {
583 const size_t avail = compute_available(current_src);
584
585 const size_t retval = __mbrtoc32_utf8(nullptr, current_src, avail, ps, ptd);
586 if (retval == __crt_mbstring::INVALID)
587 {
588 ptd.get_errno().set(EILSEQ);
589 return retval;
590 }
591 else if (retval == 0)
592 {
593 break;
594 }
595 else if (retval == 4)
596 {
597 // SMP characters take two UTF-16 wide chars
598 ++total_count;
599 }
600 else
601 {
602 // This should be impossible. Means we encountered a multibyte char
603 // that extended past the null terminator, or is more than 4 bytes long
604 _ASSERTE(retval != __crt_mbstring::INCOMPLETE);
605 }
606 current_src += retval;
607 }
608 return total_count;
609 }
610 }
611