xref: /reactos/sdk/lib/ucrt/convert/mbrtowc.cpp (revision 04e0dc4a)
1 /***
2 *mbrtowc.c - Convert multibyte char to wide char.
3 *
4 *       Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 *Purpose:
7 *       Convert a multibyte character into the equivalent wide character.
8 *
9 *******************************************************************************/
10 #include <corecrt_internal_mbstring.h>
11 #include <corecrt_internal_ptd_propagation.h>
12 #include <corecrt_internal_securecrt.h>
13 #include <limits.h>
14 #include <locale.h>
15 #include <stdio.h>
16 #include <uchar.h>
17 #include <wchar.h>
18 
19 using namespace __crt_mbstring;
20 
21 /***
22 *errno_t _mbrtowc_internal() - Helper function to convert multibyte char to wide character.
23 *
24 *Purpose:
25 *       Convert a multi-byte character into the equivalent wide character,
26 *       according to the specified LC_CTYPE category, or the current locale.
27 *       [ANSI].
28 *
29 *       NOTE:  Currently, the C libraries support the "C" locale only.
30 *              Non-C locale support now available under _INTL switch.
31 *Entry:
32 *       wchar_t *dst       = pointer to (single) destination wide character
33 *       const char *s      = pointer to multibyte character
34 *       size_t n           = maximum length of multibyte character to consider
35 *       mbstate_t *pmbst   = pointer to state (must be not nullptr)
36 *       _locale_t plocinfo = locale info
37 *
38 *Exit:
39 *       returns, in *pRetValue:
40 *       If s = nullptr, 0, indicating we only use state-independent
41 *       character encodings.
42 *       If s != nullptr:  0 (if *s = null char)
43 *                      -1 (if the next n or fewer bytes not valid mbc)
44 *                      number of bytes comprising converted mbc
45 *
46 *Exceptions:
47 *
48 *******************************************************************************/
49 
50 _Success_(return != 0)
51 _Post_satisfies_(*pRetValue <= _String_length_(s))
52 static errno_t __cdecl _mbrtowc_internal(
53     _Inout_ _Out_range_(<=, 1)              int *                  pRetValue,
54     _Pre_maybenull_ _Out_writes_opt_z_(1)   wchar_t *              dst,
55     _In_opt_z_                              const char *           s,
56     _In_                                    size_t                 n,
57     _Inout_                                 mbstate_t *            pmbst,
58     _Inout_                                 __crt_cached_ptd_host& ptd
59     ) throw()
60 {
61     _ASSERTE(pmbst != nullptr);
62     _ASSIGN_IF_NOT_NULL(dst, 0);
63 
64     if (!s || n == 0)
65     {
66         /* indicate do not have state-dependent encodings,
67         handle zero length string */
68         _ASSIGN_IF_NOT_NULL(pRetValue, 0);
69         return 0;
70     }
71 
72     if (!*s)
73     {
74         /* handle nullptr char */
75         _ASSIGN_IF_NOT_NULL(pRetValue, 0);
76         return 0;
77     }
78 
79     const _locale_t locale = ptd.get_locale();
80 
81     if (locale->locinfo->_public._locale_lc_codepage == CP_UTF8)
82     {
83         const size_t retval = __mbrtowc_utf8(dst, s, n, pmbst, ptd);
84         _ASSIGN_IF_NOT_NULL(pRetValue, static_cast<int>(retval));
85         return ptd.get_errno().value_or(0);
86     }
87 
88     const int locale_mb_cur_max = locale->locinfo->_public._locale_mb_cur_max;
89     _ASSERTE(locale_mb_cur_max == 1 || locale_mb_cur_max == 2);
90 
91     if (locale->locinfo->locale_name[LC_CTYPE] == nullptr)
92     {
93         _ASSIGN_IF_NOT_NULL(dst, (wchar_t) (unsigned char) *s);
94         _ASSIGN_IF_NOT_NULL(pRetValue, 1);
95         return 0;
96     }
97 
98     if (pmbst->_Wchar != 0)
99     {
100         /* complete two-byte multibyte character */
101         ((char *) pmbst)[1] = *s;
102         if (locale_mb_cur_max <= 1 ||
103             (__acrt_MultiByteToWideChar(
104             locale->locinfo->_public._locale_lc_codepage,
105             MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
106             (char *) pmbst,
107             2,
108             dst,
109             (dst != nullptr ? 1 : 0)) == 0))
110         {
111             /* translation failed */
112             pmbst->_Wchar = 0;
113             _ASSIGN_IF_NOT_NULL(dst, 0);
114             _ASSIGN_IF_NOT_NULL(pRetValue, -1);
115             return ptd.get_errno().set(EILSEQ);
116         }
117         pmbst->_Wchar = 0;
118         _ASSIGN_IF_NOT_NULL(pRetValue, locale_mb_cur_max);
119         return 0;
120     }
121     else if (_isleadbyte_fast_internal((unsigned char) *s, locale))
122     {
123         /* multi-byte char */
124         if (n < (size_t) locale_mb_cur_max)
125         {
126             /* save partial multibyte character */
127             ((char *) pmbst)[0] = *s;
128             _ASSIGN_IF_NOT_NULL(pRetValue, -2);
129             return 0;
130         }
131         else if (locale_mb_cur_max <= 1 ||
132             (__acrt_MultiByteToWideChar(locale->locinfo->_public._locale_lc_codepage,
133             MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
134             s,
135             static_cast<int>(__min(strlen(s), INT_MAX)),
136             dst,
137             (dst != nullptr ? 1 : 0)) == 0))
138         {
139             /* validate high byte of mbcs char */
140             if (!*(s + 1))
141             {
142                 pmbst->_Wchar = 0;
143                 _ASSIGN_IF_NOT_NULL(dst, 0);
144                 _ASSIGN_IF_NOT_NULL(pRetValue, -1);
145                 return ptd.get_errno().set(EILSEQ);
146             }
147         }
148         _ASSIGN_IF_NOT_NULL(pRetValue, locale_mb_cur_max);
149         return 0;
150     }
151     else {
152         /* single byte char */
153         if (__acrt_MultiByteToWideChar(
154             locale->locinfo->_public._locale_lc_codepage,
155             MB_PRECOMPOSED | MB_ERR_INVALID_CHARS,
156             s,
157             1,
158             dst,
159             (dst != nullptr ? 1 : 0)) == 0)
160         {
161             _ASSIGN_IF_NOT_NULL(dst, 0);
162             _ASSIGN_IF_NOT_NULL(pRetValue, -1);
163             return ptd.get_errno().set(EILSEQ);
164         }
165 
166         _ASSIGN_IF_NOT_NULL(pRetValue, sizeof(char) );
167         return 0;
168     }
169 }
170 
171 
172 /***
173 *wint_t btowc(c) - translate single byte to wide char
174 *
175 *Purpose:
176 *
177 *Entry:
178 *
179 *Exit:
180 *
181 *Exceptions:
182 *
183 *******************************************************************************/
184 
btowc(int c)185 extern "C" wint_t __cdecl btowc(
186     int c
187     )
188 {
189     if (c == EOF)
190     {
191         return WEOF;
192     }
193     else
194     {
195         /* convert as one-byte string */
196         char ch = (char) c;
197         mbstate_t mbst = {};
198         wchar_t wc = 0;
199         int retValue = -1;
200 
201         __crt_cached_ptd_host ptd;
202         _mbrtowc_internal(&retValue, &wc, &ch, 1, &mbst, ptd);
203         return (retValue < 0 ? WEOF : wc);
204     }
205 }
206 
207 
208 /***
209 *size_t mbrlen(s, n, pst) - determine next multibyte code, restartably
210 *
211 *Purpose:
212 *
213 *Entry:
214 *
215 *Exit:
216 *
217 *Exceptions:
218 *
219 *******************************************************************************/
220 
mbrlen(const char * s,size_t n,mbstate_t * pst)221 extern "C" size_t __cdecl mbrlen(
222     const char *s,
223     size_t n,
224     mbstate_t *pst
225     )
226 {
227     static mbstate_t mbst = {};
228     int retValue = -1;
229 
230     __crt_cached_ptd_host ptd;
231     _mbrtowc_internal(&retValue, nullptr, s, n, (pst != nullptr ? pst : &mbst), ptd);
232     return retValue;
233 }
234 
235 
236 /***
237 *size_t mbrtowc(pwc, s, n, pst) - translate multibyte to wchar_t, restartably
238 *
239 *Purpose:
240 *
241 *Entry:
242 *
243 *Exit:
244 *
245 *Exceptions:
246 *
247 *******************************************************************************/
248 
mbrtowc(wchar_t * dst,const char * s,size_t n,mbstate_t * pst)249 extern "C" size_t __cdecl mbrtowc(
250     wchar_t *dst,
251     const char *s,
252     size_t n,
253     mbstate_t *pst
254     )
255 {
256     static mbstate_t mbst = {};
257     int retValue = -1;
258 
259     __crt_cached_ptd_host ptd;
260 
261     if (s != nullptr)
262     {
263         _mbrtowc_internal(&retValue, dst, s, n, (pst != nullptr ? pst : &mbst), ptd);
264     }
265     else
266     {
267         _mbrtowc_internal(&retValue, nullptr, "", 1, (pst != nullptr ? pst : &mbst), ptd);
268     }
269     return retValue;
270 }
271 
272 
273 /***
274 *size_t mbsrtowcs(wcs, ps, n, pst) - translate multibyte string to wide,
275 *       restartably
276 *
277 *Purpose:
278 *
279 *Entry:
280 *
281 *Exit:
282 *
283 *Exceptions:
284 *
285 *******************************************************************************/
286 
287 /* Helper function shared by the secure and non-secure versions. */
288 
289 _Success_(return == 0)
_mbsrtowcs_helper(_Out_writes_opt_z_ (n)wchar_t * wcs,_Deref_pre_opt_z_ const char ** ps,_In_ size_t n,_Inout_ mbstate_t * pst,_Inout_ __crt_cached_ptd_host & ptd)290 static size_t __cdecl _mbsrtowcs_helper(
291     _Out_writes_opt_z_(n)               wchar_t *              wcs,
292     _Deref_pre_opt_z_                   const char **          ps,
293     _In_                                size_t                 n,
294     _Inout_                             mbstate_t *            pst,
295     _Inout_                             __crt_cached_ptd_host& ptd
296     ) throw()
297 {
298     /* validation section */
299     _UCRT_VALIDATE_RETURN(ptd, ps != nullptr, EINVAL, (size_t) - 1);
300 
301     static mbstate_t mbst = {};
302     const char *s = *ps;
303     int i = 0;
304     size_t nwc = 0;
305 
306     // Use the static cached state if necessary
307     if (pst == nullptr)
308     {
309         pst = &mbst;
310     }
311 
312     const _locale_t locale = ptd.get_locale();
313 
314     if (locale->locinfo->_public._locale_lc_codepage == CP_UTF8)
315     {
316         return __mbsrtowcs_utf8(wcs, ps, n, pst, ptd);
317     }
318 
319     if (wcs == nullptr)
320     {
321         for (;; ++nwc, s += i)
322         {
323             /* translate but don't store */
324             wchar_t wc;
325             _mbrtowc_internal(&i, &wc, s, INT_MAX, pst, ptd);
326             if (i < 0)
327             {
328                 return (size_t) - 1;
329             }
330             else if (i == 0)
331             {
332                 return nwc;
333             }
334         }
335     }
336 
337     for (; 0 < n; ++nwc, s += i, ++wcs, --n)
338     {
339         /* translate and store */
340         _mbrtowc_internal(&i, wcs, s, INT_MAX, pst, ptd);
341         if (i < 0)
342         {
343             /* encountered invalid sequence */
344             nwc = (size_t) - 1;
345             break;
346         }
347         else if (i == 0)
348         {
349             /* encountered terminating null */
350             s = 0;
351             break;
352         }
353     }
354 
355     *ps = s;
356     return nwc;
357 }
358 
359 /***
360 *size_t mbsrtowcs() - Convert multibyte char string to wide char string.
361 *
362 *Purpose:
363 *       Convert a multi-byte char string into the equivalent wide char string,
364 *       according to the LC_CTYPE category of the current locale.
365 *       Same as mbsrtowcs_s(), but the destination may not be null terminated.
366 *       If there's not enough space, we return EINVAL.
367 *
368 *Entry:
369 *       wchar_t *pwcs = pointer to destination wide character string buffer
370 *       const char **s = pointer to source multibyte character string
371 *       size_t n = maximum number of wide characters to store (not including the terminating null character)
372 *       mbstate_t *pst = pointer to the conversion state
373 *
374 *Exit:
375 *       The nunber if wide characters written to *wcs, not including any terminating null character)
376 *
377 *Exceptions:
378 *       Input parameters are validated. Refer to the validation section of the function.
379 *
380 *******************************************************************************/
mbsrtowcs(wchar_t * wcs,const char ** ps,size_t n,mbstate_t * pst)381 extern "C" size_t __cdecl mbsrtowcs(
382     wchar_t *     wcs,
383     const char ** ps,
384     size_t        n,
385     mbstate_t *   pst
386     )
387 {
388     /* Call a non-deprecated helper to do the work. */
389     __crt_cached_ptd_host ptd;
390     return _mbsrtowcs_helper(wcs, ps, n, pst, ptd);
391 }
392 
393 
394 /***
395 *errno_t mbsrtowcs_s() - Convert multibyte char string to wide char string.
396 *
397 *Purpose:
398 *       Convert a multi-byte char string into the equivalent wide char string,
399 *       according to the LC_CTYPE category of the current locale.
400 *       Same as mbsrtowcs(), but the destination is ensured to be null terminated.
401 *       If there's not enough space, we return EINVAL.
402 *
403 *Entry:
404 *       size_t *pRetValue = Number of bytes modified including the terminating nullptr
405 *                           This pointer can be nullptr.
406 *       wchar_t *pwcs = pointer to destination wide character string buffer
407 *       size_t sizeInWords = size of the destination buffer
408 *       const char **s = pointer to source multibyte character string
409 *       size_t n = maximum number of wide characters to store (not including the terminating null character)
410 *       mbstate_t *pst = pointer to the conversion state
411 *
412 *Exit:
413 *       The error code.
414 *
415 *Exceptions:
416 *       Input parameters are validated. Refer to the validation section of the function.
417 *
418 *******************************************************************************/
419 
mbsrtowcs_s_internal(size_t * pRetValue,wchar_t * dst,size_t sizeInWords,const char ** ps,size_t n,mbstate_t * pmbst,__crt_cached_ptd_host & ptd)420 static errno_t __cdecl mbsrtowcs_s_internal(
421     size_t *               pRetValue,
422     wchar_t *              dst,
423     size_t                 sizeInWords,
424     const char **          ps,
425     size_t                 n,
426     mbstate_t *            pmbst,
427     __crt_cached_ptd_host& ptd
428     )
429 {
430     size_t retsize;
431 
432     /* validation section */
433     _ASSIGN_IF_NOT_NULL(pRetValue, (size_t) - 1);
434     _UCRT_VALIDATE_RETURN_ERRCODE(ptd, (dst == nullptr && sizeInWords == 0) || (dst != nullptr && sizeInWords > 0), EINVAL);
435     if (dst != nullptr)
436     {
437         _RESET_STRING(dst, sizeInWords);
438     }
439     _UCRT_VALIDATE_RETURN_ERRCODE(ptd, ps != nullptr, EINVAL);
440 
441     /* Call a non-deprecated helper to do the work. */
442 
443     retsize = _mbsrtowcs_helper(dst, ps, (n > sizeInWords ? sizeInWords : n), pmbst, ptd);
444 
445     if (retsize == (size_t) - 1)
446     {
447         if (dst != nullptr)
448         {
449             _RESET_STRING(dst, sizeInWords);
450         }
451         return ptd.get_errno().value_or(0);
452     }
453 
454     /* count the null terminator */
455     retsize++;
456 
457     if (dst != nullptr)
458     {
459         /* return error if the string does not fit */
460         if (retsize > sizeInWords)
461         {
462             _RESET_STRING(dst, sizeInWords);
463             _UCRT_VALIDATE_RETURN_ERRCODE(ptd, sizeInWords <= retsize, ERANGE);
464         }
465         else
466         {
467             /* ensure the string is null terminated */
468             dst[retsize - 1] = '\0';
469         }
470     }
471 
472     _ASSIGN_IF_NOT_NULL(pRetValue, retsize);
473 
474     return 0;
475 }
476 
mbsrtowcs_s(size_t * pRetValue,wchar_t * dst,size_t sizeInWords,const char ** ps,size_t n,mbstate_t * pmbst)477 extern "C" errno_t __cdecl mbsrtowcs_s(
478     size_t *      pRetValue,
479     wchar_t *     dst,
480     size_t        sizeInWords,
481     const char ** ps,
482     size_t        n,
483     mbstate_t *   pmbst
484     )
485 {
486     __crt_cached_ptd_host ptd;
487     return mbsrtowcs_s_internal(pRetValue, dst, sizeInWords, ps, n, pmbst, ptd);
488 }
489 
__mbrtowc_utf8(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps,__crt_cached_ptd_host & ptd)490 size_t __cdecl __crt_mbstring::__mbrtowc_utf8(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps, __crt_cached_ptd_host& ptd)
491 {
492     static_assert(sizeof(wchar_t) == 2, "wchar_t is assumed to be 16 bits");
493     char32_t c32;
494     const size_t retval = __mbrtoc32_utf8(&c32, s, n, ps, ptd);
495     // If we succesfully consumed a character, write the result after a quick range check
496     if (retval <= 4)
497     {
498         if (c32 > 0xffff)
499         {
500             // A 4-byte UTF-8 character won't fit into a single UTF-16 wchar
501             // So return the "replacement char"
502             c32 = 0xfffd;
503         }
504         _ASSIGN_IF_NOT_NULL(pwc, static_cast<wchar_t>(c32));
505     }
506     return retval;
507 }
508 
__mbsrtowcs_utf8(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps,__crt_cached_ptd_host & ptd)509 size_t __cdecl __crt_mbstring::__mbsrtowcs_utf8(wchar_t* dst, const char** src, size_t len, mbstate_t* ps, __crt_cached_ptd_host& ptd)
510 {
511     const char* current_src = *src;
512 
513     auto compute_available = [](const char* s) -> size_t
514     {
515         // We shouldn't just blindly request to read 4 bytes, because there might not be 4 bytes left to read.
516         if (s[0] == '\0')
517         {
518             return 1;
519         }
520         else if (s[1] == '\0')
521         {
522             return 2;
523         }
524         else if (s[2] == '\0')
525         {
526             return 3;
527         }
528         return 4;
529     };
530 
531     if (dst != nullptr)
532     {
533         wchar_t* current_dest = dst;
534         for (; len > 0; --len)
535         {
536             const size_t avail = compute_available(current_src);
537             char32_t c32;
538             const size_t retval = __mbrtoc32_utf8(&c32, current_src, avail, ps, ptd);
539             if (retval == __crt_mbstring::INVALID)
540             {
541                 // Set src to the beginning of the invalid char
542                 *src = current_src;
543                 ptd.get_errno().set(EILSEQ);
544                 return retval;
545             }
546             else if (retval == 0)
547             {
548                 current_src = nullptr;
549                 *current_dest = L'\0';
550                 break;
551             }
552             else if (c32 > 0xffff)
553             {
554                 // This is going to take two output wchars. Make sure we have enough room for this output.
555                 if (len > 1)
556                 {
557                     --len;
558                     c32 -= 0x10000;
559                     const char16_t high_surrogate = static_cast<char16_t>((c32 >> 10) | 0xd800);
560                     const char16_t low_surrogate = static_cast<char16_t>((c32 & 0x03ff) | 0xdc00);
561                     *current_dest++ = high_surrogate;
562                     *current_dest++ = low_surrogate;
563                 }
564                 else
565                 {
566                     break;
567                 }
568             }
569             else
570             {
571                 *current_dest++ = static_cast<wchar_t>(c32);
572             }
573             current_src += retval;
574         }
575         *src = current_src;
576         return current_dest - dst;
577     }
578     else
579     {
580         size_t total_count = 0;
581         for (;; ++total_count)
582         {
583             const size_t avail = compute_available(current_src);
584 
585             const size_t retval = __mbrtoc32_utf8(nullptr, current_src, avail, ps, ptd);
586             if (retval == __crt_mbstring::INVALID)
587             {
588                 ptd.get_errno().set(EILSEQ);
589                 return retval;
590             }
591             else if (retval == 0)
592             {
593                 break;
594             }
595             else if (retval == 4)
596             {
597                 // SMP characters take two UTF-16 wide chars
598                 ++total_count;
599             }
600             else
601             {
602                 // This should be impossible. Means we encountered a multibyte char
603                 // that extended past the null terminator, or is more than 4 bytes long
604                 _ASSERTE(retval != __crt_mbstring::INCOMPLETE);
605             }
606             current_src += retval;
607         }
608         return total_count;
609     }
610 }
611