xref: /reactos/sdk/lib/ucrt/convert/mbrtoc32.cpp (revision 04e0dc4a)
1 //
2 // mbrtoc32.cpp
3 //
4 //      Copyright (c) Microsoft Corporation. All rights reserved.
5 //
6 
7 #include <corecrt_internal_mbstring.h>
8 #include <corecrt_internal_ptd_propagation.h>
9 #include <errno.h>
10 #include <stdint.h>
11 #include <uchar.h>
12 
13 using namespace __crt_mbstring;
14 
mbrtoc32(char32_t * pc32,const char * s,size_t n,mbstate_t * ps)15 extern "C" size_t __cdecl mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps)
16 {
17     // TODO: Bug 13307590 says this is always assuming UTF-8.
18     __crt_cached_ptd_host ptd;
19     return __mbrtoc32_utf8(pc32, s, n, ps, ptd);
20 }
21 
__mbrtoc32_utf8(char32_t * pc32,const char * s,size_t n,mbstate_t * ps,__crt_cached_ptd_host & ptd)22 size_t __cdecl __crt_mbstring::__mbrtoc32_utf8(char32_t* pc32, const char* s, size_t n, mbstate_t* ps, __crt_cached_ptd_host& ptd)
23 {
24     const char* begin = s;
25     static mbstate_t internal_pst{};
26     if (ps == nullptr)
27     {
28         ps = &internal_pst;
29     }
30 
31     if (!s)
32     {
33         s = "";
34         n = 1;
35         pc32 = nullptr;
36     }
37 
38     if (n == 0)
39     {
40         return INCOMPLETE;
41     }
42 
43     // Retrieve the first byte from the string, or from the previous state
44     uint8_t length;
45     uint8_t bytes_needed;
46     char32_t c32;
47     const bool init_state = (ps->_State == 0);
48     if (init_state)
49     {
50         const uint8_t first_byte = static_cast<uint8_t>(*s++);
51 
52         // Optimize for ASCII if in initial state
53         if ((first_byte & 0x80) == 0)
54         {
55             if (pc32 != nullptr)
56             {
57                 *pc32 = first_byte;
58             }
59             return first_byte != '\0' ? 1 : 0;
60         }
61 
62         if ((first_byte & 0xe0) == 0xc0)
63         {
64             length = 2;
65         }
66         else if ((first_byte & 0xf0) == 0xe0)
67         {
68             length = 3;
69         }
70         else if ((first_byte & 0xf8) == 0xf0)
71         {
72             length = 4;
73         }
74         else
75         {
76             return return_illegal_sequence(ps, ptd);
77         }
78         bytes_needed = length;
79         // Mask out the length bits
80         c32 = first_byte & ((1 << (7 - length)) - 1);
81     }
82     else
83     {
84         c32 = ps->_Wchar;
85         length = static_cast<uint8_t>(ps->_Byte);
86         bytes_needed = static_cast<uint8_t>(ps->_State);
87 
88         // Make sure we don't have some sort of invalid/corrupted state.
89         // Any input that left behind state would have been more than one byte long
90         // and the first byte should have been processed already.
91         if (length < 2 || length > 4 || bytes_needed < 1 || bytes_needed >= length)
92         {
93             return return_illegal_sequence(ps, ptd);
94         }
95     }
96 
97     // Don't read more bytes than we're allowed
98     if (bytes_needed < n)
99     {
100         n = bytes_needed;
101     }
102 
103     // We've already read the first byte.
104     // All remaining bytes should be continuation bytes
105     while (static_cast<size_t>(s - begin) < n)
106     {
107         uint8_t current_byte = static_cast<uint8_t>(*s++);
108         if ((current_byte & 0xc0) != 0x80)
109         {
110             // Not a continuation character
111             return return_illegal_sequence(ps, ptd);
112         }
113         c32 = (c32 << 6) | (current_byte & 0x3f);
114     }
115 
116     if (n < bytes_needed)
117     {
118         // Store state and return incomplete
119         auto bytes_remaining = static_cast<uint8_t>(bytes_needed - n);
120         static_assert(sizeof(mbstate_t::_Wchar) >= sizeof(char32_t), "mbstate_t has broken mbrtoc32");
121         ps->_Wchar = c32;
122         ps->_Byte = length;
123         ps->_State = bytes_remaining;
124         return INCOMPLETE;
125     }
126 
127     if ((0xd800 <= c32 && c32 <= 0xdfff) || (0x10ffff < c32))
128     {
129         // Invalid code point (surrogate or out of range)
130         return return_illegal_sequence(ps, ptd);
131     }
132 
133     constexpr char32_t min_legal[3]{ 0x80, 0x800, 0x10000 };
134     if (c32 < min_legal[length - 2])
135     {
136         // Overlong encoding
137         return return_illegal_sequence(ps, ptd);
138     }
139 
140     // Success! Store results
141     if (pc32 != nullptr)
142     {
143         *pc32 = c32;
144     }
145 
146     return reset_and_return(c32 == U'\0' ? 0 : bytes_needed, ps);
147 }
148 
149