1 // 2 // mbrtoc32.cpp 3 // 4 // Copyright (c) Microsoft Corporation. All rights reserved. 5 // 6 7 #include <corecrt_internal_mbstring.h> 8 #include <corecrt_internal_ptd_propagation.h> 9 #include <errno.h> 10 #include <stdint.h> 11 #include <uchar.h> 12 13 using namespace __crt_mbstring; 14 15 extern "C" size_t __cdecl mbrtoc32(char32_t* pc32, const char* s, size_t n, mbstate_t* ps) 16 { 17 // TODO: Bug 13307590 says this is always assuming UTF-8. 18 __crt_cached_ptd_host ptd; 19 return __mbrtoc32_utf8(pc32, s, n, ps, ptd); 20 } 21 22 size_t __cdecl __crt_mbstring::__mbrtoc32_utf8(char32_t* pc32, const char* s, size_t n, mbstate_t* ps, __crt_cached_ptd_host& ptd) 23 { 24 const char* begin = s; 25 static mbstate_t internal_pst{}; 26 if (ps == nullptr) 27 { 28 ps = &internal_pst; 29 } 30 31 if (!s) 32 { 33 s = ""; 34 n = 1; 35 pc32 = nullptr; 36 } 37 38 if (n == 0) 39 { 40 return INCOMPLETE; 41 } 42 43 // Retrieve the first byte from the string, or from the previous state 44 uint8_t length; 45 uint8_t bytes_needed; 46 char32_t c32; 47 const bool init_state = (ps->_State == 0); 48 if (init_state) 49 { 50 const uint8_t first_byte = static_cast<uint8_t>(*s++); 51 52 // Optimize for ASCII if in initial state 53 if ((first_byte & 0x80) == 0) 54 { 55 if (pc32 != nullptr) 56 { 57 *pc32 = first_byte; 58 } 59 return first_byte != '\0' ? 1 : 0; 60 } 61 62 if ((first_byte & 0xe0) == 0xc0) 63 { 64 length = 2; 65 } 66 else if ((first_byte & 0xf0) == 0xe0) 67 { 68 length = 3; 69 } 70 else if ((first_byte & 0xf8) == 0xf0) 71 { 72 length = 4; 73 } 74 else 75 { 76 return return_illegal_sequence(ps, ptd); 77 } 78 bytes_needed = length; 79 // Mask out the length bits 80 c32 = first_byte & ((1 << (7 - length)) - 1); 81 } 82 else 83 { 84 c32 = ps->_Wchar; 85 length = static_cast<uint8_t>(ps->_Byte); 86 bytes_needed = static_cast<uint8_t>(ps->_State); 87 88 // Make sure we don't have some sort of invalid/corrupted state. 89 // Any input that left behind state would have been more than one byte long 90 // and the first byte should have been processed already. 91 if (length < 2 || length > 4 || bytes_needed < 1 || bytes_needed >= length) 92 { 93 return return_illegal_sequence(ps, ptd); 94 } 95 } 96 97 // Don't read more bytes than we're allowed 98 if (bytes_needed < n) 99 { 100 n = bytes_needed; 101 } 102 103 // We've already read the first byte. 104 // All remaining bytes should be continuation bytes 105 while (static_cast<size_t>(s - begin) < n) 106 { 107 uint8_t current_byte = static_cast<uint8_t>(*s++); 108 if ((current_byte & 0xc0) != 0x80) 109 { 110 // Not a continuation character 111 return return_illegal_sequence(ps, ptd); 112 } 113 c32 = (c32 << 6) | (current_byte & 0x3f); 114 } 115 116 if (n < bytes_needed) 117 { 118 // Store state and return incomplete 119 auto bytes_remaining = static_cast<uint8_t>(bytes_needed - n); 120 static_assert(sizeof(mbstate_t::_Wchar) >= sizeof(char32_t), "mbstate_t has broken mbrtoc32"); 121 ps->_Wchar = c32; 122 ps->_Byte = length; 123 ps->_State = bytes_remaining; 124 return INCOMPLETE; 125 } 126 127 if ((0xd800 <= c32 && c32 <= 0xdfff) || (0x10ffff < c32)) 128 { 129 // Invalid code point (surrogate or out of range) 130 return return_illegal_sequence(ps, ptd); 131 } 132 133 constexpr char32_t min_legal[3]{ 0x80, 0x800, 0x10000 }; 134 if (c32 < min_legal[length - 2]) 135 { 136 // Overlong encoding 137 return return_illegal_sequence(ps, ptd); 138 } 139 140 // Success! Store results 141 if (pc32 != nullptr) 142 { 143 *pc32 = c32; 144 } 145 146 return reset_and_return(c32 == U'\0' ? 0 : bytes_needed, ps); 147 } 148 149