1 /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
2 file Copyright.txt or https://cmake.org/licensing for details. */
3 #include "cm_codecvt.hxx"
4
5 #if defined(_WIN32)
6 # include <windows.h>
7
8 # include <assert.h>
9 # include <string.h>
10 # undef max
11 # include "cmsys/Encoding.hxx"
12 #endif
13
14 #if defined(_WIN32)
15 /* Number of leading ones before a zero in the byte (see cm_utf8.c). */
16 extern "C" unsigned char const cm_utf8_ones[256];
17 #endif
18
codecvt(Encoding e)19 codecvt::codecvt(Encoding e)
20 #if defined(_WIN32)
21 : m_codepage(0)
22 #endif
23 {
24 switch (e) {
25 case codecvt::ANSI:
26 #if defined(_WIN32)
27 m_noconv = false;
28 m_codepage = CP_ACP;
29 break;
30 #endif
31 // We don't know which ANSI encoding to use for other platforms than
32 // Windows so we don't do any conversion there
33 case codecvt::UTF8:
34 case codecvt::UTF8_WITH_BOM:
35 // Assume internal encoding is UTF-8
36 case codecvt::None:
37 // No encoding
38 default:
39 this->m_noconv = true;
40 }
41 }
42
43 codecvt::~codecvt() = default;
44
do_always_noconv() const45 bool codecvt::do_always_noconv() const throw()
46 {
47 return this->m_noconv;
48 }
49
do_out(mbstate_t & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next) const50 std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
51 const char* from_end,
52 const char*& from_next, char* to,
53 char* to_end, char*& to_next) const
54 {
55 from_next = from;
56 to_next = to;
57 if (this->m_noconv) {
58 return std::codecvt_base::noconv;
59 }
60 #if defined(_WIN32)
61 // Use a const view of the state because we should not modify it until we
62 // have fully processed and consume a byte (with sufficient space in the
63 // output buffer). We call helpers to re-cast and modify the state
64 State const& lstate = reinterpret_cast<State&>(state);
65
66 while (from_next != from_end) {
67 // Count leading ones in the bits of the next byte.
68 unsigned char const ones =
69 cm_utf8_ones[static_cast<unsigned char>(*from_next)];
70
71 if (ones != 1 && lstate.buffered != 0) {
72 // We have a buffered partial codepoint that we never completed.
73 return std::codecvt_base::error;
74 } else if (ones == 1 && lstate.buffered == 0) {
75 // This is a continuation of a codepoint that never started.
76 return std::codecvt_base::error;
77 }
78
79 // Compute the number of bytes in the current codepoint.
80 int need = 0;
81 switch (ones) {
82 case 0: // 0xxx xxxx: new codepoint of size 1
83 need = 1;
84 break;
85 case 1: // 10xx xxxx: continues a codepoint
86 assert(lstate.size != 0);
87 need = lstate.size;
88 break;
89 case 2: // 110x xxxx: new codepoint of size 2
90 need = 2;
91 break;
92 case 3: // 1110 xxxx: new codepoint of size 3
93 need = 3;
94 break;
95 case 4: // 1111 0xxx: new codepoint of size 4
96 need = 4;
97 break;
98 default: // invalid byte
99 return std::codecvt_base::error;
100 }
101 assert(need > 0);
102
103 if (lstate.buffered + 1 == need) {
104 // This byte completes a codepoint.
105 std::codecvt_base::result decode_result =
106 this->Decode(state, need, from_next, to_next, to_end);
107 if (decode_result != std::codecvt_base::ok) {
108 return decode_result;
109 }
110 } else {
111 // This byte does not complete a codepoint.
112 this->BufferPartial(state, need, from_next);
113 }
114 }
115
116 return std::codecvt_base::ok;
117 #else
118 static_cast<void>(state);
119 static_cast<void>(from);
120 static_cast<void>(from_end);
121 static_cast<void>(from_next);
122 static_cast<void>(to);
123 static_cast<void>(to_end);
124 static_cast<void>(to_next);
125 return std::codecvt_base::noconv;
126 #endif
127 }
128
do_unshift(mbstate_t & state,char * to,char * to_end,char * & to_next) const129 std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
130 char* to_end,
131 char*& to_next) const
132 {
133 to_next = to;
134 if (this->m_noconv) {
135 return std::codecvt_base::noconv;
136 }
137 #if defined(_WIN32)
138 State& lstate = reinterpret_cast<State&>(state);
139 if (lstate.buffered != 0) {
140 return this->DecodePartial(state, to_next, to_end);
141 }
142 return std::codecvt_base::ok;
143 #else
144 static_cast<void>(state);
145 static_cast<void>(to_end);
146 return std::codecvt_base::ok;
147 #endif
148 }
149
150 #if defined(_WIN32)
Decode(mbstate_t & state,int size,const char * & from_next,char * & to_next,char * to_end) const151 std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
152 const char*& from_next,
153 char*& to_next, char* to_end) const
154 {
155 State& lstate = reinterpret_cast<State&>(state);
156
157 // Collect all the bytes for this codepoint.
158 char buf[4];
159 memcpy(buf, lstate.partial, lstate.buffered);
160 buf[lstate.buffered] = *from_next;
161
162 // Convert the encoding.
163 wchar_t wbuf[2];
164 int wlen =
165 MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
166 if (wlen <= 0) {
167 return std::codecvt_base::error;
168 }
169
170 int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
171 to_end - to_next, NULL, NULL);
172 if (tlen <= 0) {
173 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
174 return std::codecvt_base::partial;
175 }
176 return std::codecvt_base::error;
177 }
178
179 // Move past the now-consumed byte in the input buffer.
180 ++from_next;
181
182 // Move past the converted codepoint in the output buffer.
183 to_next += tlen;
184
185 // Re-initialize the state for the next codepoint to start.
186 lstate = State();
187
188 return std::codecvt_base::ok;
189 }
190
DecodePartial(mbstate_t & state,char * & to_next,char * to_end) const191 std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
192 char*& to_next,
193 char* to_end) const
194 {
195 State& lstate = reinterpret_cast<State&>(state);
196
197 // Try converting the partial codepoint.
198 wchar_t wbuf[2];
199 int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
200 lstate.buffered, wbuf, 2);
201 if (wlen <= 0) {
202 return std::codecvt_base::error;
203 }
204
205 int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
206 to_end - to_next, NULL, NULL);
207 if (tlen <= 0) {
208 if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
209 return std::codecvt_base::partial;
210 }
211 return std::codecvt_base::error;
212 }
213
214 // Move past the converted codepoint in the output buffer.
215 to_next += tlen;
216
217 // Re-initialize the state for the next codepoint to start.
218 lstate = State();
219
220 return std::codecvt_base::ok;
221 }
222
BufferPartial(mbstate_t & state,int size,const char * & from_next) const223 void codecvt::BufferPartial(mbstate_t& state, int size,
224 const char*& from_next) const
225 {
226 State& lstate = reinterpret_cast<State&>(state);
227
228 // Save the byte in our buffer for later.
229 lstate.partial[lstate.buffered++] = *from_next;
230 lstate.size = size;
231
232 // Move past the now-consumed byte in the input buffer.
233 ++from_next;
234 }
235 #endif
236
do_max_length() const237 int codecvt::do_max_length() const throw()
238 {
239 return 4;
240 }
241
do_encoding() const242 int codecvt::do_encoding() const throw()
243 {
244 return 0;
245 }
246