1 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
2    file Copyright.txt or https://cmake.org/licensing for details.  */
3 #include "cm_codecvt.hxx"
4 
5 #if defined(_WIN32)
6 #  include <windows.h>
7 
8 #  include <assert.h>
9 #  include <string.h>
10 #  undef max
11 #  include "cmsys/Encoding.hxx"
12 #endif
13 
14 #if defined(_WIN32)
15 /* Number of leading ones before a zero in the byte (see cm_utf8.c).  */
16 extern "C" unsigned char const cm_utf8_ones[256];
17 #endif
18 
codecvt(Encoding e)19 codecvt::codecvt(Encoding e)
20 #if defined(_WIN32)
21   : m_codepage(0)
22 #endif
23 {
24   switch (e) {
25     case codecvt::ANSI:
26 #if defined(_WIN32)
27       m_noconv = false;
28       m_codepage = CP_ACP;
29       break;
30 #endif
31     // We don't know which ANSI encoding to use for other platforms than
32     // Windows so we don't do any conversion there
33     case codecvt::UTF8:
34     case codecvt::UTF8_WITH_BOM:
35     // Assume internal encoding is UTF-8
36     case codecvt::None:
37     // No encoding
38     default:
39       this->m_noconv = true;
40   }
41 }
42 
43 codecvt::~codecvt() = default;
44 
do_always_noconv() const45 bool codecvt::do_always_noconv() const throw()
46 {
47   return this->m_noconv;
48 }
49 
do_out(mbstate_t & state,const char * from,const char * from_end,const char * & from_next,char * to,char * to_end,char * & to_next) const50 std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
51                                           const char* from_end,
52                                           const char*& from_next, char* to,
53                                           char* to_end, char*& to_next) const
54 {
55   from_next = from;
56   to_next = to;
57   if (this->m_noconv) {
58     return std::codecvt_base::noconv;
59   }
60 #if defined(_WIN32)
61   // Use a const view of the state because we should not modify it until we
62   // have fully processed and consume a byte (with sufficient space in the
63   // output buffer).  We call helpers to re-cast and modify the state
64   State const& lstate = reinterpret_cast<State&>(state);
65 
66   while (from_next != from_end) {
67     // Count leading ones in the bits of the next byte.
68     unsigned char const ones =
69       cm_utf8_ones[static_cast<unsigned char>(*from_next)];
70 
71     if (ones != 1 && lstate.buffered != 0) {
72       // We have a buffered partial codepoint that we never completed.
73       return std::codecvt_base::error;
74     } else if (ones == 1 && lstate.buffered == 0) {
75       // This is a continuation of a codepoint that never started.
76       return std::codecvt_base::error;
77     }
78 
79     // Compute the number of bytes in the current codepoint.
80     int need = 0;
81     switch (ones) {
82       case 0: // 0xxx xxxx: new codepoint of size 1
83         need = 1;
84         break;
85       case 1: // 10xx xxxx: continues a codepoint
86         assert(lstate.size != 0);
87         need = lstate.size;
88         break;
89       case 2: // 110x xxxx: new codepoint of size 2
90         need = 2;
91         break;
92       case 3: // 1110 xxxx: new codepoint of size 3
93         need = 3;
94         break;
95       case 4: // 1111 0xxx: new codepoint of size 4
96         need = 4;
97         break;
98       default: // invalid byte
99         return std::codecvt_base::error;
100     }
101     assert(need > 0);
102 
103     if (lstate.buffered + 1 == need) {
104       // This byte completes a codepoint.
105       std::codecvt_base::result decode_result =
106         this->Decode(state, need, from_next, to_next, to_end);
107       if (decode_result != std::codecvt_base::ok) {
108         return decode_result;
109       }
110     } else {
111       // This byte does not complete a codepoint.
112       this->BufferPartial(state, need, from_next);
113     }
114   }
115 
116   return std::codecvt_base::ok;
117 #else
118   static_cast<void>(state);
119   static_cast<void>(from);
120   static_cast<void>(from_end);
121   static_cast<void>(from_next);
122   static_cast<void>(to);
123   static_cast<void>(to_end);
124   static_cast<void>(to_next);
125   return std::codecvt_base::noconv;
126 #endif
127 }
128 
do_unshift(mbstate_t & state,char * to,char * to_end,char * & to_next) const129 std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
130                                               char* to_end,
131                                               char*& to_next) const
132 {
133   to_next = to;
134   if (this->m_noconv) {
135     return std::codecvt_base::noconv;
136   }
137 #if defined(_WIN32)
138   State& lstate = reinterpret_cast<State&>(state);
139   if (lstate.buffered != 0) {
140     return this->DecodePartial(state, to_next, to_end);
141   }
142   return std::codecvt_base::ok;
143 #else
144   static_cast<void>(state);
145   static_cast<void>(to_end);
146   return std::codecvt_base::ok;
147 #endif
148 }
149 
150 #if defined(_WIN32)
Decode(mbstate_t & state,int size,const char * & from_next,char * & to_next,char * to_end) const151 std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
152                                           const char*& from_next,
153                                           char*& to_next, char* to_end) const
154 {
155   State& lstate = reinterpret_cast<State&>(state);
156 
157   // Collect all the bytes for this codepoint.
158   char buf[4];
159   memcpy(buf, lstate.partial, lstate.buffered);
160   buf[lstate.buffered] = *from_next;
161 
162   // Convert the encoding.
163   wchar_t wbuf[2];
164   int wlen =
165     MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
166   if (wlen <= 0) {
167     return std::codecvt_base::error;
168   }
169 
170   int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
171                                  to_end - to_next, NULL, NULL);
172   if (tlen <= 0) {
173     if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
174       return std::codecvt_base::partial;
175     }
176     return std::codecvt_base::error;
177   }
178 
179   // Move past the now-consumed byte in the input buffer.
180   ++from_next;
181 
182   // Move past the converted codepoint in the output buffer.
183   to_next += tlen;
184 
185   // Re-initialize the state for the next codepoint to start.
186   lstate = State();
187 
188   return std::codecvt_base::ok;
189 }
190 
DecodePartial(mbstate_t & state,char * & to_next,char * to_end) const191 std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
192                                                  char*& to_next,
193                                                  char* to_end) const
194 {
195   State& lstate = reinterpret_cast<State&>(state);
196 
197   // Try converting the partial codepoint.
198   wchar_t wbuf[2];
199   int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
200                                  lstate.buffered, wbuf, 2);
201   if (wlen <= 0) {
202     return std::codecvt_base::error;
203   }
204 
205   int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
206                                  to_end - to_next, NULL, NULL);
207   if (tlen <= 0) {
208     if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
209       return std::codecvt_base::partial;
210     }
211     return std::codecvt_base::error;
212   }
213 
214   // Move past the converted codepoint in the output buffer.
215   to_next += tlen;
216 
217   // Re-initialize the state for the next codepoint to start.
218   lstate = State();
219 
220   return std::codecvt_base::ok;
221 }
222 
BufferPartial(mbstate_t & state,int size,const char * & from_next) const223 void codecvt::BufferPartial(mbstate_t& state, int size,
224                             const char*& from_next) const
225 {
226   State& lstate = reinterpret_cast<State&>(state);
227 
228   // Save the byte in our buffer for later.
229   lstate.partial[lstate.buffered++] = *from_next;
230   lstate.size = size;
231 
232   // Move past the now-consumed byte in the input buffer.
233   ++from_next;
234 }
235 #endif
236 
do_max_length() const237 int codecvt::do_max_length() const throw()
238 {
239   return 4;
240 }
241 
do_encoding() const242 int codecvt::do_encoding() const throw()
243 {
244   return 0;
245 }
246