1 // text.cpp
2
3
4 /**
5 * Copyright (C) 2018-present MongoDB, Inc.
6 *
7 * This program is free software: you can redistribute it and/or modify
8 * it under the terms of the Server Side Public License, version 1,
9 * as published by MongoDB, Inc.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * Server Side Public License for more details.
15 *
16 * You should have received a copy of the Server Side Public License
17 * along with this program. If not, see
18 * <http://www.mongodb.com/licensing/server-side-public-license>.
19 *
20 * As a special exception, the copyright holders give permission to link the
21 * code of portions of this program with the OpenSSL library under certain
22 * conditions as described in each individual source file and distribute
23 * linked combinations including the program with the OpenSSL library. You
24 * must comply with the Server Side Public License in all respects for
25 * all of the code used other than as permitted herein. If you modify file(s)
26 * with this exception, you may extend this exception to your version of the
27 * file(s), but you are not obligated to do so. If you do not wish to do so,
28 * delete this exception statement from your version. If you delete this
29 * exception statement from all source files in the program, then also delete
30 * it in the license file.
31 */
32
33 #include "mongo/platform/basic.h"
34
35 #include "mongo/util/text.h"
36
37 #include <boost/integer_traits.hpp>
38 #include <errno.h>
39 #include <iostream>
40 #include <memory>
41 #include <sstream>
42
43 #ifdef _WIN32
44 #include <io.h>
45 #endif
46
47 #include "mongo/platform/basic.h"
48 #include "mongo/util/allocator.h"
49 #include "mongo/util/mongoutils/str.h"
50
51 using namespace std;
52
53 namespace mongo {
54
55 // --- StringSplitter ----
56
57 /** get next split string fragment */
next()58 string StringSplitter::next() {
59 const char* foo = strstr(_big, _splitter);
60 if (foo) {
61 string s(_big, foo - _big);
62 _big = foo + strlen(_splitter);
63 while (*_big && strstr(_big, _splitter) == _big)
64 _big++;
65 return s;
66 }
67
68 string s = _big;
69 _big += strlen(_big);
70 return s;
71 }
72
73
split(vector<string> & l)74 void StringSplitter::split(vector<string>& l) {
75 while (more()) {
76 l.push_back(next());
77 }
78 }
79
split()80 vector<string> StringSplitter::split() {
81 vector<string> l;
82 split(l);
83 return l;
84 }
85
join(const vector<string> & l,const string & split)86 string StringSplitter::join(const vector<string>& l, const string& split) {
87 stringstream ss;
88 for (unsigned i = 0; i < l.size(); i++) {
89 if (i > 0)
90 ss << split;
91 ss << l[i];
92 }
93 return ss.str();
94 }
95
split(const string & big,const string & splitter)96 vector<string> StringSplitter::split(const string& big, const string& splitter) {
97 StringSplitter ss(big.c_str(), splitter.c_str());
98 return ss.split();
99 }
100
101
102 // --- utf8 utils ------
103
leadingOnes(unsigned char c)104 inline int leadingOnes(unsigned char c) {
105 if (c < 0x80)
106 return 0;
107 static const char _leadingOnes[128] = {
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 - 0x8F
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90 - 0x99
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xA0 - 0xA9
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xB0 - 0xB9
112 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xC0 - 0xC9
113 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xD0 - 0xD9
114 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE0 - 0xE9
115 4, 4, 4, 4, 4, 4, 4, 4, // 0xF0 - 0xF7
116 5, 5, 5, 5, // 0xF8 - 0xFB
117 6, 6, // 0xFC - 0xFD
118 7, // 0xFE
119 8, // 0xFF
120 };
121 return _leadingOnes[c & 0x7f];
122 }
123
isValidUTF8(const std::string & s)124 bool isValidUTF8(const std::string& s) {
125 return isValidUTF8(s.c_str());
126 }
127
isValidUTF8(const char * s)128 bool isValidUTF8(const char* s) {
129 int left = 0; // how many bytes are left in the current codepoint
130 while (*s) {
131 const unsigned char c = (unsigned char)*(s++);
132 const int ones = leadingOnes(c);
133 if (left) {
134 if (ones != 1)
135 return false; // should be a continuation byte
136 left--;
137 } else {
138 if (ones == 0)
139 continue; // ASCII byte
140 if (ones == 1)
141 return false; // unexpected continuation byte
142 if (c > 0xF4)
143 return false; // codepoint too large (< 0x10FFFF)
144 if (c == 0xC0 || c == 0xC1)
145 return false; // codepoints <= 0x7F shouldn't be 2 bytes
146
147 // still valid
148 left = ones - 1;
149 }
150 }
151 if (left != 0)
152 return false; // string ended mid-codepoint
153 return true;
154 }
155
156 #if defined(_WIN32)
157
toUtf8String(const std::wstring & wide)158 std::string toUtf8String(const std::wstring& wide) {
159 if (wide.size() > boost::integer_traits<int>::const_max)
160 throw std::length_error("Wide string cannot be more than INT_MAX characters long.");
161 if (wide.size() == 0)
162 return "";
163
164 // Calculate necessary buffer size
165 int len = ::WideCharToMultiByte(
166 CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()), NULL, 0, NULL, NULL);
167
168 // Perform actual conversion
169 if (len > 0) {
170 std::vector<char> buffer(len);
171 len = ::WideCharToMultiByte(CP_UTF8,
172 0,
173 wide.c_str(),
174 static_cast<int>(wide.size()),
175 &buffer[0],
176 static_cast<int>(buffer.size()),
177 NULL,
178 NULL);
179 if (len > 0) {
180 verify(len == static_cast<int>(buffer.size()));
181 return std::string(&buffer[0], buffer.size());
182 }
183 }
184
185 msgasserted(16091, mongoutils::str::stream() << "can't wstring to utf8: " << ::GetLastError());
186 return "";
187 }
188
toWideString(const char * utf8String)189 std::wstring toWideString(const char* utf8String) {
190 int bufferSize = MultiByteToWideChar(CP_UTF8, // Code page
191 0, // Flags
192 utf8String, // Input string
193 -1, // Count, -1 for NUL-terminated
194 NULL, // No output buffer
195 0 // Zero means "compute required size"
196 );
197 if (bufferSize == 0) {
198 return std::wstring();
199 }
200 std::unique_ptr<wchar_t[]> tempBuffer(new wchar_t[bufferSize]);
201 tempBuffer[0] = 0;
202 MultiByteToWideChar(CP_UTF8, // Code page
203 0, // Flags
204 utf8String, // Input string
205 -1, // Count, -1 for NUL-terminated
206 tempBuffer.get(), // UTF-16 output buffer
207 bufferSize // Buffer size in wide characters
208 );
209 return std::wstring(tempBuffer.get());
210 }
211
212 /**
213 * Write a UTF-8 string to the Windows console in Unicode (UTF-16)
214 *
215 * @param utf8String UTF-8 input string
216 * @param utf8StringSize Number of bytes in UTF-8 string, no NUL terminator assumed
217 * @return true if all characters were displayed (including zero characters)
218 */
writeUtf8ToWindowsConsole(const char * utf8String,unsigned int utf8StringSize)219 bool writeUtf8ToWindowsConsole(const char* utf8String, unsigned int utf8StringSize) {
220 int bufferSize = MultiByteToWideChar(CP_UTF8, // Code page
221 0, // Flags
222 utf8String, // Input string
223 utf8StringSize, // Input string length
224 NULL, // No output buffer
225 0 // Zero means "compute required size"
226 );
227 if (bufferSize == 0) {
228 return true;
229 }
230 std::unique_ptr<wchar_t[]> utf16String(new wchar_t[bufferSize]);
231 MultiByteToWideChar(CP_UTF8, // Code page
232 0, // Flags
233 utf8String, // Input string
234 utf8StringSize, // Input string length
235 utf16String.get(), // UTF-16 output buffer
236 bufferSize // Buffer size in wide characters
237 );
238 const wchar_t* utf16Pointer = utf16String.get();
239 size_t numberOfCharactersToWrite = bufferSize;
240 HANDLE consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
241 while (numberOfCharactersToWrite > 0) {
242 static const DWORD MAXIMUM_CHARACTERS_PER_PASS = 8 * 1024;
243 DWORD numberOfCharactersThisPass = static_cast<DWORD>(numberOfCharactersToWrite);
244 if (numberOfCharactersThisPass > MAXIMUM_CHARACTERS_PER_PASS) {
245 numberOfCharactersThisPass = MAXIMUM_CHARACTERS_PER_PASS;
246 }
247 DWORD numberOfCharactersWritten;
248 BOOL success = WriteConsoleW(consoleHandle,
249 utf16Pointer,
250 numberOfCharactersThisPass,
251 &numberOfCharactersWritten,
252 NULL);
253 if (0 == success) {
254 DWORD dosError = GetLastError();
255 static bool errorMessageShown = false;
256 if (ERROR_GEN_FAILURE == dosError) {
257 if (!errorMessageShown) {
258 std::cout << "\n---\nUnicode text could not be correctly displayed.\n"
259 "Please change your console font to a Unicode font "
260 "(e.g. Lucida Console).\n---\n"
261 << std::endl;
262 errorMessageShown = true;
263 }
264 // we can't display the text properly using a raster font,
265 // but we can display the bits that will display ...
266 _write(1, utf8String, utf8StringSize);
267 }
268 return false;
269 }
270 numberOfCharactersToWrite -= numberOfCharactersWritten;
271 utf16Pointer += numberOfCharactersWritten;
272 }
273 return true;
274 }
275
WindowsCommandLine(int argc,wchar_t * argvW[],wchar_t * envpW[])276 WindowsCommandLine::WindowsCommandLine(int argc, wchar_t* argvW[], wchar_t* envpW[])
277 : _argv(NULL), _envp(NULL) {
278 // Construct UTF-8 copy of arguments
279 vector<string> utf8args;
280 vector<size_t> utf8argLength;
281 size_t blockSize = argc * sizeof(char*);
282 size_t blockPtr = blockSize;
283 for (int i = 0; i < argc; ++i) {
284 utf8args.push_back(toUtf8String(argvW[i]));
285 size_t argLength = utf8args[i].length() + 1;
286 utf8argLength.push_back(argLength);
287 blockSize += argLength;
288 }
289 _argv = static_cast<char**>(mongoMalloc(blockSize));
290 for (int i = 0; i < argc; ++i) {
291 _argv[i] = reinterpret_cast<char*>(_argv) + blockPtr;
292 strcpy_s(_argv[i], utf8argLength[i], utf8args[i].c_str());
293 blockPtr += utf8argLength[i];
294 }
295
296 // Construct UTF-8 copy of environment strings
297 size_t envCount = 0;
298 wchar_t** envpWptr = &envpW[0];
299 while (*envpWptr++) {
300 ++envCount;
301 }
302 vector<string> utf8envs;
303 vector<size_t> utf8envLength;
304 blockSize = (envCount + 1) * sizeof(char*);
305 blockPtr = blockSize;
306 for (size_t i = 0; i < envCount; ++i) {
307 utf8envs.push_back(toUtf8String(envpW[i]));
308 size_t envLength = utf8envs[i].length() + 1;
309 utf8envLength.push_back(envLength);
310 blockSize += envLength;
311 }
312 _envp = static_cast<char**>(mongoMalloc(blockSize));
313 size_t i;
314 for (i = 0; i < envCount; ++i) {
315 _envp[i] = reinterpret_cast<char*>(_envp) + blockPtr;
316 strcpy_s(_envp[i], utf8envLength[i], utf8envs[i].c_str());
317 blockPtr += utf8envLength[i];
318 }
319 _envp[i] = NULL;
320 }
321
~WindowsCommandLine()322 WindowsCommandLine::~WindowsCommandLine() {
323 free(_argv);
324 free(_envp);
325 }
326
327 #endif // #if defined(_WIN32)
328
329 // See "Parsing C++ Command-Line Arguments (C++)"
330 // http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
quoteForWindowsCommandLine(const std::string & arg,std::ostream & os)331 static void quoteForWindowsCommandLine(const std::string& arg, std::ostream& os) {
332 if (arg.empty()) {
333 os << "\"\"";
334 } else if (arg.find_first_of(" \t\"") == std::string::npos) {
335 os << arg;
336 } else {
337 os << '"';
338 std::string backslashes = "";
339 for (std::string::const_iterator iter = arg.begin(), end = arg.end(); iter != end; ++iter) {
340 switch (*iter) {
341 case '\\':
342 backslashes.push_back(*iter);
343 if (iter + 1 == end)
344 os << backslashes << backslashes;
345 break;
346 case '"':
347 os << backslashes << backslashes << "\\\"";
348 break;
349 default:
350 os << backslashes << *iter;
351 backslashes.clear();
352 break;
353 }
354 }
355 os << '"';
356 }
357 }
358
constructUtf8WindowsCommandLine(const std::vector<std::string> & argv)359 std::string constructUtf8WindowsCommandLine(const std::vector<std::string>& argv) {
360 if (argv.empty())
361 return "";
362
363 std::ostringstream commandLine;
364 std::vector<std::string>::const_iterator iter = argv.begin();
365 std::vector<std::string>::const_iterator end = argv.end();
366 quoteForWindowsCommandLine(*iter, commandLine);
367 ++iter;
368 for (; iter != end; ++iter) {
369 commandLine << ' ';
370 quoteForWindowsCommandLine(*iter, commandLine);
371 }
372 return commandLine.str();
373 }
374 }
375