1 // text.cpp
2 
3 
4 /**
5  *    Copyright (C) 2018-present MongoDB, Inc.
6  *
7  *    This program is free software: you can redistribute it and/or modify
8  *    it under the terms of the Server Side Public License, version 1,
9  *    as published by MongoDB, Inc.
10  *
11  *    This program is distributed in the hope that it will be useful,
12  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *    Server Side Public License for more details.
15  *
16  *    You should have received a copy of the Server Side Public License
17  *    along with this program. If not, see
18  *    <http://www.mongodb.com/licensing/server-side-public-license>.
19  *
20  *    As a special exception, the copyright holders give permission to link the
21  *    code of portions of this program with the OpenSSL library under certain
22  *    conditions as described in each individual source file and distribute
23  *    linked combinations including the program with the OpenSSL library. You
24  *    must comply with the Server Side Public License in all respects for
25  *    all of the code used other than as permitted herein. If you modify file(s)
26  *    with this exception, you may extend this exception to your version of the
27  *    file(s), but you are not obligated to do so. If you do not wish to do so,
28  *    delete this exception statement from your version. If you delete this
29  *    exception statement from all source files in the program, then also delete
30  *    it in the license file.
31  */
32 
33 #include "mongo/platform/basic.h"
34 
35 #include "mongo/util/text.h"
36 
37 #include <boost/integer_traits.hpp>
38 #include <errno.h>
39 #include <iostream>
40 #include <memory>
41 #include <sstream>
42 
43 #ifdef _WIN32
44 #include <io.h>
45 #endif
46 
47 #include "mongo/platform/basic.h"
48 #include "mongo/util/allocator.h"
49 #include "mongo/util/mongoutils/str.h"
50 
51 using namespace std;
52 
53 namespace mongo {
54 
55 // --- StringSplitter ----
56 
57 /** get next split string fragment */
next()58 string StringSplitter::next() {
59     const char* foo = strstr(_big, _splitter);
60     if (foo) {
61         string s(_big, foo - _big);
62         _big = foo + strlen(_splitter);
63         while (*_big && strstr(_big, _splitter) == _big)
64             _big++;
65         return s;
66     }
67 
68     string s = _big;
69     _big += strlen(_big);
70     return s;
71 }
72 
73 
split(vector<string> & l)74 void StringSplitter::split(vector<string>& l) {
75     while (more()) {
76         l.push_back(next());
77     }
78 }
79 
split()80 vector<string> StringSplitter::split() {
81     vector<string> l;
82     split(l);
83     return l;
84 }
85 
join(const vector<string> & l,const string & split)86 string StringSplitter::join(const vector<string>& l, const string& split) {
87     stringstream ss;
88     for (unsigned i = 0; i < l.size(); i++) {
89         if (i > 0)
90             ss << split;
91         ss << l[i];
92     }
93     return ss.str();
94 }
95 
split(const string & big,const string & splitter)96 vector<string> StringSplitter::split(const string& big, const string& splitter) {
97     StringSplitter ss(big.c_str(), splitter.c_str());
98     return ss.split();
99 }
100 
101 
102 // --- utf8 utils ------
103 
leadingOnes(unsigned char c)104 inline int leadingOnes(unsigned char c) {
105     if (c < 0x80)
106         return 0;
107     static const char _leadingOnes[128] = {
108         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x80 - 0x8F
109         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0x90 - 0x99
110         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0xA0 - 0xA9
111         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 0xB0 - 0xB9
112         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xC0 - 0xC9
113         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // 0xD0 - 0xD9
114         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // 0xE0 - 0xE9
115         4, 4, 4, 4, 4, 4, 4, 4,                          // 0xF0 - 0xF7
116         5, 5, 5, 5,                                      // 0xF8 - 0xFB
117         6, 6,                                            // 0xFC - 0xFD
118         7,                                               // 0xFE
119         8,                                               // 0xFF
120     };
121     return _leadingOnes[c & 0x7f];
122 }
123 
isValidUTF8(const std::string & s)124 bool isValidUTF8(const std::string& s) {
125     return isValidUTF8(s.c_str());
126 }
127 
isValidUTF8(const char * s)128 bool isValidUTF8(const char* s) {
129     int left = 0;  // how many bytes are left in the current codepoint
130     while (*s) {
131         const unsigned char c = (unsigned char)*(s++);
132         const int ones = leadingOnes(c);
133         if (left) {
134             if (ones != 1)
135                 return false;  // should be a continuation byte
136             left--;
137         } else {
138             if (ones == 0)
139                 continue;  // ASCII byte
140             if (ones == 1)
141                 return false;  // unexpected continuation byte
142             if (c > 0xF4)
143                 return false;  // codepoint too large (< 0x10FFFF)
144             if (c == 0xC0 || c == 0xC1)
145                 return false;  // codepoints <= 0x7F shouldn't be 2 bytes
146 
147             // still valid
148             left = ones - 1;
149         }
150     }
151     if (left != 0)
152         return false;  // string ended mid-codepoint
153     return true;
154 }
155 
156 #if defined(_WIN32)
157 
toUtf8String(const std::wstring & wide)158 std::string toUtf8String(const std::wstring& wide) {
159     if (wide.size() > boost::integer_traits<int>::const_max)
160         throw std::length_error("Wide string cannot be more than INT_MAX characters long.");
161     if (wide.size() == 0)
162         return "";
163 
164     // Calculate necessary buffer size
165     int len = ::WideCharToMultiByte(
166         CP_UTF8, 0, wide.c_str(), static_cast<int>(wide.size()), NULL, 0, NULL, NULL);
167 
168     // Perform actual conversion
169     if (len > 0) {
170         std::vector<char> buffer(len);
171         len = ::WideCharToMultiByte(CP_UTF8,
172                                     0,
173                                     wide.c_str(),
174                                     static_cast<int>(wide.size()),
175                                     &buffer[0],
176                                     static_cast<int>(buffer.size()),
177                                     NULL,
178                                     NULL);
179         if (len > 0) {
180             verify(len == static_cast<int>(buffer.size()));
181             return std::string(&buffer[0], buffer.size());
182         }
183     }
184 
185     msgasserted(16091, mongoutils::str::stream() << "can't wstring to utf8: " << ::GetLastError());
186     return "";
187 }
188 
toWideString(const char * utf8String)189 std::wstring toWideString(const char* utf8String) {
190     int bufferSize = MultiByteToWideChar(CP_UTF8,     // Code page
191                                          0,           // Flags
192                                          utf8String,  // Input string
193                                          -1,          // Count, -1 for NUL-terminated
194                                          NULL,        // No output buffer
195                                          0            // Zero means "compute required size"
196                                          );
197     if (bufferSize == 0) {
198         return std::wstring();
199     }
200     std::unique_ptr<wchar_t[]> tempBuffer(new wchar_t[bufferSize]);
201     tempBuffer[0] = 0;
202     MultiByteToWideChar(CP_UTF8,           // Code page
203                         0,                 // Flags
204                         utf8String,        // Input string
205                         -1,                // Count, -1 for NUL-terminated
206                         tempBuffer.get(),  // UTF-16 output buffer
207                         bufferSize         // Buffer size in wide characters
208                         );
209     return std::wstring(tempBuffer.get());
210 }
211 
212 /**
213  * Write a UTF-8 string to the Windows console in Unicode (UTF-16)
214  *
215  * @param utf8String        UTF-8 input string
216  * @param utf8StringSize    Number of bytes in UTF-8 string, no NUL terminator assumed
217  * @return                  true if all characters were displayed (including zero characters)
218  */
writeUtf8ToWindowsConsole(const char * utf8String,unsigned int utf8StringSize)219 bool writeUtf8ToWindowsConsole(const char* utf8String, unsigned int utf8StringSize) {
220     int bufferSize = MultiByteToWideChar(CP_UTF8,         // Code page
221                                          0,               // Flags
222                                          utf8String,      // Input string
223                                          utf8StringSize,  // Input string length
224                                          NULL,            // No output buffer
225                                          0                // Zero means "compute required size"
226                                          );
227     if (bufferSize == 0) {
228         return true;
229     }
230     std::unique_ptr<wchar_t[]> utf16String(new wchar_t[bufferSize]);
231     MultiByteToWideChar(CP_UTF8,            // Code page
232                         0,                  // Flags
233                         utf8String,         // Input string
234                         utf8StringSize,     // Input string length
235                         utf16String.get(),  // UTF-16 output buffer
236                         bufferSize          // Buffer size in wide characters
237                         );
238     const wchar_t* utf16Pointer = utf16String.get();
239     size_t numberOfCharactersToWrite = bufferSize;
240     HANDLE consoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
241     while (numberOfCharactersToWrite > 0) {
242         static const DWORD MAXIMUM_CHARACTERS_PER_PASS = 8 * 1024;
243         DWORD numberOfCharactersThisPass = static_cast<DWORD>(numberOfCharactersToWrite);
244         if (numberOfCharactersThisPass > MAXIMUM_CHARACTERS_PER_PASS) {
245             numberOfCharactersThisPass = MAXIMUM_CHARACTERS_PER_PASS;
246         }
247         DWORD numberOfCharactersWritten;
248         BOOL success = WriteConsoleW(consoleHandle,
249                                      utf16Pointer,
250                                      numberOfCharactersThisPass,
251                                      &numberOfCharactersWritten,
252                                      NULL);
253         if (0 == success) {
254             DWORD dosError = GetLastError();
255             static bool errorMessageShown = false;
256             if (ERROR_GEN_FAILURE == dosError) {
257                 if (!errorMessageShown) {
258                     std::cout << "\n---\nUnicode text could not be correctly displayed.\n"
259                                  "Please change your console font to a Unicode font "
260                                  "(e.g. Lucida Console).\n---\n"
261                               << std::endl;
262                     errorMessageShown = true;
263                 }
264                 // we can't display the text properly using a raster font,
265                 // but we can display the bits that will display ...
266                 _write(1, utf8String, utf8StringSize);
267             }
268             return false;
269         }
270         numberOfCharactersToWrite -= numberOfCharactersWritten;
271         utf16Pointer += numberOfCharactersWritten;
272     }
273     return true;
274 }
275 
WindowsCommandLine(int argc,wchar_t * argvW[],wchar_t * envpW[])276 WindowsCommandLine::WindowsCommandLine(int argc, wchar_t* argvW[], wchar_t* envpW[])
277     : _argv(NULL), _envp(NULL) {
278     // Construct UTF-8 copy of arguments
279     vector<string> utf8args;
280     vector<size_t> utf8argLength;
281     size_t blockSize = argc * sizeof(char*);
282     size_t blockPtr = blockSize;
283     for (int i = 0; i < argc; ++i) {
284         utf8args.push_back(toUtf8String(argvW[i]));
285         size_t argLength = utf8args[i].length() + 1;
286         utf8argLength.push_back(argLength);
287         blockSize += argLength;
288     }
289     _argv = static_cast<char**>(mongoMalloc(blockSize));
290     for (int i = 0; i < argc; ++i) {
291         _argv[i] = reinterpret_cast<char*>(_argv) + blockPtr;
292         strcpy_s(_argv[i], utf8argLength[i], utf8args[i].c_str());
293         blockPtr += utf8argLength[i];
294     }
295 
296     // Construct UTF-8 copy of environment strings
297     size_t envCount = 0;
298     wchar_t** envpWptr = &envpW[0];
299     while (*envpWptr++) {
300         ++envCount;
301     }
302     vector<string> utf8envs;
303     vector<size_t> utf8envLength;
304     blockSize = (envCount + 1) * sizeof(char*);
305     blockPtr = blockSize;
306     for (size_t i = 0; i < envCount; ++i) {
307         utf8envs.push_back(toUtf8String(envpW[i]));
308         size_t envLength = utf8envs[i].length() + 1;
309         utf8envLength.push_back(envLength);
310         blockSize += envLength;
311     }
312     _envp = static_cast<char**>(mongoMalloc(blockSize));
313     size_t i;
314     for (i = 0; i < envCount; ++i) {
315         _envp[i] = reinterpret_cast<char*>(_envp) + blockPtr;
316         strcpy_s(_envp[i], utf8envLength[i], utf8envs[i].c_str());
317         blockPtr += utf8envLength[i];
318     }
319     _envp[i] = NULL;
320 }
321 
~WindowsCommandLine()322 WindowsCommandLine::~WindowsCommandLine() {
323     free(_argv);
324     free(_envp);
325 }
326 
327 #endif  // #if defined(_WIN32)
328 
329 // See "Parsing C++ Command-Line Arguments (C++)"
330 // http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
quoteForWindowsCommandLine(const std::string & arg,std::ostream & os)331 static void quoteForWindowsCommandLine(const std::string& arg, std::ostream& os) {
332     if (arg.empty()) {
333         os << "\"\"";
334     } else if (arg.find_first_of(" \t\"") == std::string::npos) {
335         os << arg;
336     } else {
337         os << '"';
338         std::string backslashes = "";
339         for (std::string::const_iterator iter = arg.begin(), end = arg.end(); iter != end; ++iter) {
340             switch (*iter) {
341                 case '\\':
342                     backslashes.push_back(*iter);
343                     if (iter + 1 == end)
344                         os << backslashes << backslashes;
345                     break;
346                 case '"':
347                     os << backslashes << backslashes << "\\\"";
348                     break;
349                 default:
350                     os << backslashes << *iter;
351                     backslashes.clear();
352                     break;
353             }
354         }
355         os << '"';
356     }
357 }
358 
constructUtf8WindowsCommandLine(const std::vector<std::string> & argv)359 std::string constructUtf8WindowsCommandLine(const std::vector<std::string>& argv) {
360     if (argv.empty())
361         return "";
362 
363     std::ostringstream commandLine;
364     std::vector<std::string>::const_iterator iter = argv.begin();
365     std::vector<std::string>::const_iterator end = argv.end();
366     quoteForWindowsCommandLine(*iter, commandLine);
367     ++iter;
368     for (; iter != end; ++iter) {
369         commandLine << ' ';
370         quoteForWindowsCommandLine(*iter, commandLine);
371     }
372     return commandLine.str();
373 }
374 }
375