1 /*
2 * StringUtils.cpp
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16 #include <core/StringUtils.hpp>
17
18 #include <stdarg.h>
19
20 #include <algorithm>
21 #include <map>
22 #include <ostream>
23 #include <gsl/gsl>
24
25 #include <boost/algorithm/string.hpp>
26 #include <boost/algorithm/string/case_conv.hpp>
27 #include <boost/algorithm/string/classification.hpp>
28 #include <boost/algorithm/string/replace.hpp>
29 #include <boost/algorithm/string/split.hpp>
30 #include <boost/regex.hpp>
31
32 #include <core/Algorithm.hpp>
33 #include <core/Log.hpp>
34 #include <shared_core/SafeConvert.hpp>
35 #include <shared_core/json/Json.hpp>
36
37 #ifdef _WIN32
38 #include <windows.h>
39 #include <winnls.h>
40 #endif
41
42 #ifndef CP_ACP
43 # define CP_ACP 0
44 #endif
45
46 namespace rstudio {
47 namespace core {
48 namespace string_utils {
49
isTruthy(const std::string & string,bool valueIfEmpty)50 bool isTruthy(const std::string& string,
51 bool valueIfEmpty)
52 {
53 // allow user-configurable behavior for empty strings
54 if (string.empty())
55 return valueIfEmpty;
56
57 // check for special 'falsy' values
58 std::string lower = toLower(string);
59 if (lower == "0" || lower == "false")
60 return false;
61
62 // assume all other values are 'truthy'
63 return true;
64 }
65
isSubsequence(std::string const & self,std::string const & other,std::string::size_type other_n)66 bool isSubsequence(std::string const& self,
67 std::string const& other,
68 std::string::size_type other_n)
69 {
70 std::string::size_type self_n = self.length();
71
72 if (other_n == 0)
73 return true;
74
75 if (other_n > other.length())
76 other_n = other.length();
77
78 if (other_n > self_n)
79 return false;
80
81 std::string::size_type self_idx = 0;
82 std::string::size_type other_idx = 0;
83
84 while (self_idx < self_n)
85 {
86 char selfChar = self[self_idx];
87 char otherChar = other[other_idx];
88
89 if (otherChar == selfChar)
90 {
91 ++other_idx;
92 if (other_idx == other_n)
93 {
94 return true;
95 }
96 }
97 ++self_idx;
98 }
99 return false;
100 }
101
102
isSubsequence(std::string const & self,std::string const & other,std::string::size_type other_n,bool caseInsensitive)103 bool isSubsequence(std::string const& self,
104 std::string const& other,
105 std::string::size_type other_n,
106 bool caseInsensitive)
107 {
108 return caseInsensitive ?
109 isSubsequence(boost::algorithm::to_lower_copy(self),
110 boost::algorithm::to_lower_copy(other),
111 other_n) :
112 isSubsequence(self, other, other_n)
113 ;
114 }
115
isSubsequence(std::string const & self,std::string const & other)116 bool isSubsequence(std::string const& self,
117 std::string const& other)
118 {
119 return isSubsequence(self, other, other.length());
120 }
121
isSubsequence(std::string const & self,std::string const & other,bool caseInsensitive)122 bool isSubsequence(std::string const& self,
123 std::string const& other,
124 bool caseInsensitive)
125 {
126 return isSubsequence(self, other, other.length(), caseInsensitive);
127 }
128
subsequenceIndices(std::string const & sequence,std::string const & query)129 std::vector<int> subsequenceIndices(std::string const& sequence,
130 std::string const& query)
131 {
132 std::string::size_type querySize = query.length();
133 std::vector<int> result;
134 result.reserve(querySize);
135
136 std::string::size_type prevMatchIndex = -1;
137 for (std::string::size_type i = 0; i < querySize; i++)
138 {
139 std::string::size_type index = sequence.find(query[i], prevMatchIndex + 1);
140 if (index == std::string::npos)
141 continue;
142
143 result.push_back(gsl::narrow_cast<int>(index));
144 prevMatchIndex = index;
145 }
146
147 return result;
148 }
149
subsequenceIndices(std::string const & sequence,std::string const & query,std::vector<int> * pIndices)150 bool subsequenceIndices(std::string const& sequence,
151 std::string const& query,
152 std::vector<int> *pIndices)
153 {
154 pIndices->clear();
155 pIndices->reserve(query.length());
156
157 int query_n = gsl::narrow_cast<int>(query.length());
158 int prevMatchIndex = -1;
159
160 for (int i = 0; i < query_n; i++)
161 {
162 int index = gsl::narrow_cast<int>(sequence.find(query[i], prevMatchIndex + 1));
163 if (index == -1)
164 return false;
165
166 pIndices->push_back(index);
167 prevMatchIndex = index;
168 }
169
170 return true;
171 }
172
getExtension(std::string const & x)173 std::string getExtension(std::string const& x)
174 {
175 std::size_t lastDotIndex = x.rfind('.');
176 if (lastDotIndex != std::string::npos)
177 return x.substr(lastDotIndex);
178 else
179 return std::string();
180 }
181
convertLineEndings(std::string * pStr,LineEnding type)182 void convertLineEndings(std::string* pStr, LineEnding type)
183 {
184 std::string replacement;
185 switch (type)
186 {
187 case LineEndingWindows:
188 replacement = "\r\n";
189 break;
190 case LineEndingPosix:
191 replacement = "\n";
192 break;
193 case LineEndingNative:
194 #if _WIN32
195 replacement = "\r\n";
196 #else
197 replacement = "\n";
198 #endif
199 break;
200 case LineEndingPassthrough:
201 default:
202 return;
203 }
204
205 *pStr = boost::regex_replace(*pStr, boost::regex("\\r?\\n|\\r|\\xE2\\x80[\\xA8\\xA9]"), replacement);
206 }
207
detectLineEndings(const FilePath & filePath,LineEnding * pType)208 bool detectLineEndings(const FilePath& filePath, LineEnding* pType)
209 {
210 if (!filePath.exists())
211 return false;
212
213 std::shared_ptr<std::istream> pIfs;
214 Error error = filePath.openForRead(pIfs);
215 if (error)
216 {
217 LOG_ERROR(error);
218 return false;
219 }
220
221 // read file character-by-character using a streambuf
222 try
223 {
224 std::istream::sentry se(*pIfs, true);
225 std::streambuf* sb = pIfs->rdbuf();
226
227 while(true)
228 {
229 int ch = sb->sbumpc();
230
231 if (ch == '\n')
232 {
233 // using posix line endings
234 *pType = string_utils::LineEndingPosix;
235 return true;
236 }
237 else if (ch == '\r' && sb->sgetc() == '\n')
238 {
239 // using windows line endings
240 *pType = string_utils::LineEndingWindows;
241 return true;
242 }
243 else if (ch == EOF)
244 {
245 break;
246 }
247 else if (pIfs->fail())
248 {
249 LOG_WARNING_MESSAGE("I/O Error reading file " +
250 filePath.getAbsolutePath());
251 break;
252 }
253 }
254 }
255 CATCH_UNEXPECTED_EXCEPTION
256
257 // no detection possible (perhaps the file is empty or has only one line)
258 return false;
259 }
260
utf8ToSystem(const std::string & str,bool escapeInvalidChars)261 std::string utf8ToSystem(const std::string& str,
262 bool escapeInvalidChars)
263 {
264 if (str.empty())
265 return std::string();
266
267 #ifdef _WIN32
268
269 std::vector<wchar_t> wide(str.length() + 1);
270 int chars = ::MultiByteToWideChar(
271 CP_UTF8, 0,
272 str.c_str(), -1,
273 &wide[0], gsl::narrow_cast<int>(wide.size()));
274
275 if (chars < 0)
276 {
277 LOG_ERROR(LAST_SYSTEM_ERROR());
278 return str;
279 }
280
281 std::ostringstream output;
282 char buffer[16];
283
284 // Only go up to chars - 1 because last char is \0
285 for (int i = 0; i < chars - 1; i++)
286 {
287 int n = wctomb(buffer, wide[i]);
288
289 if (n == -1)
290 {
291 if (escapeInvalidChars)
292 {
293 // NOTE: in R, both '\u{1234}' and '\u1234' are valid
294 // ways of specifying a unicode literal, but only the
295 // latter is accepted by Python, and since the reticulate
296 // REPL uses the same conversion routines we prefer the
297 // format compatible with both parsers
298 output << "\\u" << std::hex << wide[i];
299 }
300 else
301 {
302 output << "?"; // TODO: Use GetCPInfo()
303 }
304 }
305 else
306 {
307 output.write(buffer, n);
308 }
309 }
310 return output.str();
311 #else
312 // Assumes that UTF8 is the locale on POSIX
313 return str;
314 #endif
315 }
316
systemToUtf8(const std::string & str,int codepage)317 std::string systemToUtf8(const std::string& str, int codepage)
318 {
319 if (str.empty())
320 return std::string();
321
322 #ifdef _WIN32
323 std::vector<wchar_t> wide(str.length() + 1);
324 int chars = ::MultiByteToWideChar(codepage,
325 0,
326 str.c_str(),
327 gsl::narrow_cast<int>(str.length()),
328 &wide[0],
329 gsl::narrow_cast<int>(wide.size()));
330 if (chars < 0)
331 {
332 LOG_ERROR(LAST_SYSTEM_ERROR());
333 return str;
334 }
335
336 int bytesRequired = ::WideCharToMultiByte(CP_UTF8, 0, &wide[0], chars,
337 nullptr, 0,
338 nullptr, nullptr);
339 if (bytesRequired == 0)
340 {
341 LOG_ERROR(LAST_SYSTEM_ERROR());
342 return str;
343 }
344 std::vector<char> buf(bytesRequired, 0);
345 int bytesWritten = ::WideCharToMultiByte(CP_UTF8, 0, &wide[0], chars,
346 &(buf[0]), static_cast<int>(buf.size()),
347 nullptr, nullptr);
348 return std::string(buf.begin(), buf.end());
349 #else
350 return str;
351 #endif
352 }
353
systemToUtf8(const std::string & str)354 std::string systemToUtf8(const std::string& str)
355 {
356 return systemToUtf8(str, CP_ACP);
357 }
358
toUpper(const std::string & str)359 std::string toUpper(const std::string& str)
360 {
361 std::string upper = str;
362 std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper);
363 return upper;
364 }
365
toLower(const std::string & str)366 std::string toLower(const std::string& str)
367 {
368 std::string lower = str;
369 std::transform(lower.begin(), lower.end(), lower.begin(), core::tolower);
370 return lower;
371 }
372
textToHtml(const std::string & str)373 std::string textToHtml(const std::string& str)
374 {
375 std::string html = str;
376 boost::replace_all(html, "&", "&");
377 boost::replace_all(html, "<", "<");
378 return html;
379 }
380
381 namespace {
escape(std::string specialChars,const std::map<char,std::string> & replacements,std::string str)382 std::string escape(std::string specialChars,
383 const std::map<char, std::string>& replacements,
384 std::string str)
385 {
386 std::string result;
387 result.reserve(static_cast<size_t>(str.size() * 1.2));
388
389 size_t tail = 0;
390 for (size_t head = 0;
391 head < str.size()
392 && str.npos != (head = str.find_first_of(specialChars, head));
393 tail = ++head)
394 {
395 if (tail < head)
396 result.append(str, tail, head - tail);
397
398 result.append(replacements.find(str.at(head))->second);
399 }
400
401 if (tail < str.size())
402 result.append(str, tail, std::string::npos);
403
404 return result;
405
406 }
407 } // anonymous namespace
408
htmlEscape(const std::string & str,bool isAttributeValue)409 std::string htmlEscape(const std::string& str, bool isAttributeValue)
410 {
411 std::string escapes = isAttributeValue ?
412 "<>&'\"/\r\n" :
413 "<>&'\"/";
414
415 std::map<char, std::string> subs;
416 subs['<'] = "<";
417 subs['>'] = ">";
418 subs['&'] = "&";
419 subs['\''] = "'";
420 subs['"'] = """;
421 subs['/'] = "/";
422 if (isAttributeValue)
423 {
424 subs['\r'] = " ";
425 subs['\n'] = " ";
426 }
427
428 return escape(escapes, subs, str);
429 }
430
jsLiteralEscape(const std::string & str)431 std::string jsLiteralEscape(const std::string& str)
432 {
433 std::string escapes = "\\'\"\r\n<";
434
435 std::map<char, std::string> subs;
436 subs['\\'] = "\\\\";
437 subs['\''] = "\\'";
438 subs['"'] = "\\\"";
439 subs['\r'] = "\\r";
440 subs['\n'] = "\\n";
441 subs['<'] = "\\074";
442
443 return escape(escapes, subs, str);
444 }
445
jsonLiteralEscape(const std::string & str)446 std::string jsonLiteralEscape(const std::string& str)
447 {
448 std::string escapes = "\\\"\r\n";
449
450 std::map<char, std::string> subs;
451 subs['\\'] = "\\\\";
452 subs['"'] = "\\\"";
453 subs['\r'] = "\\r";
454 subs['\n'] = "\\n";
455
456 return escape(escapes, subs, str);
457 }
458 // The str that is passed in should INCLUDE the " " around the value!
459 // (Sorry this is inconsistent with jsonLiteralEscape, but it's more efficient
460 // than adding double-quotes in this function)
jsonLiteralUnescape(const std::string & str)461 std::string jsonLiteralUnescape(const std::string& str)
462 {
463 json::Value value;
464 if (value.parse(str) || !json::isType<std::string>(value))
465 {
466 LOG_ERROR_MESSAGE("Failed to unescape JS literal");
467 return str;
468 }
469
470 return value.getString();
471 }
472
singleQuotedStrEscape(const std::string & str)473 std::string singleQuotedStrEscape(const std::string& str)
474 {
475 std::string escapes = "'\\";
476
477 std::map<char, std::string> subs;
478 subs['\\'] = "\\\\";
479 subs['\''] = "\\'";
480
481 return escape(escapes, subs, str);
482 }
483
filterControlChars(const std::string & str)484 std::string filterControlChars(const std::string& str)
485 {
486 // Delete control chars, which can cause errors in JSON parsing (especially
487 // \0003)
488 return boost::regex_replace(str,
489 boost::regex("[\\0000-\\0010\\0016-\\0037]+"),
490 "");
491 }
492
493 namespace {
494
initLookupTable(wchar_t ranges[][2],size_t rangeCount)495 std::vector<bool> initLookupTable(wchar_t ranges[][2], size_t rangeCount)
496 {
497 std::vector<bool> results(0xFFFF, false);
498 for (size_t i = 0; i < rangeCount; i++)
499 {
500 for (wchar_t j = ranges[i][0]; j <= ranges[i][1]; j++)
501 results[j] = true;
502 }
503 return results;
504 }
505
506 // See https://gist.github.com/1110629 for range generating script
507
initAlnumLookupTable()508 std::vector<bool> initAlnumLookupTable()
509 {
510 wchar_t ranges[][2] = {
511 {0x30, 0x39}, {0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, {0x376, 0x37D}, {0x386, 0x386}, {0x388, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x559}, {0x561, 0x587}, {0x5D0, 0x5F2}, {0x620, 0x64A}, {0x660, 0x669}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, {0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x966, 0x96F}, {0x971, 0x97F}, {0x985, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9E1}, {0x9E6, 0x9F1}, {0xA05, 0xA39}, {0xA59, 0xA6F}, {0xA72, 0xA74}, {0xA85, 0xAB9}, {0xABD, 0xABD}, {0xAD0, 0xAE1}, {0xAE6, 0xAEF}, {0xB05, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB61}, {0xB66, 0xB6F}, {0xB71, 0xB71}, {0xB83, 0xBB9}, {0xBD0, 0xBD0}, {0xBE6, 0xBEF}, {0xC05, 0xC3D}, {0xC58, 0xC61}, {0xC66, 0xC6F}, {0xC85, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCE1}, {0xCE6, 0xCF2}, {0xD05, 0xD3D}, {0xD4E, 0xD4E}, {0xD60, 0xD61}, {0xD66, 0xD6F}, {0xD7A, 0xD7F}, {0xD85, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE50, 0xE59}, {0xE81, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEC6}, {0xED0, 0xF00}, {0xF20, 0xF29}, {0xF40, 0xF6C}, {0xF88, 0xF8C}, {0x1000, 0x102A}, {0x103F, 0x1049}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x1090, 0x1099}, {0x10A0, 0x10FA}, {0x10FC, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F4}, {0x1401, 0x166C}, {0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16EE, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x1770}, {0x1780, 0x17B3}, {0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x17E0, 0x17E9}, {0x1810, 0x18A8}, {0x18AA, 0x191C}, {0x1946, 0x19AB}, {0x19C1, 0x19C7}, {0x19D0, 0x19D9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1A80, 0x1A99}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B59}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C40, 0x1C7D}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF1}, {0x1D00, 0x1DBF}, {0x1E00, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FCC}, {0x1FD0, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, {0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, {0x2160, 0x2188}, {0x2C00, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2D00, 0x2D6F}, {0x2D80, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3007}, {0x3021, 0x3029}, {0x3031, 0x3035}, {0x3038, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x318E}, {0x31A0, 0x31BA}, {0x31F0, 0x31FF}, {0x3400, 0x4DB5}, {0x4E00, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA66E}, {0xA67F, 0xA6EF}, {0xA717, 0xA71F}, {0xA722, 0xA788}, {0xA78B, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8D0, 0xA8D9}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9D9}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, {0xAA50, 0xAA59}, {0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA80, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAADD}, {0xAB01, 0xABE2}, {0xABF0, 0xD7FB}, {0xF900, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFDFB}, {0xFE70, 0xFEFC}, {0xFF10, 0xFF19}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFDC}
512 };
513
514 return initLookupTable(ranges, sizeof(ranges) / sizeof(ranges[0]));
515 }
516
initAlphaLookupTable()517 std::vector<bool> initAlphaLookupTable()
518 {
519 wchar_t ranges[][2] = {
520 {0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, {0x376, 0x37D}, {0x386, 0x386}, {0x388, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x559}, {0x561, 0x587}, {0x5D0, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, {0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x97F}, {0x985, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9E1}, {0x9F0, 0x9F1}, {0xA05, 0xA39}, {0xA59, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xAB9}, {0xABD, 0xABD}, {0xAD0, 0xAE1}, {0xB05, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB61}, {0xB71, 0xB71}, {0xB83, 0xBB9}, {0xBD0, 0xBD0}, {0xC05, 0xC3D}, {0xC58, 0xC61}, {0xC85, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCE1}, {0xCF1, 0xCF2}, {0xD05, 0xD3D}, {0xD4E, 0xD4E}, {0xD60, 0xD61}, {0xD7A, 0xD7F}, {0xD85, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEC6}, {0xEDC, 0xF00}, {0xF40, 0xF6C}, {0xF88, 0xF8C}, {0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10FA}, {0x10FC, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F4}, {0x1401, 0x166C}, {0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16EE, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x1770}, {0x1780, 0x17B3}, {0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x18A8}, {0x18AA, 0x191C}, {0x1950, 0x19AB}, {0x19C1, 0x19C7}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BC0, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, {0x1C5A, 0x1C7D}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF1}, {0x1D00, 0x1DBF}, {0x1E00, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FCC}, {0x1FD0, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, {0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, {0x2160, 0x2188}, {0x2C00, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2D00, 0x2D6F}, {0x2D80, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3007}, {0x3021, 0x3029}, {0x3031, 0x3035}, {0x3038, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x318E}, {0x31A0, 0x31BA}, {0x31F0, 0x31FF}, {0x3400, 0x4DB5}, {0x4E00, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA66E}, {0xA67F, 0xA6EF}, {0xA717, 0xA71F}, {0xA722, 0xA788}, {0xA78B, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, {0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA80, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAADD}, {0xAB01, 0xABE2}, {0xAC00, 0xD7FB}, {0xF900, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFDFB}, {0xFE70, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFDC}
521 };
522
523 return initLookupTable(ranges, sizeof(ranges) / sizeof(ranges[0]));
524 }
525
526 } // anonymous namespace
527
isalpha(wchar_t c)528 bool isalpha(wchar_t c)
529 {
530 static std::vector<bool> lookup = initAlphaLookupTable();
531 if (c >= 0xFFFF)
532 return false; // This function only supports BMP
533 return lookup.at(c);
534 }
535
isalnum(wchar_t c)536 bool isalnum(wchar_t c)
537 {
538 static std::vector<bool> lookup;
539 if (lookup.empty())
540 lookup = initAlnumLookupTable();
541
542 if (c >= 0xFFFF)
543 return false; // This function only supports BMP
544 return lookup.at(c);
545 }
546
parseVersion(const std::string & str,uint64_t * pVersion)547 bool parseVersion(const std::string& str, uint64_t* pVersion)
548 {
549 uint64_t version = 0;
550
551 std::vector<std::string> chunks;
552 boost::algorithm::split(chunks, str, boost::algorithm::is_any_of("."));
553
554 if (chunks.empty())
555 return false;
556
557 for (size_t i = 0; i < chunks.size() && i < 4; i++)
558 {
559 boost::optional<uint16_t> value = core::safe_convert::stringTo<uint16_t>(chunks[i]);
560 if (!value)
561 return false;
562 version += static_cast<uint64_t>(value.get()) << ((3-i) * 16);
563 }
564 if (pVersion)
565 *pVersion = version;
566 return true;
567 }
568
trimLeadingLines(int maxLines,std::string * pLines)569 bool trimLeadingLines(int maxLines, std::string* pLines)
570 {
571 bool didTrim = false;
572 if (pLines->length() > static_cast<unsigned int>(maxLines * 2))
573 {
574 int lineCount = 0;
575 std::string::const_iterator begin = pLines->begin();
576 std::string::iterator pos = pLines->end();
577
578 for (;;)
579 {
580 --pos;
581
582 if (*pos == '\n')
583 {
584 if (++lineCount > maxLines)
585 {
586 pLines->erase(pLines->begin(), pos);
587 didTrim = true;
588 break;
589 }
590 }
591
592 if (pos == begin)
593 break;
594 }
595 }
596 return didTrim;
597 }
598
strippedOfBackQuotes(const std::string & string)599 std::string strippedOfBackQuotes(const std::string& string)
600 {
601 if (string.length() < 2)
602 return string;
603
604 std::size_t startIndex = 0;
605 std::size_t n = string.length();
606 std::size_t endIndex = n;
607
608 startIndex += string[0] == '`';
609 endIndex -= string[n - 1] == '`';
610
611 return string.substr(startIndex, endIndex - startIndex);
612 }
613
stripQuotes(std::string * pStr)614 void stripQuotes(std::string* pStr)
615 {
616 if (pStr->length() > 0 && (pStr->at(0) == '\'' || pStr->at(0) == '"'))
617 *pStr = pStr->substr(1);
618
619 auto len = pStr->length();
620
621 if (len > 0 && (pStr->at(len-1) == '\'' || pStr->at(len-1) == '"'))
622 *pStr = pStr->substr(0, len -1);
623 }
624
strippedOfQuotes(const std::string & string)625 std::string strippedOfQuotes(const std::string& string)
626 {
627 std::string::size_type n = string.length();
628 if (n < 2) return string;
629
630 char first = string[0];
631 char last = string[n - 1];
632
633 if ((first == '\'' && last == '\'') ||
634 (first == '"' && last == '"') |\
635 (first == '`' && last == '`'))
636 {
637 return string.substr(1, n - 2);
638 }
639
640 return string;
641 }
642
643 template <typename Iter, typename U>
644 Iter countNewlinesImpl(Iter begin,
645 Iter end,
646 const U& CR,
647 const U& LF,
648 std::size_t* pNewlineCount)
649 {
650 std::size_t newlineCount = 0;
651 Iter it = begin;
652
653 Iter lastNewline = end;
654
655 for (; it != end; ++it)
656 {
657 // Detect '\r\n'
658 if (*it == CR)
659 {
660 if (it + 1 != end &&
661 *(it + 1) == LF)
662 {
663 lastNewline = it;
664 ++it;
665 ++newlineCount;
666 continue;
667 }
668 }
669
670 // Detect '\n'
671 if (*it == LF)
672 {
673 lastNewline = it;
674 ++newlineCount;
675 }
676 }
677
678 *pNewlineCount = newlineCount;
679 return lastNewline;
680 }
681
countNewlines(const std::wstring & string)682 std::size_t countNewlines(const std::wstring& string)
683 {
684 std::size_t count = 0;
685 countNewlinesImpl(string.begin(), string.end(), L'\r', L'\n', &count);
686 return count;
687 }
688
countNewlines(const std::string & string)689 std::size_t countNewlines(const std::string& string)
690 {
691 std::size_t count = 0;
692 countNewlinesImpl(string.begin(), string.end(), '\r', '\n', &count);
693 return count;
694 }
695
countNewlines(std::string::iterator begin,std::string::iterator end)696 std::size_t countNewlines(std::string::iterator begin,
697 std::string::iterator end)
698 {
699 std::size_t count = 0;
700 countNewlinesImpl(begin, end, '\r', '\n', &count);
701 return count;
702 }
703
countNewlines(std::wstring::iterator begin,std::wstring::iterator end)704 std::size_t countNewlines(std::wstring::iterator begin,
705 std::wstring::iterator end)
706 {
707 std::size_t count = 0;
708 countNewlinesImpl(begin, end, '\r', '\n', &count);
709 return count;
710 }
711
countNewlines(std::wstring::const_iterator begin,std::wstring::const_iterator end,std::size_t * pCount)712 std::wstring::const_iterator countNewlines(std::wstring::const_iterator begin,
713 std::wstring::const_iterator end,
714 std::size_t* pCount)
715 {
716 return countNewlinesImpl(begin, end, '\r', '\n', pCount);
717 }
718
isPrefixOf(const std::string & self,const std::string & prefix)719 bool isPrefixOf(const std::string& self, const std::string& prefix)
720 {
721 return boost::algorithm::starts_with(self, prefix);
722 }
723
makeRandomByteString(std::size_t n)724 std::string makeRandomByteString(std::size_t n)
725 {
726 std::string result;
727 result.resize(n);
728 for (std::size_t i = 0; i < n; ++i)
729 result[i] = (unsigned char) (::rand() % UCHAR_MAX);
730 return result;
731 }
732
extractCommentHeader(const std::string & contents,const std::string & reCommentPrefix,std::string * pHeader)733 bool extractCommentHeader(const std::string& contents,
734 const std::string& reCommentPrefix,
735 std::string* pHeader)
736 {
737 // construct newline-based token iterator
738 boost::regex reNewline("(?:\\r?\\n|$)");
739 boost::sregex_token_iterator it(
740 contents.begin(),
741 contents.end(),
742 reNewline,
743 -1);
744 boost::sregex_token_iterator end;
745
746 // first, skip blank lines
747 boost::regex reWhitespace("^\\s*$");
748 while (it != end)
749 {
750 if (boost::regex_match(it->begin(), it->end(), reWhitespace))
751 {
752 ++it;
753 continue;
754 }
755
756 break;
757 }
758
759 // if we're at the end now, bail
760 if (it == end)
761 return false;
762
763 // check to see if we landed on our comment prefix and
764 // quit early if we haven't
765 boost::regex rePrefix(reCommentPrefix);
766 if (!boost::regex_search(it->begin(), it->end(), rePrefix))
767 return false;
768
769 // we have a prefix: start iterating and extracting these
770 for (; it != end; ++it)
771 {
772 boost::smatch match;
773 if (!boost::regex_search(it->begin(), it->end(), match, rePrefix))
774 {
775 // this is no longer a commented line; time to go home
776 break;
777 }
778
779 // extract the line (sans prefix)
780 std::string line(it->begin() + match.length(), it->end());
781 pHeader->append(line + "\n");
782 }
783
784 // report success to the user
785 return true;
786 }
787
extractIndent(const std::string & line)788 std::string extractIndent(const std::string& line)
789 {
790 auto index = line.find_first_not_of(" \t");
791 if (index == std::string::npos)
792 return std::string();
793 return line.substr(0, index);
794 }
795
formatDouble(const double d,const int precision)796 std::string formatDouble(const double d, const int precision)
797 {
798 std::stringstream out;
799 out.precision(precision);
800 out << d;
801 return out.str();
802 }
803
sprintf(const char * fmt,...)804 std::string sprintf(const char* fmt, ...)
805 {
806 // note: the semantics for vsnprintf are slightly awkward... when vsnprintf
807 // is called with a null pointer, it returns the number of characters that
808 // would be written, not including the null terminator. however, when called
809 // with a buffer, vsnprintf will write a maximum of n - 1 characters, and
810 // will always write a null terminator at the end! so we need to ensure we
811 // add 1 character to the size returned by vsnprintf(nullptr) to get the
812 // full size of the C string we want to generate
813 std::size_t n = 0;
814 {
815 va_list args;
816 va_start(args, fmt);
817 n = std::vsnprintf(nullptr, 0, fmt, args);
818 va_end(args);
819 }
820
821 if (n == 0)
822 {
823 return std::string();
824 }
825
826 // allocate buffer of required size
827 // (include space for null pointer)
828 std::vector<char> buffer(n + 1);
829
830 // write formatted string to buffer
831 {
832 va_list args;
833 va_start(args, fmt);
834 std::vsnprintf(&buffer[0], buffer.size(), fmt, args);
835 va_end(args);
836 }
837
838 // return as string
839 return std::string(&buffer[0], n);
840 }
841
842 } // namespace string_utils
843 } // namespace core
844 } // namespace rstudio
845
846
847
848