1 /*
2  * StringUtils.cpp
3  *
4  * Copyright (C) 2021 by RStudio, PBC
5  *
6  * Unless you have received this program directly from RStudio pursuant
7  * to the terms of a commercial license agreement with RStudio, then
8  * this program is licensed to you under the terms of version 3 of the
9  * GNU Affero General Public License. This program is distributed WITHOUT
10  * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12  * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13  *
14  */
15 
16 #include <core/StringUtils.hpp>
17 
18 #include <stdarg.h>
19 
20 #include <algorithm>
21 #include <map>
22 #include <ostream>
23 #include <gsl/gsl>
24 
25 #include <boost/algorithm/string.hpp>
26 #include <boost/algorithm/string/case_conv.hpp>
27 #include <boost/algorithm/string/classification.hpp>
28 #include <boost/algorithm/string/replace.hpp>
29 #include <boost/algorithm/string/split.hpp>
30 #include <boost/regex.hpp>
31 
32 #include <core/Algorithm.hpp>
33 #include <core/Log.hpp>
34 #include <shared_core/SafeConvert.hpp>
35 #include <shared_core/json/Json.hpp>
36 
37 #ifdef _WIN32
38 #include <windows.h>
39 #include <winnls.h>
40 #endif
41 
42 #ifndef CP_ACP
43 # define CP_ACP 0
44 #endif
45 
46 namespace rstudio {
47 namespace core {
48 namespace string_utils {
49 
isTruthy(const std::string & string,bool valueIfEmpty)50 bool isTruthy(const std::string& string,
51               bool valueIfEmpty)
52 {
53    // allow user-configurable behavior for empty strings
54    if (string.empty())
55       return valueIfEmpty;
56 
57    // check for special 'falsy' values
58    std::string lower = toLower(string);
59    if (lower == "0" || lower == "false")
60       return false;
61 
62    // assume all other values are 'truthy'
63    return true;
64 }
65 
isSubsequence(std::string const & self,std::string const & other,std::string::size_type other_n)66 bool isSubsequence(std::string const& self,
67                    std::string const& other,
68                    std::string::size_type other_n)
69 {
70    std::string::size_type self_n = self.length();
71 
72    if (other_n == 0)
73       return true;
74 
75    if (other_n > other.length())
76       other_n = other.length();
77 
78    if (other_n > self_n)
79       return false;
80 
81    std::string::size_type self_idx = 0;
82    std::string::size_type other_idx = 0;
83 
84    while (self_idx < self_n)
85    {
86       char selfChar = self[self_idx];
87       char otherChar = other[other_idx];
88 
89       if (otherChar == selfChar)
90       {
91          ++other_idx;
92          if (other_idx == other_n)
93          {
94             return true;
95          }
96       }
97       ++self_idx;
98    }
99    return false;
100 }
101 
102 
isSubsequence(std::string const & self,std::string const & other,std::string::size_type other_n,bool caseInsensitive)103 bool isSubsequence(std::string const& self,
104                    std::string const& other,
105                    std::string::size_type other_n,
106                    bool caseInsensitive)
107 {
108    return caseInsensitive ?
109             isSubsequence(boost::algorithm::to_lower_copy(self),
110                           boost::algorithm::to_lower_copy(other),
111                           other_n) :
112             isSubsequence(self, other, other_n)
113             ;
114 }
115 
isSubsequence(std::string const & self,std::string const & other)116 bool isSubsequence(std::string const& self,
117                    std::string const& other)
118 {
119    return isSubsequence(self, other, other.length());
120 }
121 
isSubsequence(std::string const & self,std::string const & other,bool caseInsensitive)122 bool isSubsequence(std::string const& self,
123                    std::string const& other,
124                    bool caseInsensitive)
125 {
126    return isSubsequence(self, other, other.length(), caseInsensitive);
127 }
128 
subsequenceIndices(std::string const & sequence,std::string const & query)129 std::vector<int> subsequenceIndices(std::string const& sequence,
130                                     std::string const& query)
131 {
132    std::string::size_type querySize = query.length();
133    std::vector<int> result;
134    result.reserve(querySize);
135 
136    std::string::size_type prevMatchIndex = -1;
137    for (std::string::size_type i = 0; i < querySize; i++)
138    {
139       std::string::size_type index = sequence.find(query[i], prevMatchIndex + 1);
140       if (index == std::string::npos)
141          continue;
142 
143       result.push_back(gsl::narrow_cast<int>(index));
144       prevMatchIndex = index;
145    }
146 
147    return result;
148 }
149 
subsequenceIndices(std::string const & sequence,std::string const & query,std::vector<int> * pIndices)150 bool subsequenceIndices(std::string const& sequence,
151                         std::string const& query,
152                         std::vector<int> *pIndices)
153 {
154    pIndices->clear();
155    pIndices->reserve(query.length());
156 
157    int query_n = gsl::narrow_cast<int>(query.length());
158    int prevMatchIndex = -1;
159 
160    for (int i = 0; i < query_n; i++)
161    {
162       int index = gsl::narrow_cast<int>(sequence.find(query[i], prevMatchIndex + 1));
163       if (index == -1)
164          return false;
165 
166       pIndices->push_back(index);
167       prevMatchIndex = index;
168    }
169 
170    return true;
171 }
172 
getExtension(std::string const & x)173 std::string getExtension(std::string const& x)
174 {
175    std::size_t lastDotIndex = x.rfind('.');
176    if (lastDotIndex != std::string::npos)
177       return x.substr(lastDotIndex);
178    else
179       return std::string();
180 }
181 
convertLineEndings(std::string * pStr,LineEnding type)182 void convertLineEndings(std::string* pStr, LineEnding type)
183 {
184    std::string replacement;
185    switch (type)
186    {
187    case LineEndingWindows:
188       replacement = "\r\n";
189       break;
190    case LineEndingPosix:
191       replacement = "\n";
192       break;
193    case LineEndingNative:
194 #if _WIN32
195       replacement = "\r\n";
196 #else
197       replacement = "\n";
198 #endif
199       break;
200    case LineEndingPassthrough:
201    default:
202       return;
203    }
204 
205    *pStr = boost::regex_replace(*pStr, boost::regex("\\r?\\n|\\r|\\xE2\\x80[\\xA8\\xA9]"), replacement);
206 }
207 
detectLineEndings(const FilePath & filePath,LineEnding * pType)208 bool detectLineEndings(const FilePath& filePath, LineEnding* pType)
209 {
210    if (!filePath.exists())
211       return false;
212 
213    std::shared_ptr<std::istream> pIfs;
214    Error error = filePath.openForRead(pIfs);
215    if (error)
216    {
217       LOG_ERROR(error);
218       return false;
219    }
220 
221    // read file character-by-character using a streambuf
222    try
223    {
224       std::istream::sentry se(*pIfs, true);
225       std::streambuf* sb = pIfs->rdbuf();
226 
227       while(true)
228       {
229          int ch = sb->sbumpc();
230 
231          if (ch == '\n')
232          {
233             // using posix line endings
234             *pType = string_utils::LineEndingPosix;
235             return true;
236          }
237          else if (ch == '\r' && sb->sgetc() == '\n')
238          {
239             // using windows line endings
240             *pType = string_utils::LineEndingWindows;
241             return true;
242          }
243          else if (ch == EOF)
244          {
245             break;
246          }
247          else if (pIfs->fail())
248          {
249             LOG_WARNING_MESSAGE("I/O Error reading file " +
250                                    filePath.getAbsolutePath());
251             break;
252          }
253       }
254    }
255    CATCH_UNEXPECTED_EXCEPTION
256 
257    // no detection possible (perhaps the file is empty or has only one line)
258    return false;
259 }
260 
utf8ToSystem(const std::string & str,bool escapeInvalidChars)261 std::string utf8ToSystem(const std::string& str,
262                          bool escapeInvalidChars)
263 {
264    if (str.empty())
265       return std::string();
266 
267 #ifdef _WIN32
268 
269    std::vector<wchar_t> wide(str.length() + 1);
270    int chars = ::MultiByteToWideChar(
271             CP_UTF8, 0,
272             str.c_str(), -1,
273             &wide[0], gsl::narrow_cast<int>(wide.size()));
274 
275    if (chars < 0)
276    {
277       LOG_ERROR(LAST_SYSTEM_ERROR());
278       return str;
279    }
280 
281    std::ostringstream output;
282    char buffer[16];
283 
284    // Only go up to chars - 1 because last char is \0
285    for (int i = 0; i < chars - 1; i++)
286    {
287       int n = wctomb(buffer, wide[i]);
288 
289       if (n == -1)
290       {
291          if (escapeInvalidChars)
292          {
293             // NOTE: in R, both '\u{1234}' and '\u1234' are valid
294             // ways of specifying a unicode literal, but only the
295             // latter is accepted by Python, and since the reticulate
296             // REPL uses the same conversion routines we prefer the
297             // format compatible with both parsers
298             output << "\\u" << std::hex << wide[i];
299          }
300          else
301          {
302             output << "?"; // TODO: Use GetCPInfo()
303          }
304       }
305       else
306       {
307          output.write(buffer, n);
308       }
309    }
310    return output.str();
311 #else
312    // Assumes that UTF8 is the locale on POSIX
313    return str;
314 #endif
315 }
316 
systemToUtf8(const std::string & str,int codepage)317 std::string systemToUtf8(const std::string& str, int codepage)
318 {
319    if (str.empty())
320       return std::string();
321 
322 #ifdef _WIN32
323    std::vector<wchar_t> wide(str.length() + 1);
324    int chars = ::MultiByteToWideChar(codepage,
325                                      0,
326                                      str.c_str(),
327                                      gsl::narrow_cast<int>(str.length()),
328                                      &wide[0],
329                                      gsl::narrow_cast<int>(wide.size()));
330    if (chars < 0)
331    {
332       LOG_ERROR(LAST_SYSTEM_ERROR());
333       return str;
334    }
335 
336    int bytesRequired = ::WideCharToMultiByte(CP_UTF8, 0, &wide[0], chars,
337                                              nullptr, 0,
338                                              nullptr, nullptr);
339    if (bytesRequired == 0)
340    {
341       LOG_ERROR(LAST_SYSTEM_ERROR());
342       return str;
343    }
344    std::vector<char> buf(bytesRequired, 0);
345    int bytesWritten = ::WideCharToMultiByte(CP_UTF8, 0, &wide[0], chars,
346                                             &(buf[0]), static_cast<int>(buf.size()),
347                                             nullptr, nullptr);
348    return std::string(buf.begin(), buf.end());
349 #else
350    return str;
351 #endif
352 }
353 
systemToUtf8(const std::string & str)354 std::string systemToUtf8(const std::string& str)
355 {
356    return systemToUtf8(str, CP_ACP);
357 }
358 
toUpper(const std::string & str)359 std::string toUpper(const std::string& str)
360 {
361    std::string upper = str;
362    std::transform(upper.begin(), upper.end(), upper.begin(), ::toupper);
363    return upper;
364 }
365 
toLower(const std::string & str)366 std::string toLower(const std::string& str)
367 {
368    std::string lower = str;
369    std::transform(lower.begin(), lower.end(), lower.begin(), core::tolower);
370    return lower;
371 }
372 
textToHtml(const std::string & str)373 std::string textToHtml(const std::string& str)
374 {
375    std::string html = str;
376    boost::replace_all(html, "&", "&amp;");
377    boost::replace_all(html, "<", "&lt;");
378    return html;
379 }
380 
381 namespace {
escape(std::string specialChars,const std::map<char,std::string> & replacements,std::string str)382 std::string escape(std::string specialChars,
383                    const std::map<char, std::string>& replacements,
384                    std::string str)
385 {
386    std::string result;
387    result.reserve(static_cast<size_t>(str.size() * 1.2));
388 
389    size_t tail = 0;
390    for (size_t head = 0;
391         head < str.size()
392            && str.npos != (head = str.find_first_of(specialChars, head));
393         tail = ++head)
394    {
395       if (tail < head)
396          result.append(str, tail, head - tail);
397 
398       result.append(replacements.find(str.at(head))->second);
399    }
400 
401    if (tail < str.size())
402       result.append(str, tail, std::string::npos);
403 
404    return result;
405 
406 }
407 } // anonymous namespace
408 
htmlEscape(const std::string & str,bool isAttributeValue)409 std::string htmlEscape(const std::string& str, bool isAttributeValue)
410 {
411    std::string escapes = isAttributeValue ?
412                          "<>&'\"/\r\n" :
413                          "<>&'\"/";
414 
415    std::map<char, std::string> subs;
416    subs['<'] = "&lt;";
417    subs['>'] = "&gt;";
418    subs['&'] = "&amp;";
419    subs['\''] = "&#x27;";
420    subs['"'] = "&quot;";
421    subs['/'] = "&#x2F;";
422    if (isAttributeValue)
423    {
424       subs['\r'] = "&#13;";
425       subs['\n'] = "&#10;";
426    }
427 
428    return escape(escapes, subs, str);
429 }
430 
jsLiteralEscape(const std::string & str)431 std::string jsLiteralEscape(const std::string& str)
432 {
433    std::string escapes = "\\'\"\r\n<";
434 
435    std::map<char, std::string> subs;
436    subs['\\'] = "\\\\";
437    subs['\''] = "\\'";
438    subs['"'] = "\\\"";
439    subs['\r'] = "\\r";
440    subs['\n'] = "\\n";
441    subs['<'] = "\\074";
442 
443    return escape(escapes, subs, str);
444 }
445 
jsonLiteralEscape(const std::string & str)446 std::string jsonLiteralEscape(const std::string& str)
447 {
448    std::string escapes = "\\\"\r\n";
449 
450    std::map<char, std::string> subs;
451    subs['\\'] = "\\\\";
452    subs['"'] = "\\\"";
453    subs['\r'] = "\\r";
454    subs['\n'] = "\\n";
455 
456    return escape(escapes, subs, str);
457 }
458 // The str that is passed in should INCLUDE the " " around the value!
459 // (Sorry this is inconsistent with jsonLiteralEscape, but it's more efficient
460 // than adding double-quotes in this function)
jsonLiteralUnescape(const std::string & str)461 std::string jsonLiteralUnescape(const std::string& str)
462 {
463    json::Value value;
464    if (value.parse(str) || !json::isType<std::string>(value))
465    {
466       LOG_ERROR_MESSAGE("Failed to unescape JS literal");
467       return str;
468    }
469 
470    return value.getString();
471 }
472 
singleQuotedStrEscape(const std::string & str)473 std::string singleQuotedStrEscape(const std::string& str)
474 {
475    std::string escapes = "'\\";
476 
477    std::map<char, std::string> subs;
478    subs['\\'] = "\\\\";
479    subs['\''] = "\\'";
480 
481    return escape(escapes, subs, str);
482 }
483 
filterControlChars(const std::string & str)484 std::string filterControlChars(const std::string& str)
485 {
486    // Delete control chars, which can cause errors in JSON parsing (especially
487    // \0003)
488    return boost::regex_replace(str,
489                                boost::regex("[\\0000-\\0010\\0016-\\0037]+"),
490                                "");
491 }
492 
493 namespace {
494 
initLookupTable(wchar_t ranges[][2],size_t rangeCount)495 std::vector<bool> initLookupTable(wchar_t ranges[][2], size_t rangeCount)
496 {
497    std::vector<bool> results(0xFFFF, false);
498    for (size_t i = 0; i < rangeCount; i++)
499    {
500       for (wchar_t j = ranges[i][0]; j <= ranges[i][1]; j++)
501          results[j] = true;
502    }
503    return results;
504 }
505 
506 // See https://gist.github.com/1110629 for range generating script
507 
initAlnumLookupTable()508 std::vector<bool> initAlnumLookupTable()
509 {
510    wchar_t ranges[][2] = {
511       {0x30, 0x39}, {0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, {0x376, 0x37D}, {0x386, 0x386}, {0x388, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x559}, {0x561, 0x587}, {0x5D0, 0x5F2}, {0x620, 0x64A}, {0x660, 0x669}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, {0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x966, 0x96F}, {0x971, 0x97F}, {0x985, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9E1}, {0x9E6, 0x9F1}, {0xA05, 0xA39}, {0xA59, 0xA6F}, {0xA72, 0xA74}, {0xA85, 0xAB9}, {0xABD, 0xABD}, {0xAD0, 0xAE1}, {0xAE6, 0xAEF}, {0xB05, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB61}, {0xB66, 0xB6F}, {0xB71, 0xB71}, {0xB83, 0xBB9}, {0xBD0, 0xBD0}, {0xBE6, 0xBEF}, {0xC05, 0xC3D}, {0xC58, 0xC61}, {0xC66, 0xC6F}, {0xC85, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCE1}, {0xCE6, 0xCF2}, {0xD05, 0xD3D}, {0xD4E, 0xD4E}, {0xD60, 0xD61}, {0xD66, 0xD6F}, {0xD7A, 0xD7F}, {0xD85, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE50, 0xE59}, {0xE81, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEC6}, {0xED0, 0xF00}, {0xF20, 0xF29}, {0xF40, 0xF6C}, {0xF88, 0xF8C}, {0x1000, 0x102A}, {0x103F, 0x1049}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x1090, 0x1099}, {0x10A0, 0x10FA}, {0x10FC, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F4}, {0x1401, 0x166C}, {0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16EE, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x1770}, {0x1780, 0x17B3}, {0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x17E0, 0x17E9}, {0x1810, 0x18A8}, {0x18AA, 0x191C}, {0x1946, 0x19AB}, {0x19C1, 0x19C7}, {0x19D0, 0x19D9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1A80, 0x1A99}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B59}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C40, 0x1C7D}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF1}, {0x1D00, 0x1DBF}, {0x1E00, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FCC}, {0x1FD0, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, {0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, {0x2160, 0x2188}, {0x2C00, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2D00, 0x2D6F}, {0x2D80, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3007}, {0x3021, 0x3029}, {0x3031, 0x3035}, {0x3038, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x318E}, {0x31A0, 0x31BA}, {0x31F0, 0x31FF}, {0x3400, 0x4DB5}, {0x4E00, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA66E}, {0xA67F, 0xA6EF}, {0xA717, 0xA71F}, {0xA722, 0xA788}, {0xA78B, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8D0, 0xA8D9}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9D9}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, {0xAA50, 0xAA59}, {0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA80, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAADD}, {0xAB01, 0xABE2}, {0xABF0, 0xD7FB}, {0xF900, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFDFB}, {0xFE70, 0xFEFC}, {0xFF10, 0xFF19}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFDC}
512    };
513 
514    return initLookupTable(ranges, sizeof(ranges) / sizeof(ranges[0]));
515 }
516 
initAlphaLookupTable()517 std::vector<bool> initAlphaLookupTable()
518 {
519    wchar_t ranges[][2] = {
520       {0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374}, {0x376, 0x37D}, {0x386, 0x386}, {0x388, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x559}, {0x561, 0x587}, {0x5D0, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710}, {0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x97F}, {0x985, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9E1}, {0x9F0, 0x9F1}, {0xA05, 0xA39}, {0xA59, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xAB9}, {0xABD, 0xABD}, {0xAD0, 0xAE1}, {0xB05, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB61}, {0xB71, 0xB71}, {0xB83, 0xBB9}, {0xBD0, 0xBD0}, {0xC05, 0xC3D}, {0xC58, 0xC61}, {0xC85, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCE1}, {0xCF1, 0xCF2}, {0xD05, 0xD3D}, {0xD4E, 0xD4E}, {0xD60, 0xD61}, {0xD7A, 0xD7F}, {0xD85, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEC6}, {0xEDC, 0xF00}, {0xF40, 0xF6C}, {0xF88, 0xF8C}, {0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10FA}, {0x10FC, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F4}, {0x1401, 0x166C}, {0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16EE, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x1770}, {0x1780, 0x17B3}, {0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x18A8}, {0x18AA, 0x191C}, {0x1950, 0x19AB}, {0x19C1, 0x19C7}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BC0, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F}, {0x1C5A, 0x1C7D}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF1}, {0x1D00, 0x1DBF}, {0x1E00, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FCC}, {0x1FD0, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107}, {0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E}, {0x2160, 0x2188}, {0x2C00, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2D00, 0x2D6F}, {0x2D80, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3007}, {0x3021, 0x3029}, {0x3031, 0x3035}, {0x3038, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x318E}, {0x31A0, 0x31BA}, {0x31F0, 0x31FF}, {0x3400, 0x4DB5}, {0x4E00, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA66E}, {0xA67F, 0xA6EF}, {0xA717, 0xA71F}, {0xA722, 0xA788}, {0xA78B, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B}, {0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA80, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAADD}, {0xAB01, 0xABE2}, {0xAC00, 0xD7FB}, {0xF900, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFDFB}, {0xFE70, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFDC}
521    };
522 
523    return initLookupTable(ranges, sizeof(ranges) / sizeof(ranges[0]));
524 }
525 
526 } // anonymous namespace
527 
isalpha(wchar_t c)528 bool isalpha(wchar_t c)
529 {
530    static std::vector<bool> lookup = initAlphaLookupTable();
531    if (c >= 0xFFFF)
532       return false; // This function only supports BMP
533    return lookup.at(c);
534 }
535 
isalnum(wchar_t c)536 bool isalnum(wchar_t c)
537 {
538    static std::vector<bool> lookup;
539    if (lookup.empty())
540       lookup = initAlnumLookupTable();
541 
542    if (c >= 0xFFFF)
543       return false; // This function only supports BMP
544    return lookup.at(c);
545 }
546 
parseVersion(const std::string & str,uint64_t * pVersion)547 bool parseVersion(const std::string& str, uint64_t* pVersion)
548 {
549    uint64_t version = 0;
550 
551    std::vector<std::string> chunks;
552    boost::algorithm::split(chunks, str, boost::algorithm::is_any_of("."));
553 
554    if (chunks.empty())
555       return false;
556 
557    for (size_t i = 0; i < chunks.size() && i < 4; i++)
558    {
559       boost::optional<uint16_t> value = core::safe_convert::stringTo<uint16_t>(chunks[i]);
560       if (!value)
561          return false;
562       version += static_cast<uint64_t>(value.get()) << ((3-i) * 16);
563    }
564    if (pVersion)
565       *pVersion = version;
566    return true;
567 }
568 
trimLeadingLines(int maxLines,std::string * pLines)569 bool trimLeadingLines(int maxLines, std::string* pLines)
570 {
571    bool didTrim = false;
572    if (pLines->length() > static_cast<unsigned int>(maxLines * 2))
573    {
574       int lineCount = 0;
575       std::string::const_iterator begin = pLines->begin();
576       std::string::iterator pos = pLines->end();
577 
578       for (;;)
579       {
580          --pos;
581 
582          if (*pos == '\n')
583          {
584             if (++lineCount > maxLines)
585             {
586                pLines->erase(pLines->begin(), pos);
587                didTrim = true;
588                break;
589             }
590          }
591 
592          if (pos == begin)
593             break;
594       }
595    }
596    return didTrim;
597 }
598 
strippedOfBackQuotes(const std::string & string)599 std::string strippedOfBackQuotes(const std::string& string)
600 {
601    if (string.length() < 2)
602       return string;
603 
604    std::size_t startIndex = 0;
605    std::size_t n = string.length();
606    std::size_t endIndex = n;
607 
608    startIndex += string[0] == '`';
609    endIndex   -= string[n - 1] == '`';
610 
611    return string.substr(startIndex, endIndex - startIndex);
612 }
613 
stripQuotes(std::string * pStr)614 void stripQuotes(std::string* pStr)
615 {
616    if (pStr->length() > 0 && (pStr->at(0) == '\'' || pStr->at(0) == '"'))
617       *pStr = pStr->substr(1);
618 
619    auto len = pStr->length();
620 
621    if (len > 0 && (pStr->at(len-1) == '\'' || pStr->at(len-1) == '"'))
622       *pStr = pStr->substr(0, len -1);
623 }
624 
strippedOfQuotes(const std::string & string)625 std::string strippedOfQuotes(const std::string& string)
626 {
627    std::string::size_type n = string.length();
628    if (n < 2) return string;
629 
630    char first = string[0];
631    char last  = string[n - 1];
632 
633    if ((first == '\'' && last == '\'') ||
634        (first == '"' && last == '"') |\
635        (first == '`' && last == '`'))
636    {
637       return string.substr(1, n - 2);
638    }
639 
640    return string;
641 }
642 
643 template <typename Iter, typename U>
644 Iter countNewlinesImpl(Iter begin,
645                        Iter end,
646                        const U& CR,
647                        const U& LF,
648                        std::size_t* pNewlineCount)
649 {
650    std::size_t newlineCount = 0;
651    Iter it = begin;
652 
653    Iter lastNewline = end;
654 
655    for (; it != end; ++it)
656    {
657       // Detect '\r\n'
658       if (*it == CR)
659       {
660          if (it + 1 != end &&
661              *(it + 1) == LF)
662          {
663             lastNewline = it;
664             ++it;
665             ++newlineCount;
666             continue;
667          }
668       }
669 
670       // Detect '\n'
671       if (*it == LF)
672       {
673          lastNewline = it;
674          ++newlineCount;
675       }
676    }
677 
678    *pNewlineCount = newlineCount;
679    return lastNewline;
680 }
681 
countNewlines(const std::wstring & string)682 std::size_t countNewlines(const std::wstring& string)
683 {
684    std::size_t count = 0;
685    countNewlinesImpl(string.begin(), string.end(), L'\r', L'\n', &count);
686    return count;
687 }
688 
countNewlines(const std::string & string)689 std::size_t countNewlines(const std::string& string)
690 {
691    std::size_t count = 0;
692    countNewlinesImpl(string.begin(), string.end(), '\r', '\n', &count);
693    return count;
694 }
695 
countNewlines(std::string::iterator begin,std::string::iterator end)696 std::size_t countNewlines(std::string::iterator begin,
697                           std::string::iterator end)
698 {
699    std::size_t count = 0;
700    countNewlinesImpl(begin, end, '\r', '\n', &count);
701    return count;
702 }
703 
countNewlines(std::wstring::iterator begin,std::wstring::iterator end)704 std::size_t countNewlines(std::wstring::iterator begin,
705                           std::wstring::iterator end)
706 {
707    std::size_t count = 0;
708    countNewlinesImpl(begin, end, '\r', '\n', &count);
709    return count;
710 }
711 
countNewlines(std::wstring::const_iterator begin,std::wstring::const_iterator end,std::size_t * pCount)712 std::wstring::const_iterator countNewlines(std::wstring::const_iterator begin,
713                                            std::wstring::const_iterator end,
714                                            std::size_t* pCount)
715 {
716    return countNewlinesImpl(begin, end, '\r', '\n', pCount);
717 }
718 
isPrefixOf(const std::string & self,const std::string & prefix)719 bool isPrefixOf(const std::string& self, const std::string& prefix)
720 {
721    return boost::algorithm::starts_with(self, prefix);
722 }
723 
makeRandomByteString(std::size_t n)724 std::string makeRandomByteString(std::size_t n)
725 {
726    std::string result;
727    result.resize(n);
728    for (std::size_t i = 0; i < n; ++i)
729       result[i] = (unsigned char) (::rand() % UCHAR_MAX);
730    return result;
731 }
732 
extractCommentHeader(const std::string & contents,const std::string & reCommentPrefix,std::string * pHeader)733 bool extractCommentHeader(const std::string& contents,
734                           const std::string& reCommentPrefix,
735                           std::string* pHeader)
736 {
737    // construct newline-based token iterator
738    boost::regex reNewline("(?:\\r?\\n|$)");
739    boost::sregex_token_iterator it(
740             contents.begin(),
741             contents.end(),
742             reNewline,
743             -1);
744    boost::sregex_token_iterator end;
745 
746    // first, skip blank lines
747    boost::regex reWhitespace("^\\s*$");
748    while (it != end)
749    {
750       if (boost::regex_match(it->begin(), it->end(), reWhitespace))
751       {
752          ++it;
753          continue;
754       }
755 
756       break;
757    }
758 
759    // if we're at the end now, bail
760    if (it == end)
761       return false;
762 
763    // check to see if we landed on our comment prefix and
764    // quit early if we haven't
765    boost::regex rePrefix(reCommentPrefix);
766    if (!boost::regex_search(it->begin(), it->end(), rePrefix))
767       return false;
768 
769    // we have a prefix: start iterating and extracting these
770    for (; it != end; ++it)
771    {
772       boost::smatch match;
773       if (!boost::regex_search(it->begin(), it->end(), match, rePrefix))
774       {
775          // this is no longer a commented line; time to go home
776          break;
777       }
778 
779       // extract the line (sans prefix)
780       std::string line(it->begin() + match.length(), it->end());
781       pHeader->append(line + "\n");
782    }
783 
784    // report success to the user
785    return true;
786 }
787 
extractIndent(const std::string & line)788 std::string extractIndent(const std::string& line)
789 {
790    auto index = line.find_first_not_of(" \t");
791    if (index == std::string::npos)
792       return std::string();
793    return line.substr(0, index);
794 }
795 
formatDouble(const double d,const int precision)796 std::string formatDouble(const double d, const int precision)
797 {
798    std::stringstream out;
799    out.precision(precision);
800    out << d;
801    return out.str();
802 }
803 
sprintf(const char * fmt,...)804 std::string sprintf(const char* fmt, ...)
805 {
806    // note: the semantics for vsnprintf are slightly awkward... when vsnprintf
807    // is called with a null pointer, it returns the number of characters that
808    // would be written, not including the null terminator. however, when called
809    // with a buffer, vsnprintf will write a maximum of n - 1 characters, and
810    // will always write a null terminator at the end! so we need to ensure we
811    // add 1 character to the size returned by vsnprintf(nullptr) to get the
812    // full size of the C string we want to generate
813    std::size_t n = 0;
814    {
815       va_list args;
816       va_start(args, fmt);
817       n = std::vsnprintf(nullptr, 0, fmt, args);
818       va_end(args);
819    }
820 
821    if (n == 0)
822    {
823       return std::string();
824    }
825 
826    // allocate buffer of required size
827    // (include space for null pointer)
828    std::vector<char> buffer(n + 1);
829 
830    // write formatted string to buffer
831    {
832       va_list args;
833       va_start(args, fmt);
834       std::vsnprintf(&buffer[0], buffer.size(), fmt, args);
835       va_end(args);
836    }
837 
838    // return as string
839    return std::string(&buffer[0], n);
840 }
841 
842 } // namespace string_utils
843 } // namespace core
844 } // namespace rstudio
845 
846 
847 
848