1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <iterator>
20 #include <stdexcept>
21 
22 #include <folly/CppAttributes.h>
23 
24 #ifndef FOLLY_STRING_H_
25 #error This file may only be included from String.h
26 #endif
27 
28 namespace folly {
29 
30 namespace detail {
31 // Map from character code to value of one-character escape sequence
32 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33 // an octal escape sequence, or 'P' if the character is printable and
34 // should be printed as is.
35 extern const std::array<char, 256> cEscapeTable;
36 } // namespace detail
37 
38 template <class String>
cEscape(StringPiece str,String & out)39 void cEscape(StringPiece str, String& out) {
40   char esc[4];
41   esc[0] = '\\';
42   out.reserve(out.size() + str.size());
43   auto p = str.begin();
44   auto last = p; // last regular character
45   // We advance over runs of regular characters (printable, not double-quote or
46   // backslash) and copy them in one go; this is faster than calling push_back
47   // repeatedly.
48   while (p != str.end()) {
49     char c = *p;
50     unsigned char v = static_cast<unsigned char>(c);
51     char e = detail::cEscapeTable[v];
52     if (e == 'P') { // printable
53       ++p;
54     } else if (e == 'O') { // octal
55       out.append(&*last, size_t(p - last));
56       esc[1] = '0' + ((v >> 6) & 7);
57       esc[2] = '0' + ((v >> 3) & 7);
58       esc[3] = '0' + (v & 7);
59       out.append(esc, 4);
60       ++p;
61       last = p;
62     } else { // special 1-character escape
63       out.append(&*last, size_t(p - last));
64       esc[1] = e;
65       out.append(esc, 2);
66       ++p;
67       last = p;
68     }
69   }
70   out.append(&*last, size_t(p - last));
71 }
72 
73 namespace detail {
74 // Map from the character code of the character following a backslash to
75 // the unescaped character if a valid one-character escape sequence
76 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77 // octal escape sequence, 'X' if this is the first character of a
78 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79 extern const std::array<char, 256> cUnescapeTable;
80 
81 // Map from the character code to the hex value, or 16 if invalid hex char.
82 extern const std::array<unsigned char, 256> hexTable;
83 } // namespace detail
84 
85 template <class String>
cUnescape(StringPiece str,String & out,bool strict)86 void cUnescape(StringPiece str, String& out, bool strict) {
87   out.reserve(out.size() + str.size());
88   auto p = str.begin();
89   auto last = p; // last regular character (not part of an escape sequence)
90   // We advance over runs of regular characters (not backslash) and copy them
91   // in one go; this is faster than calling push_back repeatedly.
92   while (p != str.end()) {
93     char c = *p;
94     if (c != '\\') { // normal case
95       ++p;
96       continue;
97     }
98     out.append(&*last, p - last);
99     ++p;
100     if (p == str.end()) { // backslash at end of string
101       if (strict) {
102         throw_exception<std::invalid_argument>("incomplete escape sequence");
103       }
104       out.push_back('\\');
105       last = p;
106       continue;
107     }
108     char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
109     if (e == 'O') { // octal
110       unsigned char val = 0;
111       for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
112            ++i, ++p) {
113         val <<= 3;
114         val |= (*p - '0');
115       }
116       out.push_back(val);
117       last = p;
118     } else if (e == 'X') { // hex
119       ++p;
120       if (p == str.end()) { // \x at end of string
121         if (strict) {
122           throw_exception<std::invalid_argument>(
123               "incomplete hex escape sequence");
124         }
125         out.append("\\x");
126         last = p;
127         continue;
128       }
129       unsigned char val = 0;
130       unsigned char h;
131       for (; (p != str.end() &&
132               (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
133            ++p) {
134         val <<= 4;
135         val |= h;
136       }
137       out.push_back(val);
138       last = p;
139     } else if (e == 'I') { // invalid
140       if (strict) {
141         throw_exception<std::invalid_argument>("invalid escape sequence");
142       }
143       out.push_back('\\');
144       out.push_back(*p);
145       ++p;
146       last = p;
147     } else { // standard escape sequence, \' etc
148       out.push_back(e);
149       ++p;
150       last = p;
151     }
152   }
153   out.append(&*last, p - last);
154 }
155 
156 namespace detail {
157 // Map from character code to escape mode:
158 // 0 = pass through
159 // 1 = unused
160 // 2 = pass through in PATH mode
161 // 3 = space, replace with '+' in QUERY mode
162 // 4 = percent-encode
163 extern const std::array<unsigned char, 256> uriEscapeTable;
164 } // namespace detail
165 
166 template <class String>
uriEscape(StringPiece str,String & out,UriEscapeMode mode)167 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
168   static const char hexValues[] = "0123456789abcdef";
169   char esc[3];
170   esc[0] = '%';
171   // Preallocate assuming that 25% of the input string will be escaped
172   out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
173   auto p = str.begin();
174   auto last = p; // last regular character
175   // We advance over runs of passthrough characters and copy them in one go;
176   // this is faster than calling push_back repeatedly.
177   unsigned char minEncode = static_cast<unsigned char>(mode);
178   while (p != str.end()) {
179     char c = *p;
180     unsigned char v = static_cast<unsigned char>(c);
181     unsigned char discriminator = detail::uriEscapeTable[v];
182     if (LIKELY(discriminator <= minEncode)) {
183       ++p;
184     } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
185       out.append(&*last, size_t(p - last));
186       out.push_back('+');
187       ++p;
188       last = p;
189     } else {
190       out.append(&*last, size_t(p - last));
191       esc[1] = hexValues[v >> 4];
192       esc[2] = hexValues[v & 0x0f];
193       out.append(esc, 3);
194       ++p;
195       last = p;
196     }
197   }
198   out.append(&*last, size_t(p - last));
199 }
200 
201 template <class String>
uriUnescape(StringPiece str,String & out,UriEscapeMode mode)202 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
203   out.reserve(out.size() + str.size());
204   auto p = str.begin();
205   auto last = p;
206   // We advance over runs of passthrough characters and copy them in one go;
207   // this is faster than calling push_back repeatedly.
208   while (p != str.end()) {
209     char c = *p;
210     switch (c) {
211       case '%': {
212         if (UNLIKELY(std::distance(p, str.end()) < 3)) {
213           throw_exception<std::invalid_argument>(
214               "incomplete percent encode sequence");
215         }
216         auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
217         auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
218         if (UNLIKELY(h1 == 16 || h2 == 16)) {
219           throw_exception<std::invalid_argument>(
220               "invalid percent encode sequence");
221         }
222         out.append(&*last, size_t(p - last));
223         out.push_back((h1 << 4) | h2);
224         p += 3;
225         last = p;
226         break;
227       }
228       case '+':
229         if (mode == UriEscapeMode::QUERY) {
230           out.append(&*last, size_t(p - last));
231           out.push_back(' ');
232           ++p;
233           last = p;
234           break;
235         }
236         // else fallthrough
237         FOLLY_FALLTHROUGH;
238       default:
239         ++p;
240         break;
241     }
242   }
243   out.append(&*last, size_t(p - last));
244 }
245 
246 namespace detail {
247 
248 /*
249  * The following functions are type-overloaded helpers for
250  * internalSplit().
251  */
delimSize(char)252 inline size_t delimSize(char) {
253   return 1;
254 }
delimSize(StringPiece s)255 inline size_t delimSize(StringPiece s) {
256   return s.size();
257 }
atDelim(const char * s,char c)258 inline bool atDelim(const char* s, char c) {
259   return *s == c;
260 }
atDelim(const char * s,StringPiece sp)261 inline bool atDelim(const char* s, StringPiece sp) {
262   return !std::memcmp(s, sp.start(), sp.size());
263 }
264 
265 // These are used to short-circuit internalSplit() in the case of
266 // 1-character strings.
delimFront(char c)267 inline char delimFront(char c) {
268   // This one exists only for compile-time; it should never be called.
269   std::abort();
270   return c;
271 }
delimFront(StringPiece s)272 inline char delimFront(StringPiece s) {
273   assert(!s.empty() && s.start() != nullptr);
274   return *s.start();
275 }
276 
277 /*
278  * Shared implementation for all the split() overloads.
279  *
280  * This uses some external helpers that are overloaded to let this
281  * algorithm be more performant if the deliminator is a single
282  * character instead of a whole string.
283  *
284  * @param ignoreEmpty iff true, don't copy empty segments to output
285  */
286 template <class OutStringT, class DelimT, class OutputIterator>
internalSplit(DelimT delim,StringPiece sp,OutputIterator out,bool ignoreEmpty)287 void internalSplit(
288     DelimT delim, StringPiece sp, OutputIterator out, bool ignoreEmpty) {
289   assert(sp.empty() || sp.start() != nullptr);
290 
291   const char* s = sp.start();
292   const size_t strSize = sp.size();
293   const size_t dSize = delimSize(delim);
294 
295   if (dSize > strSize || dSize == 0) {
296     if (!ignoreEmpty || strSize > 0) {
297       *out++ = to<OutStringT>(sp);
298     }
299     return;
300   }
301   if (std::is_same<DelimT, StringPiece>::value && dSize == 1) {
302     // Call the char version because it is significantly faster.
303     return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
304   }
305 
306   size_t tokenStartPos = 0;
307   size_t tokenSize = 0;
308   for (size_t i = 0; i <= strSize - dSize; ++i) {
309     if (atDelim(&s[i], delim)) {
310       if (!ignoreEmpty || tokenSize > 0) {
311         *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
312       }
313 
314       tokenStartPos = i + dSize;
315       tokenSize = 0;
316       i += dSize - 1;
317     } else {
318       ++tokenSize;
319     }
320   }
321   tokenSize = strSize - tokenStartPos;
322   if (!ignoreEmpty || tokenSize > 0) {
323     *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
324   }
325 }
326 
327 template <class String>
prepareDelim(const String & s)328 StringPiece prepareDelim(const String& s) {
329   return StringPiece(s);
330 }
prepareDelim(char c)331 inline char prepareDelim(char c) {
332   return c;
333 }
334 
335 template <class OutputType>
toOrIgnore(StringPiece input,OutputType & output)336 void toOrIgnore(StringPiece input, OutputType& output) {
337   output = folly::to<OutputType>(input);
338 }
339 
toOrIgnore(StringPiece,decltype (std::ignore)&)340 inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
341 
342 template <bool exact, class Delim, class OutputType>
splitFixed(const Delim & delimiter,StringPiece input,OutputType & output)343 bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
344   static_assert(
345       exact || std::is_same<OutputType, StringPiece>::value ||
346           IsSomeString<OutputType>::value ||
347           std::is_same<OutputType, decltype(std::ignore)>::value,
348       "split<false>() requires that the last argument be a string type "
349       "or std::ignore");
350   if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
351     return false;
352   }
353   toOrIgnore(input, output);
354   return true;
355 }
356 
357 template <bool exact, class Delim, class OutputType, class... OutputTypes>
splitFixed(const Delim & delimiter,StringPiece input,OutputType & outHead,OutputTypes &...outTail)358 bool splitFixed(
359     const Delim& delimiter,
360     StringPiece input,
361     OutputType& outHead,
362     OutputTypes&... outTail) {
363   size_t cut = input.find(delimiter);
364   if (UNLIKELY(cut == std::string::npos)) {
365     return false;
366   }
367   StringPiece head(input.begin(), input.begin() + cut);
368   StringPiece tail(
369       input.begin() + cut + detail::delimSize(delimiter), input.end());
370   if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
371     toOrIgnore(head, outHead);
372     return true;
373   }
374   return false;
375 }
376 
377 } // namespace detail
378 
379 //////////////////////////////////////////////////////////////////////
380 
381 template <class Delim, class String, class OutputType>
split(const Delim & delimiter,const String & input,std::vector<OutputType> & out,bool ignoreEmpty)382 void split(
383     const Delim& delimiter,
384     const String& input,
385     std::vector<OutputType>& out,
386     bool ignoreEmpty) {
387   detail::internalSplit<OutputType>(
388       detail::prepareDelim(delimiter),
389       StringPiece(input),
390       std::back_inserter(out),
391       ignoreEmpty);
392 }
393 
394 template <class Delim, class String, class OutputType>
split(const Delim & delimiter,const String & input,fbvector<OutputType,std::allocator<OutputType>> & out,bool ignoreEmpty)395 void split(
396     const Delim& delimiter,
397     const String& input,
398     fbvector<OutputType, std::allocator<OutputType>>& out,
399     bool ignoreEmpty) {
400   detail::internalSplit<OutputType>(
401       detail::prepareDelim(delimiter),
402       StringPiece(input),
403       std::back_inserter(out),
404       ignoreEmpty);
405 }
406 
407 template <
408     class OutputValueType,
409     class Delim,
410     class String,
411     class OutputIterator>
splitTo(const Delim & delimiter,const String & input,OutputIterator out,bool ignoreEmpty)412 void splitTo(
413     const Delim& delimiter,
414     const String& input,
415     OutputIterator out,
416     bool ignoreEmpty) {
417   detail::internalSplit<OutputValueType>(
418       detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
419 }
420 
421 template <bool exact, class Delim, class... OutputTypes>
422 typename std::enable_if<
423     StrictConjunction<IsConvertible<OutputTypes>...>::value &&
424         sizeof...(OutputTypes) >= 1,
425     bool>::type
426 split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
427   return detail::splitFixed<exact>(
428       detail::prepareDelim(delimiter), input, outputs...);
429 }
430 
431 namespace detail {
432 
433 /*
434  * If a type can have its string size determined cheaply, we can more
435  * efficiently append it in a loop (see internalJoinAppend). Note that the
436  * struct need not conform to the std::string api completely (ex. does not need
437  * to implement append()).
438  */
439 template <class T>
440 struct IsSizableString {
441   enum {
442     value = IsSomeString<T>::value || std::is_same<T, StringPiece>::value
443   };
444 };
445 
446 template <class Iterator>
447 struct IsSizableStringContainerIterator
448     : IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
449 
450 template <class Delim, class Iterator, class String>
internalJoinAppend(Delim delimiter,Iterator begin,Iterator end,String & output)451 void internalJoinAppend(
452     Delim delimiter, Iterator begin, Iterator end, String& output) {
453   assert(begin != end);
454   if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) {
455     internalJoinAppend(delimFront(delimiter), begin, end, output);
456     return;
457   }
458   toAppend(*begin, &output);
459   while (++begin != end) {
460     toAppend(delimiter, *begin, &output);
461   }
462 }
463 
464 template <class Delim, class Iterator, class String>
465 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
internalJoin(Delim delimiter,Iterator begin,Iterator end,String & output)466 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
467   output.clear();
468   if (begin == end) {
469     return;
470   }
471   const size_t dsize = delimSize(delimiter);
472   Iterator it = begin;
473   size_t size = it->size();
474   while (++it != end) {
475     size += dsize + it->size();
476   }
477   output.reserve(size);
478   internalJoinAppend(delimiter, begin, end, output);
479 }
480 
481 template <class Delim, class Iterator, class String>
482 typename std::enable_if<
483     !IsSizableStringContainerIterator<Iterator>::value>::type
internalJoin(Delim delimiter,Iterator begin,Iterator end,String & output)484 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
485   output.clear();
486   if (begin == end) {
487     return;
488   }
489   internalJoinAppend(delimiter, begin, end, output);
490 }
491 
492 } // namespace detail
493 
494 template <class Delim, class Iterator, class String>
join(const Delim & delimiter,Iterator begin,Iterator end,String & output)495 void join(
496     const Delim& delimiter, Iterator begin, Iterator end, String& output) {
497   detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
498 }
499 
500 template <class OutputString>
backslashify(folly::StringPiece input,OutputString & output,bool hex_style)501 void backslashify(
502     folly::StringPiece input, OutputString& output, bool hex_style) {
503   static const char hexValues[] = "0123456789abcdef";
504   output.clear();
505   output.reserve(3 * input.size());
506   for (unsigned char c : input) {
507     // less than space or greater than '~' are considered unprintable
508     if (c < 0x20 || c > 0x7e || c == '\\') {
509       bool hex_append = false;
510       output.push_back('\\');
511       if (hex_style) {
512         hex_append = true;
513       } else {
514         if (c == '\r') {
515           output += 'r';
516         } else if (c == '\n') {
517           output += 'n';
518         } else if (c == '\t') {
519           output += 't';
520         } else if (c == '\a') {
521           output += 'a';
522         } else if (c == '\b') {
523           output += 'b';
524         } else if (c == '\0') {
525           output += '0';
526         } else if (c == '\\') {
527           output += '\\';
528         } else {
529           hex_append = true;
530         }
531       }
532       if (hex_append) {
533         output.push_back('x');
534         output.push_back(hexValues[(c >> 4) & 0xf]);
535         output.push_back(hexValues[c & 0xf]);
536       }
537     } else {
538       output += c;
539     }
540   }
541 }
542 
543 template <class String1, class String2>
humanify(const String1 & input,String2 & output)544 void humanify(const String1& input, String2& output) {
545   size_t numUnprintable = 0;
546   size_t numPrintablePrefix = 0;
547   for (unsigned char c : input) {
548     if (c < 0x20 || c > 0x7e || c == '\\') {
549       ++numUnprintable;
550     }
551     if (numUnprintable == 0) {
552       ++numPrintablePrefix;
553     }
554   }
555 
556   // hexlify doubles a string's size; backslashify can potentially
557   // explode it by 4x.  Now, the printable range of the ascii
558   // "spectrum" is around 95 out of 256 values, so a "random" binary
559   // string should be around 60% unprintable.  We use a 50% heuristic
560   // here, so if a string is 60% unprintable, then we just use hex
561   // output.  Otherwise we backslash.
562   //
563   // UTF8 is completely ignored; as a result, utf8 characters will
564   // likely be \x escaped (since most common glyphs fit in two bytes).
565   // This is a tradeoff of complexity/speed instead of a convenience
566   // that likely would rarely matter.  Moreover, this function is more
567   // about displaying underlying bytes, not about displaying glyphs
568   // from languages.
569   if (numUnprintable == 0) {
570     output = input;
571   } else if (5 * numUnprintable >= 3 * input.size()) {
572     // However!  If we have a "meaningful" prefix of printable
573     // characters, say 20% of the string, we backslashify under the
574     // assumption viewing the prefix as ascii is worth blowing the
575     // output size up a bit.
576     if (5 * numPrintablePrefix >= input.size()) {
577       backslashify(input, output);
578     } else {
579       output = "0x";
580       hexlify(input, output, true /* append output */);
581     }
582   } else {
583     backslashify(input, output);
584   }
585 }
586 
587 template <class InputString, class OutputString>
hexlify(const InputString & input,OutputString & output,bool append_output)588 bool hexlify(
589     const InputString& input, OutputString& output, bool append_output) {
590   if (!append_output) {
591     output.clear();
592   }
593 
594   static char hexValues[] = "0123456789abcdef";
595   auto j = output.size();
596   output.resize(2 * input.size() + output.size());
597   for (size_t i = 0; i < input.size(); ++i) {
598     int ch = input[i];
599     output[j++] = hexValues[(ch >> 4) & 0xf];
600     output[j++] = hexValues[ch & 0xf];
601   }
602   return true;
603 }
604 
605 template <class InputString, class OutputString>
unhexlify(const InputString & input,OutputString & output)606 bool unhexlify(const InputString& input, OutputString& output) {
607   if (input.size() % 2 != 0) {
608     return false;
609   }
610   output.resize(input.size() / 2);
611   int j = 0;
612 
613   for (size_t i = 0; i < input.size(); i += 2) {
614     int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
615     int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
616     if ((highBits | lowBits) & 0x10) {
617       // One of the characters wasn't a hex digit
618       return false;
619     }
620     output[j++] = (highBits << 4) + lowBits;
621   }
622   return true;
623 }
624 
625 namespace detail {
626 /**
627  * Hex-dump at most 16 bytes starting at offset from a memory area of size
628  * bytes.  Return the number of bytes actually dumped.
629  */
630 size_t hexDumpLine(
631     const void* ptr, size_t offset, size_t size, std::string& line);
632 } // namespace detail
633 
634 template <class OutIt>
hexDump(const void * ptr,size_t size,OutIt out)635 void hexDump(const void* ptr, size_t size, OutIt out) {
636   size_t offset = 0;
637   std::string line;
638   while (offset < size) {
639     offset += detail::hexDumpLine(ptr, offset, size, line);
640     *out++ = line;
641   }
642 }
643 
644 } // namespace folly
645