1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #pragma once
18
19 #include <iterator>
20 #include <stdexcept>
21
22 #include <folly/CppAttributes.h>
23
24 #ifndef FOLLY_STRING_H_
25 #error This file may only be included from String.h
26 #endif
27
28 namespace folly {
29
30 namespace detail {
31 // Map from character code to value of one-character escape sequence
32 // ('\n' = 10 maps to 'n'), 'O' if the character should be printed as
33 // an octal escape sequence, or 'P' if the character is printable and
34 // should be printed as is.
35 extern const std::array<char, 256> cEscapeTable;
36 } // namespace detail
37
38 template <class String>
cEscape(StringPiece str,String & out)39 void cEscape(StringPiece str, String& out) {
40 char esc[4];
41 esc[0] = '\\';
42 out.reserve(out.size() + str.size());
43 auto p = str.begin();
44 auto last = p; // last regular character
45 // We advance over runs of regular characters (printable, not double-quote or
46 // backslash) and copy them in one go; this is faster than calling push_back
47 // repeatedly.
48 while (p != str.end()) {
49 char c = *p;
50 unsigned char v = static_cast<unsigned char>(c);
51 char e = detail::cEscapeTable[v];
52 if (e == 'P') { // printable
53 ++p;
54 } else if (e == 'O') { // octal
55 out.append(&*last, size_t(p - last));
56 esc[1] = '0' + ((v >> 6) & 7);
57 esc[2] = '0' + ((v >> 3) & 7);
58 esc[3] = '0' + (v & 7);
59 out.append(esc, 4);
60 ++p;
61 last = p;
62 } else { // special 1-character escape
63 out.append(&*last, size_t(p - last));
64 esc[1] = e;
65 out.append(esc, 2);
66 ++p;
67 last = p;
68 }
69 }
70 out.append(&*last, size_t(p - last));
71 }
72
73 namespace detail {
74 // Map from the character code of the character following a backslash to
75 // the unescaped character if a valid one-character escape sequence
76 // ('n' maps to 10 = '\n'), 'O' if this is the first character of an
77 // octal escape sequence, 'X' if this is the first character of a
78 // hexadecimal escape sequence, or 'I' if this escape sequence is invalid.
79 extern const std::array<char, 256> cUnescapeTable;
80
81 // Map from the character code to the hex value, or 16 if invalid hex char.
82 extern const std::array<unsigned char, 256> hexTable;
83 } // namespace detail
84
85 template <class String>
cUnescape(StringPiece str,String & out,bool strict)86 void cUnescape(StringPiece str, String& out, bool strict) {
87 out.reserve(out.size() + str.size());
88 auto p = str.begin();
89 auto last = p; // last regular character (not part of an escape sequence)
90 // We advance over runs of regular characters (not backslash) and copy them
91 // in one go; this is faster than calling push_back repeatedly.
92 while (p != str.end()) {
93 char c = *p;
94 if (c != '\\') { // normal case
95 ++p;
96 continue;
97 }
98 out.append(&*last, p - last);
99 ++p;
100 if (p == str.end()) { // backslash at end of string
101 if (strict) {
102 throw_exception<std::invalid_argument>("incomplete escape sequence");
103 }
104 out.push_back('\\');
105 last = p;
106 continue;
107 }
108 char e = detail::cUnescapeTable[static_cast<unsigned char>(*p)];
109 if (e == 'O') { // octal
110 unsigned char val = 0;
111 for (int i = 0; i < 3 && p != str.end() && *p >= '0' && *p <= '7';
112 ++i, ++p) {
113 val <<= 3;
114 val |= (*p - '0');
115 }
116 out.push_back(val);
117 last = p;
118 } else if (e == 'X') { // hex
119 ++p;
120 if (p == str.end()) { // \x at end of string
121 if (strict) {
122 throw_exception<std::invalid_argument>(
123 "incomplete hex escape sequence");
124 }
125 out.append("\\x");
126 last = p;
127 continue;
128 }
129 unsigned char val = 0;
130 unsigned char h;
131 for (; (p != str.end() &&
132 (h = detail::hexTable[static_cast<unsigned char>(*p)]) < 16);
133 ++p) {
134 val <<= 4;
135 val |= h;
136 }
137 out.push_back(val);
138 last = p;
139 } else if (e == 'I') { // invalid
140 if (strict) {
141 throw_exception<std::invalid_argument>("invalid escape sequence");
142 }
143 out.push_back('\\');
144 out.push_back(*p);
145 ++p;
146 last = p;
147 } else { // standard escape sequence, \' etc
148 out.push_back(e);
149 ++p;
150 last = p;
151 }
152 }
153 out.append(&*last, p - last);
154 }
155
156 namespace detail {
157 // Map from character code to escape mode:
158 // 0 = pass through
159 // 1 = unused
160 // 2 = pass through in PATH mode
161 // 3 = space, replace with '+' in QUERY mode
162 // 4 = percent-encode
163 extern const std::array<unsigned char, 256> uriEscapeTable;
164 } // namespace detail
165
166 template <class String>
uriEscape(StringPiece str,String & out,UriEscapeMode mode)167 void uriEscape(StringPiece str, String& out, UriEscapeMode mode) {
168 static const char hexValues[] = "0123456789abcdef";
169 char esc[3];
170 esc[0] = '%';
171 // Preallocate assuming that 25% of the input string will be escaped
172 out.reserve(out.size() + str.size() + 3 * (str.size() / 4));
173 auto p = str.begin();
174 auto last = p; // last regular character
175 // We advance over runs of passthrough characters and copy them in one go;
176 // this is faster than calling push_back repeatedly.
177 unsigned char minEncode = static_cast<unsigned char>(mode);
178 while (p != str.end()) {
179 char c = *p;
180 unsigned char v = static_cast<unsigned char>(c);
181 unsigned char discriminator = detail::uriEscapeTable[v];
182 if (LIKELY(discriminator <= minEncode)) {
183 ++p;
184 } else if (mode == UriEscapeMode::QUERY && discriminator == 3) {
185 out.append(&*last, size_t(p - last));
186 out.push_back('+');
187 ++p;
188 last = p;
189 } else {
190 out.append(&*last, size_t(p - last));
191 esc[1] = hexValues[v >> 4];
192 esc[2] = hexValues[v & 0x0f];
193 out.append(esc, 3);
194 ++p;
195 last = p;
196 }
197 }
198 out.append(&*last, size_t(p - last));
199 }
200
201 template <class String>
uriUnescape(StringPiece str,String & out,UriEscapeMode mode)202 void uriUnescape(StringPiece str, String& out, UriEscapeMode mode) {
203 out.reserve(out.size() + str.size());
204 auto p = str.begin();
205 auto last = p;
206 // We advance over runs of passthrough characters and copy them in one go;
207 // this is faster than calling push_back repeatedly.
208 while (p != str.end()) {
209 char c = *p;
210 switch (c) {
211 case '%': {
212 if (UNLIKELY(std::distance(p, str.end()) < 3)) {
213 throw_exception<std::invalid_argument>(
214 "incomplete percent encode sequence");
215 }
216 auto h1 = detail::hexTable[static_cast<unsigned char>(p[1])];
217 auto h2 = detail::hexTable[static_cast<unsigned char>(p[2])];
218 if (UNLIKELY(h1 == 16 || h2 == 16)) {
219 throw_exception<std::invalid_argument>(
220 "invalid percent encode sequence");
221 }
222 out.append(&*last, size_t(p - last));
223 out.push_back((h1 << 4) | h2);
224 p += 3;
225 last = p;
226 break;
227 }
228 case '+':
229 if (mode == UriEscapeMode::QUERY) {
230 out.append(&*last, size_t(p - last));
231 out.push_back(' ');
232 ++p;
233 last = p;
234 break;
235 }
236 // else fallthrough
237 FOLLY_FALLTHROUGH;
238 default:
239 ++p;
240 break;
241 }
242 }
243 out.append(&*last, size_t(p - last));
244 }
245
246 namespace detail {
247
248 /*
249 * The following functions are type-overloaded helpers for
250 * internalSplit().
251 */
delimSize(char)252 inline size_t delimSize(char) {
253 return 1;
254 }
delimSize(StringPiece s)255 inline size_t delimSize(StringPiece s) {
256 return s.size();
257 }
atDelim(const char * s,char c)258 inline bool atDelim(const char* s, char c) {
259 return *s == c;
260 }
atDelim(const char * s,StringPiece sp)261 inline bool atDelim(const char* s, StringPiece sp) {
262 return !std::memcmp(s, sp.start(), sp.size());
263 }
264
265 // These are used to short-circuit internalSplit() in the case of
266 // 1-character strings.
delimFront(char c)267 inline char delimFront(char c) {
268 // This one exists only for compile-time; it should never be called.
269 std::abort();
270 return c;
271 }
delimFront(StringPiece s)272 inline char delimFront(StringPiece s) {
273 assert(!s.empty() && s.start() != nullptr);
274 return *s.start();
275 }
276
277 /*
278 * Shared implementation for all the split() overloads.
279 *
280 * This uses some external helpers that are overloaded to let this
281 * algorithm be more performant if the deliminator is a single
282 * character instead of a whole string.
283 *
284 * @param ignoreEmpty iff true, don't copy empty segments to output
285 */
286 template <class OutStringT, class DelimT, class OutputIterator>
internalSplit(DelimT delim,StringPiece sp,OutputIterator out,bool ignoreEmpty)287 void internalSplit(
288 DelimT delim, StringPiece sp, OutputIterator out, bool ignoreEmpty) {
289 assert(sp.empty() || sp.start() != nullptr);
290
291 const char* s = sp.start();
292 const size_t strSize = sp.size();
293 const size_t dSize = delimSize(delim);
294
295 if (dSize > strSize || dSize == 0) {
296 if (!ignoreEmpty || strSize > 0) {
297 *out++ = to<OutStringT>(sp);
298 }
299 return;
300 }
301 if (std::is_same<DelimT, StringPiece>::value && dSize == 1) {
302 // Call the char version because it is significantly faster.
303 return internalSplit<OutStringT>(delimFront(delim), sp, out, ignoreEmpty);
304 }
305
306 size_t tokenStartPos = 0;
307 size_t tokenSize = 0;
308 for (size_t i = 0; i <= strSize - dSize; ++i) {
309 if (atDelim(&s[i], delim)) {
310 if (!ignoreEmpty || tokenSize > 0) {
311 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
312 }
313
314 tokenStartPos = i + dSize;
315 tokenSize = 0;
316 i += dSize - 1;
317 } else {
318 ++tokenSize;
319 }
320 }
321 tokenSize = strSize - tokenStartPos;
322 if (!ignoreEmpty || tokenSize > 0) {
323 *out++ = to<OutStringT>(sp.subpiece(tokenStartPos, tokenSize));
324 }
325 }
326
327 template <class String>
prepareDelim(const String & s)328 StringPiece prepareDelim(const String& s) {
329 return StringPiece(s);
330 }
prepareDelim(char c)331 inline char prepareDelim(char c) {
332 return c;
333 }
334
335 template <class OutputType>
toOrIgnore(StringPiece input,OutputType & output)336 void toOrIgnore(StringPiece input, OutputType& output) {
337 output = folly::to<OutputType>(input);
338 }
339
toOrIgnore(StringPiece,decltype (std::ignore)&)340 inline void toOrIgnore(StringPiece, decltype(std::ignore)&) {}
341
342 template <bool exact, class Delim, class OutputType>
splitFixed(const Delim & delimiter,StringPiece input,OutputType & output)343 bool splitFixed(const Delim& delimiter, StringPiece input, OutputType& output) {
344 static_assert(
345 exact || std::is_same<OutputType, StringPiece>::value ||
346 IsSomeString<OutputType>::value ||
347 std::is_same<OutputType, decltype(std::ignore)>::value,
348 "split<false>() requires that the last argument be a string type "
349 "or std::ignore");
350 if (exact && UNLIKELY(std::string::npos != input.find(delimiter))) {
351 return false;
352 }
353 toOrIgnore(input, output);
354 return true;
355 }
356
357 template <bool exact, class Delim, class OutputType, class... OutputTypes>
splitFixed(const Delim & delimiter,StringPiece input,OutputType & outHead,OutputTypes &...outTail)358 bool splitFixed(
359 const Delim& delimiter,
360 StringPiece input,
361 OutputType& outHead,
362 OutputTypes&... outTail) {
363 size_t cut = input.find(delimiter);
364 if (UNLIKELY(cut == std::string::npos)) {
365 return false;
366 }
367 StringPiece head(input.begin(), input.begin() + cut);
368 StringPiece tail(
369 input.begin() + cut + detail::delimSize(delimiter), input.end());
370 if (LIKELY(splitFixed<exact>(delimiter, tail, outTail...))) {
371 toOrIgnore(head, outHead);
372 return true;
373 }
374 return false;
375 }
376
377 } // namespace detail
378
379 //////////////////////////////////////////////////////////////////////
380
381 template <class Delim, class String, class OutputType>
split(const Delim & delimiter,const String & input,std::vector<OutputType> & out,bool ignoreEmpty)382 void split(
383 const Delim& delimiter,
384 const String& input,
385 std::vector<OutputType>& out,
386 bool ignoreEmpty) {
387 detail::internalSplit<OutputType>(
388 detail::prepareDelim(delimiter),
389 StringPiece(input),
390 std::back_inserter(out),
391 ignoreEmpty);
392 }
393
394 template <class Delim, class String, class OutputType>
split(const Delim & delimiter,const String & input,fbvector<OutputType,std::allocator<OutputType>> & out,bool ignoreEmpty)395 void split(
396 const Delim& delimiter,
397 const String& input,
398 fbvector<OutputType, std::allocator<OutputType>>& out,
399 bool ignoreEmpty) {
400 detail::internalSplit<OutputType>(
401 detail::prepareDelim(delimiter),
402 StringPiece(input),
403 std::back_inserter(out),
404 ignoreEmpty);
405 }
406
407 template <
408 class OutputValueType,
409 class Delim,
410 class String,
411 class OutputIterator>
splitTo(const Delim & delimiter,const String & input,OutputIterator out,bool ignoreEmpty)412 void splitTo(
413 const Delim& delimiter,
414 const String& input,
415 OutputIterator out,
416 bool ignoreEmpty) {
417 detail::internalSplit<OutputValueType>(
418 detail::prepareDelim(delimiter), StringPiece(input), out, ignoreEmpty);
419 }
420
421 template <bool exact, class Delim, class... OutputTypes>
422 typename std::enable_if<
423 StrictConjunction<IsConvertible<OutputTypes>...>::value &&
424 sizeof...(OutputTypes) >= 1,
425 bool>::type
426 split(const Delim& delimiter, StringPiece input, OutputTypes&... outputs) {
427 return detail::splitFixed<exact>(
428 detail::prepareDelim(delimiter), input, outputs...);
429 }
430
431 namespace detail {
432
433 /*
434 * If a type can have its string size determined cheaply, we can more
435 * efficiently append it in a loop (see internalJoinAppend). Note that the
436 * struct need not conform to the std::string api completely (ex. does not need
437 * to implement append()).
438 */
439 template <class T>
440 struct IsSizableString {
441 enum {
442 value = IsSomeString<T>::value || std::is_same<T, StringPiece>::value
443 };
444 };
445
446 template <class Iterator>
447 struct IsSizableStringContainerIterator
448 : IsSizableString<typename std::iterator_traits<Iterator>::value_type> {};
449
450 template <class Delim, class Iterator, class String>
internalJoinAppend(Delim delimiter,Iterator begin,Iterator end,String & output)451 void internalJoinAppend(
452 Delim delimiter, Iterator begin, Iterator end, String& output) {
453 assert(begin != end);
454 if (std::is_same<Delim, StringPiece>::value && delimSize(delimiter) == 1) {
455 internalJoinAppend(delimFront(delimiter), begin, end, output);
456 return;
457 }
458 toAppend(*begin, &output);
459 while (++begin != end) {
460 toAppend(delimiter, *begin, &output);
461 }
462 }
463
464 template <class Delim, class Iterator, class String>
465 typename std::enable_if<IsSizableStringContainerIterator<Iterator>::value>::type
internalJoin(Delim delimiter,Iterator begin,Iterator end,String & output)466 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
467 output.clear();
468 if (begin == end) {
469 return;
470 }
471 const size_t dsize = delimSize(delimiter);
472 Iterator it = begin;
473 size_t size = it->size();
474 while (++it != end) {
475 size += dsize + it->size();
476 }
477 output.reserve(size);
478 internalJoinAppend(delimiter, begin, end, output);
479 }
480
481 template <class Delim, class Iterator, class String>
482 typename std::enable_if<
483 !IsSizableStringContainerIterator<Iterator>::value>::type
internalJoin(Delim delimiter,Iterator begin,Iterator end,String & output)484 internalJoin(Delim delimiter, Iterator begin, Iterator end, String& output) {
485 output.clear();
486 if (begin == end) {
487 return;
488 }
489 internalJoinAppend(delimiter, begin, end, output);
490 }
491
492 } // namespace detail
493
494 template <class Delim, class Iterator, class String>
join(const Delim & delimiter,Iterator begin,Iterator end,String & output)495 void join(
496 const Delim& delimiter, Iterator begin, Iterator end, String& output) {
497 detail::internalJoin(detail::prepareDelim(delimiter), begin, end, output);
498 }
499
500 template <class OutputString>
backslashify(folly::StringPiece input,OutputString & output,bool hex_style)501 void backslashify(
502 folly::StringPiece input, OutputString& output, bool hex_style) {
503 static const char hexValues[] = "0123456789abcdef";
504 output.clear();
505 output.reserve(3 * input.size());
506 for (unsigned char c : input) {
507 // less than space or greater than '~' are considered unprintable
508 if (c < 0x20 || c > 0x7e || c == '\\') {
509 bool hex_append = false;
510 output.push_back('\\');
511 if (hex_style) {
512 hex_append = true;
513 } else {
514 if (c == '\r') {
515 output += 'r';
516 } else if (c == '\n') {
517 output += 'n';
518 } else if (c == '\t') {
519 output += 't';
520 } else if (c == '\a') {
521 output += 'a';
522 } else if (c == '\b') {
523 output += 'b';
524 } else if (c == '\0') {
525 output += '0';
526 } else if (c == '\\') {
527 output += '\\';
528 } else {
529 hex_append = true;
530 }
531 }
532 if (hex_append) {
533 output.push_back('x');
534 output.push_back(hexValues[(c >> 4) & 0xf]);
535 output.push_back(hexValues[c & 0xf]);
536 }
537 } else {
538 output += c;
539 }
540 }
541 }
542
543 template <class String1, class String2>
humanify(const String1 & input,String2 & output)544 void humanify(const String1& input, String2& output) {
545 size_t numUnprintable = 0;
546 size_t numPrintablePrefix = 0;
547 for (unsigned char c : input) {
548 if (c < 0x20 || c > 0x7e || c == '\\') {
549 ++numUnprintable;
550 }
551 if (numUnprintable == 0) {
552 ++numPrintablePrefix;
553 }
554 }
555
556 // hexlify doubles a string's size; backslashify can potentially
557 // explode it by 4x. Now, the printable range of the ascii
558 // "spectrum" is around 95 out of 256 values, so a "random" binary
559 // string should be around 60% unprintable. We use a 50% heuristic
560 // here, so if a string is 60% unprintable, then we just use hex
561 // output. Otherwise we backslash.
562 //
563 // UTF8 is completely ignored; as a result, utf8 characters will
564 // likely be \x escaped (since most common glyphs fit in two bytes).
565 // This is a tradeoff of complexity/speed instead of a convenience
566 // that likely would rarely matter. Moreover, this function is more
567 // about displaying underlying bytes, not about displaying glyphs
568 // from languages.
569 if (numUnprintable == 0) {
570 output = input;
571 } else if (5 * numUnprintable >= 3 * input.size()) {
572 // However! If we have a "meaningful" prefix of printable
573 // characters, say 20% of the string, we backslashify under the
574 // assumption viewing the prefix as ascii is worth blowing the
575 // output size up a bit.
576 if (5 * numPrintablePrefix >= input.size()) {
577 backslashify(input, output);
578 } else {
579 output = "0x";
580 hexlify(input, output, true /* append output */);
581 }
582 } else {
583 backslashify(input, output);
584 }
585 }
586
587 template <class InputString, class OutputString>
hexlify(const InputString & input,OutputString & output,bool append_output)588 bool hexlify(
589 const InputString& input, OutputString& output, bool append_output) {
590 if (!append_output) {
591 output.clear();
592 }
593
594 static char hexValues[] = "0123456789abcdef";
595 auto j = output.size();
596 output.resize(2 * input.size() + output.size());
597 for (size_t i = 0; i < input.size(); ++i) {
598 int ch = input[i];
599 output[j++] = hexValues[(ch >> 4) & 0xf];
600 output[j++] = hexValues[ch & 0xf];
601 }
602 return true;
603 }
604
605 template <class InputString, class OutputString>
unhexlify(const InputString & input,OutputString & output)606 bool unhexlify(const InputString& input, OutputString& output) {
607 if (input.size() % 2 != 0) {
608 return false;
609 }
610 output.resize(input.size() / 2);
611 int j = 0;
612
613 for (size_t i = 0; i < input.size(); i += 2) {
614 int highBits = detail::hexTable[static_cast<uint8_t>(input[i])];
615 int lowBits = detail::hexTable[static_cast<uint8_t>(input[i + 1])];
616 if ((highBits | lowBits) & 0x10) {
617 // One of the characters wasn't a hex digit
618 return false;
619 }
620 output[j++] = (highBits << 4) + lowBits;
621 }
622 return true;
623 }
624
625 namespace detail {
626 /**
627 * Hex-dump at most 16 bytes starting at offset from a memory area of size
628 * bytes. Return the number of bytes actually dumped.
629 */
630 size_t hexDumpLine(
631 const void* ptr, size_t offset, size_t size, std::string& line);
632 } // namespace detail
633
634 template <class OutIt>
hexDump(const void * ptr,size_t size,OutIt out)635 void hexDump(const void* ptr, size_t size, OutIt out) {
636 size_t offset = 0;
637 std::string line;
638 while (offset < size) {
639 offset += detail::hexDumpLine(ptr, offset, size, line);
640 *out++ = line;
641 }
642 }
643
644 } // namespace folly
645