1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35 
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39 
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42 
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44 
hex_digit_to_int(char c)45 inline int hex_digit_to_int(char c) {
46   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47                 "Character set must be ASCII.");
48   assert(absl::ascii_isxdigit(c));
49   int x = static_cast<unsigned char>(c);
50   if (x > '9') {
51     x += 9;
52   }
53   return x & 0xf;
54 }
55 
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57   if (c >= 0xD800 && c <= 0xDFFF) {
58     if (error) {
59       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60                             src);
61     }
62     return true;
63   }
64   return false;
65 }
66 
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 //    Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 //    If 'source' is valid, stores the unescaped string and its size in
74 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
75 //    returns false and optionally stores the error description in
76 //    'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 //    'dest' should point to a buffer that is at least as big as 'source'.
79 //    'source' and 'dest' may be the same.
80 //
81 //     NOTE: any changes to this function must also be reflected in the older
82 //     UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85                        char* dest, ptrdiff_t* dest_len, std::string* error) {
86   char* d = dest;
87   const char* p = source.data();
88   const char* end = p + source.size();
89   const char* last_byte = end - 1;
90 
91   // Small optimization for case where source = dest and there's no escaping
92   while (p == d && p < end && *p != '\\') p++, d++;
93 
94   while (p < end) {
95     if (*p != '\\') {
96       *d++ = *p++;
97     } else {
98       if (++p > last_byte) {  // skip past the '\\'
99         if (error) *error = "String cannot end with \\";
100         return false;
101       }
102       switch (*p) {
103         case 'a':  *d++ = '\a';  break;
104         case 'b':  *d++ = '\b';  break;
105         case 'f':  *d++ = '\f';  break;
106         case 'n':  *d++ = '\n';  break;
107         case 'r':  *d++ = '\r';  break;
108         case 't':  *d++ = '\t';  break;
109         case 'v':  *d++ = '\v';  break;
110         case '\\': *d++ = '\\';  break;
111         case '?':  *d++ = '\?';  break;    // \?  Who knew?
112         case '\'': *d++ = '\'';  break;
113         case '"':  *d++ = '\"';  break;
114         case '0':
115         case '1':
116         case '2':
117         case '3':
118         case '4':
119         case '5':
120         case '6':
121         case '7': {
122           // octal digit: 1 to 3 digits
123           const char* octal_start = p;
124           unsigned int ch = *p - '0';
125           if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
126           if (p < last_byte && is_octal_digit(p[1]))
127             ch = ch * 8 + *++p - '0';      // now points at last digit
128           if (ch > 0xff) {
129             if (error) {
130               *error = "Value of \\" +
131                        std::string(octal_start, p + 1 - octal_start) +
132                        " exceeds 0xff";
133             }
134             return false;
135           }
136           if ((ch == 0) && leave_nulls_escaped) {
137             // Copy the escape sequence for the null character
138             const ptrdiff_t octal_size = p + 1 - octal_start;
139             *d++ = '\\';
140             memmove(d, octal_start, octal_size);
141             d += octal_size;
142             break;
143           }
144           *d++ = ch;
145           break;
146         }
147         case 'x':
148         case 'X': {
149           if (p >= last_byte) {
150             if (error) *error = "String cannot end with \\x";
151             return false;
152           } else if (!absl::ascii_isxdigit(p[1])) {
153             if (error) *error = "\\x cannot be followed by a non-hex digit";
154             return false;
155           }
156           unsigned int ch = 0;
157           const char* hex_start = p;
158           while (p < last_byte && absl::ascii_isxdigit(p[1]))
159             // Arbitrarily many hex digits
160             ch = (ch << 4) + hex_digit_to_int(*++p);
161           if (ch > 0xFF) {
162             if (error) {
163               *error = "Value of \\" +
164                        std::string(hex_start, p + 1 - hex_start) +
165                        " exceeds 0xff";
166             }
167             return false;
168           }
169           if ((ch == 0) && leave_nulls_escaped) {
170             // Copy the escape sequence for the null character
171             const ptrdiff_t hex_size = p + 1 - hex_start;
172             *d++ = '\\';
173             memmove(d, hex_start, hex_size);
174             d += hex_size;
175             break;
176           }
177           *d++ = ch;
178           break;
179         }
180         case 'u': {
181           // \uhhhh => convert 4 hex digits to UTF-8
182           char32_t rune = 0;
183           const char* hex_start = p;
184           if (p + 4 >= end) {
185             if (error) {
186               *error = "\\u must be followed by 4 hex digits: \\" +
187                        std::string(hex_start, p + 1 - hex_start);
188             }
189             return false;
190           }
191           for (int i = 0; i < 4; ++i) {
192             // Look one char ahead.
193             if (absl::ascii_isxdigit(p[1])) {
194               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
195             } else {
196               if (error) {
197                 *error = "\\u must be followed by 4 hex digits: \\" +
198                          std::string(hex_start, p + 1 - hex_start);
199               }
200               return false;
201             }
202           }
203           if ((rune == 0) && leave_nulls_escaped) {
204             // Copy the escape sequence for the null character
205             *d++ = '\\';
206             memmove(d, hex_start, 5);  // u0000
207             d += 5;
208             break;
209           }
210           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
211             return false;
212           }
213           d += strings_internal::EncodeUTF8Char(d, rune);
214           break;
215         }
216         case 'U': {
217           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
218           char32_t rune = 0;
219           const char* hex_start = p;
220           if (p + 8 >= end) {
221             if (error) {
222               *error = "\\U must be followed by 8 hex digits: \\" +
223                        std::string(hex_start, p + 1 - hex_start);
224             }
225             return false;
226           }
227           for (int i = 0; i < 8; ++i) {
228             // Look one char ahead.
229             if (absl::ascii_isxdigit(p[1])) {
230               // Don't change rune until we're sure this
231               // is within the Unicode limit, but do advance p.
232               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
233               if (newrune > 0x10FFFF) {
234                 if (error) {
235                   *error = "Value of \\" +
236                            std::string(hex_start, p + 1 - hex_start) +
237                            " exceeds Unicode limit (0x10FFFF)";
238                 }
239                 return false;
240               } else {
241                 rune = newrune;
242               }
243             } else {
244               if (error) {
245                 *error = "\\U must be followed by 8 hex digits: \\" +
246                          std::string(hex_start, p + 1 - hex_start);
247               }
248               return false;
249             }
250           }
251           if ((rune == 0) && leave_nulls_escaped) {
252             // Copy the escape sequence for the null character
253             *d++ = '\\';
254             memmove(d, hex_start, 9);  // U00000000
255             d += 9;
256             break;
257           }
258           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
259             return false;
260           }
261           d += strings_internal::EncodeUTF8Char(d, rune);
262           break;
263         }
264         default: {
265           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
266           return false;
267         }
268       }
269       p++;                                 // read past letter we escaped
270     }
271   }
272   *dest_len = d - dest;
273   return true;
274 }
275 
276 // ----------------------------------------------------------------------
277 // CUnescapeInternal()
278 //
279 //    Same as above but uses a std::string for output. 'source' and 'dest'
280 //    may be the same.
281 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)282 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
283                        std::string* dest, std::string* error) {
284   strings_internal::STLStringResizeUninitialized(dest, source.size());
285 
286   ptrdiff_t dest_size;
287   if (!CUnescapeInternal(source,
288                          leave_nulls_escaped,
289                          &(*dest)[0],
290                          &dest_size,
291                          error)) {
292     return false;
293   }
294   dest->erase(dest_size);
295   return true;
296 }
297 
298 // ----------------------------------------------------------------------
299 // CEscape()
300 // CHexEscape()
301 // Utf8SafeCEscape()
302 // Utf8SafeCHexEscape()
303 //    Escapes 'src' using C-style escape sequences.  This is useful for
304 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
305 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
306 //
307 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
308 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)309 std::string CEscapeInternal(absl::string_view src, bool use_hex,
310                             bool utf8_safe) {
311   std::string dest;
312   bool last_hex_escape = false;  // true if last output char was \xNN.
313 
314   for (unsigned char c : src) {
315     bool is_hex_escape = false;
316     switch (c) {
317       case '\n': dest.append("\\" "n"); break;
318       case '\r': dest.append("\\" "r"); break;
319       case '\t': dest.append("\\" "t"); break;
320       case '\"': dest.append("\\" "\""); break;
321       case '\'': dest.append("\\" "'"); break;
322       case '\\': dest.append("\\" "\\"); break;
323       default:
324         // Note that if we emit \xNN and the src character after that is a hex
325         // digit then that digit must be escaped too to prevent it being
326         // interpreted as part of the character code by C.
327         if ((!utf8_safe || c < 0x80) &&
328             (!absl::ascii_isprint(c) ||
329              (last_hex_escape && absl::ascii_isxdigit(c)))) {
330           if (use_hex) {
331             dest.append("\\" "x");
332             dest.push_back(numbers_internal::kHexChar[c / 16]);
333             dest.push_back(numbers_internal::kHexChar[c % 16]);
334             is_hex_escape = true;
335           } else {
336             dest.append("\\");
337             dest.push_back(numbers_internal::kHexChar[c / 64]);
338             dest.push_back(numbers_internal::kHexChar[(c % 64) / 8]);
339             dest.push_back(numbers_internal::kHexChar[c % 8]);
340           }
341         } else {
342           dest.push_back(c);
343           break;
344         }
345     }
346     last_hex_escape = is_hex_escape;
347   }
348 
349   return dest;
350 }
351 
352 /* clang-format off */
353 constexpr char c_escaped_len[256] = {
354     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
355     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
356     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
357     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
358     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
359     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
360     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
361     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
362     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
363     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
364     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
365     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
366     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
368     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
369     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
370 };
371 /* clang-format on */
372 
373 // Calculates the length of the C-style escaped version of 'src'.
374 // Assumes that non-printable characters are escaped using octal sequences, and
375 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)376 inline size_t CEscapedLength(absl::string_view src) {
377   size_t escaped_len = 0;
378   for (unsigned char c : src) escaped_len += c_escaped_len[c];
379   return escaped_len;
380 }
381 
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)382 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
383   size_t escaped_len = CEscapedLength(src);
384   if (escaped_len == src.size()) {
385     dest->append(src.data(), src.size());
386     return;
387   }
388 
389   size_t cur_dest_len = dest->size();
390   strings_internal::STLStringResizeUninitialized(dest,
391                                                  cur_dest_len + escaped_len);
392   char* append_ptr = &(*dest)[cur_dest_len];
393 
394   for (unsigned char c : src) {
395     int char_len = c_escaped_len[c];
396     if (char_len == 1) {
397       *append_ptr++ = c;
398     } else if (char_len == 2) {
399       switch (c) {
400         case '\n':
401           *append_ptr++ = '\\';
402           *append_ptr++ = 'n';
403           break;
404         case '\r':
405           *append_ptr++ = '\\';
406           *append_ptr++ = 'r';
407           break;
408         case '\t':
409           *append_ptr++ = '\\';
410           *append_ptr++ = 't';
411           break;
412         case '\"':
413           *append_ptr++ = '\\';
414           *append_ptr++ = '\"';
415           break;
416         case '\'':
417           *append_ptr++ = '\\';
418           *append_ptr++ = '\'';
419           break;
420         case '\\':
421           *append_ptr++ = '\\';
422           *append_ptr++ = '\\';
423           break;
424       }
425     } else {
426       *append_ptr++ = '\\';
427       *append_ptr++ = '0' + c / 64;
428       *append_ptr++ = '0' + (c % 64) / 8;
429       *append_ptr++ = '0' + c % 8;
430     }
431   }
432 }
433 
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)434 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
435                             size_t szdest, const signed char* unbase64,
436                             size_t* len) {
437   static const char kPad64Equals = '=';
438   static const char kPad64Dot = '.';
439 
440   size_t destidx = 0;
441   int decode = 0;
442   int state = 0;
443   unsigned int ch = 0;
444   unsigned int temp = 0;
445 
446   // If "char" is signed by default, using *src as an array index results in
447   // accessing negative array elements. Treat the input as a pointer to
448   // unsigned char to avoid this.
449   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
450 
451   // The GET_INPUT macro gets the next input character, skipping
452   // over any whitespace, and stopping when we reach the end of the
453   // string or when we read any non-data character.  The arguments are
454   // an arbitrary identifier (used as a label for goto) and the number
455   // of data bytes that must remain in the input to avoid aborting the
456   // loop.
457 #define GET_INPUT(label, remain)                                \
458   label:                                                        \
459   --szsrc;                                                      \
460   ch = *src++;                                                  \
461   decode = unbase64[ch];                                        \
462   if (decode < 0) {                                             \
463     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
464     state = 4 - remain;                                         \
465     break;                                                      \
466   }
467 
468   // if dest is null, we're just checking to see if it's legal input
469   // rather than producing output.  (I suspect this could just be done
470   // with a regexp...).  We duplicate the loop so this test can be
471   // outside it instead of in every iteration.
472 
473   if (dest) {
474     // This loop consumes 4 input bytes and produces 3 output bytes
475     // per iteration.  We can't know at the start that there is enough
476     // data left in the string for a full iteration, so the loop may
477     // break out in the middle; if so 'state' will be set to the
478     // number of input bytes read.
479 
480     while (szsrc >= 4) {
481       // We'll start by optimistically assuming that the next four
482       // bytes of the string (src[0..3]) are four good data bytes
483       // (that is, no nulls, whitespace, padding chars, or illegal
484       // chars).  We need to test src[0..2] for nulls individually
485       // before constructing temp to preserve the property that we
486       // never read past a null in the string (no matter how long
487       // szsrc claims the string is).
488 
489       if (!src[0] || !src[1] || !src[2] ||
490           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
491                     (unsigned(unbase64[src[1]]) << 12) |
492                     (unsigned(unbase64[src[2]]) << 6) |
493                     (unsigned(unbase64[src[3]])))) &
494            0x80000000)) {
495         // Iff any of those four characters was bad (null, illegal,
496         // whitespace, padding), then temp's high bit will be set
497         // (because unbase64[] is -1 for all bad characters).
498         //
499         // We'll back up and resort to the slower decoder, which knows
500         // how to handle those cases.
501 
502         GET_INPUT(first, 4);
503         temp = decode;
504         GET_INPUT(second, 3);
505         temp = (temp << 6) | decode;
506         GET_INPUT(third, 2);
507         temp = (temp << 6) | decode;
508         GET_INPUT(fourth, 1);
509         temp = (temp << 6) | decode;
510       } else {
511         // We really did have four good data bytes, so advance four
512         // characters in the string.
513 
514         szsrc -= 4;
515         src += 4;
516       }
517 
518       // temp has 24 bits of input, so write that out as three bytes.
519 
520       if (destidx + 3 > szdest) return false;
521       dest[destidx + 2] = temp;
522       temp >>= 8;
523       dest[destidx + 1] = temp;
524       temp >>= 8;
525       dest[destidx] = temp;
526       destidx += 3;
527     }
528   } else {
529     while (szsrc >= 4) {
530       if (!src[0] || !src[1] || !src[2] ||
531           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
532                     (unsigned(unbase64[src[1]]) << 12) |
533                     (unsigned(unbase64[src[2]]) << 6) |
534                     (unsigned(unbase64[src[3]])))) &
535            0x80000000)) {
536         GET_INPUT(first_no_dest, 4);
537         GET_INPUT(second_no_dest, 3);
538         GET_INPUT(third_no_dest, 2);
539         GET_INPUT(fourth_no_dest, 1);
540       } else {
541         szsrc -= 4;
542         src += 4;
543       }
544       destidx += 3;
545     }
546   }
547 
548 #undef GET_INPUT
549 
550   // if the loop terminated because we read a bad character, return
551   // now.
552   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
553       !absl::ascii_isspace(ch))
554     return false;
555 
556   if (ch == kPad64Equals || ch == kPad64Dot) {
557     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
558     // look at it again when we count to check for the proper number of
559     // equals signs at the end.
560     ++szsrc;
561     --src;
562   } else {
563     // This loop consumes 1 input byte per iteration.  It's used to
564     // clean up the 0-3 input bytes remaining when the first, faster
565     // loop finishes.  'temp' contains the data from 'state' input
566     // characters read by the first loop.
567     while (szsrc > 0) {
568       --szsrc;
569       ch = *src++;
570       decode = unbase64[ch];
571       if (decode < 0) {
572         if (absl::ascii_isspace(ch)) {
573           continue;
574         } else if (ch == kPad64Equals || ch == kPad64Dot) {
575           // back up one character; we'll read it again when we check
576           // for the correct number of pad characters at the end.
577           ++szsrc;
578           --src;
579           break;
580         } else {
581           return false;
582         }
583       }
584 
585       // Each input character gives us six bits of output.
586       temp = (temp << 6) | decode;
587       ++state;
588       if (state == 4) {
589         // If we've accumulated 24 bits of output, write that out as
590         // three bytes.
591         if (dest) {
592           if (destidx + 3 > szdest) return false;
593           dest[destidx + 2] = temp;
594           temp >>= 8;
595           dest[destidx + 1] = temp;
596           temp >>= 8;
597           dest[destidx] = temp;
598         }
599         destidx += 3;
600         state = 0;
601         temp = 0;
602       }
603     }
604   }
605 
606   // Process the leftover data contained in 'temp' at the end of the input.
607   int expected_equals = 0;
608   switch (state) {
609     case 0:
610       // Nothing left over; output is a multiple of 3 bytes.
611       break;
612 
613     case 1:
614       // Bad input; we have 6 bits left over.
615       return false;
616 
617     case 2:
618       // Produce one more output byte from the 12 input bits we have left.
619       if (dest) {
620         if (destidx + 1 > szdest) return false;
621         temp >>= 4;
622         dest[destidx] = temp;
623       }
624       ++destidx;
625       expected_equals = 2;
626       break;
627 
628     case 3:
629       // Produce two more output bytes from the 18 input bits we have left.
630       if (dest) {
631         if (destidx + 2 > szdest) return false;
632         temp >>= 2;
633         dest[destidx + 1] = temp;
634         temp >>= 8;
635         dest[destidx] = temp;
636       }
637       destidx += 2;
638       expected_equals = 1;
639       break;
640 
641     default:
642       // state should have no other values at this point.
643       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
644                    state);
645   }
646 
647   // The remainder of the string should be all whitespace, mixed with
648   // exactly 0 equals signs, or exactly 'expected_equals' equals
649   // signs.  (Always accepting 0 equals signs is an Abseil extension
650   // not covered in the RFC, as is accepting dot as the pad character.)
651 
652   int equals = 0;
653   while (szsrc > 0) {
654     if (*src == kPad64Equals || *src == kPad64Dot)
655       ++equals;
656     else if (!absl::ascii_isspace(*src))
657       return false;
658     --szsrc;
659     ++src;
660   }
661 
662   const bool ok = (equals == 0 || equals == expected_equals);
663   if (ok) *len = destidx;
664   return ok;
665 }
666 
667 // The arrays below were generated by the following code
668 // #include <sys/time.h>
669 // #include <stdlib.h>
670 // #include <string.h>
671 // main()
672 // {
673 //   static const char Base64[] =
674 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
675 //   char* pos;
676 //   int idx, i, j;
677 //   printf("    ");
678 //   for (i = 0; i < 255; i += 8) {
679 //     for (j = i; j < i + 8; j++) {
680 //       pos = strchr(Base64, j);
681 //       if ((pos == nullptr) || (j == 0))
682 //         idx = -1;
683 //       else
684 //         idx = pos - Base64;
685 //       if (idx == -1)
686 //         printf(" %2d,     ", idx);
687 //       else
688 //         printf(" %2d/*%c*/,", idx, j);
689 //     }
690 //     printf("\n    ");
691 //   }
692 // }
693 //
694 // where the value of "Base64[]" was replaced by one of the base-64 conversion
695 // tables from the functions below.
696 /* clang-format off */
697 constexpr signed char kUnBase64[] = {
698     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
699     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
700     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
701     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
702     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
703     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
704     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
705     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
706     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
707     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
708     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
709     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
710     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
711     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
712     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
713     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
714     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
715     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
716     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
717     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
718     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
719     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
720     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
721     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
722     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
723     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
724     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
725     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
726     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
727     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
728     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
729     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
730 };
731 
732 constexpr signed char kUnWebSafeBase64[] = {
733     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
734     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
735     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
736     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
737     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
738     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
739     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
740     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
741     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
742     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
743     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
744     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
745     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
746     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
747     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
748     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
749     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
750     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
751     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
752     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
753     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
754     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
755     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
756     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
757     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
758     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
759     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
760     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
761     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
762     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
763     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
764     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
765 };
766 /* clang-format on */
767 
768 constexpr char kWebSafeBase64Chars[] =
769     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
770 
771 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)772 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
773                             const signed char* unbase64) {
774   // Determine the size of the output string.  Base64 encodes every 3 bytes into
775   // 4 characters.  any leftover chars are added directly for good measure.
776   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
777   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
778 
779   strings_internal::STLStringResizeUninitialized(dest, dest_len);
780 
781   // We are getting the destination buffer by getting the beginning of the
782   // string and converting it into a char *.
783   size_t len;
784   const bool ok =
785       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
786   if (!ok) {
787     dest->clear();
788     return false;
789   }
790 
791   // could be shorter if there was padding
792   assert(len <= dest_len);
793   dest->erase(len);
794 
795   return true;
796 }
797 
798 /* clang-format off */
799 constexpr char kHexValueLenient[256] = {
800     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
801     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
803     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
804     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
805     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
806     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
807     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
808     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
809     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
810     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
811     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
812     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
813     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 };
817 
818 /* clang-format on */
819 
820 // This is a templated function so that T can be either a char*
821 // or a string.  This works because we use the [] operator to access
822 // individual characters at a time.
823 template <typename T>
HexStringToBytesInternal(const char * from,T to,ptrdiff_t num)824 void HexStringToBytesInternal(const char* from, T to, ptrdiff_t num) {
825   for (int i = 0; i < num; i++) {
826     to[i] = (kHexValueLenient[from[i * 2] & 0xFF] << 4) +
827             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
828   }
829 }
830 
831 // This is a templated function so that T can be either a char* or a
832 // std::string.
833 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,ptrdiff_t num)834 void BytesToHexStringInternal(const unsigned char* src, T dest, ptrdiff_t num) {
835   auto dest_ptr = &dest[0];
836   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
837     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
838     std::copy(hex_p, hex_p + 2, dest_ptr);
839   }
840 }
841 
842 }  // namespace
843 
844 // ----------------------------------------------------------------------
845 // CUnescape()
846 //
847 // See CUnescapeInternal() for implementation details.
848 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)849 bool CUnescape(absl::string_view source, std::string* dest,
850                std::string* error) {
851   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
852 }
853 
CEscape(absl::string_view src)854 std::string CEscape(absl::string_view src) {
855   std::string dest;
856   CEscapeAndAppendInternal(src, &dest);
857   return dest;
858 }
859 
CHexEscape(absl::string_view src)860 std::string CHexEscape(absl::string_view src) {
861   return CEscapeInternal(src, true, false);
862 }
863 
Utf8SafeCEscape(absl::string_view src)864 std::string Utf8SafeCEscape(absl::string_view src) {
865   return CEscapeInternal(src, false, true);
866 }
867 
Utf8SafeCHexEscape(absl::string_view src)868 std::string Utf8SafeCHexEscape(absl::string_view src) {
869   return CEscapeInternal(src, true, true);
870 }
871 
872 // ----------------------------------------------------------------------
873 // Base64Unescape() - base64 decoder
874 // Base64Escape() - base64 encoder
875 // WebSafeBase64Unescape() - Google's variation of base64 decoder
876 // WebSafeBase64Escape() - Google's variation of base64 encoder
877 //
878 // Check out
879 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
880 // care about is that...
881 //   Take the encoded stuff in groups of 4 characters and turn each
882 //   character into a code 0 to 63 thus:
883 //           A-Z map to 0 to 25
884 //           a-z map to 26 to 51
885 //           0-9 map to 52 to 61
886 //           +(- for WebSafe) maps to 62
887 //           /(_ for WebSafe) maps to 63
888 //   There will be four numbers, all less than 64 which can be represented
889 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
890 //   Arrange the 6 digit binary numbers into three bytes as such:
891 //   aaaaaabb bbbbcccc ccdddddd
892 //   Equals signs (one or two) are used at the end of the encoded block to
893 //   indicate that the text was not an integer multiple of three bytes long.
894 // ----------------------------------------------------------------------
895 
Base64Unescape(absl::string_view src,std::string * dest)896 bool Base64Unescape(absl::string_view src, std::string* dest) {
897   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
898 }
899 
WebSafeBase64Unescape(absl::string_view src,std::string * dest)900 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
901   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
902 }
903 
Base64Escape(absl::string_view src,std::string * dest)904 void Base64Escape(absl::string_view src, std::string* dest) {
905   strings_internal::Base64EscapeInternal(
906       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
907       true, strings_internal::kBase64Chars);
908 }
909 
WebSafeBase64Escape(absl::string_view src,std::string * dest)910 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
911   strings_internal::Base64EscapeInternal(
912       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
913       false, kWebSafeBase64Chars);
914 }
915 
Base64Escape(absl::string_view src)916 std::string Base64Escape(absl::string_view src) {
917   std::string dest;
918   strings_internal::Base64EscapeInternal(
919       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
920       true, strings_internal::kBase64Chars);
921   return dest;
922 }
923 
WebSafeBase64Escape(absl::string_view src)924 std::string WebSafeBase64Escape(absl::string_view src) {
925   std::string dest;
926   strings_internal::Base64EscapeInternal(
927       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
928       false, kWebSafeBase64Chars);
929   return dest;
930 }
931 
HexStringToBytes(absl::string_view from)932 std::string HexStringToBytes(absl::string_view from) {
933   std::string result;
934   const auto num = from.size() / 2;
935   strings_internal::STLStringResizeUninitialized(&result, num);
936   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
937   return result;
938 }
939 
BytesToHexString(absl::string_view from)940 std::string BytesToHexString(absl::string_view from) {
941   std::string result;
942   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
943   absl::BytesToHexStringInternal<std::string&>(
944       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
945   return result;
946 }
947 
948 ABSL_NAMESPACE_END
949 }  // namespace absl
950