1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 
35 #include <errno.h>
36 #include <float.h>    // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42 
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45 #include <google/protobuf/io/strtod.h>
46 
47 #ifdef _WIN32
48 // MSVC has only _snprintf, not snprintf.
49 //
50 // MinGW has both snprintf and _snprintf, but they appear to be different
51 // functions.  The former is buggy.  When invoked like so:
52 //   char buffer[32];
53 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
54 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
55 // trailing zeros after the decimal point.  For some reason this bug only
56 // occurs with some input values, not all.  In any case, _snprintf does the
57 // right thing, so we use it.
58 #define snprintf _snprintf
59 #endif
60 
61 namespace google {
62 namespace protobuf {
63 
64 // These are defined as macros on some platforms.  #undef them so that we can
65 // redefine them.
66 #undef isxdigit
67 #undef isprint
68 
69 // The definitions of these in ctype.h change based on locale.  Since our
70 // string manipulation is all in relation to the protocol buffer and C++
71 // languages, we always want to use the C locale.  So, we re-define these
72 // exactly as we want them.
isxdigit(char c)73 inline bool isxdigit(char c) {
74   return ('0' <= c && c <= '9') ||
75          ('a' <= c && c <= 'f') ||
76          ('A' <= c && c <= 'F');
77 }
78 
isprint(char c)79 inline bool isprint(char c) {
80   return c >= 0x20 && c <= 0x7E;
81 }
82 
83 // ----------------------------------------------------------------------
84 // StripString
85 //    Replaces any occurrence of the character 'remove' (or the characters
86 //    in 'remove') with the character 'replacewith'.
87 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)88 void StripString(string* s, const char* remove, char replacewith) {
89   const char * str_start = s->c_str();
90   const char * str = str_start;
91   for (str = strpbrk(str, remove);
92        str != nullptr;
93        str = strpbrk(str + 1, remove)) {
94     (*s)[str - str_start] = replacewith;
95   }
96 }
97 
98 // ----------------------------------------------------------------------
99 // ReplaceCharacters
100 //    Replaces any occurrence of the character 'remove' (or the characters
101 //    in 'remove') with the character 'replacewith'.
102 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)103 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
104   const char *str_start = s->c_str();
105   const char *str = str_start;
106   for (str = strpbrk(str, remove);
107        str != nullptr;
108        str = strpbrk(str + 1, remove)) {
109     (*s)[str - str_start] = replacewith;
110   }
111 }
112 
StripWhitespace(string * str)113 void StripWhitespace(string* str) {
114   int str_length = str->length();
115 
116   // Strip off leading whitespace.
117   int first = 0;
118   while (first < str_length && ascii_isspace(str->at(first))) {
119     ++first;
120   }
121   // If entire string is white space.
122   if (first == str_length) {
123     str->clear();
124     return;
125   }
126   if (first > 0) {
127     str->erase(0, first);
128     str_length -= first;
129   }
130 
131   // Strip off trailing whitespace.
132   int last = str_length - 1;
133   while (last >= 0 && ascii_isspace(str->at(last))) {
134     --last;
135   }
136   if (last != (str_length - 1) && last >= 0) {
137     str->erase(last + 1, string::npos);
138   }
139 }
140 
141 // ----------------------------------------------------------------------
142 // StringReplace()
143 //    Replace the "old" pattern with the "new" pattern in a string,
144 //    and append the result to "res".  If replace_all is false,
145 //    it only replaces the first instance of "old."
146 // ----------------------------------------------------------------------
147 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)148 void StringReplace(const string& s, const string& oldsub,
149                    const string& newsub, bool replace_all,
150                    string* res) {
151   if (oldsub.empty()) {
152     res->append(s);  // if empty, append the given string.
153     return;
154   }
155 
156   string::size_type start_pos = 0;
157   string::size_type pos;
158   do {
159     pos = s.find(oldsub, start_pos);
160     if (pos == string::npos) {
161       break;
162     }
163     res->append(s, start_pos, pos - start_pos);
164     res->append(newsub);
165     start_pos = pos + oldsub.size();  // start searching again after the "old"
166   } while (replace_all);
167   res->append(s, start_pos, s.length() - start_pos);
168 }
169 
170 // ----------------------------------------------------------------------
171 // StringReplace()
172 //    Give me a string and two patterns "old" and "new", and I replace
173 //    the first instance of "old" in the string with "new", if it
174 //    exists.  If "global" is true; call this repeatedly until it
175 //    fails.  RETURN a new string, regardless of whether the replacement
176 //    happened or not.
177 // ----------------------------------------------------------------------
178 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)179 string StringReplace(const string& s, const string& oldsub,
180                      const string& newsub, bool replace_all) {
181   string ret;
182   StringReplace(s, oldsub, newsub, replace_all, &ret);
183   return ret;
184 }
185 
186 // ----------------------------------------------------------------------
187 // SplitStringUsing()
188 //    Split a string using a character delimiter. Append the components
189 //    to 'result'.
190 //
191 // Note: For multi-character delimiters, this routine will split on *ANY* of
192 // the characters in the string, not the entire string as a single delimiter.
193 // ----------------------------------------------------------------------
194 template <typename ITR>
195 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)196 void SplitStringToIteratorUsing(const string& full,
197                                 const char* delim,
198                                 ITR& result) {
199   // Optimize the common case where delim is a single character.
200   if (delim[0] != '\0' && delim[1] == '\0') {
201     char c = delim[0];
202     const char* p = full.data();
203     const char* end = p + full.size();
204     while (p != end) {
205       if (*p == c) {
206         ++p;
207       } else {
208         const char* start = p;
209         while (++p != end && *p != c);
210         *result++ = string(start, p - start);
211       }
212     }
213     return;
214   }
215 
216   string::size_type begin_index, end_index;
217   begin_index = full.find_first_not_of(delim);
218   while (begin_index != string::npos) {
219     end_index = full.find_first_of(delim, begin_index);
220     if (end_index == string::npos) {
221       *result++ = full.substr(begin_index);
222       return;
223     }
224     *result++ = full.substr(begin_index, (end_index - begin_index));
225     begin_index = full.find_first_not_of(delim, end_index);
226   }
227 }
228 
SplitStringUsing(const string & full,const char * delim,std::vector<string> * result)229 void SplitStringUsing(const string& full,
230                       const char* delim,
231                       std::vector<string>* result) {
232   std::back_insert_iterator< std::vector<string> > it(*result);
233   SplitStringToIteratorUsing(full, delim, it);
234 }
235 
236 // Split a string using a character delimiter. Append the components
237 // to 'result'.  If there are consecutive delimiters, this function
238 // will return corresponding empty strings. The string is split into
239 // at most the specified number of pieces greedily. This means that the
240 // last piece may possibly be split further. To split into as many pieces
241 // as possible, specify 0 as the number of pieces.
242 //
243 // If "full" is the empty string, yields an empty string as the only value.
244 //
245 // If "pieces" is negative for some reason, it returns the whole string
246 // ----------------------------------------------------------------------
247 template <typename StringType, typename ITR>
248 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)249 void SplitStringToIteratorAllowEmpty(const StringType& full,
250                                      const char* delim,
251                                      int pieces,
252                                      ITR& result) {
253   string::size_type begin_index, end_index;
254   begin_index = 0;
255 
256   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
257     end_index = full.find_first_of(delim, begin_index);
258     if (end_index == string::npos) {
259       *result++ = full.substr(begin_index);
260       return;
261     }
262     *result++ = full.substr(begin_index, (end_index - begin_index));
263     begin_index = end_index + 1;
264   }
265   *result++ = full.substr(begin_index);
266 }
267 
SplitStringAllowEmpty(const string & full,const char * delim,std::vector<string> * result)268 void SplitStringAllowEmpty(const string& full, const char* delim,
269                            std::vector<string>* result) {
270   std::back_insert_iterator<std::vector<string> > it(*result);
271   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
272 }
273 
274 // ----------------------------------------------------------------------
275 // JoinStrings()
276 //    This merges a vector of string components with delim inserted
277 //    as separaters between components.
278 //
279 // ----------------------------------------------------------------------
280 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)281 static void JoinStringsIterator(const ITERATOR& start,
282                                 const ITERATOR& end,
283                                 const char* delim,
284                                 string* result) {
285   GOOGLE_CHECK(result != nullptr);
286   result->clear();
287   int delim_length = strlen(delim);
288 
289   // Precompute resulting length so we can reserve() memory in one shot.
290   int length = 0;
291   for (ITERATOR iter = start; iter != end; ++iter) {
292     if (iter != start) {
293       length += delim_length;
294     }
295     length += iter->size();
296   }
297   result->reserve(length);
298 
299   // Now combine everything.
300   for (ITERATOR iter = start; iter != end; ++iter) {
301     if (iter != start) {
302       result->append(delim, delim_length);
303     }
304     result->append(iter->data(), iter->size());
305   }
306 }
307 
JoinStrings(const std::vector<string> & components,const char * delim,string * result)308 void JoinStrings(const std::vector<string>& components,
309                  const char* delim,
310                  string * result) {
311   JoinStringsIterator(components.begin(), components.end(), delim, result);
312 }
313 
314 // ----------------------------------------------------------------------
315 // UnescapeCEscapeSequences()
316 //    This does all the unescaping that C does: \ooo, \r, \n, etc
317 //    Returns length of resulting string.
318 //    The implementation of \x parses any positive number of hex digits,
319 //    but it is an error if the value requires more than 8 bits, and the
320 //    result is truncated to 8 bits.
321 //
322 //    The second call stores its errors in a supplied string vector.
323 //    If the string vector pointer is nullptr, it reports the errors with LOG().
324 // ----------------------------------------------------------------------
325 
326 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
327 
328 // Protocol buffers doesn't ever care about errors, but I don't want to remove
329 // the code.
330 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
331 
UnescapeCEscapeSequences(const char * source,char * dest)332 int UnescapeCEscapeSequences(const char* source, char* dest) {
333   return UnescapeCEscapeSequences(source, dest, nullptr);
334 }
335 
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)336 int UnescapeCEscapeSequences(const char* source, char* dest,
337                              std::vector<string> *errors) {
338   GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
339 
340   char* d = dest;
341   const char* p = source;
342 
343   // Small optimization for case where source = dest and there's no escaping
344   while ( p == d && *p != '\0' && *p != '\\' )
345     p++, d++;
346 
347   while (*p != '\0') {
348     if (*p != '\\') {
349       *d++ = *p++;
350     } else {
351       switch ( *++p ) {                    // skip past the '\\'
352         case '\0':
353           LOG_STRING(ERROR, errors) << "String cannot end with \\";
354           *d = '\0';
355           return d - dest;   // we're done with p
356         case 'a':  *d++ = '\a';  break;
357         case 'b':  *d++ = '\b';  break;
358         case 'f':  *d++ = '\f';  break;
359         case 'n':  *d++ = '\n';  break;
360         case 'r':  *d++ = '\r';  break;
361         case 't':  *d++ = '\t';  break;
362         case 'v':  *d++ = '\v';  break;
363         case '\\': *d++ = '\\';  break;
364         case '?':  *d++ = '\?';  break;    // \?  Who knew?
365         case '\'': *d++ = '\'';  break;
366         case '"':  *d++ = '\"';  break;
367         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
368         case '4': case '5': case '6': case '7': {
369           char ch = *p - '0';
370           if ( IS_OCTAL_DIGIT(p[1]) )
371             ch = ch * 8 + *++p - '0';
372           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
373             ch = ch * 8 + *++p - '0';      // now points at last digit
374           *d++ = ch;
375           break;
376         }
377         case 'x': case 'X': {
378           if (!isxdigit(p[1])) {
379             if (p[1] == '\0') {
380               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
381             } else {
382               LOG_STRING(ERROR, errors) <<
383                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
384             }
385             break;
386           }
387           unsigned int ch = 0;
388           const char *hex_start = p;
389           while (isxdigit(p[1]))  // arbitrarily many hex digits
390             ch = (ch << 4) + hex_digit_to_int(*++p);
391           if (ch > 0xFF)
392             LOG_STRING(ERROR, errors) << "Value of " <<
393               "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
394           *d++ = ch;
395           break;
396         }
397 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
398         case 'u': {
399           // \uhhhh => convert 4 hex digits to UTF-8
400           char32 rune = 0;
401           const char *hex_start = p;
402           for (int i = 0; i < 4; ++i) {
403             if (isxdigit(p[1])) {  // Look one char ahead.
404               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
405             } else {
406               LOG_STRING(ERROR, errors)
407                 << "\\u must be followed by 4 hex digits: \\"
408                 <<  string(hex_start, p+1-hex_start);
409               break;
410             }
411           }
412           d += runetochar(d, &rune);
413           break;
414         }
415         case 'U': {
416           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
417           char32 rune = 0;
418           const char *hex_start = p;
419           for (int i = 0; i < 8; ++i) {
420             if (isxdigit(p[1])) {  // Look one char ahead.
421               // Don't change rune until we're sure this
422               // is within the Unicode limit, but do advance p.
423               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
424               if (newrune > 0x10FFFF) {
425                 LOG_STRING(ERROR, errors)
426                   << "Value of \\"
427                   << string(hex_start, p + 1 - hex_start)
428                   << " exceeds Unicode limit (0x10FFFF)";
429                 break;
430               } else {
431                 rune = newrune;
432               }
433             } else {
434               LOG_STRING(ERROR, errors)
435                 << "\\U must be followed by 8 hex digits: \\"
436                 <<  string(hex_start, p+1-hex_start);
437               break;
438             }
439           }
440           d += runetochar(d, &rune);
441           break;
442         }
443 #endif
444         default:
445           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
446       }
447       p++;                                 // read past letter we escaped
448     }
449   }
450   *d = '\0';
451   return d - dest;
452 }
453 
454 // ----------------------------------------------------------------------
455 // UnescapeCEscapeString()
456 //    This does the same thing as UnescapeCEscapeSequences, but creates
457 //    a new string. The caller does not need to worry about allocating
458 //    a dest buffer. This should be used for non performance critical
459 //    tasks such as printing debug messages. It is safe for src and dest
460 //    to be the same.
461 //
462 //    The second call stores its errors in a supplied string vector.
463 //    If the string vector pointer is nullptr, it reports the errors with LOG().
464 //
465 //    In the first and second calls, the length of dest is returned. In the
466 //    the third call, the new string is returned.
467 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)468 int UnescapeCEscapeString(const string& src, string* dest) {
469   return UnescapeCEscapeString(src, dest, nullptr);
470 }
471 
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)472 int UnescapeCEscapeString(const string& src, string* dest,
473                           std::vector<string> *errors) {
474   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
475   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
476   GOOGLE_CHECK(dest);
477   dest->assign(unescaped.get(), len);
478   return len;
479 }
480 
UnescapeCEscapeString(const string & src)481 string UnescapeCEscapeString(const string& src) {
482   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
483   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
484   return string(unescaped.get(), len);
485 }
486 
487 // ----------------------------------------------------------------------
488 // CEscapeString()
489 // CHexEscapeString()
490 //    Copies 'src' to 'dest', escaping dangerous characters using
491 //    C-style escape sequences. This is very useful for preparing query
492 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
493 //    hexadecimal rather than octal sequences.
494 //    Returns the number of bytes written to 'dest' (not including the \0)
495 //    or -1 if there was insufficient space.
496 //
497 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
498 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)499 int CEscapeInternal(const char* src, int src_len, char* dest,
500                     int dest_len, bool use_hex, bool utf8_safe) {
501   const char* src_end = src + src_len;
502   int used = 0;
503   bool last_hex_escape = false; // true if last output char was \xNN
504 
505   for (; src < src_end; src++) {
506     if (dest_len - used < 2)   // Need space for two letter escape
507       return -1;
508 
509     bool is_hex_escape = false;
510     switch (*src) {
511       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
512       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
513       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
514       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
515       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
516       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
517       default:
518         // Note that if we emit \xNN and the src character after that is a hex
519         // digit then that digit must be escaped too to prevent it being
520         // interpreted as part of the character code by C.
521         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
522             (!isprint(*src) ||
523              (last_hex_escape && isxdigit(*src)))) {
524           if (dest_len - used < 4) // need space for 4 letter escape
525             return -1;
526           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
527                   static_cast<uint8>(*src));
528           is_hex_escape = use_hex;
529           used += 4;
530         } else {
531           dest[used++] = *src; break;
532         }
533     }
534     last_hex_escape = is_hex_escape;
535   }
536 
537   if (dest_len - used < 1)   // make sure that there is room for \0
538     return -1;
539 
540   dest[used] = '\0';   // doesn't count towards return value though
541   return used;
542 }
543 
544 // Calculates the length of the C-style escaped version of 'src'.
545 // Assumes that non-printable characters are escaped using octal sequences, and
546 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)547 static inline size_t CEscapedLength(StringPiece src) {
548   static char c_escaped_len[256] = {
549     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
550     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
551     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
552     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
553     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
554     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
555     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
556     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
557     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
558     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
559     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
560     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
561     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
562     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
563     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
564     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
565   };
566 
567   size_t escaped_len = 0;
568   for (int i = 0; i < src.size(); ++i) {
569     unsigned char c = static_cast<unsigned char>(src[i]);
570     escaped_len += c_escaped_len[c];
571   }
572   return escaped_len;
573 }
574 
575 // ----------------------------------------------------------------------
576 // Escapes 'src' using C-style escape sequences, and appends the escaped string
577 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
578 // the required space using a lookup table, and also does not do any special
579 // handling for Hex or UTF-8 characters.
580 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)581 void CEscapeAndAppend(StringPiece src, string* dest) {
582   size_t escaped_len = CEscapedLength(src);
583   if (escaped_len == src.size()) {
584     dest->append(src.data(), src.size());
585     return;
586   }
587 
588   size_t cur_dest_len = dest->size();
589   dest->resize(cur_dest_len + escaped_len);
590   char* append_ptr = &(*dest)[cur_dest_len];
591 
592   for (int i = 0; i < src.size(); ++i) {
593     unsigned char c = static_cast<unsigned char>(src[i]);
594     switch (c) {
595       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
596       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
597       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
598       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
599       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
600       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
601       default:
602         if (!isprint(c)) {
603           *append_ptr++ = '\\';
604           *append_ptr++ = '0' + c / 64;
605           *append_ptr++ = '0' + (c % 64) / 8;
606           *append_ptr++ = '0' + c % 8;
607         } else {
608           *append_ptr++ = c;
609         }
610         break;
611     }
612   }
613 }
614 
CEscape(const string & src)615 string CEscape(const string& src) {
616   string dest;
617   CEscapeAndAppend(src, &dest);
618   return dest;
619 }
620 
621 namespace strings {
622 
Utf8SafeCEscape(const string & src)623 string Utf8SafeCEscape(const string& src) {
624   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
625   std::unique_ptr<char[]> dest(new char[dest_length]);
626   const int len = CEscapeInternal(src.data(), src.size(),
627                                   dest.get(), dest_length, false, true);
628   GOOGLE_DCHECK_GE(len, 0);
629   return string(dest.get(), len);
630 }
631 
CHexEscape(const string & src)632 string CHexEscape(const string& src) {
633   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
634   std::unique_ptr<char[]> dest(new char[dest_length]);
635   const int len = CEscapeInternal(src.data(), src.size(),
636                                   dest.get(), dest_length, true, false);
637   GOOGLE_DCHECK_GE(len, 0);
638   return string(dest.get(), len);
639 }
640 
641 }  // namespace strings
642 
643 // ----------------------------------------------------------------------
644 // strto32_adaptor()
645 // strtou32_adaptor()
646 //    Implementation of strto[u]l replacements that have identical
647 //    overflow and underflow characteristics for both ILP-32 and LP-64
648 //    platforms, including errno preservation in error-free calls.
649 // ----------------------------------------------------------------------
650 
strto32_adaptor(const char * nptr,char ** endptr,int base)651 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
652   const int saved_errno = errno;
653   errno = 0;
654   const long result = strtol(nptr, endptr, base);
655   if (errno == ERANGE && result == LONG_MIN) {
656     return kint32min;
657   } else if (errno == ERANGE && result == LONG_MAX) {
658     return kint32max;
659   } else if (errno == 0 && result < kint32min) {
660     errno = ERANGE;
661     return kint32min;
662   } else if (errno == 0 && result > kint32max) {
663     errno = ERANGE;
664     return kint32max;
665   }
666   if (errno == 0)
667     errno = saved_errno;
668   return static_cast<int32>(result);
669 }
670 
strtou32_adaptor(const char * nptr,char ** endptr,int base)671 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
672   const int saved_errno = errno;
673   errno = 0;
674   const unsigned long result = strtoul(nptr, endptr, base);
675   if (errno == ERANGE && result == ULONG_MAX) {
676     return kuint32max;
677   } else if (errno == 0 && result > kuint32max) {
678     errno = ERANGE;
679     return kuint32max;
680   }
681   if (errno == 0)
682     errno = saved_errno;
683   return static_cast<uint32>(result);
684 }
685 
safe_parse_sign(string * text,bool * negative_ptr)686 inline bool safe_parse_sign(string* text  /*inout*/,
687                             bool* negative_ptr  /*output*/) {
688   const char* start = text->data();
689   const char* end = start + text->size();
690 
691   // Consume whitespace.
692   while (start < end && (start[0] == ' ')) {
693     ++start;
694   }
695   while (start < end && (end[-1] == ' ')) {
696     --end;
697   }
698   if (start >= end) {
699     return false;
700   }
701 
702   // Consume sign.
703   *negative_ptr = (start[0] == '-');
704   if (*negative_ptr || start[0] == '+') {
705     ++start;
706     if (start >= end) {
707       return false;
708     }
709   }
710   *text = text->substr(start - text->data(), end - start);
711   return true;
712 }
713 
714 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)715 bool safe_parse_positive_int(
716     string text, IntType* value_p) {
717   int base = 10;
718   IntType value = 0;
719   const IntType vmax = std::numeric_limits<IntType>::max();
720   assert(vmax > 0);
721   assert(vmax >= base);
722   const IntType vmax_over_base = vmax / base;
723   const char* start = text.data();
724   const char* end = start + text.size();
725   // loop over digits
726   for (; start < end; ++start) {
727     unsigned char c = static_cast<unsigned char>(start[0]);
728     int digit = c - '0';
729     if (digit >= base || digit < 0) {
730       *value_p = value;
731       return false;
732     }
733     if (value > vmax_over_base) {
734       *value_p = vmax;
735       return false;
736     }
737     value *= base;
738     if (value > vmax - digit) {
739       *value_p = vmax;
740       return false;
741     }
742     value += digit;
743   }
744   *value_p = value;
745   return true;
746 }
747 
748 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)749 bool safe_parse_negative_int(
750     const string& text, IntType* value_p) {
751   int base = 10;
752   IntType value = 0;
753   const IntType vmin = std::numeric_limits<IntType>::min();
754   assert(vmin < 0);
755   assert(vmin <= 0 - base);
756   IntType vmin_over_base = vmin / base;
757   // 2003 c++ standard [expr.mul]
758   // "... the sign of the remainder is implementation-defined."
759   // Although (vmin/base)*base + vmin%base is always vmin.
760   // 2011 c++ standard tightens the spec but we cannot rely on it.
761   if (vmin % base > 0) {
762     vmin_over_base += 1;
763   }
764   const char* start = text.data();
765   const char* end = start + text.size();
766   // loop over digits
767   for (; start < end; ++start) {
768     unsigned char c = static_cast<unsigned char>(start[0]);
769     int digit = c - '0';
770     if (digit >= base || digit < 0) {
771       *value_p = value;
772       return false;
773     }
774     if (value < vmin_over_base) {
775       *value_p = vmin;
776       return false;
777     }
778     value *= base;
779     if (value < vmin + digit) {
780       *value_p = vmin;
781       return false;
782     }
783     value -= digit;
784   }
785   *value_p = value;
786   return true;
787 }
788 
789 template<typename IntType>
safe_int_internal(string text,IntType * value_p)790 bool safe_int_internal(string text, IntType* value_p) {
791   *value_p = 0;
792   bool negative;
793   if (!safe_parse_sign(&text, &negative)) {
794     return false;
795   }
796   if (!negative) {
797     return safe_parse_positive_int(text, value_p);
798   } else {
799     return safe_parse_negative_int(text, value_p);
800   }
801 }
802 
803 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)804 bool safe_uint_internal(string text, IntType* value_p) {
805   *value_p = 0;
806   bool negative;
807   if (!safe_parse_sign(&text, &negative) || negative) {
808     return false;
809   }
810   return safe_parse_positive_int(text, value_p);
811 }
812 
813 // ----------------------------------------------------------------------
814 // FastIntToBuffer()
815 // FastInt64ToBuffer()
816 // FastHexToBuffer()
817 // FastHex64ToBuffer()
818 // FastHex32ToBuffer()
819 // ----------------------------------------------------------------------
820 
821 // Offset into buffer where FastInt64ToBuffer places the end of string
822 // null character.  Also used by FastInt64ToBufferLeft.
823 static const int kFastInt64ToBufferOffset = 21;
824 
FastInt64ToBuffer(int64 i,char * buffer)825 char *FastInt64ToBuffer(int64 i, char* buffer) {
826   // We could collapse the positive and negative sections, but that
827   // would be slightly slower for positive numbers...
828   // 22 bytes is enough to store -2**64, -18446744073709551616.
829   char* p = buffer + kFastInt64ToBufferOffset;
830   *p-- = '\0';
831   if (i >= 0) {
832     do {
833       *p-- = '0' + i % 10;
834       i /= 10;
835     } while (i > 0);
836     return p + 1;
837   } else {
838     // On different platforms, % and / have different behaviors for
839     // negative numbers, so we need to jump through hoops to make sure
840     // we don't divide negative numbers.
841     if (i > -10) {
842       i = -i;
843       *p-- = '0' + i;
844       *p = '-';
845       return p;
846     } else {
847       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
848       i = i + 10;
849       i = -i;
850       *p-- = '0' + i % 10;
851       // Undo what we did a moment ago
852       i = i / 10 + 1;
853       do {
854         *p-- = '0' + i % 10;
855         i /= 10;
856       } while (i > 0);
857       *p = '-';
858       return p;
859     }
860   }
861 }
862 
863 // Offset into buffer where FastInt32ToBuffer places the end of string
864 // null character.  Also used by FastInt32ToBufferLeft
865 static const int kFastInt32ToBufferOffset = 11;
866 
867 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
868 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
869 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)870 char *FastInt32ToBuffer(int32 i, char* buffer) {
871   // We could collapse the positive and negative sections, but that
872   // would be slightly slower for positive numbers...
873   // 12 bytes is enough to store -2**32, -4294967296.
874   char* p = buffer + kFastInt32ToBufferOffset;
875   *p-- = '\0';
876   if (i >= 0) {
877     do {
878       *p-- = '0' + i % 10;
879       i /= 10;
880     } while (i > 0);
881     return p + 1;
882   } else {
883     // On different platforms, % and / have different behaviors for
884     // negative numbers, so we need to jump through hoops to make sure
885     // we don't divide negative numbers.
886     if (i > -10) {
887       i = -i;
888       *p-- = '0' + i;
889       *p = '-';
890       return p;
891     } else {
892       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
893       i = i + 10;
894       i = -i;
895       *p-- = '0' + i % 10;
896       // Undo what we did a moment ago
897       i = i / 10 + 1;
898       do {
899         *p-- = '0' + i % 10;
900         i /= 10;
901       } while (i > 0);
902       *p = '-';
903       return p;
904     }
905   }
906 }
907 
FastHexToBuffer(int i,char * buffer)908 char *FastHexToBuffer(int i, char* buffer) {
909   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
910 
911   static const char *hexdigits = "0123456789abcdef";
912   char *p = buffer + 21;
913   *p-- = '\0';
914   do {
915     *p-- = hexdigits[i & 15];   // mod by 16
916     i >>= 4;                    // divide by 16
917   } while (i > 0);
918   return p + 1;
919 }
920 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)921 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
922   static const char *hexdigits = "0123456789abcdef";
923   buffer[num_byte] = '\0';
924   for (int i = num_byte - 1; i >= 0; i--) {
925 #ifdef _M_X64
926     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
927     // block. Given that the uint32 cast was to improve performance on 32-bit
928     // platforms, we use 64-bit '&' directly.
929     buffer[i] = hexdigits[value & 0xf];
930 #else
931     buffer[i] = hexdigits[uint32(value) & 0xf];
932 #endif
933     value >>= 4;
934   }
935   return buffer;
936 }
937 
FastHex64ToBuffer(uint64 value,char * buffer)938 char *FastHex64ToBuffer(uint64 value, char* buffer) {
939   return InternalFastHexToBuffer(value, buffer, 16);
940 }
941 
FastHex32ToBuffer(uint32 value,char * buffer)942 char *FastHex32ToBuffer(uint32 value, char* buffer) {
943   return InternalFastHexToBuffer(value, buffer, 8);
944 }
945 
946 // ----------------------------------------------------------------------
947 // FastInt32ToBufferLeft()
948 // FastUInt32ToBufferLeft()
949 // FastInt64ToBufferLeft()
950 // FastUInt64ToBufferLeft()
951 //
952 // Like the Fast*ToBuffer() functions above, these are intended for speed.
953 // Unlike the Fast*ToBuffer() functions, however, these functions write
954 // their output to the beginning of the buffer (hence the name, as the
955 // output is left-aligned).  The caller is responsible for ensuring that
956 // the buffer has enough space to hold the output.
957 //
958 // Returns a pointer to the end of the string (i.e. the null character
959 // terminating the string).
960 // ----------------------------------------------------------------------
961 
962 static const char two_ASCII_digits[100][2] = {
963   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
964   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
965   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
966   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
967   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
968   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
969   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
970   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
971   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
972   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
973   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
974   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
975   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
976   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
977   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
978   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
979   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
980   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
981   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
982   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
983 };
984 
FastUInt32ToBufferLeft(uint32 u,char * buffer)985 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
986   uint32 digits;
987   const char *ASCII_digits = nullptr;
988   // The idea of this implementation is to trim the number of divides to as few
989   // as possible by using multiplication and subtraction rather than mod (%),
990   // and by outputting two digits at a time rather than one.
991   // The huge-number case is first, in the hopes that the compiler will output
992   // that case in one branch-free block of code, and only output conditional
993   // branches into it from below.
994   if (u >= 1000000000) {  // >= 1,000,000,000
995     digits = u / 100000000;  // 100,000,000
996     ASCII_digits = two_ASCII_digits[digits];
997     buffer[0] = ASCII_digits[0];
998     buffer[1] = ASCII_digits[1];
999     buffer += 2;
1000 sublt100_000_000:
1001     u -= digits * 100000000;  // 100,000,000
1002 lt100_000_000:
1003     digits = u / 1000000;  // 1,000,000
1004     ASCII_digits = two_ASCII_digits[digits];
1005     buffer[0] = ASCII_digits[0];
1006     buffer[1] = ASCII_digits[1];
1007     buffer += 2;
1008 sublt1_000_000:
1009     u -= digits * 1000000;  // 1,000,000
1010 lt1_000_000:
1011     digits = u / 10000;  // 10,000
1012     ASCII_digits = two_ASCII_digits[digits];
1013     buffer[0] = ASCII_digits[0];
1014     buffer[1] = ASCII_digits[1];
1015     buffer += 2;
1016 sublt10_000:
1017     u -= digits * 10000;  // 10,000
1018 lt10_000:
1019     digits = u / 100;
1020     ASCII_digits = two_ASCII_digits[digits];
1021     buffer[0] = ASCII_digits[0];
1022     buffer[1] = ASCII_digits[1];
1023     buffer += 2;
1024 sublt100:
1025     u -= digits * 100;
1026 lt100:
1027     digits = u;
1028     ASCII_digits = two_ASCII_digits[digits];
1029     buffer[0] = ASCII_digits[0];
1030     buffer[1] = ASCII_digits[1];
1031     buffer += 2;
1032 done:
1033     *buffer = 0;
1034     return buffer;
1035   }
1036 
1037   if (u < 100) {
1038     digits = u;
1039     if (u >= 10) goto lt100;
1040     *buffer++ = '0' + digits;
1041     goto done;
1042   }
1043   if (u  <  10000) {   // 10,000
1044     if (u >= 1000) goto lt10_000;
1045     digits = u / 100;
1046     *buffer++ = '0' + digits;
1047     goto sublt100;
1048   }
1049   if (u  <  1000000) {   // 1,000,000
1050     if (u >= 100000) goto lt1_000_000;
1051     digits = u / 10000;  //    10,000
1052     *buffer++ = '0' + digits;
1053     goto sublt10_000;
1054   }
1055   if (u  <  100000000) {   // 100,000,000
1056     if (u >= 10000000) goto lt100_000_000;
1057     digits = u / 1000000;  //   1,000,000
1058     *buffer++ = '0' + digits;
1059     goto sublt1_000_000;
1060   }
1061   // we already know that u < 1,000,000,000
1062   digits = u / 100000000;   // 100,000,000
1063   *buffer++ = '0' + digits;
1064   goto sublt100_000_000;
1065 }
1066 
FastInt32ToBufferLeft(int32 i,char * buffer)1067 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1068   uint32 u = i;
1069   if (i < 0) {
1070     *buffer++ = '-';
1071     u = -i;
1072   }
1073   return FastUInt32ToBufferLeft(u, buffer);
1074 }
1075 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1076 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1077   int digits;
1078   const char *ASCII_digits = nullptr;
1079 
1080   uint32 u = static_cast<uint32>(u64);
1081   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1082 
1083   uint64 top_11_digits = u64 / 1000000000;
1084   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1085   u = u64 - (top_11_digits * 1000000000);
1086 
1087   digits = u / 10000000;  // 10,000,000
1088   GOOGLE_DCHECK_LT(digits, 100);
1089   ASCII_digits = two_ASCII_digits[digits];
1090   buffer[0] = ASCII_digits[0];
1091   buffer[1] = ASCII_digits[1];
1092   buffer += 2;
1093   u -= digits * 10000000;  // 10,000,000
1094   digits = u / 100000;  // 100,000
1095   ASCII_digits = two_ASCII_digits[digits];
1096   buffer[0] = ASCII_digits[0];
1097   buffer[1] = ASCII_digits[1];
1098   buffer += 2;
1099   u -= digits * 100000;  // 100,000
1100   digits = u / 1000;  // 1,000
1101   ASCII_digits = two_ASCII_digits[digits];
1102   buffer[0] = ASCII_digits[0];
1103   buffer[1] = ASCII_digits[1];
1104   buffer += 2;
1105   u -= digits * 1000;  // 1,000
1106   digits = u / 10;
1107   ASCII_digits = two_ASCII_digits[digits];
1108   buffer[0] = ASCII_digits[0];
1109   buffer[1] = ASCII_digits[1];
1110   buffer += 2;
1111   u -= digits * 10;
1112   digits = u;
1113   *buffer++ = '0' + digits;
1114   *buffer = 0;
1115   return buffer;
1116 }
1117 
FastInt64ToBufferLeft(int64 i,char * buffer)1118 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1119   uint64 u = 0;
1120   if (i < 0) {
1121     *buffer++ = '-';
1122     u -= i;
1123   } else {
1124     u = i;
1125   }
1126   return FastUInt64ToBufferLeft(u, buffer);
1127 }
1128 
1129 // ----------------------------------------------------------------------
1130 // SimpleItoa()
1131 //    Description: converts an integer to a string.
1132 //
1133 //    Return value: string
1134 // ----------------------------------------------------------------------
1135 
SimpleItoa(int i)1136 string SimpleItoa(int i) {
1137   char buffer[kFastToBufferSize];
1138   return (sizeof(i) == 4) ?
1139     FastInt32ToBuffer(i, buffer) :
1140     FastInt64ToBuffer(i, buffer);
1141 }
1142 
SimpleItoa(unsigned int i)1143 string SimpleItoa(unsigned int i) {
1144   char buffer[kFastToBufferSize];
1145   return string(buffer, (sizeof(i) == 4) ?
1146     FastUInt32ToBufferLeft(i, buffer) :
1147     FastUInt64ToBufferLeft(i, buffer));
1148 }
1149 
SimpleItoa(long i)1150 string SimpleItoa(long i) {
1151   char buffer[kFastToBufferSize];
1152   return (sizeof(i) == 4) ?
1153     FastInt32ToBuffer(i, buffer) :
1154     FastInt64ToBuffer(i, buffer);
1155 }
1156 
SimpleItoa(unsigned long i)1157 string SimpleItoa(unsigned long i) {
1158   char buffer[kFastToBufferSize];
1159   return string(buffer, (sizeof(i) == 4) ?
1160     FastUInt32ToBufferLeft(i, buffer) :
1161     FastUInt64ToBufferLeft(i, buffer));
1162 }
1163 
SimpleItoa(long long i)1164 string SimpleItoa(long long i) {
1165   char buffer[kFastToBufferSize];
1166   return (sizeof(i) == 4) ?
1167     FastInt32ToBuffer(i, buffer) :
1168     FastInt64ToBuffer(i, buffer);
1169 }
1170 
SimpleItoa(unsigned long long i)1171 string SimpleItoa(unsigned long long i) {
1172   char buffer[kFastToBufferSize];
1173   return string(buffer, (sizeof(i) == 4) ?
1174     FastUInt32ToBufferLeft(i, buffer) :
1175     FastUInt64ToBufferLeft(i, buffer));
1176 }
1177 
1178 // ----------------------------------------------------------------------
1179 // SimpleDtoa()
1180 // SimpleFtoa()
1181 // DoubleToBuffer()
1182 // FloatToBuffer()
1183 //    We want to print the value without losing precision, but we also do
1184 //    not want to print more digits than necessary.  This turns out to be
1185 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1186 //    exactly in binary.  If we print 0.2 with a very large precision,
1187 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1188 //    On the other hand, if we set the precision too low, we lose
1189 //    significant digits when printing numbers that actually need them.
1190 //    It turns out there is no precision value that does the right thing
1191 //    for all numbers.
1192 //
1193 //    Our strategy is to first try printing with a precision that is never
1194 //    over-precise, then parse the result with strtod() to see if it
1195 //    matches.  If not, we print again with a precision that will always
1196 //    give a precise result, but may use more digits than necessary.
1197 //
1198 //    An arguably better strategy would be to use the algorithm described
1199 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1200 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1201 //    however, that the following implementation is about as fast as
1202 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1203 //    will not scale well on multi-core machines.  DMG's code is slightly
1204 //    more accurate (in that it will never use more digits than
1205 //    necessary), but this is probably irrelevant for most users.
1206 //
1207 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1208 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1209 //    one in that it makes guesses and then uses strtod() to check them.
1210 //    Their implementation is faster because they use their own code to
1211 //    generate the digits in the first place rather than use snprintf(),
1212 //    thus avoiding format string parsing overhead.  However, this makes
1213 //    it considerably more complicated than the following implementation,
1214 //    and it is embedded in a larger library.  If speed turns out to be
1215 //    an issue, we could re-implement this in terms of their
1216 //    implementation.
1217 // ----------------------------------------------------------------------
1218 
SimpleDtoa(double value)1219 string SimpleDtoa(double value) {
1220   char buffer[kDoubleToBufferSize];
1221   return DoubleToBuffer(value, buffer);
1222 }
1223 
SimpleFtoa(float value)1224 string SimpleFtoa(float value) {
1225   char buffer[kFloatToBufferSize];
1226   return FloatToBuffer(value, buffer);
1227 }
1228 
IsValidFloatChar(char c)1229 static inline bool IsValidFloatChar(char c) {
1230   return ('0' <= c && c <= '9') ||
1231          c == 'e' || c == 'E' ||
1232          c == '+' || c == '-';
1233 }
1234 
DelocalizeRadix(char * buffer)1235 void DelocalizeRadix(char* buffer) {
1236   // Fast check:  if the buffer has a normal decimal point, assume no
1237   // translation is needed.
1238   if (strchr(buffer, '.') != nullptr) return;
1239 
1240   // Find the first unknown character.
1241   while (IsValidFloatChar(*buffer)) ++buffer;
1242 
1243   if (*buffer == '\0') {
1244     // No radix character found.
1245     return;
1246   }
1247 
1248   // We are now pointing at the locale-specific radix character.  Replace it
1249   // with '.'.
1250   *buffer = '.';
1251   ++buffer;
1252 
1253   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1254     // It appears the radix was a multi-byte character.  We need to remove the
1255     // extra bytes.
1256     char* target = buffer;
1257     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1258     memmove(target, buffer, strlen(buffer) + 1);
1259   }
1260 }
1261 
DoubleToBuffer(double value,char * buffer)1262 char* DoubleToBuffer(double value, char* buffer) {
1263   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1264   // platforms these days.  Just in case some system exists where DBL_DIG
1265   // is significantly larger -- and risks overflowing our buffer -- we have
1266   // this assert.
1267   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1268 
1269   if (value == std::numeric_limits<double>::infinity()) {
1270     strcpy(buffer, "inf");
1271     return buffer;
1272   } else if (value == -std::numeric_limits<double>::infinity()) {
1273     strcpy(buffer, "-inf");
1274     return buffer;
1275   } else if (std::isnan(value)) {
1276     strcpy(buffer, "nan");
1277     return buffer;
1278   }
1279 
1280   int snprintf_result =
1281     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1282 
1283   // The snprintf should never overflow because the buffer is significantly
1284   // larger than the precision we asked for.
1285   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1286 
1287   // We need to make parsed_value volatile in order to force the compiler to
1288   // write it out to the stack.  Otherwise, it may keep the value in a
1289   // register, and if it does that, it may keep it as a long double instead
1290   // of a double.  This long double may have extra bits that make it compare
1291   // unequal to "value" even though it would be exactly equal if it were
1292   // truncated to a double.
1293   volatile double parsed_value = io::NoLocaleStrtod(buffer, nullptr);
1294   if (parsed_value != value) {
1295     int snprintf_result =
1296       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1297 
1298     // Should never overflow; see above.
1299     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1300   }
1301 
1302   DelocalizeRadix(buffer);
1303   return buffer;
1304 }
1305 
memcasecmp(const char * s1,const char * s2,size_t len)1306 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1307   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1308   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1309 
1310   for ( int i = 0; i < len; i++ ) {
1311     const int diff =
1312       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1313       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1314     if (diff != 0) return diff;
1315   }
1316   return 0;
1317 }
1318 
CaseEqual(StringPiece s1,StringPiece s2)1319 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1320   if (s1.size() != s2.size()) return false;
1321   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1322 }
1323 
safe_strtob(StringPiece str,bool * value)1324 bool safe_strtob(StringPiece str, bool* value) {
1325   GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1326   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1327       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1328       CaseEqual(str, "1")) {
1329     *value = true;
1330     return true;
1331   }
1332   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1333       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1334       CaseEqual(str, "0")) {
1335     *value = false;
1336     return true;
1337   }
1338   return false;
1339 }
1340 
safe_strtof(const char * str,float * value)1341 bool safe_strtof(const char* str, float* value) {
1342   char* endptr;
1343   errno = 0;  // errno only gets set on errors
1344 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1345   *value = io::NoLocaleStrtod(str, &endptr);
1346 #else
1347   *value = strtof(str, &endptr);
1348 #endif
1349   return *str != 0 && *endptr == 0 && errno == 0;
1350 }
1351 
safe_strtod(const char * str,double * value)1352 bool safe_strtod(const char* str, double* value) {
1353   char* endptr;
1354   *value = io::NoLocaleStrtod(str, &endptr);
1355   if (endptr != str) {
1356     while (ascii_isspace(*endptr)) ++endptr;
1357   }
1358   // Ignore range errors from strtod.  The values it
1359   // returns on underflow and overflow are the right
1360   // fallback in a robust setting.
1361   return *str != '\0' && *endptr == '\0';
1362 }
1363 
safe_strto32(const string & str,int32 * value)1364 bool safe_strto32(const string& str, int32* value) {
1365   return safe_int_internal(str, value);
1366 }
1367 
safe_strtou32(const string & str,uint32 * value)1368 bool safe_strtou32(const string& str, uint32* value) {
1369   return safe_uint_internal(str, value);
1370 }
1371 
safe_strto64(const string & str,int64 * value)1372 bool safe_strto64(const string& str, int64* value) {
1373   return safe_int_internal(str, value);
1374 }
1375 
safe_strtou64(const string & str,uint64 * value)1376 bool safe_strtou64(const string& str, uint64* value) {
1377   return safe_uint_internal(str, value);
1378 }
1379 
FloatToBuffer(float value,char * buffer)1380 char* FloatToBuffer(float value, char* buffer) {
1381   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1382   // platforms these days.  Just in case some system exists where FLT_DIG
1383   // is significantly larger -- and risks overflowing our buffer -- we have
1384   // this assert.
1385   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1386 
1387   if (value == std::numeric_limits<double>::infinity()) {
1388     strcpy(buffer, "inf");
1389     return buffer;
1390   } else if (value == -std::numeric_limits<double>::infinity()) {
1391     strcpy(buffer, "-inf");
1392     return buffer;
1393   } else if (std::isnan(value)) {
1394     strcpy(buffer, "nan");
1395     return buffer;
1396   }
1397 
1398   int snprintf_result =
1399     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1400 
1401   // The snprintf should never overflow because the buffer is significantly
1402   // larger than the precision we asked for.
1403   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1404 
1405   float parsed_value;
1406   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1407     int snprintf_result =
1408       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1409 
1410     // Should never overflow; see above.
1411     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1412   }
1413 
1414   DelocalizeRadix(buffer);
1415   return buffer;
1416 }
1417 
1418 namespace strings {
1419 
AlphaNum(strings::Hex hex)1420 AlphaNum::AlphaNum(strings::Hex hex) {
1421   char *const end = &digits[kFastToBufferSize];
1422   char *writer = end;
1423   uint64 value = hex.value;
1424   uint64 width = hex.spec;
1425   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1426   // where 0x10000 is the smallest hex number that is as wide as the user
1427   // asked for.
1428   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1429   static const char hexdigits[] = "0123456789abcdef";
1430   do {
1431     *--writer = hexdigits[value & 0xF];
1432     value >>= 4;
1433     mask >>= 4;
1434   } while (mask != 0);
1435   piece_data_ = writer;
1436   piece_size_ = end - writer;
1437 }
1438 
1439 }  // namespace strings
1440 
1441 // ----------------------------------------------------------------------
1442 // StrCat()
1443 //    This merges the given strings or integers, with no delimiter.  This
1444 //    is designed to be the fastest possible way to construct a string out
1445 //    of a mix of raw C strings, C++ strings, and integer values.
1446 // ----------------------------------------------------------------------
1447 
1448 // Append is merely a version of memcpy that returns the address of the byte
1449 // after the area just overwritten.  It comes in multiple flavors to minimize
1450 // call overhead.
Append1(char * out,const AlphaNum & x)1451 static char *Append1(char *out, const AlphaNum &x) {
1452   memcpy(out, x.data(), x.size());
1453   return out + x.size();
1454 }
1455 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1456 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1457   memcpy(out, x1.data(), x1.size());
1458   out += x1.size();
1459 
1460   memcpy(out, x2.data(), x2.size());
1461   return out + x2.size();
1462 }
1463 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1464 static char *Append4(char *out,
1465                      const AlphaNum &x1, const AlphaNum &x2,
1466                      const AlphaNum &x3, const AlphaNum &x4) {
1467   memcpy(out, x1.data(), x1.size());
1468   out += x1.size();
1469 
1470   memcpy(out, x2.data(), x2.size());
1471   out += x2.size();
1472 
1473   memcpy(out, x3.data(), x3.size());
1474   out += x3.size();
1475 
1476   memcpy(out, x4.data(), x4.size());
1477   return out + x4.size();
1478 }
1479 
StrCat(const AlphaNum & a,const AlphaNum & b)1480 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1481   string result;
1482   result.resize(a.size() + b.size());
1483   char *const begin = &*result.begin();
1484   char *out = Append2(begin, a, b);
1485   GOOGLE_DCHECK_EQ(out, begin + result.size());
1486   return result;
1487 }
1488 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1489 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1490   string result;
1491   result.resize(a.size() + b.size() + c.size());
1492   char *const begin = &*result.begin();
1493   char *out = Append2(begin, a, b);
1494   out = Append1(out, c);
1495   GOOGLE_DCHECK_EQ(out, begin + result.size());
1496   return result;
1497 }
1498 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1499 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1500               const AlphaNum &d) {
1501   string result;
1502   result.resize(a.size() + b.size() + c.size() + d.size());
1503   char *const begin = &*result.begin();
1504   char *out = Append4(begin, a, b, c, d);
1505   GOOGLE_DCHECK_EQ(out, begin + result.size());
1506   return result;
1507 }
1508 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1509 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1510               const AlphaNum &d, const AlphaNum &e) {
1511   string result;
1512   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1513   char *const begin = &*result.begin();
1514   char *out = Append4(begin, a, b, c, d);
1515   out = Append1(out, e);
1516   GOOGLE_DCHECK_EQ(out, begin + result.size());
1517   return result;
1518 }
1519 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1520 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1521               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1522   string result;
1523   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1524                 f.size());
1525   char *const begin = &*result.begin();
1526   char *out = Append4(begin, a, b, c, d);
1527   out = Append2(out, e, f);
1528   GOOGLE_DCHECK_EQ(out, begin + result.size());
1529   return result;
1530 }
1531 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1532 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1533               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1534               const AlphaNum &g) {
1535   string result;
1536   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1537                 f.size() + g.size());
1538   char *const begin = &*result.begin();
1539   char *out = Append4(begin, a, b, c, d);
1540   out = Append2(out, e, f);
1541   out = Append1(out, g);
1542   GOOGLE_DCHECK_EQ(out, begin + result.size());
1543   return result;
1544 }
1545 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1546 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1547               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1548               const AlphaNum &g, const AlphaNum &h) {
1549   string result;
1550   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1551                 f.size() + g.size() + h.size());
1552   char *const begin = &*result.begin();
1553   char *out = Append4(begin, a, b, c, d);
1554   out = Append4(out, e, f, g, h);
1555   GOOGLE_DCHECK_EQ(out, begin + result.size());
1556   return result;
1557 }
1558 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1559 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1560               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1561               const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1562   string result;
1563   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1564                 f.size() + g.size() + h.size() + i.size());
1565   char *const begin = &*result.begin();
1566   char *out = Append4(begin, a, b, c, d);
1567   out = Append4(out, e, f, g, h);
1568   out = Append1(out, i);
1569   GOOGLE_DCHECK_EQ(out, begin + result.size());
1570   return result;
1571 }
1572 
1573 // It's possible to call StrAppend with a char * pointer that is partway into
1574 // the string we're appending to.  However the results of this are random.
1575 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1576 // to do one comparison.
1577 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1578     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1579                      uintptr_t((dest).size()))
1580 
StrAppend(string * result,const AlphaNum & a)1581 void StrAppend(string *result, const AlphaNum &a) {
1582   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1583   result->append(a.data(), a.size());
1584 }
1585 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1586 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1587   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1588   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1589   string::size_type old_size = result->size();
1590   result->resize(old_size + a.size() + b.size());
1591   char *const begin = &*result->begin();
1592   char *out = Append2(begin + old_size, a, b);
1593   GOOGLE_DCHECK_EQ(out, begin + result->size());
1594 }
1595 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1596 void StrAppend(string *result,
1597                const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1598   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1599   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1600   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1601   string::size_type old_size = result->size();
1602   result->resize(old_size + a.size() + b.size() + c.size());
1603   char *const begin = &*result->begin();
1604   char *out = Append2(begin + old_size, a, b);
1605   out = Append1(out, c);
1606   GOOGLE_DCHECK_EQ(out, begin + result->size());
1607 }
1608 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1609 void StrAppend(string *result,
1610                const AlphaNum &a, const AlphaNum &b,
1611                const AlphaNum &c, const AlphaNum &d) {
1612   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1613   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1614   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1615   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1616   string::size_type old_size = result->size();
1617   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1618   char *const begin = &*result->begin();
1619   char *out = Append4(begin + old_size, a, b, c, d);
1620   GOOGLE_DCHECK_EQ(out, begin + result->size());
1621 }
1622 
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1623 int GlobalReplaceSubstring(const string& substring,
1624                            const string& replacement,
1625                            string* s) {
1626   GOOGLE_CHECK(s != nullptr);
1627   if (s->empty() || substring.empty())
1628     return 0;
1629   string tmp;
1630   int num_replacements = 0;
1631   int pos = 0;
1632   for (int match_pos = s->find(substring.data(), pos, substring.length());
1633        match_pos != string::npos;
1634        pos = match_pos + substring.length(),
1635            match_pos = s->find(substring.data(), pos, substring.length())) {
1636     ++num_replacements;
1637     // Append the original content before the match.
1638     tmp.append(*s, pos, match_pos - pos);
1639     // Append the replacement for the match.
1640     tmp.append(replacement.begin(), replacement.end());
1641   }
1642   // Append the content after the last match. If no replacements were made, the
1643   // original string is left untouched.
1644   if (num_replacements > 0) {
1645     tmp.append(*s, pos, s->length() - pos);
1646     s->swap(tmp);
1647   }
1648   return num_replacements;
1649 }
1650 
CalculateBase64EscapedLen(int input_len,bool do_padding)1651 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1652   // Base64 encodes three bytes of input at a time. If the input is not
1653   // divisible by three, we pad as appropriate.
1654   //
1655   // (from http://tools.ietf.org/html/rfc3548)
1656   // Special processing is performed if fewer than 24 bits are available
1657   // at the end of the data being encoded.  A full encoding quantum is
1658   // always completed at the end of a quantity.  When fewer than 24 input
1659   // bits are available in an input group, zero bits are added (on the
1660   // right) to form an integral number of 6-bit groups.  Padding at the
1661   // end of the data is performed using the '=' character.  Since all base
1662   // 64 input is an integral number of octets, only the following cases
1663   // can arise:
1664 
1665 
1666   // Base64 encodes each three bytes of input into four bytes of output.
1667   int len = (input_len / 3) * 4;
1668 
1669   if (input_len % 3 == 0) {
1670     // (from http://tools.ietf.org/html/rfc3548)
1671     // (1) the final quantum of encoding input is an integral multiple of 24
1672     // bits; here, the final unit of encoded output will be an integral
1673     // multiple of 4 characters with no "=" padding,
1674   } else if (input_len % 3 == 1) {
1675     // (from http://tools.ietf.org/html/rfc3548)
1676     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1677     // final unit of encoded output will be two characters followed by two
1678     // "=" padding characters, or
1679     len += 2;
1680     if (do_padding) {
1681       len += 2;
1682     }
1683   } else {  // (input_len % 3 == 2)
1684     // (from http://tools.ietf.org/html/rfc3548)
1685     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1686     // final unit of encoded output will be three characters followed by one
1687     // "=" padding character.
1688     len += 3;
1689     if (do_padding) {
1690       len += 1;
1691     }
1692   }
1693 
1694   assert(len >= input_len);  // make sure we didn't overflow
1695   return len;
1696 }
1697 
1698 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1699 int CalculateBase64EscapedLen(int input_len) {
1700   return CalculateBase64EscapedLen(input_len, true);
1701 }
1702 
1703 // ----------------------------------------------------------------------
1704 // int Base64Unescape() - base64 decoder
1705 // int Base64Escape() - base64 encoder
1706 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1707 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1708 //
1709 // Check out
1710 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1711 // care about is that...
1712 //   Take the encoded stuff in groups of 4 characters and turn each
1713 //   character into a code 0 to 63 thus:
1714 //           A-Z map to 0 to 25
1715 //           a-z map to 26 to 51
1716 //           0-9 map to 52 to 61
1717 //           +(- for WebSafe) maps to 62
1718 //           /(_ for WebSafe) maps to 63
1719 //   There will be four numbers, all less than 64 which can be represented
1720 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1721 //   Arrange the 6 digit binary numbers into three bytes as such:
1722 //   aaaaaabb bbbbcccc ccdddddd
1723 //   Equals signs (one or two) are used at the end of the encoded block to
1724 //   indicate that the text was not an integer multiple of three bytes long.
1725 // ----------------------------------------------------------------------
1726 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1727 int Base64UnescapeInternal(const char *src_param, int szsrc,
1728                            char *dest, int szdest,
1729                            const signed char* unbase64) {
1730   static const char kPad64Equals = '=';
1731   static const char kPad64Dot = '.';
1732 
1733   int decode = 0;
1734   int destidx = 0;
1735   int state = 0;
1736   unsigned int ch = 0;
1737   unsigned int temp = 0;
1738 
1739   // If "char" is signed by default, using *src as an array index results in
1740   // accessing negative array elements. Treat the input as a pointer to
1741   // unsigned char to avoid this.
1742   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1743 
1744   // The GET_INPUT macro gets the next input character, skipping
1745   // over any whitespace, and stopping when we reach the end of the
1746   // string or when we read any non-data character.  The arguments are
1747   // an arbitrary identifier (used as a label for goto) and the number
1748   // of data bytes that must remain in the input to avoid aborting the
1749   // loop.
1750 #define GET_INPUT(label, remain)                 \
1751   label:                                         \
1752     --szsrc;                                     \
1753     ch = *src++;                                 \
1754     decode = unbase64[ch];                       \
1755     if (decode < 0) {                            \
1756       if (ascii_isspace(ch) && szsrc >= remain)  \
1757         goto label;                              \
1758       state = 4 - remain;                        \
1759       break;                                     \
1760     }
1761 
1762   // if dest is null, we're just checking to see if it's legal input
1763   // rather than producing output.  (I suspect this could just be done
1764   // with a regexp...).  We duplicate the loop so this test can be
1765   // outside it instead of in every iteration.
1766 
1767   if (dest) {
1768     // This loop consumes 4 input bytes and produces 3 output bytes
1769     // per iteration.  We can't know at the start that there is enough
1770     // data left in the string for a full iteration, so the loop may
1771     // break out in the middle; if so 'state' will be set to the
1772     // number of input bytes read.
1773 
1774     while (szsrc >= 4)  {
1775       // We'll start by optimistically assuming that the next four
1776       // bytes of the string (src[0..3]) are four good data bytes
1777       // (that is, no nulls, whitespace, padding chars, or illegal
1778       // chars).  We need to test src[0..2] for nulls individually
1779       // before constructing temp to preserve the property that we
1780       // never read past a null in the string (no matter how long
1781       // szsrc claims the string is).
1782 
1783       if (!src[0] || !src[1] || !src[2] ||
1784           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1785                    (unsigned(unbase64[src[1]]) << 12) |
1786                    (unsigned(unbase64[src[2]]) << 6) |
1787                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1788         // Iff any of those four characters was bad (null, illegal,
1789         // whitespace, padding), then temp's high bit will be set
1790         // (because unbase64[] is -1 for all bad characters).
1791         //
1792         // We'll back up and resort to the slower decoder, which knows
1793         // how to handle those cases.
1794 
1795         GET_INPUT(first, 4);
1796         temp = decode;
1797         GET_INPUT(second, 3);
1798         temp = (temp << 6) | decode;
1799         GET_INPUT(third, 2);
1800         temp = (temp << 6) | decode;
1801         GET_INPUT(fourth, 1);
1802         temp = (temp << 6) | decode;
1803       } else {
1804         // We really did have four good data bytes, so advance four
1805         // characters in the string.
1806 
1807         szsrc -= 4;
1808         src += 4;
1809         decode = -1;
1810         ch = '\0';
1811       }
1812 
1813       // temp has 24 bits of input, so write that out as three bytes.
1814 
1815       if (destidx+3 > szdest) return -1;
1816       dest[destidx+2] = temp;
1817       temp >>= 8;
1818       dest[destidx+1] = temp;
1819       temp >>= 8;
1820       dest[destidx] = temp;
1821       destidx += 3;
1822     }
1823   } else {
1824     while (szsrc >= 4)  {
1825       if (!src[0] || !src[1] || !src[2] ||
1826           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1827                    (unsigned(unbase64[src[1]]) << 12) |
1828                    (unsigned(unbase64[src[2]]) << 6) |
1829                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1830         GET_INPUT(first_no_dest, 4);
1831         GET_INPUT(second_no_dest, 3);
1832         GET_INPUT(third_no_dest, 2);
1833         GET_INPUT(fourth_no_dest, 1);
1834       } else {
1835         szsrc -= 4;
1836         src += 4;
1837         decode = -1;
1838         ch = '\0';
1839       }
1840       destidx += 3;
1841     }
1842   }
1843 
1844 #undef GET_INPUT
1845 
1846   // if the loop terminated because we read a bad character, return
1847   // now.
1848   if (decode < 0 && ch != '\0' &&
1849       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1850     return -1;
1851 
1852   if (ch == kPad64Equals || ch == kPad64Dot) {
1853     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1854     // look at it again when we count to check for the proper number of
1855     // equals signs at the end.
1856     ++szsrc;
1857     --src;
1858   } else {
1859     // This loop consumes 1 input byte per iteration.  It's used to
1860     // clean up the 0-3 input bytes remaining when the first, faster
1861     // loop finishes.  'temp' contains the data from 'state' input
1862     // characters read by the first loop.
1863     while (szsrc > 0)  {
1864       --szsrc;
1865       ch = *src++;
1866       decode = unbase64[ch];
1867       if (decode < 0) {
1868         if (ascii_isspace(ch)) {
1869           continue;
1870         } else if (ch == '\0') {
1871           break;
1872         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1873           // back up one character; we'll read it again when we check
1874           // for the correct number of pad characters at the end.
1875           ++szsrc;
1876           --src;
1877           break;
1878         } else {
1879           return -1;
1880         }
1881       }
1882 
1883       // Each input character gives us six bits of output.
1884       temp = (temp << 6) | decode;
1885       ++state;
1886       if (state == 4) {
1887         // If we've accumulated 24 bits of output, write that out as
1888         // three bytes.
1889         if (dest) {
1890           if (destidx+3 > szdest) return -1;
1891           dest[destidx+2] = temp;
1892           temp >>= 8;
1893           dest[destidx+1] = temp;
1894           temp >>= 8;
1895           dest[destidx] = temp;
1896         }
1897         destidx += 3;
1898         state = 0;
1899         temp = 0;
1900       }
1901     }
1902   }
1903 
1904   // Process the leftover data contained in 'temp' at the end of the input.
1905   int expected_equals = 0;
1906   switch (state) {
1907     case 0:
1908       // Nothing left over; output is a multiple of 3 bytes.
1909       break;
1910 
1911     case 1:
1912       // Bad input; we have 6 bits left over.
1913       return -1;
1914 
1915     case 2:
1916       // Produce one more output byte from the 12 input bits we have left.
1917       if (dest) {
1918         if (destidx+1 > szdest) return -1;
1919         temp >>= 4;
1920         dest[destidx] = temp;
1921       }
1922       ++destidx;
1923       expected_equals = 2;
1924       break;
1925 
1926     case 3:
1927       // Produce two more output bytes from the 18 input bits we have left.
1928       if (dest) {
1929         if (destidx+2 > szdest) return -1;
1930         temp >>= 2;
1931         dest[destidx+1] = temp;
1932         temp >>= 8;
1933         dest[destidx] = temp;
1934       }
1935       destidx += 2;
1936       expected_equals = 1;
1937       break;
1938 
1939     default:
1940       // state should have no other values at this point.
1941       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1942   }
1943 
1944   // The remainder of the string should be all whitespace, mixed with
1945   // exactly 0 equals signs, or exactly 'expected_equals' equals
1946   // signs.  (Always accepting 0 equals signs is a google extension
1947   // not covered in the RFC, as is accepting dot as the pad character.)
1948 
1949   int equals = 0;
1950   while (szsrc > 0 && *src) {
1951     if (*src == kPad64Equals || *src == kPad64Dot)
1952       ++equals;
1953     else if (!ascii_isspace(*src))
1954       return -1;
1955     --szsrc;
1956     ++src;
1957   }
1958 
1959   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1960 }
1961 
1962 // The arrays below were generated by the following code
1963 // #include <sys/time.h>
1964 // #include <stdlib.h>
1965 // #include <string.h>
1966 // #include <stdio.h>
1967 // main()
1968 // {
1969 //   static const char Base64[] =
1970 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1971 //   const char *pos;
1972 //   int idx, i, j;
1973 //   printf("    ");
1974 //   for (i = 0; i < 255; i += 8) {
1975 //     for (j = i; j < i + 8; j++) {
1976 //       pos = strchr(Base64, j);
1977 //       if ((pos == nullptr) || (j == 0))
1978 //         idx = -1;
1979 //       else
1980 //         idx = pos - Base64;
1981 //       if (idx == -1)
1982 //         printf(" %2d,     ", idx);
1983 //       else
1984 //         printf(" %2d/""*%c*""/,", idx, j);
1985 //     }
1986 //     printf("\n    ");
1987 //   }
1988 // }
1989 //
1990 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1991 // tables from the functions below.
1992 static const signed char kUnBase64[] = {
1993   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1994   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1995   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1996   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1997   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1998   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1999   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2000   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2001   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2002    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2003   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2004   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
2005   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2006   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2007   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2008   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2011   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2012   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2013   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2014   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2015   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2016   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2017   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2018   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2019   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2020   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2021   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2022   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2023   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2024   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2025 };
2026 static const signed char kUnWebSafeBase64[] = {
2027   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2028   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2029   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2030   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2031   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2032   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2033   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2034   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2035   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2036    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2037   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2038   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2039   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2040   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2041   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2042   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2043   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2044   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2045   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2046   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2047   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2048   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2049   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2050   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2051   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2052   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2053   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2054   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2055   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2056   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2057   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2058   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2059 };
2060 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2061 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2062   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2063 }
2064 
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2065 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2066                                    const signed char* unbase64) {
2067   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2068   // 4 characters.  any leftover chars are added directly for good measure.
2069   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2070   const int dest_len = 3 * (slen / 4) + (slen % 4);
2071 
2072   dest->resize(dest_len);
2073 
2074   // We are getting the destination buffer by getting the beginning of the
2075   // string and converting it into a char *.
2076   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2077                                          dest_len, unbase64);
2078   if (len < 0) {
2079     dest->clear();
2080     return false;
2081   }
2082 
2083   // could be shorter if there was padding
2084   GOOGLE_DCHECK_LE(len, dest_len);
2085   dest->erase(len);
2086 
2087   return true;
2088 }
2089 
Base64Unescape(StringPiece src,string * dest)2090 bool Base64Unescape(StringPiece src, string* dest) {
2091   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2092 }
2093 
WebSafeBase64Unescape(StringPiece src,string * dest)2094 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2095   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2096 }
2097 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2098 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2099                          char *dest, int szdest, const char *base64,
2100                          bool do_padding) {
2101   static const char kPad64 = '=';
2102 
2103   if (szsrc <= 0) return 0;
2104 
2105   if (szsrc * 4 > szdest * 3) return 0;
2106 
2107   char *cur_dest = dest;
2108   const unsigned char *cur_src = src;
2109 
2110   char *limit_dest = dest + szdest;
2111   const unsigned char *limit_src = src + szsrc;
2112 
2113   // Three bytes of data encodes to four characters of cyphertext.
2114   // So we can pump through three-byte chunks atomically.
2115   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2116     uint32 in = BigEndian::Load32(cur_src) >> 8;
2117 
2118     cur_dest[0] = base64[in >> 18];
2119     in &= 0x3FFFF;
2120     cur_dest[1] = base64[in >> 12];
2121     in &= 0xFFF;
2122     cur_dest[2] = base64[in >> 6];
2123     in &= 0x3F;
2124     cur_dest[3] = base64[in];
2125 
2126     cur_dest += 4;
2127     cur_src += 3;
2128   }
2129   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2130   szdest = limit_dest - cur_dest;
2131   szsrc = limit_src - cur_src;
2132 
2133   /* now deal with the tail (<=3 bytes) */
2134   switch (szsrc) {
2135     case 0:
2136       // Nothing left; nothing more to do.
2137       break;
2138     case 1: {
2139       // One byte left: this encodes to two characters, and (optionally)
2140       // two pad characters to round out the four-character cypherblock.
2141       if ((szdest -= 2) < 0) return 0;
2142       uint32 in = cur_src[0];
2143       cur_dest[0] = base64[in >> 2];
2144       in &= 0x3;
2145       cur_dest[1] = base64[in << 4];
2146       cur_dest += 2;
2147       if (do_padding) {
2148         if ((szdest -= 2) < 0) return 0;
2149         cur_dest[0] = kPad64;
2150         cur_dest[1] = kPad64;
2151         cur_dest += 2;
2152       }
2153       break;
2154     }
2155     case 2: {
2156       // Two bytes left: this encodes to three characters, and (optionally)
2157       // one pad character to round out the four-character cypherblock.
2158       if ((szdest -= 3) < 0) return 0;
2159       uint32 in = BigEndian::Load16(cur_src);
2160       cur_dest[0] = base64[in >> 10];
2161       in &= 0x3FF;
2162       cur_dest[1] = base64[in >> 4];
2163       in &= 0x00F;
2164       cur_dest[2] = base64[in << 2];
2165       cur_dest += 3;
2166       if (do_padding) {
2167         if ((szdest -= 1) < 0) return 0;
2168         cur_dest[0] = kPad64;
2169         cur_dest += 1;
2170       }
2171       break;
2172     }
2173     case 3: {
2174       // Three bytes left: same as in the big loop above.  We can't do this in
2175       // the loop because the loop above always reads 4 bytes, and the fourth
2176       // byte is past the end of the input.
2177       if ((szdest -= 4) < 0) return 0;
2178       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2179       cur_dest[0] = base64[in >> 18];
2180       in &= 0x3FFFF;
2181       cur_dest[1] = base64[in >> 12];
2182       in &= 0xFFF;
2183       cur_dest[2] = base64[in >> 6];
2184       in &= 0x3F;
2185       cur_dest[3] = base64[in];
2186       cur_dest += 4;
2187       break;
2188     }
2189     default:
2190       // Should not be reached: blocks of 4 bytes are handled
2191       // in the while loop before this switch statement.
2192       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2193       break;
2194   }
2195   return (cur_dest - dest);
2196 }
2197 
2198 static const char kBase64Chars[] =
2199 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2200 
2201 static const char kWebSafeBase64Chars[] =
2202 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2203 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2204 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2205   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2206 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2207 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2208                         int szdest, bool do_padding) {
2209   return Base64EscapeInternal(src, szsrc, dest, szdest,
2210                               kWebSafeBase64Chars, do_padding);
2211 }
2212 
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2213 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2214                           string* dest, bool do_padding,
2215                           const char* base64_chars) {
2216   const int calc_escaped_size =
2217     CalculateBase64EscapedLen(szsrc, do_padding);
2218   dest->resize(calc_escaped_size);
2219   const int escaped_len = Base64EscapeInternal(src, szsrc,
2220                                                string_as_array(dest),
2221                                                dest->size(),
2222                                                base64_chars,
2223                                                do_padding);
2224   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2225   dest->erase(escaped_len);
2226 }
2227 
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void Base64Escape(const unsigned char *src, int szsrc,
2229                   string* dest, bool do_padding) {
2230   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2231 }
2232 
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2233 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2234                          string *dest, bool do_padding) {
2235   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2236 }
2237 
Base64Escape(StringPiece src,string * dest)2238 void Base64Escape(StringPiece src, string* dest) {
2239   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240                src.size(), dest, true);
2241 }
2242 
WebSafeBase64Escape(StringPiece src,string * dest)2243 void WebSafeBase64Escape(StringPiece src, string* dest) {
2244   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245                       src.size(), dest, false);
2246 }
2247 
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2248 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2249   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2250                       src.size(), dest, true);
2251 }
2252 
2253 // Helper to append a Unicode code point to a string as UTF8, without bringing
2254 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2255 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2256   uint32 tmp = 0;
2257   int len = 0;
2258   if (code_point <= 0x7f) {
2259     tmp = code_point;
2260     len = 1;
2261   } else if (code_point <= 0x07ff) {
2262     tmp = 0x0000c080 |
2263         ((code_point & 0x07c0) << 2) |
2264         (code_point & 0x003f);
2265     len = 2;
2266   } else if (code_point <= 0xffff) {
2267     tmp = 0x00e08080 |
2268         ((code_point & 0xf000) << 4) |
2269         ((code_point & 0x0fc0) << 2) |
2270         (code_point & 0x003f);
2271     len = 3;
2272   } else {
2273     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2274     // normally only defined up to there as well.
2275     tmp = 0xf0808080 |
2276         ((code_point & 0x1c0000) << 6) |
2277         ((code_point & 0x03f000) << 4) |
2278         ((code_point & 0x000fc0) << 2) |
2279         (code_point & 0x003f);
2280     len = 4;
2281   }
2282   tmp = ghtonl(tmp);
2283   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2284   return len;
2285 }
2286 
2287 // Table of UTF-8 character lengths, based on first byte
2288 static const unsigned char kUTF8LenTbl[256] = {
2289   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2290   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2291   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2292   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2293 
2294   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2295   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2296   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2297   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2298 };
2299 
2300 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2301 int UTF8FirstLetterNumBytes(const char* src, int len) {
2302   if (len == 0) {
2303     return 0;
2304   }
2305   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2306 }
2307 
2308 // ----------------------------------------------------------------------
2309 // CleanStringLineEndings()
2310 //   Clean up a multi-line string to conform to Unix line endings.
2311 //   Reads from src and appends to dst, so usually dst should be empty.
2312 //
2313 //   If there is no line ending at the end of a non-empty string, it can
2314 //   be added automatically.
2315 //
2316 //   Four different types of input are correctly handled:
2317 //
2318 //     - Unix/Linux files: line ending is LF: pass through unchanged
2319 //
2320 //     - DOS/Windows files: line ending is CRLF: convert to LF
2321 //
2322 //     - Legacy Mac files: line ending is CR: convert to LF
2323 //
2324 //     - Garbled files: random line endings: convert gracefully
2325 //                      lonely CR, lonely LF, CRLF: convert to LF
2326 //
2327 //   @param src The multi-line string to convert
2328 //   @param dst The converted string is appended to this string
2329 //   @param auto_end_last_line Automatically terminate the last line
2330 //
2331 //   Limitations:
2332 //
2333 //     This does not do the right thing for CRCRLF files created by
2334 //     broken programs that do another Unix->DOS conversion on files
2335 //     that are already in CRLF format.  For this, a two-pass approach
2336 //     brute-force would be needed that
2337 //
2338 //       (1) determines the presence of LF (first one is ok)
2339 //       (2) if yes, removes any CR, else convert every CR to LF
2340 
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2341 void CleanStringLineEndings(const string &src, string *dst,
2342                             bool auto_end_last_line) {
2343   if (dst->empty()) {
2344     dst->append(src);
2345     CleanStringLineEndings(dst, auto_end_last_line);
2346   } else {
2347     string tmp = src;
2348     CleanStringLineEndings(&tmp, auto_end_last_line);
2349     dst->append(tmp);
2350   }
2351 }
2352 
CleanStringLineEndings(string * str,bool auto_end_last_line)2353 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2354   ptrdiff_t output_pos = 0;
2355   bool r_seen = false;
2356   ptrdiff_t len = str->size();
2357 
2358   char *p = &(*str)[0];
2359 
2360   for (ptrdiff_t input_pos = 0; input_pos < len;) {
2361     if (!r_seen && input_pos + 8 < len) {
2362       uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2363       // Loop over groups of 8 bytes at a time until we come across
2364       // a word that has a byte whose value is less than or equal to
2365       // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2366       //
2367       // We use a has_less macro that quickly tests a whole 64-bit
2368       // word to see if any of the bytes has a value < N.
2369       //
2370       // For more details, see:
2371       //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2372 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2373       if (!has_less(v, '\r' + 1)) {
2374 #undef has_less
2375         // No byte in this word has a value that could be a \r or a \n
2376         if (output_pos != input_pos) {
2377           GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2378         }
2379         input_pos += 8;
2380         output_pos += 8;
2381         continue;
2382       }
2383     }
2384     string::const_reference in = p[input_pos];
2385     if (in == '\r') {
2386       if (r_seen) p[output_pos++] = '\n';
2387       r_seen = true;
2388     } else if (in == '\n') {
2389       if (input_pos != output_pos)
2390         p[output_pos++] = '\n';
2391       else
2392         output_pos++;
2393       r_seen = false;
2394     } else {
2395       if (r_seen) p[output_pos++] = '\n';
2396       r_seen = false;
2397       if (input_pos != output_pos)
2398         p[output_pos++] = in;
2399       else
2400         output_pos++;
2401     }
2402     input_pos++;
2403   }
2404   if (r_seen ||
2405       (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2406     str->resize(output_pos + 1);
2407     str->operator[](output_pos) = '\n';
2408   } else if (output_pos < len) {
2409     str->resize(output_pos);
2410   }
2411 }
2412 
2413 }  // namespace protobuf
2414 }  // namespace google
2415