1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 #include <google/protobuf/stubs/mathlimits.h>
35 
36 #include <errno.h>
37 #include <float.h>    // FLT_DIG and DBL_DIG
38 #include <limits>
39 #include <limits.h>
40 #include <stdio.h>
41 #include <iterator>
42 
43 #include <google/protobuf/stubs/stl_util.h>
44 
45 #ifdef _WIN32
46 // MSVC has only _snprintf, not snprintf.
47 //
48 // MinGW has both snprintf and _snprintf, but they appear to be different
49 // functions.  The former is buggy.  When invoked like so:
50 //   char buffer[32];
51 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
52 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
53 // trailing zeros after the decimal point.  For some reason this bug only
54 // occurs with some input values, not all.  In any case, _snprintf does the
55 // right thing, so we use it.
56 #define snprintf _snprintf
57 #endif
58 
59 namespace google {
60 namespace protobuf {
61 
62 // These are defined as macros on some platforms.  #undef them so that we can
63 // redefine them.
64 #undef isxdigit
65 #undef isprint
66 
67 // The definitions of these in ctype.h change based on locale.  Since our
68 // string manipulation is all in relation to the protocol buffer and C++
69 // languages, we always want to use the C locale.  So, we re-define these
70 // exactly as we want them.
isxdigit(char c)71 inline bool isxdigit(char c) {
72   return ('0' <= c && c <= '9') ||
73          ('a' <= c && c <= 'f') ||
74          ('A' <= c && c <= 'F');
75 }
76 
isprint(char c)77 inline bool isprint(char c) {
78   return c >= 0x20 && c <= 0x7E;
79 }
80 
81 // ----------------------------------------------------------------------
82 // StripString
83 //    Replaces any occurrence of the character 'remove' (or the characters
84 //    in 'remove') with the character 'replacewith'.
85 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)86 void StripString(string* s, const char* remove, char replacewith) {
87   const char * str_start = s->c_str();
88   const char * str = str_start;
89   for (str = strpbrk(str, remove);
90        str != NULL;
91        str = strpbrk(str + 1, remove)) {
92     (*s)[str - str_start] = replacewith;
93   }
94 }
95 
96 // ----------------------------------------------------------------------
97 // ReplaceCharacters
98 //    Replaces any occurrence of the character 'remove' (or the characters
99 //    in 'remove') with the character 'replacewith'.
100 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)101 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
102   const char *str_start = s->c_str();
103   const char *str = str_start;
104   for (str = strpbrk(str, remove);
105        str != NULL;
106        str = strpbrk(str + 1, remove)) {
107     (*s)[str - str_start] = replacewith;
108   }
109 }
110 
StripWhitespace(string * str)111 void StripWhitespace(string* str) {
112   int str_length = str->length();
113 
114   // Strip off leading whitespace.
115   int first = 0;
116   while (first < str_length && ascii_isspace(str->at(first))) {
117     ++first;
118   }
119   // If entire string is white space.
120   if (first == str_length) {
121     str->clear();
122     return;
123   }
124   if (first > 0) {
125     str->erase(0, first);
126     str_length -= first;
127   }
128 
129   // Strip off trailing whitespace.
130   int last = str_length - 1;
131   while (last >= 0 && ascii_isspace(str->at(last))) {
132     --last;
133   }
134   if (last != (str_length - 1) && last >= 0) {
135     str->erase(last + 1, string::npos);
136   }
137 }
138 
139 // ----------------------------------------------------------------------
140 // StringReplace()
141 //    Replace the "old" pattern with the "new" pattern in a string,
142 //    and append the result to "res".  If replace_all is false,
143 //    it only replaces the first instance of "old."
144 // ----------------------------------------------------------------------
145 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)146 void StringReplace(const string& s, const string& oldsub,
147                    const string& newsub, bool replace_all,
148                    string* res) {
149   if (oldsub.empty()) {
150     res->append(s);  // if empty, append the given string.
151     return;
152   }
153 
154   string::size_type start_pos = 0;
155   string::size_type pos;
156   do {
157     pos = s.find(oldsub, start_pos);
158     if (pos == string::npos) {
159       break;
160     }
161     res->append(s, start_pos, pos - start_pos);
162     res->append(newsub);
163     start_pos = pos + oldsub.size();  // start searching again after the "old"
164   } while (replace_all);
165   res->append(s, start_pos, s.length() - start_pos);
166 }
167 
168 // ----------------------------------------------------------------------
169 // StringReplace()
170 //    Give me a string and two patterns "old" and "new", and I replace
171 //    the first instance of "old" in the string with "new", if it
172 //    exists.  If "global" is true; call this repeatedly until it
173 //    fails.  RETURN a new string, regardless of whether the replacement
174 //    happened or not.
175 // ----------------------------------------------------------------------
176 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)177 string StringReplace(const string& s, const string& oldsub,
178                      const string& newsub, bool replace_all) {
179   string ret;
180   StringReplace(s, oldsub, newsub, replace_all, &ret);
181   return ret;
182 }
183 
184 // ----------------------------------------------------------------------
185 // SplitStringUsing()
186 //    Split a string using a character delimiter. Append the components
187 //    to 'result'.
188 //
189 // Note: For multi-character delimiters, this routine will split on *ANY* of
190 // the characters in the string, not the entire string as a single delimiter.
191 // ----------------------------------------------------------------------
192 template <typename ITR>
193 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)194 void SplitStringToIteratorUsing(const string& full,
195                                 const char* delim,
196                                 ITR& result) {
197   // Optimize the common case where delim is a single character.
198   if (delim[0] != '\0' && delim[1] == '\0') {
199     char c = delim[0];
200     const char* p = full.data();
201     const char* end = p + full.size();
202     while (p != end) {
203       if (*p == c) {
204         ++p;
205       } else {
206         const char* start = p;
207         while (++p != end && *p != c);
208         *result++ = string(start, p - start);
209       }
210     }
211     return;
212   }
213 
214   string::size_type begin_index, end_index;
215   begin_index = full.find_first_not_of(delim);
216   while (begin_index != string::npos) {
217     end_index = full.find_first_of(delim, begin_index);
218     if (end_index == string::npos) {
219       *result++ = full.substr(begin_index);
220       return;
221     }
222     *result++ = full.substr(begin_index, (end_index - begin_index));
223     begin_index = full.find_first_not_of(delim, end_index);
224   }
225 }
226 
SplitStringUsing(const string & full,const char * delim,vector<string> * result)227 void SplitStringUsing(const string& full,
228                       const char* delim,
229                       vector<string>* result) {
230   std::back_insert_iterator< vector<string> > it(*result);
231   SplitStringToIteratorUsing(full, delim, it);
232 }
233 
234 // Split a string using a character delimiter. Append the components
235 // to 'result'.  If there are consecutive delimiters, this function
236 // will return corresponding empty strings. The string is split into
237 // at most the specified number of pieces greedily. This means that the
238 // last piece may possibly be split further. To split into as many pieces
239 // as possible, specify 0 as the number of pieces.
240 //
241 // If "full" is the empty string, yields an empty string as the only value.
242 //
243 // If "pieces" is negative for some reason, it returns the whole string
244 // ----------------------------------------------------------------------
245 template <typename StringType, typename ITR>
246 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)247 void SplitStringToIteratorAllowEmpty(const StringType& full,
248                                      const char* delim,
249                                      int pieces,
250                                      ITR& result) {
251   string::size_type begin_index, end_index;
252   begin_index = 0;
253 
254   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
255     end_index = full.find_first_of(delim, begin_index);
256     if (end_index == string::npos) {
257       *result++ = full.substr(begin_index);
258       return;
259     }
260     *result++ = full.substr(begin_index, (end_index - begin_index));
261     begin_index = end_index + 1;
262   }
263   *result++ = full.substr(begin_index);
264 }
265 
SplitStringAllowEmpty(const string & full,const char * delim,vector<string> * result)266 void SplitStringAllowEmpty(const string& full, const char* delim,
267                            vector<string>* result) {
268   std::back_insert_iterator<vector<string> > it(*result);
269   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
270 }
271 
272 // ----------------------------------------------------------------------
273 // JoinStrings()
274 //    This merges a vector of string components with delim inserted
275 //    as separaters between components.
276 //
277 // ----------------------------------------------------------------------
278 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)279 static void JoinStringsIterator(const ITERATOR& start,
280                                 const ITERATOR& end,
281                                 const char* delim,
282                                 string* result) {
283   GOOGLE_CHECK(result != NULL);
284   result->clear();
285   int delim_length = strlen(delim);
286 
287   // Precompute resulting length so we can reserve() memory in one shot.
288   int length = 0;
289   for (ITERATOR iter = start; iter != end; ++iter) {
290     if (iter != start) {
291       length += delim_length;
292     }
293     length += iter->size();
294   }
295   result->reserve(length);
296 
297   // Now combine everything.
298   for (ITERATOR iter = start; iter != end; ++iter) {
299     if (iter != start) {
300       result->append(delim, delim_length);
301     }
302     result->append(iter->data(), iter->size());
303   }
304 }
305 
JoinStrings(const vector<string> & components,const char * delim,string * result)306 void JoinStrings(const vector<string>& components,
307                  const char* delim,
308                  string * result) {
309   JoinStringsIterator(components.begin(), components.end(), delim, result);
310 }
311 
312 // ----------------------------------------------------------------------
313 // UnescapeCEscapeSequences()
314 //    This does all the unescaping that C does: \ooo, \r, \n, etc
315 //    Returns length of resulting string.
316 //    The implementation of \x parses any positive number of hex digits,
317 //    but it is an error if the value requires more than 8 bits, and the
318 //    result is truncated to 8 bits.
319 //
320 //    The second call stores its errors in a supplied string vector.
321 //    If the string vector pointer is NULL, it reports the errors with LOG().
322 // ----------------------------------------------------------------------
323 
324 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
325 
326 // Protocol buffers doesn't ever care about errors, but I don't want to remove
327 // the code.
328 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
329 
UnescapeCEscapeSequences(const char * source,char * dest)330 int UnescapeCEscapeSequences(const char* source, char* dest) {
331   return UnescapeCEscapeSequences(source, dest, NULL);
332 }
333 
UnescapeCEscapeSequences(const char * source,char * dest,vector<string> * errors)334 int UnescapeCEscapeSequences(const char* source, char* dest,
335                              vector<string> *errors) {
336   GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
337 
338   char* d = dest;
339   const char* p = source;
340 
341   // Small optimization for case where source = dest and there's no escaping
342   while ( p == d && *p != '\0' && *p != '\\' )
343     p++, d++;
344 
345   while (*p != '\0') {
346     if (*p != '\\') {
347       *d++ = *p++;
348     } else {
349       switch ( *++p ) {                    // skip past the '\\'
350         case '\0':
351           LOG_STRING(ERROR, errors) << "String cannot end with \\";
352           *d = '\0';
353           return d - dest;   // we're done with p
354         case 'a':  *d++ = '\a';  break;
355         case 'b':  *d++ = '\b';  break;
356         case 'f':  *d++ = '\f';  break;
357         case 'n':  *d++ = '\n';  break;
358         case 'r':  *d++ = '\r';  break;
359         case 't':  *d++ = '\t';  break;
360         case 'v':  *d++ = '\v';  break;
361         case '\\': *d++ = '\\';  break;
362         case '?':  *d++ = '\?';  break;    // \?  Who knew?
363         case '\'': *d++ = '\'';  break;
364         case '"':  *d++ = '\"';  break;
365         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
366         case '4': case '5': case '6': case '7': {
367           char ch = *p - '0';
368           if ( IS_OCTAL_DIGIT(p[1]) )
369             ch = ch * 8 + *++p - '0';
370           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
371             ch = ch * 8 + *++p - '0';      // now points at last digit
372           *d++ = ch;
373           break;
374         }
375         case 'x': case 'X': {
376           if (!isxdigit(p[1])) {
377             if (p[1] == '\0') {
378               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
379             } else {
380               LOG_STRING(ERROR, errors) <<
381                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
382             }
383             break;
384           }
385           unsigned int ch = 0;
386           const char *hex_start = p;
387           while (isxdigit(p[1]))  // arbitrarily many hex digits
388             ch = (ch << 4) + hex_digit_to_int(*++p);
389           if (ch > 0xFF)
390             LOG_STRING(ERROR, errors) << "Value of " <<
391               "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
392           *d++ = ch;
393           break;
394         }
395 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
396         case 'u': {
397           // \uhhhh => convert 4 hex digits to UTF-8
398           char32 rune = 0;
399           const char *hex_start = p;
400           for (int i = 0; i < 4; ++i) {
401             if (isxdigit(p[1])) {  // Look one char ahead.
402               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
403             } else {
404               LOG_STRING(ERROR, errors)
405                 << "\\u must be followed by 4 hex digits: \\"
406                 <<  string(hex_start, p+1-hex_start);
407               break;
408             }
409           }
410           d += runetochar(d, &rune);
411           break;
412         }
413         case 'U': {
414           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
415           char32 rune = 0;
416           const char *hex_start = p;
417           for (int i = 0; i < 8; ++i) {
418             if (isxdigit(p[1])) {  // Look one char ahead.
419               // Don't change rune until we're sure this
420               // is within the Unicode limit, but do advance p.
421               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
422               if (newrune > 0x10FFFF) {
423                 LOG_STRING(ERROR, errors)
424                   << "Value of \\"
425                   << string(hex_start, p + 1 - hex_start)
426                   << " exceeds Unicode limit (0x10FFFF)";
427                 break;
428               } else {
429                 rune = newrune;
430               }
431             } else {
432               LOG_STRING(ERROR, errors)
433                 << "\\U must be followed by 8 hex digits: \\"
434                 <<  string(hex_start, p+1-hex_start);
435               break;
436             }
437           }
438           d += runetochar(d, &rune);
439           break;
440         }
441 #endif
442         default:
443           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
444       }
445       p++;                                 // read past letter we escaped
446     }
447   }
448   *d = '\0';
449   return d - dest;
450 }
451 
452 // ----------------------------------------------------------------------
453 // UnescapeCEscapeString()
454 //    This does the same thing as UnescapeCEscapeSequences, but creates
455 //    a new string. The caller does not need to worry about allocating
456 //    a dest buffer. This should be used for non performance critical
457 //    tasks such as printing debug messages. It is safe for src and dest
458 //    to be the same.
459 //
460 //    The second call stores its errors in a supplied string vector.
461 //    If the string vector pointer is NULL, it reports the errors with LOG().
462 //
463 //    In the first and second calls, the length of dest is returned. In the
464 //    the third call, the new string is returned.
465 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)466 int UnescapeCEscapeString(const string& src, string* dest) {
467   return UnescapeCEscapeString(src, dest, NULL);
468 }
469 
UnescapeCEscapeString(const string & src,string * dest,vector<string> * errors)470 int UnescapeCEscapeString(const string& src, string* dest,
471                           vector<string> *errors) {
472   scoped_array<char> unescaped(new char[src.size() + 1]);
473   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
474   GOOGLE_CHECK(dest);
475   dest->assign(unescaped.get(), len);
476   return len;
477 }
478 
UnescapeCEscapeString(const string & src)479 string UnescapeCEscapeString(const string& src) {
480   scoped_array<char> unescaped(new char[src.size() + 1]);
481   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
482   return string(unescaped.get(), len);
483 }
484 
485 // ----------------------------------------------------------------------
486 // CEscapeString()
487 // CHexEscapeString()
488 //    Copies 'src' to 'dest', escaping dangerous characters using
489 //    C-style escape sequences. This is very useful for preparing query
490 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
491 //    hexadecimal rather than octal sequences.
492 //    Returns the number of bytes written to 'dest' (not including the \0)
493 //    or -1 if there was insufficient space.
494 //
495 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
496 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)497 int CEscapeInternal(const char* src, int src_len, char* dest,
498                     int dest_len, bool use_hex, bool utf8_safe) {
499   const char* src_end = src + src_len;
500   int used = 0;
501   bool last_hex_escape = false; // true if last output char was \xNN
502 
503   for (; src < src_end; src++) {
504     if (dest_len - used < 2)   // Need space for two letter escape
505       return -1;
506 
507     bool is_hex_escape = false;
508     switch (*src) {
509       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
510       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
511       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
512       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
513       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
514       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
515       default:
516         // Note that if we emit \xNN and the src character after that is a hex
517         // digit then that digit must be escaped too to prevent it being
518         // interpreted as part of the character code by C.
519         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
520             (!isprint(*src) ||
521              (last_hex_escape && isxdigit(*src)))) {
522           if (dest_len - used < 4) // need space for 4 letter escape
523             return -1;
524           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
525                   static_cast<uint8>(*src));
526           is_hex_escape = use_hex;
527           used += 4;
528         } else {
529           dest[used++] = *src; break;
530         }
531     }
532     last_hex_escape = is_hex_escape;
533   }
534 
535   if (dest_len - used < 1)   // make sure that there is room for \0
536     return -1;
537 
538   dest[used] = '\0';   // doesn't count towards return value though
539   return used;
540 }
541 
542 // Calculates the length of the C-style escaped version of 'src'.
543 // Assumes that non-printable characters are escaped using octal sequences, and
544 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)545 static inline size_t CEscapedLength(StringPiece src) {
546   static char c_escaped_len[256] = {
547     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
548     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
549     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
550     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
551     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
552     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
553     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
554     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
555     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
556     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
557     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
558     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
559     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
560     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
561     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
562     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
563   };
564 
565   size_t escaped_len = 0;
566   for (int i = 0; i < src.size(); ++i) {
567     unsigned char c = static_cast<unsigned char>(src[i]);
568     escaped_len += c_escaped_len[c];
569   }
570   return escaped_len;
571 }
572 
573 // ----------------------------------------------------------------------
574 // Escapes 'src' using C-style escape sequences, and appends the escaped string
575 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
576 // the required space using a lookup table, and also does not do any special
577 // handling for Hex or UTF-8 characters.
578 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)579 void CEscapeAndAppend(StringPiece src, string* dest) {
580   size_t escaped_len = CEscapedLength(src);
581   if (escaped_len == src.size()) {
582     dest->append(src.data(), src.size());
583     return;
584   }
585 
586   size_t cur_dest_len = dest->size();
587   dest->resize(cur_dest_len + escaped_len);
588   char* append_ptr = &(*dest)[cur_dest_len];
589 
590   for (int i = 0; i < src.size(); ++i) {
591     unsigned char c = static_cast<unsigned char>(src[i]);
592     switch (c) {
593       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
594       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
595       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
596       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
597       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
598       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
599       default:
600         if (!isprint(c)) {
601           *append_ptr++ = '\\';
602           *append_ptr++ = '0' + c / 64;
603           *append_ptr++ = '0' + (c % 64) / 8;
604           *append_ptr++ = '0' + c % 8;
605         } else {
606           *append_ptr++ = c;
607         }
608         break;
609     }
610   }
611 }
612 
CEscape(const string & src)613 string CEscape(const string& src) {
614   string dest;
615   CEscapeAndAppend(src, &dest);
616   return dest;
617 }
618 
619 namespace strings {
620 
Utf8SafeCEscape(const string & src)621 string Utf8SafeCEscape(const string& src) {
622   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
623   scoped_array<char> dest(new char[dest_length]);
624   const int len = CEscapeInternal(src.data(), src.size(),
625                                   dest.get(), dest_length, false, true);
626   GOOGLE_DCHECK_GE(len, 0);
627   return string(dest.get(), len);
628 }
629 
CHexEscape(const string & src)630 string CHexEscape(const string& src) {
631   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
632   scoped_array<char> dest(new char[dest_length]);
633   const int len = CEscapeInternal(src.data(), src.size(),
634                                   dest.get(), dest_length, true, false);
635   GOOGLE_DCHECK_GE(len, 0);
636   return string(dest.get(), len);
637 }
638 
639 }  // namespace strings
640 
641 // ----------------------------------------------------------------------
642 // strto32_adaptor()
643 // strtou32_adaptor()
644 //    Implementation of strto[u]l replacements that have identical
645 //    overflow and underflow characteristics for both ILP-32 and LP-64
646 //    platforms, including errno preservation in error-free calls.
647 // ----------------------------------------------------------------------
648 
strto32_adaptor(const char * nptr,char ** endptr,int base)649 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
650   const int saved_errno = errno;
651   errno = 0;
652   const long result = strtol(nptr, endptr, base);
653   if (errno == ERANGE && result == LONG_MIN) {
654     return kint32min;
655   } else if (errno == ERANGE && result == LONG_MAX) {
656     return kint32max;
657   } else if (errno == 0 && result < kint32min) {
658     errno = ERANGE;
659     return kint32min;
660   } else if (errno == 0 && result > kint32max) {
661     errno = ERANGE;
662     return kint32max;
663   }
664   if (errno == 0)
665     errno = saved_errno;
666   return static_cast<int32>(result);
667 }
668 
strtou32_adaptor(const char * nptr,char ** endptr,int base)669 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
670   const int saved_errno = errno;
671   errno = 0;
672   const unsigned long result = strtoul(nptr, endptr, base);
673   if (errno == ERANGE && result == ULONG_MAX) {
674     return kuint32max;
675   } else if (errno == 0 && result > kuint32max) {
676     errno = ERANGE;
677     return kuint32max;
678   }
679   if (errno == 0)
680     errno = saved_errno;
681   return static_cast<uint32>(result);
682 }
683 
safe_parse_sign(string * text,bool * negative_ptr)684 inline bool safe_parse_sign(string* text  /*inout*/,
685                             bool* negative_ptr  /*output*/) {
686   const char* start = text->data();
687   const char* end = start + text->size();
688 
689   // Consume whitespace.
690   while (start < end && (start[0] == ' ')) {
691     ++start;
692   }
693   while (start < end && (end[-1] == ' ')) {
694     --end;
695   }
696   if (start >= end) {
697     return false;
698   }
699 
700   // Consume sign.
701   *negative_ptr = (start[0] == '-');
702   if (*negative_ptr || start[0] == '+') {
703     ++start;
704     if (start >= end) {
705       return false;
706     }
707   }
708   *text = text->substr(start - text->data(), end - start);
709   return true;
710 }
711 
712 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)713 bool safe_parse_positive_int(
714     string text, IntType* value_p) {
715   int base = 10;
716   IntType value = 0;
717   const IntType vmax = std::numeric_limits<IntType>::max();
718   assert(vmax > 0);
719   assert(vmax >= base);
720   const IntType vmax_over_base = vmax / base;
721   const char* start = text.data();
722   const char* end = start + text.size();
723   // loop over digits
724   for (; start < end; ++start) {
725     unsigned char c = static_cast<unsigned char>(start[0]);
726     int digit = c - '0';
727     if (digit >= base || digit < 0) {
728       *value_p = value;
729       return false;
730     }
731     if (value > vmax_over_base) {
732       *value_p = vmax;
733       return false;
734     }
735     value *= base;
736     if (value > vmax - digit) {
737       *value_p = vmax;
738       return false;
739     }
740     value += digit;
741   }
742   *value_p = value;
743   return true;
744 }
745 
746 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)747 bool safe_parse_negative_int(
748     const string& text, IntType* value_p) {
749   int base = 10;
750   IntType value = 0;
751   const IntType vmin = std::numeric_limits<IntType>::min();
752   assert(vmin < 0);
753   assert(vmin <= 0 - base);
754   IntType vmin_over_base = vmin / base;
755   // 2003 c++ standard [expr.mul]
756   // "... the sign of the remainder is implementation-defined."
757   // Although (vmin/base)*base + vmin%base is always vmin.
758   // 2011 c++ standard tightens the spec but we cannot rely on it.
759   if (vmin % base > 0) {
760     vmin_over_base += 1;
761   }
762   const char* start = text.data();
763   const char* end = start + text.size();
764   // loop over digits
765   for (; start < end; ++start) {
766     unsigned char c = static_cast<unsigned char>(start[0]);
767     int digit = c - '0';
768     if (digit >= base || digit < 0) {
769       *value_p = value;
770       return false;
771     }
772     if (value < vmin_over_base) {
773       *value_p = vmin;
774       return false;
775     }
776     value *= base;
777     if (value < vmin + digit) {
778       *value_p = vmin;
779       return false;
780     }
781     value -= digit;
782   }
783   *value_p = value;
784   return true;
785 }
786 
787 template<typename IntType>
safe_int_internal(string text,IntType * value_p)788 bool safe_int_internal(string text, IntType* value_p) {
789   *value_p = 0;
790   bool negative;
791   if (!safe_parse_sign(&text, &negative)) {
792     return false;
793   }
794   if (!negative) {
795     return safe_parse_positive_int(text, value_p);
796   } else {
797     return safe_parse_negative_int(text, value_p);
798   }
799 }
800 
801 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)802 bool safe_uint_internal(string text, IntType* value_p) {
803   *value_p = 0;
804   bool negative;
805   if (!safe_parse_sign(&text, &negative) || negative) {
806     return false;
807   }
808   return safe_parse_positive_int(text, value_p);
809 }
810 
811 // ----------------------------------------------------------------------
812 // FastIntToBuffer()
813 // FastInt64ToBuffer()
814 // FastHexToBuffer()
815 // FastHex64ToBuffer()
816 // FastHex32ToBuffer()
817 // ----------------------------------------------------------------------
818 
819 // Offset into buffer where FastInt64ToBuffer places the end of string
820 // null character.  Also used by FastInt64ToBufferLeft.
821 static const int kFastInt64ToBufferOffset = 21;
822 
FastInt64ToBuffer(int64 i,char * buffer)823 char *FastInt64ToBuffer(int64 i, char* buffer) {
824   // We could collapse the positive and negative sections, but that
825   // would be slightly slower for positive numbers...
826   // 22 bytes is enough to store -2**64, -18446744073709551616.
827   char* p = buffer + kFastInt64ToBufferOffset;
828   *p-- = '\0';
829   if (i >= 0) {
830     do {
831       *p-- = '0' + i % 10;
832       i /= 10;
833     } while (i > 0);
834     return p + 1;
835   } else {
836     // On different platforms, % and / have different behaviors for
837     // negative numbers, so we need to jump through hoops to make sure
838     // we don't divide negative numbers.
839     if (i > -10) {
840       i = -i;
841       *p-- = '0' + i;
842       *p = '-';
843       return p;
844     } else {
845       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
846       i = i + 10;
847       i = -i;
848       *p-- = '0' + i % 10;
849       // Undo what we did a moment ago
850       i = i / 10 + 1;
851       do {
852         *p-- = '0' + i % 10;
853         i /= 10;
854       } while (i > 0);
855       *p = '-';
856       return p;
857     }
858   }
859 }
860 
861 // Offset into buffer where FastInt32ToBuffer places the end of string
862 // null character.  Also used by FastInt32ToBufferLeft
863 static const int kFastInt32ToBufferOffset = 11;
864 
865 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
866 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
867 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)868 char *FastInt32ToBuffer(int32 i, char* buffer) {
869   // We could collapse the positive and negative sections, but that
870   // would be slightly slower for positive numbers...
871   // 12 bytes is enough to store -2**32, -4294967296.
872   char* p = buffer + kFastInt32ToBufferOffset;
873   *p-- = '\0';
874   if (i >= 0) {
875     do {
876       *p-- = '0' + i % 10;
877       i /= 10;
878     } while (i > 0);
879     return p + 1;
880   } else {
881     // On different platforms, % and / have different behaviors for
882     // negative numbers, so we need to jump through hoops to make sure
883     // we don't divide negative numbers.
884     if (i > -10) {
885       i = -i;
886       *p-- = '0' + i;
887       *p = '-';
888       return p;
889     } else {
890       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
891       i = i + 10;
892       i = -i;
893       *p-- = '0' + i % 10;
894       // Undo what we did a moment ago
895       i = i / 10 + 1;
896       do {
897         *p-- = '0' + i % 10;
898         i /= 10;
899       } while (i > 0);
900       *p = '-';
901       return p;
902     }
903   }
904 }
905 
FastHexToBuffer(int i,char * buffer)906 char *FastHexToBuffer(int i, char* buffer) {
907   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
908 
909   static const char *hexdigits = "0123456789abcdef";
910   char *p = buffer + 21;
911   *p-- = '\0';
912   do {
913     *p-- = hexdigits[i & 15];   // mod by 16
914     i >>= 4;                    // divide by 16
915   } while (i > 0);
916   return p + 1;
917 }
918 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)919 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
920   static const char *hexdigits = "0123456789abcdef";
921   buffer[num_byte] = '\0';
922   for (int i = num_byte - 1; i >= 0; i--) {
923 #ifdef _M_X64
924     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
925     // block. Given that the uint32 cast was to improve performance on 32-bit
926     // platforms, we use 64-bit '&' directly.
927     buffer[i] = hexdigits[value & 0xf];
928 #else
929     buffer[i] = hexdigits[uint32(value) & 0xf];
930 #endif
931     value >>= 4;
932   }
933   return buffer;
934 }
935 
FastHex64ToBuffer(uint64 value,char * buffer)936 char *FastHex64ToBuffer(uint64 value, char* buffer) {
937   return InternalFastHexToBuffer(value, buffer, 16);
938 }
939 
FastHex32ToBuffer(uint32 value,char * buffer)940 char *FastHex32ToBuffer(uint32 value, char* buffer) {
941   return InternalFastHexToBuffer(value, buffer, 8);
942 }
943 
944 // ----------------------------------------------------------------------
945 // FastInt32ToBufferLeft()
946 // FastUInt32ToBufferLeft()
947 // FastInt64ToBufferLeft()
948 // FastUInt64ToBufferLeft()
949 //
950 // Like the Fast*ToBuffer() functions above, these are intended for speed.
951 // Unlike the Fast*ToBuffer() functions, however, these functions write
952 // their output to the beginning of the buffer (hence the name, as the
953 // output is left-aligned).  The caller is responsible for ensuring that
954 // the buffer has enough space to hold the output.
955 //
956 // Returns a pointer to the end of the string (i.e. the null character
957 // terminating the string).
958 // ----------------------------------------------------------------------
959 
960 static const char two_ASCII_digits[100][2] = {
961   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
962   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
963   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
964   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
965   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
966   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
967   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
968   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
969   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
970   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
971   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
972   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
973   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
974   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
975   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
976   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
977   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
978   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
979   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
980   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
981 };
982 
FastUInt32ToBufferLeft(uint32 u,char * buffer)983 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
984   uint32 digits;
985   const char *ASCII_digits = NULL;
986   // The idea of this implementation is to trim the number of divides to as few
987   // as possible by using multiplication and subtraction rather than mod (%),
988   // and by outputting two digits at a time rather than one.
989   // The huge-number case is first, in the hopes that the compiler will output
990   // that case in one branch-free block of code, and only output conditional
991   // branches into it from below.
992   if (u >= 1000000000) {  // >= 1,000,000,000
993     digits = u / 100000000;  // 100,000,000
994     ASCII_digits = two_ASCII_digits[digits];
995     buffer[0] = ASCII_digits[0];
996     buffer[1] = ASCII_digits[1];
997     buffer += 2;
998 sublt100_000_000:
999     u -= digits * 100000000;  // 100,000,000
1000 lt100_000_000:
1001     digits = u / 1000000;  // 1,000,000
1002     ASCII_digits = two_ASCII_digits[digits];
1003     buffer[0] = ASCII_digits[0];
1004     buffer[1] = ASCII_digits[1];
1005     buffer += 2;
1006 sublt1_000_000:
1007     u -= digits * 1000000;  // 1,000,000
1008 lt1_000_000:
1009     digits = u / 10000;  // 10,000
1010     ASCII_digits = two_ASCII_digits[digits];
1011     buffer[0] = ASCII_digits[0];
1012     buffer[1] = ASCII_digits[1];
1013     buffer += 2;
1014 sublt10_000:
1015     u -= digits * 10000;  // 10,000
1016 lt10_000:
1017     digits = u / 100;
1018     ASCII_digits = two_ASCII_digits[digits];
1019     buffer[0] = ASCII_digits[0];
1020     buffer[1] = ASCII_digits[1];
1021     buffer += 2;
1022 sublt100:
1023     u -= digits * 100;
1024 lt100:
1025     digits = u;
1026     ASCII_digits = two_ASCII_digits[digits];
1027     buffer[0] = ASCII_digits[0];
1028     buffer[1] = ASCII_digits[1];
1029     buffer += 2;
1030 done:
1031     *buffer = 0;
1032     return buffer;
1033   }
1034 
1035   if (u < 100) {
1036     digits = u;
1037     if (u >= 10) goto lt100;
1038     *buffer++ = '0' + digits;
1039     goto done;
1040   }
1041   if (u  <  10000) {   // 10,000
1042     if (u >= 1000) goto lt10_000;
1043     digits = u / 100;
1044     *buffer++ = '0' + digits;
1045     goto sublt100;
1046   }
1047   if (u  <  1000000) {   // 1,000,000
1048     if (u >= 100000) goto lt1_000_000;
1049     digits = u / 10000;  //    10,000
1050     *buffer++ = '0' + digits;
1051     goto sublt10_000;
1052   }
1053   if (u  <  100000000) {   // 100,000,000
1054     if (u >= 10000000) goto lt100_000_000;
1055     digits = u / 1000000;  //   1,000,000
1056     *buffer++ = '0' + digits;
1057     goto sublt1_000_000;
1058   }
1059   // we already know that u < 1,000,000,000
1060   digits = u / 100000000;   // 100,000,000
1061   *buffer++ = '0' + digits;
1062   goto sublt100_000_000;
1063 }
1064 
FastInt32ToBufferLeft(int32 i,char * buffer)1065 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1066   uint32 u = i;
1067   if (i < 0) {
1068     *buffer++ = '-';
1069     u = -i;
1070   }
1071   return FastUInt32ToBufferLeft(u, buffer);
1072 }
1073 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1074 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1075   int digits;
1076   const char *ASCII_digits = NULL;
1077 
1078   uint32 u = static_cast<uint32>(u64);
1079   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1080 
1081   uint64 top_11_digits = u64 / 1000000000;
1082   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1083   u = u64 - (top_11_digits * 1000000000);
1084 
1085   digits = u / 10000000;  // 10,000,000
1086   GOOGLE_DCHECK_LT(digits, 100);
1087   ASCII_digits = two_ASCII_digits[digits];
1088   buffer[0] = ASCII_digits[0];
1089   buffer[1] = ASCII_digits[1];
1090   buffer += 2;
1091   u -= digits * 10000000;  // 10,000,000
1092   digits = u / 100000;  // 100,000
1093   ASCII_digits = two_ASCII_digits[digits];
1094   buffer[0] = ASCII_digits[0];
1095   buffer[1] = ASCII_digits[1];
1096   buffer += 2;
1097   u -= digits * 100000;  // 100,000
1098   digits = u / 1000;  // 1,000
1099   ASCII_digits = two_ASCII_digits[digits];
1100   buffer[0] = ASCII_digits[0];
1101   buffer[1] = ASCII_digits[1];
1102   buffer += 2;
1103   u -= digits * 1000;  // 1,000
1104   digits = u / 10;
1105   ASCII_digits = two_ASCII_digits[digits];
1106   buffer[0] = ASCII_digits[0];
1107   buffer[1] = ASCII_digits[1];
1108   buffer += 2;
1109   u -= digits * 10;
1110   digits = u;
1111   *buffer++ = '0' + digits;
1112   *buffer = 0;
1113   return buffer;
1114 }
1115 
FastInt64ToBufferLeft(int64 i,char * buffer)1116 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1117   uint64 u = i;
1118   if (i < 0) {
1119     *buffer++ = '-';
1120     u = -i;
1121   }
1122   return FastUInt64ToBufferLeft(u, buffer);
1123 }
1124 
1125 // ----------------------------------------------------------------------
1126 // SimpleItoa()
1127 //    Description: converts an integer to a string.
1128 //
1129 //    Return value: string
1130 // ----------------------------------------------------------------------
1131 
SimpleItoa(int i)1132 string SimpleItoa(int i) {
1133   char buffer[kFastToBufferSize];
1134   return (sizeof(i) == 4) ?
1135     FastInt32ToBuffer(i, buffer) :
1136     FastInt64ToBuffer(i, buffer);
1137 }
1138 
SimpleItoa(unsigned int i)1139 string SimpleItoa(unsigned int i) {
1140   char buffer[kFastToBufferSize];
1141   return string(buffer, (sizeof(i) == 4) ?
1142     FastUInt32ToBufferLeft(i, buffer) :
1143     FastUInt64ToBufferLeft(i, buffer));
1144 }
1145 
SimpleItoa(long i)1146 string SimpleItoa(long i) {
1147   char buffer[kFastToBufferSize];
1148   return (sizeof(i) == 4) ?
1149     FastInt32ToBuffer(i, buffer) :
1150     FastInt64ToBuffer(i, buffer);
1151 }
1152 
SimpleItoa(unsigned long i)1153 string SimpleItoa(unsigned long i) {
1154   char buffer[kFastToBufferSize];
1155   return string(buffer, (sizeof(i) == 4) ?
1156     FastUInt32ToBufferLeft(i, buffer) :
1157     FastUInt64ToBufferLeft(i, buffer));
1158 }
1159 
SimpleItoa(long long i)1160 string SimpleItoa(long long i) {
1161   char buffer[kFastToBufferSize];
1162   return (sizeof(i) == 4) ?
1163     FastInt32ToBuffer(i, buffer) :
1164     FastInt64ToBuffer(i, buffer);
1165 }
1166 
SimpleItoa(unsigned long long i)1167 string SimpleItoa(unsigned long long i) {
1168   char buffer[kFastToBufferSize];
1169   return string(buffer, (sizeof(i) == 4) ?
1170     FastUInt32ToBufferLeft(i, buffer) :
1171     FastUInt64ToBufferLeft(i, buffer));
1172 }
1173 
1174 // ----------------------------------------------------------------------
1175 // SimpleDtoa()
1176 // SimpleFtoa()
1177 // DoubleToBuffer()
1178 // FloatToBuffer()
1179 //    We want to print the value without losing precision, but we also do
1180 //    not want to print more digits than necessary.  This turns out to be
1181 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1182 //    exactly in binary.  If we print 0.2 with a very large precision,
1183 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1184 //    On the other hand, if we set the precision too low, we lose
1185 //    significant digits when printing numbers that actually need them.
1186 //    It turns out there is no precision value that does the right thing
1187 //    for all numbers.
1188 //
1189 //    Our strategy is to first try printing with a precision that is never
1190 //    over-precise, then parse the result with strtod() to see if it
1191 //    matches.  If not, we print again with a precision that will always
1192 //    give a precise result, but may use more digits than necessary.
1193 //
1194 //    An arguably better strategy would be to use the algorithm described
1195 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1196 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1197 //    however, that the following implementation is about as fast as
1198 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1199 //    will not scale well on multi-core machines.  DMG's code is slightly
1200 //    more accurate (in that it will never use more digits than
1201 //    necessary), but this is probably irrelevant for most users.
1202 //
1203 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1204 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1205 //    one in that it makes guesses and then uses strtod() to check them.
1206 //    Their implementation is faster because they use their own code to
1207 //    generate the digits in the first place rather than use snprintf(),
1208 //    thus avoiding format string parsing overhead.  However, this makes
1209 //    it considerably more complicated than the following implementation,
1210 //    and it is embedded in a larger library.  If speed turns out to be
1211 //    an issue, we could re-implement this in terms of their
1212 //    implementation.
1213 // ----------------------------------------------------------------------
1214 
SimpleDtoa(double value)1215 string SimpleDtoa(double value) {
1216   char buffer[kDoubleToBufferSize];
1217   return DoubleToBuffer(value, buffer);
1218 }
1219 
SimpleFtoa(float value)1220 string SimpleFtoa(float value) {
1221   char buffer[kFloatToBufferSize];
1222   return FloatToBuffer(value, buffer);
1223 }
1224 
IsValidFloatChar(char c)1225 static inline bool IsValidFloatChar(char c) {
1226   return ('0' <= c && c <= '9') ||
1227          c == 'e' || c == 'E' ||
1228          c == '+' || c == '-';
1229 }
1230 
DelocalizeRadix(char * buffer)1231 void DelocalizeRadix(char* buffer) {
1232   // Fast check:  if the buffer has a normal decimal point, assume no
1233   // translation is needed.
1234   if (strchr(buffer, '.') != NULL) return;
1235 
1236   // Find the first unknown character.
1237   while (IsValidFloatChar(*buffer)) ++buffer;
1238 
1239   if (*buffer == '\0') {
1240     // No radix character found.
1241     return;
1242   }
1243 
1244   // We are now pointing at the locale-specific radix character.  Replace it
1245   // with '.'.
1246   *buffer = '.';
1247   ++buffer;
1248 
1249   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1250     // It appears the radix was a multi-byte character.  We need to remove the
1251     // extra bytes.
1252     char* target = buffer;
1253     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1254     memmove(target, buffer, strlen(buffer) + 1);
1255   }
1256 }
1257 
DoubleToBuffer(double value,char * buffer)1258 char* DoubleToBuffer(double value, char* buffer) {
1259   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1260   // platforms these days.  Just in case some system exists where DBL_DIG
1261   // is significantly larger -- and risks overflowing our buffer -- we have
1262   // this assert.
1263   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1264 
1265   if (value == std::numeric_limits<double>::infinity()) {
1266     strcpy(buffer, "inf");
1267     return buffer;
1268   } else if (value == -std::numeric_limits<double>::infinity()) {
1269     strcpy(buffer, "-inf");
1270     return buffer;
1271   } else if (MathLimits<double>::IsNaN(value)) {
1272     strcpy(buffer, "nan");
1273     return buffer;
1274   }
1275 
1276   int snprintf_result =
1277     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1278 
1279   // The snprintf should never overflow because the buffer is significantly
1280   // larger than the precision we asked for.
1281   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1282 
1283   // We need to make parsed_value volatile in order to force the compiler to
1284   // write it out to the stack.  Otherwise, it may keep the value in a
1285   // register, and if it does that, it may keep it as a long double instead
1286   // of a double.  This long double may have extra bits that make it compare
1287   // unequal to "value" even though it would be exactly equal if it were
1288   // truncated to a double.
1289   volatile double parsed_value = strtod(buffer, NULL);
1290   if (parsed_value != value) {
1291     int snprintf_result =
1292       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1293 
1294     // Should never overflow; see above.
1295     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1296   }
1297 
1298   DelocalizeRadix(buffer);
1299   return buffer;
1300 }
1301 
memcasecmp(const char * s1,const char * s2,size_t len)1302 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1303   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1304   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1305 
1306   for ( int i = 0; i < len; i++ ) {
1307     const int diff =
1308       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1309       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1310     if (diff != 0) return diff;
1311   }
1312   return 0;
1313 }
1314 
CaseEqual(StringPiece s1,StringPiece s2)1315 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1316   if (s1.size() != s2.size()) return false;
1317   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1318 }
1319 
safe_strtob(StringPiece str,bool * value)1320 bool safe_strtob(StringPiece str, bool* value) {
1321   GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
1322   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1323       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1324       CaseEqual(str, "1")) {
1325     *value = true;
1326     return true;
1327   }
1328   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1329       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1330       CaseEqual(str, "0")) {
1331     *value = false;
1332     return true;
1333   }
1334   return false;
1335 }
1336 
safe_strtof(const char * str,float * value)1337 bool safe_strtof(const char* str, float* value) {
1338   char* endptr;
1339   errno = 0;  // errno only gets set on errors
1340 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1341   *value = strtod(str, &endptr);
1342 #else
1343   *value = strtof(str, &endptr);
1344 #endif
1345   return *str != 0 && *endptr == 0 && errno == 0;
1346 }
1347 
safe_strtod(const char * str,double * value)1348 bool safe_strtod(const char* str, double* value) {
1349   char* endptr;
1350   *value = strtod(str, &endptr);
1351   if (endptr != str) {
1352     while (ascii_isspace(*endptr)) ++endptr;
1353   }
1354   // Ignore range errors from strtod.  The values it
1355   // returns on underflow and overflow are the right
1356   // fallback in a robust setting.
1357   return *str != '\0' && *endptr == '\0';
1358 }
1359 
safe_strto32(const string & str,int32 * value)1360 bool safe_strto32(const string& str, int32* value) {
1361   return safe_int_internal(str, value);
1362 }
1363 
safe_strtou32(const string & str,uint32 * value)1364 bool safe_strtou32(const string& str, uint32* value) {
1365   return safe_uint_internal(str, value);
1366 }
1367 
safe_strto64(const string & str,int64 * value)1368 bool safe_strto64(const string& str, int64* value) {
1369   return safe_int_internal(str, value);
1370 }
1371 
safe_strtou64(const string & str,uint64 * value)1372 bool safe_strtou64(const string& str, uint64* value) {
1373   return safe_uint_internal(str, value);
1374 }
1375 
FloatToBuffer(float value,char * buffer)1376 char* FloatToBuffer(float value, char* buffer) {
1377   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1378   // platforms these days.  Just in case some system exists where FLT_DIG
1379   // is significantly larger -- and risks overflowing our buffer -- we have
1380   // this assert.
1381   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1382 
1383   if (value == std::numeric_limits<double>::infinity()) {
1384     strcpy(buffer, "inf");
1385     return buffer;
1386   } else if (value == -std::numeric_limits<double>::infinity()) {
1387     strcpy(buffer, "-inf");
1388     return buffer;
1389   } else if (MathLimits<float>::IsNaN(value)) {
1390     strcpy(buffer, "nan");
1391     return buffer;
1392   }
1393 
1394   int snprintf_result =
1395     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1396 
1397   // The snprintf should never overflow because the buffer is significantly
1398   // larger than the precision we asked for.
1399   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1400 
1401   float parsed_value;
1402   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1403     int snprintf_result =
1404       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1405 
1406     // Should never overflow; see above.
1407     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1408   }
1409 
1410   DelocalizeRadix(buffer);
1411   return buffer;
1412 }
1413 
1414 namespace strings {
1415 
AlphaNum(strings::Hex hex)1416 AlphaNum::AlphaNum(strings::Hex hex) {
1417   char *const end = &digits[kFastToBufferSize];
1418   char *writer = end;
1419   uint64 value = hex.value;
1420   uint64 width = hex.spec;
1421   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1422   // where 0x10000 is the smallest hex number that is as wide as the user
1423   // asked for.
1424   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1425   static const char hexdigits[] = "0123456789abcdef";
1426   do {
1427     *--writer = hexdigits[value & 0xF];
1428     value >>= 4;
1429     mask >>= 4;
1430   } while (mask != 0);
1431   piece_data_ = writer;
1432   piece_size_ = end - writer;
1433 }
1434 
1435 }  // namespace strings
1436 
1437 // ----------------------------------------------------------------------
1438 // StrCat()
1439 //    This merges the given strings or integers, with no delimiter.  This
1440 //    is designed to be the fastest possible way to construct a string out
1441 //    of a mix of raw C strings, C++ strings, and integer values.
1442 // ----------------------------------------------------------------------
1443 
1444 // Append is merely a version of memcpy that returns the address of the byte
1445 // after the area just overwritten.  It comes in multiple flavors to minimize
1446 // call overhead.
Append1(char * out,const AlphaNum & x)1447 static char *Append1(char *out, const AlphaNum &x) {
1448   memcpy(out, x.data(), x.size());
1449   return out + x.size();
1450 }
1451 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1452 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1453   memcpy(out, x1.data(), x1.size());
1454   out += x1.size();
1455 
1456   memcpy(out, x2.data(), x2.size());
1457   return out + x2.size();
1458 }
1459 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1460 static char *Append4(char *out,
1461                      const AlphaNum &x1, const AlphaNum &x2,
1462                      const AlphaNum &x3, const AlphaNum &x4) {
1463   memcpy(out, x1.data(), x1.size());
1464   out += x1.size();
1465 
1466   memcpy(out, x2.data(), x2.size());
1467   out += x2.size();
1468 
1469   memcpy(out, x3.data(), x3.size());
1470   out += x3.size();
1471 
1472   memcpy(out, x4.data(), x4.size());
1473   return out + x4.size();
1474 }
1475 
StrCat(const AlphaNum & a,const AlphaNum & b)1476 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1477   string result;
1478   result.resize(a.size() + b.size());
1479   char *const begin = &*result.begin();
1480   char *out = Append2(begin, a, b);
1481   GOOGLE_DCHECK_EQ(out, begin + result.size());
1482   return result;
1483 }
1484 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1485 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1486   string result;
1487   result.resize(a.size() + b.size() + c.size());
1488   char *const begin = &*result.begin();
1489   char *out = Append2(begin, a, b);
1490   out = Append1(out, c);
1491   GOOGLE_DCHECK_EQ(out, begin + result.size());
1492   return result;
1493 }
1494 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1495 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1496               const AlphaNum &d) {
1497   string result;
1498   result.resize(a.size() + b.size() + c.size() + d.size());
1499   char *const begin = &*result.begin();
1500   char *out = Append4(begin, a, b, c, d);
1501   GOOGLE_DCHECK_EQ(out, begin + result.size());
1502   return result;
1503 }
1504 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1505 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1506               const AlphaNum &d, const AlphaNum &e) {
1507   string result;
1508   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1509   char *const begin = &*result.begin();
1510   char *out = Append4(begin, a, b, c, d);
1511   out = Append1(out, e);
1512   GOOGLE_DCHECK_EQ(out, begin + result.size());
1513   return result;
1514 }
1515 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1516 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1517               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1518   string result;
1519   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1520                 f.size());
1521   char *const begin = &*result.begin();
1522   char *out = Append4(begin, a, b, c, d);
1523   out = Append2(out, e, f);
1524   GOOGLE_DCHECK_EQ(out, begin + result.size());
1525   return result;
1526 }
1527 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1528 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1529               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1530               const AlphaNum &g) {
1531   string result;
1532   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1533                 f.size() + g.size());
1534   char *const begin = &*result.begin();
1535   char *out = Append4(begin, a, b, c, d);
1536   out = Append2(out, e, f);
1537   out = Append1(out, g);
1538   GOOGLE_DCHECK_EQ(out, begin + result.size());
1539   return result;
1540 }
1541 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1542 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1543               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1544               const AlphaNum &g, const AlphaNum &h) {
1545   string result;
1546   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1547                 f.size() + g.size() + h.size());
1548   char *const begin = &*result.begin();
1549   char *out = Append4(begin, a, b, c, d);
1550   out = Append4(out, e, f, g, h);
1551   GOOGLE_DCHECK_EQ(out, begin + result.size());
1552   return result;
1553 }
1554 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1555 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1556               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1557               const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1558   string result;
1559   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1560                 f.size() + g.size() + h.size() + i.size());
1561   char *const begin = &*result.begin();
1562   char *out = Append4(begin, a, b, c, d);
1563   out = Append4(out, e, f, g, h);
1564   out = Append1(out, i);
1565   GOOGLE_DCHECK_EQ(out, begin + result.size());
1566   return result;
1567 }
1568 
1569 // It's possible to call StrAppend with a char * pointer that is partway into
1570 // the string we're appending to.  However the results of this are random.
1571 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1572 // to do one comparison.
1573 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1574     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1575                      uintptr_t((dest).size()))
1576 
StrAppend(string * result,const AlphaNum & a)1577 void StrAppend(string *result, const AlphaNum &a) {
1578   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1579   result->append(a.data(), a.size());
1580 }
1581 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1582 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1583   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1584   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1585   string::size_type old_size = result->size();
1586   result->resize(old_size + a.size() + b.size());
1587   char *const begin = &*result->begin();
1588   char *out = Append2(begin + old_size, a, b);
1589   GOOGLE_DCHECK_EQ(out, begin + result->size());
1590 }
1591 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1592 void StrAppend(string *result,
1593                const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1594   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1595   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1596   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1597   string::size_type old_size = result->size();
1598   result->resize(old_size + a.size() + b.size() + c.size());
1599   char *const begin = &*result->begin();
1600   char *out = Append2(begin + old_size, a, b);
1601   out = Append1(out, c);
1602   GOOGLE_DCHECK_EQ(out, begin + result->size());
1603 }
1604 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1605 void StrAppend(string *result,
1606                const AlphaNum &a, const AlphaNum &b,
1607                const AlphaNum &c, const AlphaNum &d) {
1608   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1609   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1610   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1611   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1612   string::size_type old_size = result->size();
1613   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1614   char *const begin = &*result->begin();
1615   char *out = Append4(begin + old_size, a, b, c, d);
1616   GOOGLE_DCHECK_EQ(out, begin + result->size());
1617 }
1618 
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1619 int GlobalReplaceSubstring(const string& substring,
1620                            const string& replacement,
1621                            string* s) {
1622   GOOGLE_CHECK(s != NULL);
1623   if (s->empty() || substring.empty())
1624     return 0;
1625   string tmp;
1626   int num_replacements = 0;
1627   int pos = 0;
1628   for (int match_pos = s->find(substring.data(), pos, substring.length());
1629        match_pos != string::npos;
1630        pos = match_pos + substring.length(),
1631            match_pos = s->find(substring.data(), pos, substring.length())) {
1632     ++num_replacements;
1633     // Append the original content before the match.
1634     tmp.append(*s, pos, match_pos - pos);
1635     // Append the replacement for the match.
1636     tmp.append(replacement.begin(), replacement.end());
1637   }
1638   // Append the content after the last match. If no replacements were made, the
1639   // original string is left untouched.
1640   if (num_replacements > 0) {
1641     tmp.append(*s, pos, s->length() - pos);
1642     s->swap(tmp);
1643   }
1644   return num_replacements;
1645 }
1646 
CalculateBase64EscapedLen(int input_len,bool do_padding)1647 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1648   // Base64 encodes three bytes of input at a time. If the input is not
1649   // divisible by three, we pad as appropriate.
1650   //
1651   // (from http://tools.ietf.org/html/rfc3548)
1652   // Special processing is performed if fewer than 24 bits are available
1653   // at the end of the data being encoded.  A full encoding quantum is
1654   // always completed at the end of a quantity.  When fewer than 24 input
1655   // bits are available in an input group, zero bits are added (on the
1656   // right) to form an integral number of 6-bit groups.  Padding at the
1657   // end of the data is performed using the '=' character.  Since all base
1658   // 64 input is an integral number of octets, only the following cases
1659   // can arise:
1660 
1661 
1662   // Base64 encodes each three bytes of input into four bytes of output.
1663   int len = (input_len / 3) * 4;
1664 
1665   if (input_len % 3 == 0) {
1666     // (from http://tools.ietf.org/html/rfc3548)
1667     // (1) the final quantum of encoding input is an integral multiple of 24
1668     // bits; here, the final unit of encoded output will be an integral
1669     // multiple of 4 characters with no "=" padding,
1670   } else if (input_len % 3 == 1) {
1671     // (from http://tools.ietf.org/html/rfc3548)
1672     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1673     // final unit of encoded output will be two characters followed by two
1674     // "=" padding characters, or
1675     len += 2;
1676     if (do_padding) {
1677       len += 2;
1678     }
1679   } else {  // (input_len % 3 == 2)
1680     // (from http://tools.ietf.org/html/rfc3548)
1681     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1682     // final unit of encoded output will be three characters followed by one
1683     // "=" padding character.
1684     len += 3;
1685     if (do_padding) {
1686       len += 1;
1687     }
1688   }
1689 
1690   assert(len >= input_len);  // make sure we didn't overflow
1691   return len;
1692 }
1693 
1694 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1695 int CalculateBase64EscapedLen(int input_len) {
1696   return CalculateBase64EscapedLen(input_len, true);
1697 }
1698 
1699 // ----------------------------------------------------------------------
1700 // int Base64Unescape() - base64 decoder
1701 // int Base64Escape() - base64 encoder
1702 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1703 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1704 //
1705 // Check out
1706 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1707 // care about is that...
1708 //   Take the encoded stuff in groups of 4 characters and turn each
1709 //   character into a code 0 to 63 thus:
1710 //           A-Z map to 0 to 25
1711 //           a-z map to 26 to 51
1712 //           0-9 map to 52 to 61
1713 //           +(- for WebSafe) maps to 62
1714 //           /(_ for WebSafe) maps to 63
1715 //   There will be four numbers, all less than 64 which can be represented
1716 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1717 //   Arrange the 6 digit binary numbers into three bytes as such:
1718 //   aaaaaabb bbbbcccc ccdddddd
1719 //   Equals signs (one or two) are used at the end of the encoded block to
1720 //   indicate that the text was not an integer multiple of three bytes long.
1721 // ----------------------------------------------------------------------
1722 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1723 int Base64UnescapeInternal(const char *src_param, int szsrc,
1724                            char *dest, int szdest,
1725                            const signed char* unbase64) {
1726   static const char kPad64Equals = '=';
1727   static const char kPad64Dot = '.';
1728 
1729   int decode = 0;
1730   int destidx = 0;
1731   int state = 0;
1732   unsigned int ch = 0;
1733   unsigned int temp = 0;
1734 
1735   // If "char" is signed by default, using *src as an array index results in
1736   // accessing negative array elements. Treat the input as a pointer to
1737   // unsigned char to avoid this.
1738   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1739 
1740   // The GET_INPUT macro gets the next input character, skipping
1741   // over any whitespace, and stopping when we reach the end of the
1742   // string or when we read any non-data character.  The arguments are
1743   // an arbitrary identifier (used as a label for goto) and the number
1744   // of data bytes that must remain in the input to avoid aborting the
1745   // loop.
1746 #define GET_INPUT(label, remain)                 \
1747   label:                                         \
1748     --szsrc;                                     \
1749     ch = *src++;                                 \
1750     decode = unbase64[ch];                       \
1751     if (decode < 0) {                            \
1752       if (ascii_isspace(ch) && szsrc >= remain)  \
1753         goto label;                              \
1754       state = 4 - remain;                        \
1755       break;                                     \
1756     }
1757 
1758   // if dest is null, we're just checking to see if it's legal input
1759   // rather than producing output.  (I suspect this could just be done
1760   // with a regexp...).  We duplicate the loop so this test can be
1761   // outside it instead of in every iteration.
1762 
1763   if (dest) {
1764     // This loop consumes 4 input bytes and produces 3 output bytes
1765     // per iteration.  We can't know at the start that there is enough
1766     // data left in the string for a full iteration, so the loop may
1767     // break out in the middle; if so 'state' will be set to the
1768     // number of input bytes read.
1769 
1770     while (szsrc >= 4)  {
1771       // We'll start by optimistically assuming that the next four
1772       // bytes of the string (src[0..3]) are four good data bytes
1773       // (that is, no nulls, whitespace, padding chars, or illegal
1774       // chars).  We need to test src[0..2] for nulls individually
1775       // before constructing temp to preserve the property that we
1776       // never read past a null in the string (no matter how long
1777       // szsrc claims the string is).
1778 
1779       if (!src[0] || !src[1] || !src[2] ||
1780           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1781                    (unsigned(unbase64[src[1]]) << 12) |
1782                    (unsigned(unbase64[src[2]]) << 6) |
1783                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1784         // Iff any of those four characters was bad (null, illegal,
1785         // whitespace, padding), then temp's high bit will be set
1786         // (because unbase64[] is -1 for all bad characters).
1787         //
1788         // We'll back up and resort to the slower decoder, which knows
1789         // how to handle those cases.
1790 
1791         GET_INPUT(first, 4);
1792         temp = decode;
1793         GET_INPUT(second, 3);
1794         temp = (temp << 6) | decode;
1795         GET_INPUT(third, 2);
1796         temp = (temp << 6) | decode;
1797         GET_INPUT(fourth, 1);
1798         temp = (temp << 6) | decode;
1799       } else {
1800         // We really did have four good data bytes, so advance four
1801         // characters in the string.
1802 
1803         szsrc -= 4;
1804         src += 4;
1805         decode = -1;
1806         ch = '\0';
1807       }
1808 
1809       // temp has 24 bits of input, so write that out as three bytes.
1810 
1811       if (destidx+3 > szdest) return -1;
1812       dest[destidx+2] = temp;
1813       temp >>= 8;
1814       dest[destidx+1] = temp;
1815       temp >>= 8;
1816       dest[destidx] = temp;
1817       destidx += 3;
1818     }
1819   } else {
1820     while (szsrc >= 4)  {
1821       if (!src[0] || !src[1] || !src[2] ||
1822           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1823                    (unsigned(unbase64[src[1]]) << 12) |
1824                    (unsigned(unbase64[src[2]]) << 6) |
1825                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1826         GET_INPUT(first_no_dest, 4);
1827         GET_INPUT(second_no_dest, 3);
1828         GET_INPUT(third_no_dest, 2);
1829         GET_INPUT(fourth_no_dest, 1);
1830       } else {
1831         szsrc -= 4;
1832         src += 4;
1833         decode = -1;
1834         ch = '\0';
1835       }
1836       destidx += 3;
1837     }
1838   }
1839 
1840 #undef GET_INPUT
1841 
1842   // if the loop terminated because we read a bad character, return
1843   // now.
1844   if (decode < 0 && ch != '\0' &&
1845       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1846     return -1;
1847 
1848   if (ch == kPad64Equals || ch == kPad64Dot) {
1849     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1850     // look at it again when we count to check for the proper number of
1851     // equals signs at the end.
1852     ++szsrc;
1853     --src;
1854   } else {
1855     // This loop consumes 1 input byte per iteration.  It's used to
1856     // clean up the 0-3 input bytes remaining when the first, faster
1857     // loop finishes.  'temp' contains the data from 'state' input
1858     // characters read by the first loop.
1859     while (szsrc > 0)  {
1860       --szsrc;
1861       ch = *src++;
1862       decode = unbase64[ch];
1863       if (decode < 0) {
1864         if (ascii_isspace(ch)) {
1865           continue;
1866         } else if (ch == '\0') {
1867           break;
1868         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1869           // back up one character; we'll read it again when we check
1870           // for the correct number of pad characters at the end.
1871           ++szsrc;
1872           --src;
1873           break;
1874         } else {
1875           return -1;
1876         }
1877       }
1878 
1879       // Each input character gives us six bits of output.
1880       temp = (temp << 6) | decode;
1881       ++state;
1882       if (state == 4) {
1883         // If we've accumulated 24 bits of output, write that out as
1884         // three bytes.
1885         if (dest) {
1886           if (destidx+3 > szdest) return -1;
1887           dest[destidx+2] = temp;
1888           temp >>= 8;
1889           dest[destidx+1] = temp;
1890           temp >>= 8;
1891           dest[destidx] = temp;
1892         }
1893         destidx += 3;
1894         state = 0;
1895         temp = 0;
1896       }
1897     }
1898   }
1899 
1900   // Process the leftover data contained in 'temp' at the end of the input.
1901   int expected_equals = 0;
1902   switch (state) {
1903     case 0:
1904       // Nothing left over; output is a multiple of 3 bytes.
1905       break;
1906 
1907     case 1:
1908       // Bad input; we have 6 bits left over.
1909       return -1;
1910 
1911     case 2:
1912       // Produce one more output byte from the 12 input bits we have left.
1913       if (dest) {
1914         if (destidx+1 > szdest) return -1;
1915         temp >>= 4;
1916         dest[destidx] = temp;
1917       }
1918       ++destidx;
1919       expected_equals = 2;
1920       break;
1921 
1922     case 3:
1923       // Produce two more output bytes from the 18 input bits we have left.
1924       if (dest) {
1925         if (destidx+2 > szdest) return -1;
1926         temp >>= 2;
1927         dest[destidx+1] = temp;
1928         temp >>= 8;
1929         dest[destidx] = temp;
1930       }
1931       destidx += 2;
1932       expected_equals = 1;
1933       break;
1934 
1935     default:
1936       // state should have no other values at this point.
1937       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1938   }
1939 
1940   // The remainder of the string should be all whitespace, mixed with
1941   // exactly 0 equals signs, or exactly 'expected_equals' equals
1942   // signs.  (Always accepting 0 equals signs is a google extension
1943   // not covered in the RFC, as is accepting dot as the pad character.)
1944 
1945   int equals = 0;
1946   while (szsrc > 0 && *src) {
1947     if (*src == kPad64Equals || *src == kPad64Dot)
1948       ++equals;
1949     else if (!ascii_isspace(*src))
1950       return -1;
1951     --szsrc;
1952     ++src;
1953   }
1954 
1955   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1956 }
1957 
1958 // The arrays below were generated by the following code
1959 // #include <sys/time.h>
1960 // #include <stdlib.h>
1961 // #include <string.h>
1962 // main()
1963 // {
1964 //   static const char Base64[] =
1965 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1966 //   char *pos;
1967 //   int idx, i, j;
1968 //   printf("    ");
1969 //   for (i = 0; i < 255; i += 8) {
1970 //     for (j = i; j < i + 8; j++) {
1971 //       pos = strchr(Base64, j);
1972 //       if ((pos == NULL) || (j == 0))
1973 //         idx = -1;
1974 //       else
1975 //         idx = pos - Base64;
1976 //       if (idx == -1)
1977 //         printf(" %2d,     ", idx);
1978 //       else
1979 //         printf(" %2d/*%c*/,", idx, j);
1980 //     }
1981 //     printf("\n    ");
1982 //   }
1983 // }
1984 //
1985 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1986 // tables from the functions below.
1987 static const signed char kUnBase64[] = {
1988   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1989   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1990   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1991   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1992   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1993   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1994   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1995   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1996   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1997   07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1998   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1999   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
2000   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2001   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2002   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2003   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2004   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2005   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2006   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2007   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2008   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2011   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2012   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2013   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2014   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2015   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2016   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2017   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2018   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2019   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2020 };
2021 static const signed char kUnWebSafeBase64[] = {
2022   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2023   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2024   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2025   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2026   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2027   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2028   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2029   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2030   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2031   07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2032   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2033   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2034   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2035   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2036   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2037   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2038   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2039   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2040   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2041   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2042   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2043   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2044   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2045   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2046   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2047   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2048   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2049   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2050   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2051   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2052   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2053   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2054 };
2055 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2056 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2057   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2058 }
2059 
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2060 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2061                                    const signed char* unbase64) {
2062   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2063   // 4 characters.  any leftover chars are added directly for good measure.
2064   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2065   const int dest_len = 3 * (slen / 4) + (slen % 4);
2066 
2067   dest->resize(dest_len);
2068 
2069   // We are getting the destination buffer by getting the beginning of the
2070   // string and converting it into a char *.
2071   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2072                                          dest_len, unbase64);
2073   if (len < 0) {
2074     dest->clear();
2075     return false;
2076   }
2077 
2078   // could be shorter if there was padding
2079   GOOGLE_DCHECK_LE(len, dest_len);
2080   dest->erase(len);
2081 
2082   return true;
2083 }
2084 
Base64Unescape(StringPiece src,string * dest)2085 bool Base64Unescape(StringPiece src, string* dest) {
2086   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2087 }
2088 
WebSafeBase64Unescape(StringPiece src,string * dest)2089 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2090   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2091 }
2092 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2093 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2094                          char *dest, int szdest, const char *base64,
2095                          bool do_padding) {
2096   static const char kPad64 = '=';
2097 
2098   if (szsrc <= 0) return 0;
2099 
2100   if (szsrc * 4 > szdest * 3) return 0;
2101 
2102   char *cur_dest = dest;
2103   const unsigned char *cur_src = src;
2104 
2105   char *limit_dest = dest + szdest;
2106   const unsigned char *limit_src = src + szsrc;
2107 
2108   // Three bytes of data encodes to four characters of cyphertext.
2109   // So we can pump through three-byte chunks atomically.
2110   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2111     uint32 in = BigEndian::Load32(cur_src) >> 8;
2112 
2113     cur_dest[0] = base64[in >> 18];
2114     in &= 0x3FFFF;
2115     cur_dest[1] = base64[in >> 12];
2116     in &= 0xFFF;
2117     cur_dest[2] = base64[in >> 6];
2118     in &= 0x3F;
2119     cur_dest[3] = base64[in];
2120 
2121     cur_dest += 4;
2122     cur_src += 3;
2123   }
2124   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2125   szdest = limit_dest - cur_dest;
2126   szsrc = limit_src - cur_src;
2127 
2128   /* now deal with the tail (<=3 bytes) */
2129   switch (szsrc) {
2130     case 0:
2131       // Nothing left; nothing more to do.
2132       break;
2133     case 1: {
2134       // One byte left: this encodes to two characters, and (optionally)
2135       // two pad characters to round out the four-character cypherblock.
2136       if ((szdest -= 2) < 0) return 0;
2137       uint32 in = cur_src[0];
2138       cur_dest[0] = base64[in >> 2];
2139       in &= 0x3;
2140       cur_dest[1] = base64[in << 4];
2141       cur_dest += 2;
2142       if (do_padding) {
2143         if ((szdest -= 2) < 0) return 0;
2144         cur_dest[0] = kPad64;
2145         cur_dest[1] = kPad64;
2146         cur_dest += 2;
2147       }
2148       break;
2149     }
2150     case 2: {
2151       // Two bytes left: this encodes to three characters, and (optionally)
2152       // one pad character to round out the four-character cypherblock.
2153       if ((szdest -= 3) < 0) return 0;
2154       uint32 in = BigEndian::Load16(cur_src);
2155       cur_dest[0] = base64[in >> 10];
2156       in &= 0x3FF;
2157       cur_dest[1] = base64[in >> 4];
2158       in &= 0x00F;
2159       cur_dest[2] = base64[in << 2];
2160       cur_dest += 3;
2161       if (do_padding) {
2162         if ((szdest -= 1) < 0) return 0;
2163         cur_dest[0] = kPad64;
2164         cur_dest += 1;
2165       }
2166       break;
2167     }
2168     case 3: {
2169       // Three bytes left: same as in the big loop above.  We can't do this in
2170       // the loop because the loop above always reads 4 bytes, and the fourth
2171       // byte is past the end of the input.
2172       if ((szdest -= 4) < 0) return 0;
2173       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2174       cur_dest[0] = base64[in >> 18];
2175       in &= 0x3FFFF;
2176       cur_dest[1] = base64[in >> 12];
2177       in &= 0xFFF;
2178       cur_dest[2] = base64[in >> 6];
2179       in &= 0x3F;
2180       cur_dest[3] = base64[in];
2181       cur_dest += 4;
2182       break;
2183     }
2184     default:
2185       // Should not be reached: blocks of 4 bytes are handled
2186       // in the while loop before this switch statement.
2187       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2188       break;
2189   }
2190   return (cur_dest - dest);
2191 }
2192 
2193 static const char kBase64Chars[] =
2194 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2195 
2196 static const char kWebSafeBase64Chars[] =
2197 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2198 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2199 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2200   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2201 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2202 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2203                         int szdest, bool do_padding) {
2204   return Base64EscapeInternal(src, szsrc, dest, szdest,
2205                               kWebSafeBase64Chars, do_padding);
2206 }
2207 
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2208 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2209                           string* dest, bool do_padding,
2210                           const char* base64_chars) {
2211   const int calc_escaped_size =
2212     CalculateBase64EscapedLen(szsrc, do_padding);
2213   dest->resize(calc_escaped_size);
2214   const int escaped_len = Base64EscapeInternal(src, szsrc,
2215                                                string_as_array(dest),
2216                                                dest->size(),
2217                                                base64_chars,
2218                                                do_padding);
2219   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2220   dest->erase(escaped_len);
2221 }
2222 
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2223 void Base64Escape(const unsigned char *src, int szsrc,
2224                   string* dest, bool do_padding) {
2225   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2226 }
2227 
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2229                          string *dest, bool do_padding) {
2230   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2231 }
2232 
Base64Escape(StringPiece src,string * dest)2233 void Base64Escape(StringPiece src, string* dest) {
2234   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2235                src.size(), dest, true);
2236 }
2237 
WebSafeBase64Escape(StringPiece src,string * dest)2238 void WebSafeBase64Escape(StringPiece src, string* dest) {
2239   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240                       src.size(), dest, false);
2241 }
2242 
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2243 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2244   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245                       src.size(), dest, true);
2246 }
2247 
2248 // Helper to append a Unicode code point to a string as UTF8, without bringing
2249 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2250 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2251   uint32 tmp = 0;
2252   int len = 0;
2253   if (code_point <= 0x7f) {
2254     tmp = code_point;
2255     len = 1;
2256   } else if (code_point <= 0x07ff) {
2257     tmp = 0x0000c080 |
2258         ((code_point & 0x07c0) << 2) |
2259         (code_point & 0x003f);
2260     len = 2;
2261   } else if (code_point <= 0xffff) {
2262     tmp = 0x00e08080 |
2263         ((code_point & 0xf000) << 4) |
2264         ((code_point & 0x0fc0) << 2) |
2265         (code_point & 0x003f);
2266     len = 3;
2267   } else {
2268     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2269     // normally only defined up to there as well.
2270     tmp = 0xf0808080 |
2271         ((code_point & 0x1c0000) << 6) |
2272         ((code_point & 0x03f000) << 4) |
2273         ((code_point & 0x000fc0) << 2) |
2274         (code_point & 0x003f);
2275     len = 4;
2276   }
2277   tmp = ghtonl(tmp);
2278   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2279   return len;
2280 }
2281 
2282 // Table of UTF-8 character lengths, based on first byte
2283 static const unsigned char kUTF8LenTbl[256] = {
2284   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2285   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2286   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2287   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2288 
2289   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2290   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2291   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2292   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2293 };
2294 
2295 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2296 int UTF8FirstLetterNumBytes(const char* src, int len) {
2297   if (len == 0) {
2298     return 0;
2299   }
2300   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2301 }
2302 
2303 }  // namespace protobuf
2304 }  // namespace google
2305