1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 
35 #include <errno.h>
36 #include <float.h>    // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42 
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45 
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions.  The former is buggy.  When invoked like so:
51 //   char buffer[32];
52 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
54 // trailing zeros after the decimal point.  For some reason this bug only
55 // occurs with some input values, not all.  In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59 
60 namespace google {
61 namespace protobuf {
62 
63 // These are defined as macros on some platforms.  #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67 
68 // The definitions of these in ctype.h change based on locale.  Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale.  So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73   return ('0' <= c && c <= '9') ||
74          ('a' <= c && c <= 'f') ||
75          ('A' <= c && c <= 'F');
76 }
77 
isprint(char c)78 inline bool isprint(char c) {
79   return c >= 0x20 && c <= 0x7E;
80 }
81 
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 //    Replaces any occurrence of the character 'remove' (or the characters
85 //    in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)87 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
88   const char *str_start = s->c_str();
89   const char *str = str_start;
90   for (str = strpbrk(str, remove);
91        str != nullptr;
92        str = strpbrk(str + 1, remove)) {
93     (*s)[str - str_start] = replacewith;
94   }
95 }
96 
StripWhitespace(string * str)97 void StripWhitespace(string* str) {
98   int str_length = str->length();
99 
100   // Strip off leading whitespace.
101   int first = 0;
102   while (first < str_length && ascii_isspace(str->at(first))) {
103     ++first;
104   }
105   // If entire string is white space.
106   if (first == str_length) {
107     str->clear();
108     return;
109   }
110   if (first > 0) {
111     str->erase(0, first);
112     str_length -= first;
113   }
114 
115   // Strip off trailing whitespace.
116   int last = str_length - 1;
117   while (last >= 0 && ascii_isspace(str->at(last))) {
118     --last;
119   }
120   if (last != (str_length - 1) && last >= 0) {
121     str->erase(last + 1, string::npos);
122   }
123 }
124 
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 //    Replace the "old" pattern with the "new" pattern in a string,
128 //    and append the result to "res".  If replace_all is false,
129 //    it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)132 void StringReplace(const string& s, const string& oldsub,
133                    const string& newsub, bool replace_all,
134                    string* res) {
135   if (oldsub.empty()) {
136     res->append(s);  // if empty, append the given string.
137     return;
138   }
139 
140   string::size_type start_pos = 0;
141   string::size_type pos;
142   do {
143     pos = s.find(oldsub, start_pos);
144     if (pos == string::npos) {
145       break;
146     }
147     res->append(s, start_pos, pos - start_pos);
148     res->append(newsub);
149     start_pos = pos + oldsub.size();  // start searching again after the "old"
150   } while (replace_all);
151   res->append(s, start_pos, s.length() - start_pos);
152 }
153 
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 //    Give me a string and two patterns "old" and "new", and I replace
157 //    the first instance of "old" in the string with "new", if it
158 //    exists.  If "global" is true; call this repeatedly until it
159 //    fails.  RETURN a new string, regardless of whether the replacement
160 //    happened or not.
161 // ----------------------------------------------------------------------
162 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)163 string StringReplace(const string& s, const string& oldsub,
164                      const string& newsub, bool replace_all) {
165   string ret;
166   StringReplace(s, oldsub, newsub, replace_all, &ret);
167   return ret;
168 }
169 
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 //    Split a string using a character delimiter. Append the components
173 //    to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
179 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)180 void SplitStringToIteratorUsing(const string& full,
181                                 const char* delim,
182                                 ITR& result) {
183   // Optimize the common case where delim is a single character.
184   if (delim[0] != '\0' && delim[1] == '\0') {
185     char c = delim[0];
186     const char* p = full.data();
187     const char* end = p + full.size();
188     while (p != end) {
189       if (*p == c) {
190         ++p;
191       } else {
192         const char* start = p;
193         while (++p != end && *p != c);
194         *result++ = string(start, p - start);
195       }
196     }
197     return;
198   }
199 
200   string::size_type begin_index, end_index;
201   begin_index = full.find_first_not_of(delim);
202   while (begin_index != string::npos) {
203     end_index = full.find_first_of(delim, begin_index);
204     if (end_index == string::npos) {
205       *result++ = full.substr(begin_index);
206       return;
207     }
208     *result++ = full.substr(begin_index, (end_index - begin_index));
209     begin_index = full.find_first_not_of(delim, end_index);
210   }
211 }
212 
SplitStringUsing(const string & full,const char * delim,std::vector<string> * result)213 void SplitStringUsing(const string& full,
214                       const char* delim,
215                       std::vector<string>* result) {
216   std::back_insert_iterator< std::vector<string> > it(*result);
217   SplitStringToIteratorUsing(full, delim, it);
218 }
219 
220 // Split a string using a character delimiter. Append the components
221 // to 'result'.  If there are consecutive delimiters, this function
222 // will return corresponding empty strings. The string is split into
223 // at most the specified number of pieces greedily. This means that the
224 // last piece may possibly be split further. To split into as many pieces
225 // as possible, specify 0 as the number of pieces.
226 //
227 // If "full" is the empty string, yields an empty string as the only value.
228 //
229 // If "pieces" is negative for some reason, it returns the whole string
230 // ----------------------------------------------------------------------
231 template <typename StringType, typename ITR>
232 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)233 void SplitStringToIteratorAllowEmpty(const StringType& full,
234                                      const char* delim,
235                                      int pieces,
236                                      ITR& result) {
237   string::size_type begin_index, end_index;
238   begin_index = 0;
239 
240   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
241     end_index = full.find_first_of(delim, begin_index);
242     if (end_index == string::npos) {
243       *result++ = full.substr(begin_index);
244       return;
245     }
246     *result++ = full.substr(begin_index, (end_index - begin_index));
247     begin_index = end_index + 1;
248   }
249   *result++ = full.substr(begin_index);
250 }
251 
SplitStringAllowEmpty(const string & full,const char * delim,std::vector<string> * result)252 void SplitStringAllowEmpty(const string& full, const char* delim,
253                            std::vector<string>* result) {
254   std::back_insert_iterator<std::vector<string> > it(*result);
255   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
256 }
257 
258 // ----------------------------------------------------------------------
259 // JoinStrings()
260 //    This merges a vector of string components with delim inserted
261 //    as separaters between components.
262 //
263 // ----------------------------------------------------------------------
264 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)265 static void JoinStringsIterator(const ITERATOR& start,
266                                 const ITERATOR& end,
267                                 const char* delim,
268                                 string* result) {
269   GOOGLE_CHECK(result != nullptr);
270   result->clear();
271   int delim_length = strlen(delim);
272 
273   // Precompute resulting length so we can reserve() memory in one shot.
274   int length = 0;
275   for (ITERATOR iter = start; iter != end; ++iter) {
276     if (iter != start) {
277       length += delim_length;
278     }
279     length += iter->size();
280   }
281   result->reserve(length);
282 
283   // Now combine everything.
284   for (ITERATOR iter = start; iter != end; ++iter) {
285     if (iter != start) {
286       result->append(delim, delim_length);
287     }
288     result->append(iter->data(), iter->size());
289   }
290 }
291 
JoinStrings(const std::vector<string> & components,const char * delim,string * result)292 void JoinStrings(const std::vector<string>& components,
293                  const char* delim,
294                  string * result) {
295   JoinStringsIterator(components.begin(), components.end(), delim, result);
296 }
297 
298 // ----------------------------------------------------------------------
299 // UnescapeCEscapeSequences()
300 //    This does all the unescaping that C does: \ooo, \r, \n, etc
301 //    Returns length of resulting string.
302 //    The implementation of \x parses any positive number of hex digits,
303 //    but it is an error if the value requires more than 8 bits, and the
304 //    result is truncated to 8 bits.
305 //
306 //    The second call stores its errors in a supplied string vector.
307 //    If the string vector pointer is nullptr, it reports the errors with LOG().
308 // ----------------------------------------------------------------------
309 
310 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
311 
312 // Protocol buffers doesn't ever care about errors, but I don't want to remove
313 // the code.
314 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
315 
UnescapeCEscapeSequences(const char * source,char * dest)316 int UnescapeCEscapeSequences(const char* source, char* dest) {
317   return UnescapeCEscapeSequences(source, dest, nullptr);
318 }
319 
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)320 int UnescapeCEscapeSequences(const char* source, char* dest,
321                              std::vector<string> *errors) {
322   GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
323 
324   char* d = dest;
325   const char* p = source;
326 
327   // Small optimization for case where source = dest and there's no escaping
328   while ( p == d && *p != '\0' && *p != '\\' )
329     p++, d++;
330 
331   while (*p != '\0') {
332     if (*p != '\\') {
333       *d++ = *p++;
334     } else {
335       switch ( *++p ) {                    // skip past the '\\'
336         case '\0':
337           LOG_STRING(ERROR, errors) << "String cannot end with \\";
338           *d = '\0';
339           return d - dest;   // we're done with p
340         case 'a':  *d++ = '\a';  break;
341         case 'b':  *d++ = '\b';  break;
342         case 'f':  *d++ = '\f';  break;
343         case 'n':  *d++ = '\n';  break;
344         case 'r':  *d++ = '\r';  break;
345         case 't':  *d++ = '\t';  break;
346         case 'v':  *d++ = '\v';  break;
347         case '\\': *d++ = '\\';  break;
348         case '?':  *d++ = '\?';  break;    // \?  Who knew?
349         case '\'': *d++ = '\'';  break;
350         case '"':  *d++ = '\"';  break;
351         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
352         case '4': case '5': case '6': case '7': {
353           char ch = *p - '0';
354           if ( IS_OCTAL_DIGIT(p[1]) )
355             ch = ch * 8 + *++p - '0';
356           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
357             ch = ch * 8 + *++p - '0';      // now points at last digit
358           *d++ = ch;
359           break;
360         }
361         case 'x': case 'X': {
362           if (!isxdigit(p[1])) {
363             if (p[1] == '\0') {
364               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
365             } else {
366               LOG_STRING(ERROR, errors) <<
367                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
368             }
369             break;
370           }
371           unsigned int ch = 0;
372           const char *hex_start = p;
373           while (isxdigit(p[1]))  // arbitrarily many hex digits
374             ch = (ch << 4) + hex_digit_to_int(*++p);
375           if (ch > 0xFF)
376             LOG_STRING(ERROR, errors) << "Value of " <<
377               "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
378           *d++ = ch;
379           break;
380         }
381 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
382         case 'u': {
383           // \uhhhh => convert 4 hex digits to UTF-8
384           char32 rune = 0;
385           const char *hex_start = p;
386           for (int i = 0; i < 4; ++i) {
387             if (isxdigit(p[1])) {  // Look one char ahead.
388               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
389             } else {
390               LOG_STRING(ERROR, errors)
391                 << "\\u must be followed by 4 hex digits: \\"
392                 <<  string(hex_start, p+1-hex_start);
393               break;
394             }
395           }
396           d += runetochar(d, &rune);
397           break;
398         }
399         case 'U': {
400           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
401           char32 rune = 0;
402           const char *hex_start = p;
403           for (int i = 0; i < 8; ++i) {
404             if (isxdigit(p[1])) {  // Look one char ahead.
405               // Don't change rune until we're sure this
406               // is within the Unicode limit, but do advance p.
407               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
408               if (newrune > 0x10FFFF) {
409                 LOG_STRING(ERROR, errors)
410                   << "Value of \\"
411                   << string(hex_start, p + 1 - hex_start)
412                   << " exceeds Unicode limit (0x10FFFF)";
413                 break;
414               } else {
415                 rune = newrune;
416               }
417             } else {
418               LOG_STRING(ERROR, errors)
419                 << "\\U must be followed by 8 hex digits: \\"
420                 <<  string(hex_start, p+1-hex_start);
421               break;
422             }
423           }
424           d += runetochar(d, &rune);
425           break;
426         }
427 #endif
428         default:
429           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
430       }
431       p++;                                 // read past letter we escaped
432     }
433   }
434   *d = '\0';
435   return d - dest;
436 }
437 
438 // ----------------------------------------------------------------------
439 // UnescapeCEscapeString()
440 //    This does the same thing as UnescapeCEscapeSequences, but creates
441 //    a new string. The caller does not need to worry about allocating
442 //    a dest buffer. This should be used for non performance critical
443 //    tasks such as printing debug messages. It is safe for src and dest
444 //    to be the same.
445 //
446 //    The second call stores its errors in a supplied string vector.
447 //    If the string vector pointer is nullptr, it reports the errors with LOG().
448 //
449 //    In the first and second calls, the length of dest is returned. In the
450 //    the third call, the new string is returned.
451 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)452 int UnescapeCEscapeString(const string& src, string* dest) {
453   return UnescapeCEscapeString(src, dest, nullptr);
454 }
455 
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)456 int UnescapeCEscapeString(const string& src, string* dest,
457                           std::vector<string> *errors) {
458   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
459   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
460   GOOGLE_CHECK(dest);
461   dest->assign(unescaped.get(), len);
462   return len;
463 }
464 
UnescapeCEscapeString(const string & src)465 string UnescapeCEscapeString(const string& src) {
466   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
467   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
468   return string(unescaped.get(), len);
469 }
470 
471 // ----------------------------------------------------------------------
472 // CEscapeString()
473 // CHexEscapeString()
474 //    Copies 'src' to 'dest', escaping dangerous characters using
475 //    C-style escape sequences. This is very useful for preparing query
476 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
477 //    hexadecimal rather than octal sequences.
478 //    Returns the number of bytes written to 'dest' (not including the \0)
479 //    or -1 if there was insufficient space.
480 //
481 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
482 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)483 int CEscapeInternal(const char* src, int src_len, char* dest,
484                     int dest_len, bool use_hex, bool utf8_safe) {
485   const char* src_end = src + src_len;
486   int used = 0;
487   bool last_hex_escape = false; // true if last output char was \xNN
488 
489   for (; src < src_end; src++) {
490     if (dest_len - used < 2)   // Need space for two letter escape
491       return -1;
492 
493     bool is_hex_escape = false;
494     switch (*src) {
495       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
496       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
497       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
498       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
499       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
500       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
501       default:
502         // Note that if we emit \xNN and the src character after that is a hex
503         // digit then that digit must be escaped too to prevent it being
504         // interpreted as part of the character code by C.
505         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
506             (!isprint(*src) ||
507              (last_hex_escape && isxdigit(*src)))) {
508           if (dest_len - used < 4) // need space for 4 letter escape
509             return -1;
510           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
511                   static_cast<uint8>(*src));
512           is_hex_escape = use_hex;
513           used += 4;
514         } else {
515           dest[used++] = *src; break;
516         }
517     }
518     last_hex_escape = is_hex_escape;
519   }
520 
521   if (dest_len - used < 1)   // make sure that there is room for \0
522     return -1;
523 
524   dest[used] = '\0';   // doesn't count towards return value though
525   return used;
526 }
527 
528 // Calculates the length of the C-style escaped version of 'src'.
529 // Assumes that non-printable characters are escaped using octal sequences, and
530 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)531 static inline size_t CEscapedLength(StringPiece src) {
532   static char c_escaped_len[256] = {
533     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
534     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
535     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
536     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
537     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
538     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
539     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
540     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
541     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
547     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
548     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
549   };
550 
551   size_t escaped_len = 0;
552   for (int i = 0; i < src.size(); ++i) {
553     unsigned char c = static_cast<unsigned char>(src[i]);
554     escaped_len += c_escaped_len[c];
555   }
556   return escaped_len;
557 }
558 
559 // ----------------------------------------------------------------------
560 // Escapes 'src' using C-style escape sequences, and appends the escaped string
561 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
562 // the required space using a lookup table, and also does not do any special
563 // handling for Hex or UTF-8 characters.
564 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)565 void CEscapeAndAppend(StringPiece src, string* dest) {
566   size_t escaped_len = CEscapedLength(src);
567   if (escaped_len == src.size()) {
568     dest->append(src.data(), src.size());
569     return;
570   }
571 
572   size_t cur_dest_len = dest->size();
573   dest->resize(cur_dest_len + escaped_len);
574   char* append_ptr = &(*dest)[cur_dest_len];
575 
576   for (int i = 0; i < src.size(); ++i) {
577     unsigned char c = static_cast<unsigned char>(src[i]);
578     switch (c) {
579       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
580       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
581       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
582       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
583       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
584       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
585       default:
586         if (!isprint(c)) {
587           *append_ptr++ = '\\';
588           *append_ptr++ = '0' + c / 64;
589           *append_ptr++ = '0' + (c % 64) / 8;
590           *append_ptr++ = '0' + c % 8;
591         } else {
592           *append_ptr++ = c;
593         }
594         break;
595     }
596   }
597 }
598 
CEscape(const string & src)599 string CEscape(const string& src) {
600   string dest;
601   CEscapeAndAppend(src, &dest);
602   return dest;
603 }
604 
605 namespace strings {
606 
Utf8SafeCEscape(const string & src)607 string Utf8SafeCEscape(const string& src) {
608   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
609   std::unique_ptr<char[]> dest(new char[dest_length]);
610   const int len = CEscapeInternal(src.data(), src.size(),
611                                   dest.get(), dest_length, false, true);
612   GOOGLE_DCHECK_GE(len, 0);
613   return string(dest.get(), len);
614 }
615 
CHexEscape(const string & src)616 string CHexEscape(const string& src) {
617   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
618   std::unique_ptr<char[]> dest(new char[dest_length]);
619   const int len = CEscapeInternal(src.data(), src.size(),
620                                   dest.get(), dest_length, true, false);
621   GOOGLE_DCHECK_GE(len, 0);
622   return string(dest.get(), len);
623 }
624 
625 }  // namespace strings
626 
627 // ----------------------------------------------------------------------
628 // strto32_adaptor()
629 // strtou32_adaptor()
630 //    Implementation of strto[u]l replacements that have identical
631 //    overflow and underflow characteristics for both ILP-32 and LP-64
632 //    platforms, including errno preservation in error-free calls.
633 // ----------------------------------------------------------------------
634 
strto32_adaptor(const char * nptr,char ** endptr,int base)635 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
636   const int saved_errno = errno;
637   errno = 0;
638   const long result = strtol(nptr, endptr, base);
639   if (errno == ERANGE && result == LONG_MIN) {
640     return kint32min;
641   } else if (errno == ERANGE && result == LONG_MAX) {
642     return kint32max;
643   } else if (errno == 0 && result < kint32min) {
644     errno = ERANGE;
645     return kint32min;
646   } else if (errno == 0 && result > kint32max) {
647     errno = ERANGE;
648     return kint32max;
649   }
650   if (errno == 0)
651     errno = saved_errno;
652   return static_cast<int32>(result);
653 }
654 
strtou32_adaptor(const char * nptr,char ** endptr,int base)655 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
656   const int saved_errno = errno;
657   errno = 0;
658   const unsigned long result = strtoul(nptr, endptr, base);
659   if (errno == ERANGE && result == ULONG_MAX) {
660     return kuint32max;
661   } else if (errno == 0 && result > kuint32max) {
662     errno = ERANGE;
663     return kuint32max;
664   }
665   if (errno == 0)
666     errno = saved_errno;
667   return static_cast<uint32>(result);
668 }
669 
safe_parse_sign(string * text,bool * negative_ptr)670 inline bool safe_parse_sign(string* text  /*inout*/,
671                             bool* negative_ptr  /*output*/) {
672   const char* start = text->data();
673   const char* end = start + text->size();
674 
675   // Consume whitespace.
676   while (start < end && (start[0] == ' ')) {
677     ++start;
678   }
679   while (start < end && (end[-1] == ' ')) {
680     --end;
681   }
682   if (start >= end) {
683     return false;
684   }
685 
686   // Consume sign.
687   *negative_ptr = (start[0] == '-');
688   if (*negative_ptr || start[0] == '+') {
689     ++start;
690     if (start >= end) {
691       return false;
692     }
693   }
694   *text = text->substr(start - text->data(), end - start);
695   return true;
696 }
697 
698 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)699 bool safe_parse_positive_int(
700     string text, IntType* value_p) {
701   int base = 10;
702   IntType value = 0;
703   const IntType vmax = std::numeric_limits<IntType>::max();
704   assert(vmax > 0);
705   assert(vmax >= base);
706   const IntType vmax_over_base = vmax / base;
707   const char* start = text.data();
708   const char* end = start + text.size();
709   // loop over digits
710   for (; start < end; ++start) {
711     unsigned char c = static_cast<unsigned char>(start[0]);
712     int digit = c - '0';
713     if (digit >= base || digit < 0) {
714       *value_p = value;
715       return false;
716     }
717     if (value > vmax_over_base) {
718       *value_p = vmax;
719       return false;
720     }
721     value *= base;
722     if (value > vmax - digit) {
723       *value_p = vmax;
724       return false;
725     }
726     value += digit;
727   }
728   *value_p = value;
729   return true;
730 }
731 
732 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)733 bool safe_parse_negative_int(
734     const string& text, IntType* value_p) {
735   int base = 10;
736   IntType value = 0;
737   const IntType vmin = std::numeric_limits<IntType>::min();
738   assert(vmin < 0);
739   assert(vmin <= 0 - base);
740   IntType vmin_over_base = vmin / base;
741   // 2003 c++ standard [expr.mul]
742   // "... the sign of the remainder is implementation-defined."
743   // Although (vmin/base)*base + vmin%base is always vmin.
744   // 2011 c++ standard tightens the spec but we cannot rely on it.
745   if (vmin % base > 0) {
746     vmin_over_base += 1;
747   }
748   const char* start = text.data();
749   const char* end = start + text.size();
750   // loop over digits
751   for (; start < end; ++start) {
752     unsigned char c = static_cast<unsigned char>(start[0]);
753     int digit = c - '0';
754     if (digit >= base || digit < 0) {
755       *value_p = value;
756       return false;
757     }
758     if (value < vmin_over_base) {
759       *value_p = vmin;
760       return false;
761     }
762     value *= base;
763     if (value < vmin + digit) {
764       *value_p = vmin;
765       return false;
766     }
767     value -= digit;
768   }
769   *value_p = value;
770   return true;
771 }
772 
773 template<typename IntType>
safe_int_internal(string text,IntType * value_p)774 bool safe_int_internal(string text, IntType* value_p) {
775   *value_p = 0;
776   bool negative;
777   if (!safe_parse_sign(&text, &negative)) {
778     return false;
779   }
780   if (!negative) {
781     return safe_parse_positive_int(text, value_p);
782   } else {
783     return safe_parse_negative_int(text, value_p);
784   }
785 }
786 
787 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)788 bool safe_uint_internal(string text, IntType* value_p) {
789   *value_p = 0;
790   bool negative;
791   if (!safe_parse_sign(&text, &negative) || negative) {
792     return false;
793   }
794   return safe_parse_positive_int(text, value_p);
795 }
796 
797 // ----------------------------------------------------------------------
798 // FastIntToBuffer()
799 // FastInt64ToBuffer()
800 // FastHexToBuffer()
801 // FastHex64ToBuffer()
802 // FastHex32ToBuffer()
803 // ----------------------------------------------------------------------
804 
805 // Offset into buffer where FastInt64ToBuffer places the end of string
806 // null character.  Also used by FastInt64ToBufferLeft.
807 static const int kFastInt64ToBufferOffset = 21;
808 
FastInt64ToBuffer(int64 i,char * buffer)809 char *FastInt64ToBuffer(int64 i, char* buffer) {
810   // We could collapse the positive and negative sections, but that
811   // would be slightly slower for positive numbers...
812   // 22 bytes is enough to store -2**64, -18446744073709551616.
813   char* p = buffer + kFastInt64ToBufferOffset;
814   *p-- = '\0';
815   if (i >= 0) {
816     do {
817       *p-- = '0' + i % 10;
818       i /= 10;
819     } while (i > 0);
820     return p + 1;
821   } else {
822     // On different platforms, % and / have different behaviors for
823     // negative numbers, so we need to jump through hoops to make sure
824     // we don't divide negative numbers.
825     if (i > -10) {
826       i = -i;
827       *p-- = '0' + i;
828       *p = '-';
829       return p;
830     } else {
831       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
832       i = i + 10;
833       i = -i;
834       *p-- = '0' + i % 10;
835       // Undo what we did a moment ago
836       i = i / 10 + 1;
837       do {
838         *p-- = '0' + i % 10;
839         i /= 10;
840       } while (i > 0);
841       *p = '-';
842       return p;
843     }
844   }
845 }
846 
847 // Offset into buffer where FastInt32ToBuffer places the end of string
848 // null character.  Also used by FastInt32ToBufferLeft
849 static const int kFastInt32ToBufferOffset = 11;
850 
851 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
852 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
853 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)854 char *FastInt32ToBuffer(int32 i, char* buffer) {
855   // We could collapse the positive and negative sections, but that
856   // would be slightly slower for positive numbers...
857   // 12 bytes is enough to store -2**32, -4294967296.
858   char* p = buffer + kFastInt32ToBufferOffset;
859   *p-- = '\0';
860   if (i >= 0) {
861     do {
862       *p-- = '0' + i % 10;
863       i /= 10;
864     } while (i > 0);
865     return p + 1;
866   } else {
867     // On different platforms, % and / have different behaviors for
868     // negative numbers, so we need to jump through hoops to make sure
869     // we don't divide negative numbers.
870     if (i > -10) {
871       i = -i;
872       *p-- = '0' + i;
873       *p = '-';
874       return p;
875     } else {
876       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
877       i = i + 10;
878       i = -i;
879       *p-- = '0' + i % 10;
880       // Undo what we did a moment ago
881       i = i / 10 + 1;
882       do {
883         *p-- = '0' + i % 10;
884         i /= 10;
885       } while (i > 0);
886       *p = '-';
887       return p;
888     }
889   }
890 }
891 
FastHexToBuffer(int i,char * buffer)892 char *FastHexToBuffer(int i, char* buffer) {
893   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
894 
895   static const char *hexdigits = "0123456789abcdef";
896   char *p = buffer + 21;
897   *p-- = '\0';
898   do {
899     *p-- = hexdigits[i & 15];   // mod by 16
900     i >>= 4;                    // divide by 16
901   } while (i > 0);
902   return p + 1;
903 }
904 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)905 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
906   static const char *hexdigits = "0123456789abcdef";
907   buffer[num_byte] = '\0';
908   for (int i = num_byte - 1; i >= 0; i--) {
909 #ifdef _M_X64
910     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
911     // block. Given that the uint32 cast was to improve performance on 32-bit
912     // platforms, we use 64-bit '&' directly.
913     buffer[i] = hexdigits[value & 0xf];
914 #else
915     buffer[i] = hexdigits[uint32(value) & 0xf];
916 #endif
917     value >>= 4;
918   }
919   return buffer;
920 }
921 
FastHex64ToBuffer(uint64 value,char * buffer)922 char *FastHex64ToBuffer(uint64 value, char* buffer) {
923   return InternalFastHexToBuffer(value, buffer, 16);
924 }
925 
FastHex32ToBuffer(uint32 value,char * buffer)926 char *FastHex32ToBuffer(uint32 value, char* buffer) {
927   return InternalFastHexToBuffer(value, buffer, 8);
928 }
929 
930 // ----------------------------------------------------------------------
931 // FastInt32ToBufferLeft()
932 // FastUInt32ToBufferLeft()
933 // FastInt64ToBufferLeft()
934 // FastUInt64ToBufferLeft()
935 //
936 // Like the Fast*ToBuffer() functions above, these are intended for speed.
937 // Unlike the Fast*ToBuffer() functions, however, these functions write
938 // their output to the beginning of the buffer (hence the name, as the
939 // output is left-aligned).  The caller is responsible for ensuring that
940 // the buffer has enough space to hold the output.
941 //
942 // Returns a pointer to the end of the string (i.e. the null character
943 // terminating the string).
944 // ----------------------------------------------------------------------
945 
946 static const char two_ASCII_digits[100][2] = {
947   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
948   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
949   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
950   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
951   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
952   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
953   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
954   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
955   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
956   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
957   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
958   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
959   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
960   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
961   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
962   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
963   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
964   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
965   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
966   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
967 };
968 
FastUInt32ToBufferLeft(uint32 u,char * buffer)969 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
970   uint32 digits;
971   const char *ASCII_digits = nullptr;
972   // The idea of this implementation is to trim the number of divides to as few
973   // as possible by using multiplication and subtraction rather than mod (%),
974   // and by outputting two digits at a time rather than one.
975   // The huge-number case is first, in the hopes that the compiler will output
976   // that case in one branch-free block of code, and only output conditional
977   // branches into it from below.
978   if (u >= 1000000000) {  // >= 1,000,000,000
979     digits = u / 100000000;  // 100,000,000
980     ASCII_digits = two_ASCII_digits[digits];
981     buffer[0] = ASCII_digits[0];
982     buffer[1] = ASCII_digits[1];
983     buffer += 2;
984 sublt100_000_000:
985     u -= digits * 100000000;  // 100,000,000
986 lt100_000_000:
987     digits = u / 1000000;  // 1,000,000
988     ASCII_digits = two_ASCII_digits[digits];
989     buffer[0] = ASCII_digits[0];
990     buffer[1] = ASCII_digits[1];
991     buffer += 2;
992 sublt1_000_000:
993     u -= digits * 1000000;  // 1,000,000
994 lt1_000_000:
995     digits = u / 10000;  // 10,000
996     ASCII_digits = two_ASCII_digits[digits];
997     buffer[0] = ASCII_digits[0];
998     buffer[1] = ASCII_digits[1];
999     buffer += 2;
1000 sublt10_000:
1001     u -= digits * 10000;  // 10,000
1002 lt10_000:
1003     digits = u / 100;
1004     ASCII_digits = two_ASCII_digits[digits];
1005     buffer[0] = ASCII_digits[0];
1006     buffer[1] = ASCII_digits[1];
1007     buffer += 2;
1008 sublt100:
1009     u -= digits * 100;
1010 lt100:
1011     digits = u;
1012     ASCII_digits = two_ASCII_digits[digits];
1013     buffer[0] = ASCII_digits[0];
1014     buffer[1] = ASCII_digits[1];
1015     buffer += 2;
1016 done:
1017     *buffer = 0;
1018     return buffer;
1019   }
1020 
1021   if (u < 100) {
1022     digits = u;
1023     if (u >= 10) goto lt100;
1024     *buffer++ = '0' + digits;
1025     goto done;
1026   }
1027   if (u  <  10000) {   // 10,000
1028     if (u >= 1000) goto lt10_000;
1029     digits = u / 100;
1030     *buffer++ = '0' + digits;
1031     goto sublt100;
1032   }
1033   if (u  <  1000000) {   // 1,000,000
1034     if (u >= 100000) goto lt1_000_000;
1035     digits = u / 10000;  //    10,000
1036     *buffer++ = '0' + digits;
1037     goto sublt10_000;
1038   }
1039   if (u  <  100000000) {   // 100,000,000
1040     if (u >= 10000000) goto lt100_000_000;
1041     digits = u / 1000000;  //   1,000,000
1042     *buffer++ = '0' + digits;
1043     goto sublt1_000_000;
1044   }
1045   // we already know that u < 1,000,000,000
1046   digits = u / 100000000;   // 100,000,000
1047   *buffer++ = '0' + digits;
1048   goto sublt100_000_000;
1049 }
1050 
FastInt32ToBufferLeft(int32 i,char * buffer)1051 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1052   uint32 u = 0;
1053   if (i < 0) {
1054     *buffer++ = '-';
1055     u -= i;
1056   } else {
1057     u = i;
1058   }
1059   return FastUInt32ToBufferLeft(u, buffer);
1060 }
1061 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1062 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1063   int digits;
1064   const char *ASCII_digits = nullptr;
1065 
1066   uint32 u = static_cast<uint32>(u64);
1067   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1068 
1069   uint64 top_11_digits = u64 / 1000000000;
1070   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1071   u = u64 - (top_11_digits * 1000000000);
1072 
1073   digits = u / 10000000;  // 10,000,000
1074   GOOGLE_DCHECK_LT(digits, 100);
1075   ASCII_digits = two_ASCII_digits[digits];
1076   buffer[0] = ASCII_digits[0];
1077   buffer[1] = ASCII_digits[1];
1078   buffer += 2;
1079   u -= digits * 10000000;  // 10,000,000
1080   digits = u / 100000;  // 100,000
1081   ASCII_digits = two_ASCII_digits[digits];
1082   buffer[0] = ASCII_digits[0];
1083   buffer[1] = ASCII_digits[1];
1084   buffer += 2;
1085   u -= digits * 100000;  // 100,000
1086   digits = u / 1000;  // 1,000
1087   ASCII_digits = two_ASCII_digits[digits];
1088   buffer[0] = ASCII_digits[0];
1089   buffer[1] = ASCII_digits[1];
1090   buffer += 2;
1091   u -= digits * 1000;  // 1,000
1092   digits = u / 10;
1093   ASCII_digits = two_ASCII_digits[digits];
1094   buffer[0] = ASCII_digits[0];
1095   buffer[1] = ASCII_digits[1];
1096   buffer += 2;
1097   u -= digits * 10;
1098   digits = u;
1099   *buffer++ = '0' + digits;
1100   *buffer = 0;
1101   return buffer;
1102 }
1103 
FastInt64ToBufferLeft(int64 i,char * buffer)1104 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1105   uint64 u = 0;
1106   if (i < 0) {
1107     *buffer++ = '-';
1108     u -= i;
1109   } else {
1110     u = i;
1111   }
1112   return FastUInt64ToBufferLeft(u, buffer);
1113 }
1114 
1115 // ----------------------------------------------------------------------
1116 // SimpleItoa()
1117 //    Description: converts an integer to a string.
1118 //
1119 //    Return value: string
1120 // ----------------------------------------------------------------------
1121 
SimpleItoa(int i)1122 string SimpleItoa(int i) {
1123   char buffer[kFastToBufferSize];
1124   return (sizeof(i) == 4) ?
1125     FastInt32ToBuffer(i, buffer) :
1126     FastInt64ToBuffer(i, buffer);
1127 }
1128 
SimpleItoa(unsigned int i)1129 string SimpleItoa(unsigned int i) {
1130   char buffer[kFastToBufferSize];
1131   return string(buffer, (sizeof(i) == 4) ?
1132     FastUInt32ToBufferLeft(i, buffer) :
1133     FastUInt64ToBufferLeft(i, buffer));
1134 }
1135 
SimpleItoa(long i)1136 string SimpleItoa(long i) {
1137   char buffer[kFastToBufferSize];
1138   return (sizeof(i) == 4) ?
1139     FastInt32ToBuffer(i, buffer) :
1140     FastInt64ToBuffer(i, buffer);
1141 }
1142 
SimpleItoa(unsigned long i)1143 string SimpleItoa(unsigned long i) {
1144   char buffer[kFastToBufferSize];
1145   return string(buffer, (sizeof(i) == 4) ?
1146     FastUInt32ToBufferLeft(i, buffer) :
1147     FastUInt64ToBufferLeft(i, buffer));
1148 }
1149 
SimpleItoa(long long i)1150 string SimpleItoa(long long i) {
1151   char buffer[kFastToBufferSize];
1152   return (sizeof(i) == 4) ?
1153     FastInt32ToBuffer(i, buffer) :
1154     FastInt64ToBuffer(i, buffer);
1155 }
1156 
SimpleItoa(unsigned long long i)1157 string SimpleItoa(unsigned long long i) {
1158   char buffer[kFastToBufferSize];
1159   return string(buffer, (sizeof(i) == 4) ?
1160     FastUInt32ToBufferLeft(i, buffer) :
1161     FastUInt64ToBufferLeft(i, buffer));
1162 }
1163 
1164 // ----------------------------------------------------------------------
1165 // SimpleDtoa()
1166 // SimpleFtoa()
1167 // DoubleToBuffer()
1168 // FloatToBuffer()
1169 //    We want to print the value without losing precision, but we also do
1170 //    not want to print more digits than necessary.  This turns out to be
1171 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1172 //    exactly in binary.  If we print 0.2 with a very large precision,
1173 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1174 //    On the other hand, if we set the precision too low, we lose
1175 //    significant digits when printing numbers that actually need them.
1176 //    It turns out there is no precision value that does the right thing
1177 //    for all numbers.
1178 //
1179 //    Our strategy is to first try printing with a precision that is never
1180 //    over-precise, then parse the result with strtod() to see if it
1181 //    matches.  If not, we print again with a precision that will always
1182 //    give a precise result, but may use more digits than necessary.
1183 //
1184 //    An arguably better strategy would be to use the algorithm described
1185 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1186 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1187 //    however, that the following implementation is about as fast as
1188 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1189 //    will not scale well on multi-core machines.  DMG's code is slightly
1190 //    more accurate (in that it will never use more digits than
1191 //    necessary), but this is probably irrelevant for most users.
1192 //
1193 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1194 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1195 //    one in that it makes guesses and then uses strtod() to check them.
1196 //    Their implementation is faster because they use their own code to
1197 //    generate the digits in the first place rather than use snprintf(),
1198 //    thus avoiding format string parsing overhead.  However, this makes
1199 //    it considerably more complicated than the following implementation,
1200 //    and it is embedded in a larger library.  If speed turns out to be
1201 //    an issue, we could re-implement this in terms of their
1202 //    implementation.
1203 // ----------------------------------------------------------------------
1204 
SimpleDtoa(double value)1205 string SimpleDtoa(double value) {
1206   char buffer[kDoubleToBufferSize];
1207   return DoubleToBuffer(value, buffer);
1208 }
1209 
SimpleFtoa(float value)1210 string SimpleFtoa(float value) {
1211   char buffer[kFloatToBufferSize];
1212   return FloatToBuffer(value, buffer);
1213 }
1214 
IsValidFloatChar(char c)1215 static inline bool IsValidFloatChar(char c) {
1216   return ('0' <= c && c <= '9') ||
1217          c == 'e' || c == 'E' ||
1218          c == '+' || c == '-';
1219 }
1220 
DelocalizeRadix(char * buffer)1221 void DelocalizeRadix(char* buffer) {
1222   // Fast check:  if the buffer has a normal decimal point, assume no
1223   // translation is needed.
1224   if (strchr(buffer, '.') != nullptr) return;
1225 
1226   // Find the first unknown character.
1227   while (IsValidFloatChar(*buffer)) ++buffer;
1228 
1229   if (*buffer == '\0') {
1230     // No radix character found.
1231     return;
1232   }
1233 
1234   // We are now pointing at the locale-specific radix character.  Replace it
1235   // with '.'.
1236   *buffer = '.';
1237   ++buffer;
1238 
1239   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1240     // It appears the radix was a multi-byte character.  We need to remove the
1241     // extra bytes.
1242     char* target = buffer;
1243     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1244     memmove(target, buffer, strlen(buffer) + 1);
1245   }
1246 }
1247 
DoubleToBuffer(double value,char * buffer)1248 char* DoubleToBuffer(double value, char* buffer) {
1249   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1250   // platforms these days.  Just in case some system exists where DBL_DIG
1251   // is significantly larger -- and risks overflowing our buffer -- we have
1252   // this assert.
1253   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1254 
1255   if (value == std::numeric_limits<double>::infinity()) {
1256     strcpy(buffer, "inf");
1257     return buffer;
1258   } else if (value == -std::numeric_limits<double>::infinity()) {
1259     strcpy(buffer, "-inf");
1260     return buffer;
1261   } else if (std::isnan(value)) {
1262     strcpy(buffer, "nan");
1263     return buffer;
1264   }
1265 
1266   int snprintf_result =
1267     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1268 
1269   // The snprintf should never overflow because the buffer is significantly
1270   // larger than the precision we asked for.
1271   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1272 
1273   // We need to make parsed_value volatile in order to force the compiler to
1274   // write it out to the stack.  Otherwise, it may keep the value in a
1275   // register, and if it does that, it may keep it as a long double instead
1276   // of a double.  This long double may have extra bits that make it compare
1277   // unequal to "value" even though it would be exactly equal if it were
1278   // truncated to a double.
1279   volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1280   if (parsed_value != value) {
1281     int snprintf_result =
1282       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1283 
1284     // Should never overflow; see above.
1285     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1286   }
1287 
1288   DelocalizeRadix(buffer);
1289   return buffer;
1290 }
1291 
memcasecmp(const char * s1,const char * s2,size_t len)1292 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1293   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1294   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1295 
1296   for ( int i = 0; i < len; i++ ) {
1297     const int diff =
1298       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1299       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1300     if (diff != 0) return diff;
1301   }
1302   return 0;
1303 }
1304 
CaseEqual(StringPiece s1,StringPiece s2)1305 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1306   if (s1.size() != s2.size()) return false;
1307   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1308 }
1309 
safe_strtob(StringPiece str,bool * value)1310 bool safe_strtob(StringPiece str, bool* value) {
1311   GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1312   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1313       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1314       CaseEqual(str, "1")) {
1315     *value = true;
1316     return true;
1317   }
1318   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1319       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1320       CaseEqual(str, "0")) {
1321     *value = false;
1322     return true;
1323   }
1324   return false;
1325 }
1326 
safe_strtof(const char * str,float * value)1327 bool safe_strtof(const char* str, float* value) {
1328   char* endptr;
1329   errno = 0;  // errno only gets set on errors
1330 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1331   *value = internal::NoLocaleStrtod(str, &endptr);
1332 #else
1333   *value = strtof(str, &endptr);
1334 #endif
1335   return *str != 0 && *endptr == 0 && errno == 0;
1336 }
1337 
safe_strtod(const char * str,double * value)1338 bool safe_strtod(const char* str, double* value) {
1339   char* endptr;
1340   *value = internal::NoLocaleStrtod(str, &endptr);
1341   if (endptr != str) {
1342     while (ascii_isspace(*endptr)) ++endptr;
1343   }
1344   // Ignore range errors from strtod.  The values it
1345   // returns on underflow and overflow are the right
1346   // fallback in a robust setting.
1347   return *str != '\0' && *endptr == '\0';
1348 }
1349 
safe_strto32(const string & str,int32 * value)1350 bool safe_strto32(const string& str, int32* value) {
1351   return safe_int_internal(str, value);
1352 }
1353 
safe_strtou32(const string & str,uint32 * value)1354 bool safe_strtou32(const string& str, uint32* value) {
1355   return safe_uint_internal(str, value);
1356 }
1357 
safe_strto64(const string & str,int64 * value)1358 bool safe_strto64(const string& str, int64* value) {
1359   return safe_int_internal(str, value);
1360 }
1361 
safe_strtou64(const string & str,uint64 * value)1362 bool safe_strtou64(const string& str, uint64* value) {
1363   return safe_uint_internal(str, value);
1364 }
1365 
FloatToBuffer(float value,char * buffer)1366 char* FloatToBuffer(float value, char* buffer) {
1367   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1368   // platforms these days.  Just in case some system exists where FLT_DIG
1369   // is significantly larger -- and risks overflowing our buffer -- we have
1370   // this assert.
1371   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1372 
1373   if (value == std::numeric_limits<double>::infinity()) {
1374     strcpy(buffer, "inf");
1375     return buffer;
1376   } else if (value == -std::numeric_limits<double>::infinity()) {
1377     strcpy(buffer, "-inf");
1378     return buffer;
1379   } else if (std::isnan(value)) {
1380     strcpy(buffer, "nan");
1381     return buffer;
1382   }
1383 
1384   int snprintf_result =
1385     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1386 
1387   // The snprintf should never overflow because the buffer is significantly
1388   // larger than the precision we asked for.
1389   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1390 
1391   float parsed_value;
1392   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1393     int snprintf_result =
1394       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1395 
1396     // Should never overflow; see above.
1397     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1398   }
1399 
1400   DelocalizeRadix(buffer);
1401   return buffer;
1402 }
1403 
1404 namespace strings {
1405 
AlphaNum(strings::Hex hex)1406 AlphaNum::AlphaNum(strings::Hex hex) {
1407   char *const end = &digits[kFastToBufferSize];
1408   char *writer = end;
1409   uint64 value = hex.value;
1410   uint64 width = hex.spec;
1411   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1412   // where 0x10000 is the smallest hex number that is as wide as the user
1413   // asked for.
1414   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1415   static const char hexdigits[] = "0123456789abcdef";
1416   do {
1417     *--writer = hexdigits[value & 0xF];
1418     value >>= 4;
1419     mask >>= 4;
1420   } while (mask != 0);
1421   piece_data_ = writer;
1422   piece_size_ = end - writer;
1423 }
1424 
1425 }  // namespace strings
1426 
1427 // ----------------------------------------------------------------------
1428 // StrCat()
1429 //    This merges the given strings or integers, with no delimiter.  This
1430 //    is designed to be the fastest possible way to construct a string out
1431 //    of a mix of raw C strings, C++ strings, and integer values.
1432 // ----------------------------------------------------------------------
1433 
1434 // Append is merely a version of memcpy that returns the address of the byte
1435 // after the area just overwritten.  It comes in multiple flavors to minimize
1436 // call overhead.
Append1(char * out,const AlphaNum & x)1437 static char *Append1(char *out, const AlphaNum &x) {
1438   memcpy(out, x.data(), x.size());
1439   return out + x.size();
1440 }
1441 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1442 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1443   memcpy(out, x1.data(), x1.size());
1444   out += x1.size();
1445 
1446   memcpy(out, x2.data(), x2.size());
1447   return out + x2.size();
1448 }
1449 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1450 static char *Append4(char *out,
1451                      const AlphaNum &x1, const AlphaNum &x2,
1452                      const AlphaNum &x3, const AlphaNum &x4) {
1453   memcpy(out, x1.data(), x1.size());
1454   out += x1.size();
1455 
1456   memcpy(out, x2.data(), x2.size());
1457   out += x2.size();
1458 
1459   memcpy(out, x3.data(), x3.size());
1460   out += x3.size();
1461 
1462   memcpy(out, x4.data(), x4.size());
1463   return out + x4.size();
1464 }
1465 
StrCat(const AlphaNum & a,const AlphaNum & b)1466 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1467   string result;
1468   result.resize(a.size() + b.size());
1469   char *const begin = &*result.begin();
1470   char *out = Append2(begin, a, b);
1471   GOOGLE_DCHECK_EQ(out, begin + result.size());
1472   return result;
1473 }
1474 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1475 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1476   string result;
1477   result.resize(a.size() + b.size() + c.size());
1478   char *const begin = &*result.begin();
1479   char *out = Append2(begin, a, b);
1480   out = Append1(out, c);
1481   GOOGLE_DCHECK_EQ(out, begin + result.size());
1482   return result;
1483 }
1484 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1485 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1486               const AlphaNum &d) {
1487   string result;
1488   result.resize(a.size() + b.size() + c.size() + d.size());
1489   char *const begin = &*result.begin();
1490   char *out = Append4(begin, a, b, c, d);
1491   GOOGLE_DCHECK_EQ(out, begin + result.size());
1492   return result;
1493 }
1494 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1495 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1496               const AlphaNum &d, const AlphaNum &e) {
1497   string result;
1498   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1499   char *const begin = &*result.begin();
1500   char *out = Append4(begin, a, b, c, d);
1501   out = Append1(out, e);
1502   GOOGLE_DCHECK_EQ(out, begin + result.size());
1503   return result;
1504 }
1505 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1506 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1507               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1508   string result;
1509   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1510                 f.size());
1511   char *const begin = &*result.begin();
1512   char *out = Append4(begin, a, b, c, d);
1513   out = Append2(out, e, f);
1514   GOOGLE_DCHECK_EQ(out, begin + result.size());
1515   return result;
1516 }
1517 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1518 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1519               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1520               const AlphaNum &g) {
1521   string result;
1522   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1523                 f.size() + g.size());
1524   char *const begin = &*result.begin();
1525   char *out = Append4(begin, a, b, c, d);
1526   out = Append2(out, e, f);
1527   out = Append1(out, g);
1528   GOOGLE_DCHECK_EQ(out, begin + result.size());
1529   return result;
1530 }
1531 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1532 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1533               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1534               const AlphaNum &g, const AlphaNum &h) {
1535   string result;
1536   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1537                 f.size() + g.size() + h.size());
1538   char *const begin = &*result.begin();
1539   char *out = Append4(begin, a, b, c, d);
1540   out = Append4(out, e, f, g, h);
1541   GOOGLE_DCHECK_EQ(out, begin + result.size());
1542   return result;
1543 }
1544 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1545 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1546               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1547               const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1548   string result;
1549   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1550                 f.size() + g.size() + h.size() + i.size());
1551   char *const begin = &*result.begin();
1552   char *out = Append4(begin, a, b, c, d);
1553   out = Append4(out, e, f, g, h);
1554   out = Append1(out, i);
1555   GOOGLE_DCHECK_EQ(out, begin + result.size());
1556   return result;
1557 }
1558 
1559 // It's possible to call StrAppend with a char * pointer that is partway into
1560 // the string we're appending to.  However the results of this are random.
1561 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1562 // to do one comparison.
1563 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1564     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1565                      uintptr_t((dest).size()))
1566 
StrAppend(string * result,const AlphaNum & a)1567 void StrAppend(string *result, const AlphaNum &a) {
1568   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1569   result->append(a.data(), a.size());
1570 }
1571 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1572 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1573   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1574   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1575   string::size_type old_size = result->size();
1576   result->resize(old_size + a.size() + b.size());
1577   char *const begin = &*result->begin();
1578   char *out = Append2(begin + old_size, a, b);
1579   GOOGLE_DCHECK_EQ(out, begin + result->size());
1580 }
1581 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1582 void StrAppend(string *result,
1583                const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1584   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1585   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1586   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1587   string::size_type old_size = result->size();
1588   result->resize(old_size + a.size() + b.size() + c.size());
1589   char *const begin = &*result->begin();
1590   char *out = Append2(begin + old_size, a, b);
1591   out = Append1(out, c);
1592   GOOGLE_DCHECK_EQ(out, begin + result->size());
1593 }
1594 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1595 void StrAppend(string *result,
1596                const AlphaNum &a, const AlphaNum &b,
1597                const AlphaNum &c, const AlphaNum &d) {
1598   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1599   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1600   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1601   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1602   string::size_type old_size = result->size();
1603   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1604   char *const begin = &*result->begin();
1605   char *out = Append4(begin + old_size, a, b, c, d);
1606   GOOGLE_DCHECK_EQ(out, begin + result->size());
1607 }
1608 
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1609 int GlobalReplaceSubstring(const string& substring,
1610                            const string& replacement,
1611                            string* s) {
1612   GOOGLE_CHECK(s != nullptr);
1613   if (s->empty() || substring.empty())
1614     return 0;
1615   string tmp;
1616   int num_replacements = 0;
1617   int pos = 0;
1618   for (int match_pos = s->find(substring.data(), pos, substring.length());
1619        match_pos != string::npos;
1620        pos = match_pos + substring.length(),
1621            match_pos = s->find(substring.data(), pos, substring.length())) {
1622     ++num_replacements;
1623     // Append the original content before the match.
1624     tmp.append(*s, pos, match_pos - pos);
1625     // Append the replacement for the match.
1626     tmp.append(replacement.begin(), replacement.end());
1627   }
1628   // Append the content after the last match. If no replacements were made, the
1629   // original string is left untouched.
1630   if (num_replacements > 0) {
1631     tmp.append(*s, pos, s->length() - pos);
1632     s->swap(tmp);
1633   }
1634   return num_replacements;
1635 }
1636 
CalculateBase64EscapedLen(int input_len,bool do_padding)1637 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1638   // Base64 encodes three bytes of input at a time. If the input is not
1639   // divisible by three, we pad as appropriate.
1640   //
1641   // (from http://tools.ietf.org/html/rfc3548)
1642   // Special processing is performed if fewer than 24 bits are available
1643   // at the end of the data being encoded.  A full encoding quantum is
1644   // always completed at the end of a quantity.  When fewer than 24 input
1645   // bits are available in an input group, zero bits are added (on the
1646   // right) to form an integral number of 6-bit groups.  Padding at the
1647   // end of the data is performed using the '=' character.  Since all base
1648   // 64 input is an integral number of octets, only the following cases
1649   // can arise:
1650 
1651 
1652   // Base64 encodes each three bytes of input into four bytes of output.
1653   int len = (input_len / 3) * 4;
1654 
1655   if (input_len % 3 == 0) {
1656     // (from http://tools.ietf.org/html/rfc3548)
1657     // (1) the final quantum of encoding input is an integral multiple of 24
1658     // bits; here, the final unit of encoded output will be an integral
1659     // multiple of 4 characters with no "=" padding,
1660   } else if (input_len % 3 == 1) {
1661     // (from http://tools.ietf.org/html/rfc3548)
1662     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1663     // final unit of encoded output will be two characters followed by two
1664     // "=" padding characters, or
1665     len += 2;
1666     if (do_padding) {
1667       len += 2;
1668     }
1669   } else {  // (input_len % 3 == 2)
1670     // (from http://tools.ietf.org/html/rfc3548)
1671     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1672     // final unit of encoded output will be three characters followed by one
1673     // "=" padding character.
1674     len += 3;
1675     if (do_padding) {
1676       len += 1;
1677     }
1678   }
1679 
1680   assert(len >= input_len);  // make sure we didn't overflow
1681   return len;
1682 }
1683 
1684 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1685 int CalculateBase64EscapedLen(int input_len) {
1686   return CalculateBase64EscapedLen(input_len, true);
1687 }
1688 
1689 // ----------------------------------------------------------------------
1690 // int Base64Unescape() - base64 decoder
1691 // int Base64Escape() - base64 encoder
1692 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1693 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1694 //
1695 // Check out
1696 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1697 // care about is that...
1698 //   Take the encoded stuff in groups of 4 characters and turn each
1699 //   character into a code 0 to 63 thus:
1700 //           A-Z map to 0 to 25
1701 //           a-z map to 26 to 51
1702 //           0-9 map to 52 to 61
1703 //           +(- for WebSafe) maps to 62
1704 //           /(_ for WebSafe) maps to 63
1705 //   There will be four numbers, all less than 64 which can be represented
1706 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1707 //   Arrange the 6 digit binary numbers into three bytes as such:
1708 //   aaaaaabb bbbbcccc ccdddddd
1709 //   Equals signs (one or two) are used at the end of the encoded block to
1710 //   indicate that the text was not an integer multiple of three bytes long.
1711 // ----------------------------------------------------------------------
1712 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1713 int Base64UnescapeInternal(const char *src_param, int szsrc,
1714                            char *dest, int szdest,
1715                            const signed char* unbase64) {
1716   static const char kPad64Equals = '=';
1717   static const char kPad64Dot = '.';
1718 
1719   int decode = 0;
1720   int destidx = 0;
1721   int state = 0;
1722   unsigned int ch = 0;
1723   unsigned int temp = 0;
1724 
1725   // If "char" is signed by default, using *src as an array index results in
1726   // accessing negative array elements. Treat the input as a pointer to
1727   // unsigned char to avoid this.
1728   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1729 
1730   // The GET_INPUT macro gets the next input character, skipping
1731   // over any whitespace, and stopping when we reach the end of the
1732   // string or when we read any non-data character.  The arguments are
1733   // an arbitrary identifier (used as a label for goto) and the number
1734   // of data bytes that must remain in the input to avoid aborting the
1735   // loop.
1736 #define GET_INPUT(label, remain)                 \
1737   label:                                         \
1738     --szsrc;                                     \
1739     ch = *src++;                                 \
1740     decode = unbase64[ch];                       \
1741     if (decode < 0) {                            \
1742       if (ascii_isspace(ch) && szsrc >= remain)  \
1743         goto label;                              \
1744       state = 4 - remain;                        \
1745       break;                                     \
1746     }
1747 
1748   // if dest is null, we're just checking to see if it's legal input
1749   // rather than producing output.  (I suspect this could just be done
1750   // with a regexp...).  We duplicate the loop so this test can be
1751   // outside it instead of in every iteration.
1752 
1753   if (dest) {
1754     // This loop consumes 4 input bytes and produces 3 output bytes
1755     // per iteration.  We can't know at the start that there is enough
1756     // data left in the string for a full iteration, so the loop may
1757     // break out in the middle; if so 'state' will be set to the
1758     // number of input bytes read.
1759 
1760     while (szsrc >= 4)  {
1761       // We'll start by optimistically assuming that the next four
1762       // bytes of the string (src[0..3]) are four good data bytes
1763       // (that is, no nulls, whitespace, padding chars, or illegal
1764       // chars).  We need to test src[0..2] for nulls individually
1765       // before constructing temp to preserve the property that we
1766       // never read past a null in the string (no matter how long
1767       // szsrc claims the string is).
1768 
1769       if (!src[0] || !src[1] || !src[2] ||
1770           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1771                    (unsigned(unbase64[src[1]]) << 12) |
1772                    (unsigned(unbase64[src[2]]) << 6) |
1773                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1774         // Iff any of those four characters was bad (null, illegal,
1775         // whitespace, padding), then temp's high bit will be set
1776         // (because unbase64[] is -1 for all bad characters).
1777         //
1778         // We'll back up and resort to the slower decoder, which knows
1779         // how to handle those cases.
1780 
1781         GET_INPUT(first, 4);
1782         temp = decode;
1783         GET_INPUT(second, 3);
1784         temp = (temp << 6) | decode;
1785         GET_INPUT(third, 2);
1786         temp = (temp << 6) | decode;
1787         GET_INPUT(fourth, 1);
1788         temp = (temp << 6) | decode;
1789       } else {
1790         // We really did have four good data bytes, so advance four
1791         // characters in the string.
1792 
1793         szsrc -= 4;
1794         src += 4;
1795         decode = -1;
1796         ch = '\0';
1797       }
1798 
1799       // temp has 24 bits of input, so write that out as three bytes.
1800 
1801       if (destidx+3 > szdest) return -1;
1802       dest[destidx+2] = temp;
1803       temp >>= 8;
1804       dest[destidx+1] = temp;
1805       temp >>= 8;
1806       dest[destidx] = temp;
1807       destidx += 3;
1808     }
1809   } else {
1810     while (szsrc >= 4)  {
1811       if (!src[0] || !src[1] || !src[2] ||
1812           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1813                    (unsigned(unbase64[src[1]]) << 12) |
1814                    (unsigned(unbase64[src[2]]) << 6) |
1815                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1816         GET_INPUT(first_no_dest, 4);
1817         GET_INPUT(second_no_dest, 3);
1818         GET_INPUT(third_no_dest, 2);
1819         GET_INPUT(fourth_no_dest, 1);
1820       } else {
1821         szsrc -= 4;
1822         src += 4;
1823         decode = -1;
1824         ch = '\0';
1825       }
1826       destidx += 3;
1827     }
1828   }
1829 
1830 #undef GET_INPUT
1831 
1832   // if the loop terminated because we read a bad character, return
1833   // now.
1834   if (decode < 0 && ch != '\0' &&
1835       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1836     return -1;
1837 
1838   if (ch == kPad64Equals || ch == kPad64Dot) {
1839     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1840     // look at it again when we count to check for the proper number of
1841     // equals signs at the end.
1842     ++szsrc;
1843     --src;
1844   } else {
1845     // This loop consumes 1 input byte per iteration.  It's used to
1846     // clean up the 0-3 input bytes remaining when the first, faster
1847     // loop finishes.  'temp' contains the data from 'state' input
1848     // characters read by the first loop.
1849     while (szsrc > 0)  {
1850       --szsrc;
1851       ch = *src++;
1852       decode = unbase64[ch];
1853       if (decode < 0) {
1854         if (ascii_isspace(ch)) {
1855           continue;
1856         } else if (ch == '\0') {
1857           break;
1858         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1859           // back up one character; we'll read it again when we check
1860           // for the correct number of pad characters at the end.
1861           ++szsrc;
1862           --src;
1863           break;
1864         } else {
1865           return -1;
1866         }
1867       }
1868 
1869       // Each input character gives us six bits of output.
1870       temp = (temp << 6) | decode;
1871       ++state;
1872       if (state == 4) {
1873         // If we've accumulated 24 bits of output, write that out as
1874         // three bytes.
1875         if (dest) {
1876           if (destidx+3 > szdest) return -1;
1877           dest[destidx+2] = temp;
1878           temp >>= 8;
1879           dest[destidx+1] = temp;
1880           temp >>= 8;
1881           dest[destidx] = temp;
1882         }
1883         destidx += 3;
1884         state = 0;
1885         temp = 0;
1886       }
1887     }
1888   }
1889 
1890   // Process the leftover data contained in 'temp' at the end of the input.
1891   int expected_equals = 0;
1892   switch (state) {
1893     case 0:
1894       // Nothing left over; output is a multiple of 3 bytes.
1895       break;
1896 
1897     case 1:
1898       // Bad input; we have 6 bits left over.
1899       return -1;
1900 
1901     case 2:
1902       // Produce one more output byte from the 12 input bits we have left.
1903       if (dest) {
1904         if (destidx+1 > szdest) return -1;
1905         temp >>= 4;
1906         dest[destidx] = temp;
1907       }
1908       ++destidx;
1909       expected_equals = 2;
1910       break;
1911 
1912     case 3:
1913       // Produce two more output bytes from the 18 input bits we have left.
1914       if (dest) {
1915         if (destidx+2 > szdest) return -1;
1916         temp >>= 2;
1917         dest[destidx+1] = temp;
1918         temp >>= 8;
1919         dest[destidx] = temp;
1920       }
1921       destidx += 2;
1922       expected_equals = 1;
1923       break;
1924 
1925     default:
1926       // state should have no other values at this point.
1927       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1928   }
1929 
1930   // The remainder of the string should be all whitespace, mixed with
1931   // exactly 0 equals signs, or exactly 'expected_equals' equals
1932   // signs.  (Always accepting 0 equals signs is a google extension
1933   // not covered in the RFC, as is accepting dot as the pad character.)
1934 
1935   int equals = 0;
1936   while (szsrc > 0 && *src) {
1937     if (*src == kPad64Equals || *src == kPad64Dot)
1938       ++equals;
1939     else if (!ascii_isspace(*src))
1940       return -1;
1941     --szsrc;
1942     ++src;
1943   }
1944 
1945   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1946 }
1947 
1948 // The arrays below were generated by the following code
1949 // #include <sys/time.h>
1950 // #include <stdlib.h>
1951 // #include <string.h>
1952 // #include <stdio.h>
1953 // main()
1954 // {
1955 //   static const char Base64[] =
1956 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1957 //   const char *pos;
1958 //   int idx, i, j;
1959 //   printf("    ");
1960 //   for (i = 0; i < 255; i += 8) {
1961 //     for (j = i; j < i + 8; j++) {
1962 //       pos = strchr(Base64, j);
1963 //       if ((pos == nullptr) || (j == 0))
1964 //         idx = -1;
1965 //       else
1966 //         idx = pos - Base64;
1967 //       if (idx == -1)
1968 //         printf(" %2d,     ", idx);
1969 //       else
1970 //         printf(" %2d/""*%c*""/,", idx, j);
1971 //     }
1972 //     printf("\n    ");
1973 //   }
1974 // }
1975 //
1976 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1977 // tables from the functions below.
1978 static const signed char kUnBase64[] = {
1979   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1980   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1981   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1982   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1983   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1984   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1985   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1986   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1987   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1988    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1989   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1990   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
1991   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1992   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1993   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1994   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
1995   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1996   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1997   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1998   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1999   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2000   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2001   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2002   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2003   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2004   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2005   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2006   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2007   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2008   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2011 };
2012 static const signed char kUnWebSafeBase64[] = {
2013   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2014   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2015   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2016   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2017   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2018   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2019   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2020   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2021   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2022    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2023   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2024   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2025   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2026   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2027   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2028   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2029   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2030   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2031   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2032   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2033   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2034   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2035   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2036   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2037   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2038   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2039   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2040   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2041   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2042   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2043   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2044   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2045 };
2046 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2047 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2048   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2049 }
2050 
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2051 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2052                                    const signed char* unbase64) {
2053   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2054   // 4 characters.  any leftover chars are added directly for good measure.
2055   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2056   const int dest_len = 3 * (slen / 4) + (slen % 4);
2057 
2058   dest->resize(dest_len);
2059 
2060   // We are getting the destination buffer by getting the beginning of the
2061   // string and converting it into a char *.
2062   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2063                                          dest_len, unbase64);
2064   if (len < 0) {
2065     dest->clear();
2066     return false;
2067   }
2068 
2069   // could be shorter if there was padding
2070   GOOGLE_DCHECK_LE(len, dest_len);
2071   dest->erase(len);
2072 
2073   return true;
2074 }
2075 
Base64Unescape(StringPiece src,string * dest)2076 bool Base64Unescape(StringPiece src, string* dest) {
2077   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2078 }
2079 
WebSafeBase64Unescape(StringPiece src,string * dest)2080 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2081   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2082 }
2083 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2084 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2085                          char *dest, int szdest, const char *base64,
2086                          bool do_padding) {
2087   static const char kPad64 = '=';
2088 
2089   if (szsrc <= 0) return 0;
2090 
2091   if (szsrc * 4 > szdest * 3) return 0;
2092 
2093   char *cur_dest = dest;
2094   const unsigned char *cur_src = src;
2095 
2096   char *limit_dest = dest + szdest;
2097   const unsigned char *limit_src = src + szsrc;
2098 
2099   // Three bytes of data encodes to four characters of cyphertext.
2100   // So we can pump through three-byte chunks atomically.
2101   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2102     uint32 in = BigEndian::Load32(cur_src) >> 8;
2103 
2104     cur_dest[0] = base64[in >> 18];
2105     in &= 0x3FFFF;
2106     cur_dest[1] = base64[in >> 12];
2107     in &= 0xFFF;
2108     cur_dest[2] = base64[in >> 6];
2109     in &= 0x3F;
2110     cur_dest[3] = base64[in];
2111 
2112     cur_dest += 4;
2113     cur_src += 3;
2114   }
2115   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2116   szdest = limit_dest - cur_dest;
2117   szsrc = limit_src - cur_src;
2118 
2119   /* now deal with the tail (<=3 bytes) */
2120   switch (szsrc) {
2121     case 0:
2122       // Nothing left; nothing more to do.
2123       break;
2124     case 1: {
2125       // One byte left: this encodes to two characters, and (optionally)
2126       // two pad characters to round out the four-character cypherblock.
2127       if ((szdest -= 2) < 0) return 0;
2128       uint32 in = cur_src[0];
2129       cur_dest[0] = base64[in >> 2];
2130       in &= 0x3;
2131       cur_dest[1] = base64[in << 4];
2132       cur_dest += 2;
2133       if (do_padding) {
2134         if ((szdest -= 2) < 0) return 0;
2135         cur_dest[0] = kPad64;
2136         cur_dest[1] = kPad64;
2137         cur_dest += 2;
2138       }
2139       break;
2140     }
2141     case 2: {
2142       // Two bytes left: this encodes to three characters, and (optionally)
2143       // one pad character to round out the four-character cypherblock.
2144       if ((szdest -= 3) < 0) return 0;
2145       uint32 in = BigEndian::Load16(cur_src);
2146       cur_dest[0] = base64[in >> 10];
2147       in &= 0x3FF;
2148       cur_dest[1] = base64[in >> 4];
2149       in &= 0x00F;
2150       cur_dest[2] = base64[in << 2];
2151       cur_dest += 3;
2152       if (do_padding) {
2153         if ((szdest -= 1) < 0) return 0;
2154         cur_dest[0] = kPad64;
2155         cur_dest += 1;
2156       }
2157       break;
2158     }
2159     case 3: {
2160       // Three bytes left: same as in the big loop above.  We can't do this in
2161       // the loop because the loop above always reads 4 bytes, and the fourth
2162       // byte is past the end of the input.
2163       if ((szdest -= 4) < 0) return 0;
2164       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2165       cur_dest[0] = base64[in >> 18];
2166       in &= 0x3FFFF;
2167       cur_dest[1] = base64[in >> 12];
2168       in &= 0xFFF;
2169       cur_dest[2] = base64[in >> 6];
2170       in &= 0x3F;
2171       cur_dest[3] = base64[in];
2172       cur_dest += 4;
2173       break;
2174     }
2175     default:
2176       // Should not be reached: blocks of 4 bytes are handled
2177       // in the while loop before this switch statement.
2178       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2179       break;
2180   }
2181   return (cur_dest - dest);
2182 }
2183 
2184 static const char kBase64Chars[] =
2185 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2186 
2187 static const char kWebSafeBase64Chars[] =
2188 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2189 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2190 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2191   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2192 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2193 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2194                         int szdest, bool do_padding) {
2195   return Base64EscapeInternal(src, szsrc, dest, szdest,
2196                               kWebSafeBase64Chars, do_padding);
2197 }
2198 
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2199 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2200                           string* dest, bool do_padding,
2201                           const char* base64_chars) {
2202   const int calc_escaped_size =
2203     CalculateBase64EscapedLen(szsrc, do_padding);
2204   dest->resize(calc_escaped_size);
2205   const int escaped_len = Base64EscapeInternal(src, szsrc,
2206                                                string_as_array(dest),
2207                                                dest->size(),
2208                                                base64_chars,
2209                                                do_padding);
2210   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2211   dest->erase(escaped_len);
2212 }
2213 
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2214 void Base64Escape(const unsigned char *src, int szsrc,
2215                   string* dest, bool do_padding) {
2216   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2217 }
2218 
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2219 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2220                          string *dest, bool do_padding) {
2221   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2222 }
2223 
Base64Escape(StringPiece src,string * dest)2224 void Base64Escape(StringPiece src, string* dest) {
2225   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2226                src.size(), dest, true);
2227 }
2228 
WebSafeBase64Escape(StringPiece src,string * dest)2229 void WebSafeBase64Escape(StringPiece src, string* dest) {
2230   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2231                       src.size(), dest, false);
2232 }
2233 
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2234 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2235   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2236                       src.size(), dest, true);
2237 }
2238 
2239 // Helper to append a Unicode code point to a string as UTF8, without bringing
2240 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2241 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2242   uint32 tmp = 0;
2243   int len = 0;
2244   if (code_point <= 0x7f) {
2245     tmp = code_point;
2246     len = 1;
2247   } else if (code_point <= 0x07ff) {
2248     tmp = 0x0000c080 |
2249         ((code_point & 0x07c0) << 2) |
2250         (code_point & 0x003f);
2251     len = 2;
2252   } else if (code_point <= 0xffff) {
2253     tmp = 0x00e08080 |
2254         ((code_point & 0xf000) << 4) |
2255         ((code_point & 0x0fc0) << 2) |
2256         (code_point & 0x003f);
2257     len = 3;
2258   } else {
2259     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2260     // normally only defined up to there as well.
2261     tmp = 0xf0808080 |
2262         ((code_point & 0x1c0000) << 6) |
2263         ((code_point & 0x03f000) << 4) |
2264         ((code_point & 0x000fc0) << 2) |
2265         (code_point & 0x003f);
2266     len = 4;
2267   }
2268   tmp = ghtonl(tmp);
2269   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2270   return len;
2271 }
2272 
2273 // Table of UTF-8 character lengths, based on first byte
2274 static const unsigned char kUTF8LenTbl[256] = {
2275   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2276   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2277   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2278   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2279 
2280   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2281   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2282   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2283   3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2284 };
2285 
2286 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2287 int UTF8FirstLetterNumBytes(const char* src, int len) {
2288   if (len == 0) {
2289     return 0;
2290   }
2291   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2292 }
2293 
2294 // ----------------------------------------------------------------------
2295 // CleanStringLineEndings()
2296 //   Clean up a multi-line string to conform to Unix line endings.
2297 //   Reads from src and appends to dst, so usually dst should be empty.
2298 //
2299 //   If there is no line ending at the end of a non-empty string, it can
2300 //   be added automatically.
2301 //
2302 //   Four different types of input are correctly handled:
2303 //
2304 //     - Unix/Linux files: line ending is LF: pass through unchanged
2305 //
2306 //     - DOS/Windows files: line ending is CRLF: convert to LF
2307 //
2308 //     - Legacy Mac files: line ending is CR: convert to LF
2309 //
2310 //     - Garbled files: random line endings: convert gracefully
2311 //                      lonely CR, lonely LF, CRLF: convert to LF
2312 //
2313 //   @param src The multi-line string to convert
2314 //   @param dst The converted string is appended to this string
2315 //   @param auto_end_last_line Automatically terminate the last line
2316 //
2317 //   Limitations:
2318 //
2319 //     This does not do the right thing for CRCRLF files created by
2320 //     broken programs that do another Unix->DOS conversion on files
2321 //     that are already in CRLF format.  For this, a two-pass approach
2322 //     brute-force would be needed that
2323 //
2324 //       (1) determines the presence of LF (first one is ok)
2325 //       (2) if yes, removes any CR, else convert every CR to LF
2326 
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2327 void CleanStringLineEndings(const string &src, string *dst,
2328                             bool auto_end_last_line) {
2329   if (dst->empty()) {
2330     dst->append(src);
2331     CleanStringLineEndings(dst, auto_end_last_line);
2332   } else {
2333     string tmp = src;
2334     CleanStringLineEndings(&tmp, auto_end_last_line);
2335     dst->append(tmp);
2336   }
2337 }
2338 
CleanStringLineEndings(string * str,bool auto_end_last_line)2339 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2340   ptrdiff_t output_pos = 0;
2341   bool r_seen = false;
2342   ptrdiff_t len = str->size();
2343 
2344   char *p = &(*str)[0];
2345 
2346   for (ptrdiff_t input_pos = 0; input_pos < len;) {
2347     if (!r_seen && input_pos + 8 < len) {
2348       uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2349       // Loop over groups of 8 bytes at a time until we come across
2350       // a word that has a byte whose value is less than or equal to
2351       // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2352       //
2353       // We use a has_less macro that quickly tests a whole 64-bit
2354       // word to see if any of the bytes has a value < N.
2355       //
2356       // For more details, see:
2357       //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2358 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2359       if (!has_less(v, '\r' + 1)) {
2360 #undef has_less
2361         // No byte in this word has a value that could be a \r or a \n
2362         if (output_pos != input_pos) {
2363           GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2364         }
2365         input_pos += 8;
2366         output_pos += 8;
2367         continue;
2368       }
2369     }
2370     string::const_reference in = p[input_pos];
2371     if (in == '\r') {
2372       if (r_seen) p[output_pos++] = '\n';
2373       r_seen = true;
2374     } else if (in == '\n') {
2375       if (input_pos != output_pos)
2376         p[output_pos++] = '\n';
2377       else
2378         output_pos++;
2379       r_seen = false;
2380     } else {
2381       if (r_seen) p[output_pos++] = '\n';
2382       r_seen = false;
2383       if (input_pos != output_pos)
2384         p[output_pos++] = in;
2385       else
2386         output_pos++;
2387     }
2388     input_pos++;
2389   }
2390   if (r_seen ||
2391       (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2392     str->resize(output_pos + 1);
2393     str->operator[](output_pos) = '\n';
2394   } else if (output_pos < len) {
2395     str->resize(output_pos);
2396   }
2397 }
2398 
2399 namespace internal {
2400 
2401 // ----------------------------------------------------------------------
2402 // NoLocaleStrtod()
2403 //   This code will make you cry.
2404 // ----------------------------------------------------------------------
2405 
2406 namespace {
2407 
2408 // Returns a string identical to *input except that the character pointed to
2409 // by radix_pos (which should be '.') is replaced with the locale-specific
2410 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2411 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2412   // Determine the locale-specific radix character by calling sprintf() to
2413   // print the number 1.5, then stripping off the digits.  As far as I can
2414   // tell, this is the only portable, thread-safe way to get the C library
2415   // to divuldge the locale's radix character.  No, localeconv() is NOT
2416   // thread-safe.
2417   char temp[16];
2418   int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2419   GOOGLE_CHECK_EQ(temp[0], '1');
2420   GOOGLE_CHECK_EQ(temp[size - 1], '5');
2421   GOOGLE_CHECK_LE(size, 6);
2422 
2423   // Now replace the '.' in the input with it.
2424   std::string result;
2425   result.reserve(strlen(input) + size - 3);
2426   result.append(input, radix_pos);
2427   result.append(temp + 1, size - 2);
2428   result.append(radix_pos + 1);
2429   return result;
2430 }
2431 
2432 }  // namespace
2433 
NoLocaleStrtod(const char * str,char ** endptr)2434 double NoLocaleStrtod(const char *str, char **endptr) {
2435   // We cannot simply set the locale to "C" temporarily with setlocale()
2436   // as this is not thread-safe.  Instead, we try to parse in the current
2437   // locale first.  If parsing stops at a '.' character, then this is a
2438   // pretty good hint that we're actually in some other locale in which
2439   // '.' is not the radix character.
2440 
2441   char *temp_endptr;
2442   double result = strtod(str, &temp_endptr);
2443   if (endptr != NULL) *endptr = temp_endptr;
2444   if (*temp_endptr != '.') return result;
2445 
2446   // Parsing halted on a '.'.  Perhaps we're in a different locale?  Let's
2447   // try to replace the '.' with a locale-specific radix character and
2448   // try again.
2449   std::string localized = LocalizeRadix(str, temp_endptr);
2450   const char *localized_cstr = localized.c_str();
2451   char *localized_endptr;
2452   result = strtod(localized_cstr, &localized_endptr);
2453   if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2454     // This attempt got further, so replacing the decimal must have helped.
2455     // Update endptr to point at the right location.
2456     if (endptr != NULL) {
2457       // size_diff is non-zero if the localized radix has multiple bytes.
2458       int size_diff = localized.size() - strlen(str);
2459       // const_cast is necessary to match the strtod() interface.
2460       *endptr = const_cast<char *>(
2461           str + (localized_endptr - localized_cstr - size_diff));
2462     }
2463   }
2464 
2465   return result;
2466 }
2467 
2468 }  // namespace internal
2469 
2470 }  // namespace protobuf
2471 }  // namespace google
2472