1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34 #include <google/protobuf/stubs/mathlimits.h>
35
36 #include <errno.h>
37 #include <float.h> // FLT_DIG and DBL_DIG
38 #include <limits>
39 #include <limits.h>
40 #include <stdio.h>
41 #include <iterator>
42
43 #include <google/protobuf/stubs/stl_util.h>
44
45 #ifdef _WIN32
46 // MSVC has only _snprintf, not snprintf.
47 //
48 // MinGW has both snprintf and _snprintf, but they appear to be different
49 // functions. The former is buggy. When invoked like so:
50 // char buffer[32];
51 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
52 // it prints "1.23000e+10". This is plainly wrong: %g should never print
53 // trailing zeros after the decimal point. For some reason this bug only
54 // occurs with some input values, not all. In any case, _snprintf does the
55 // right thing, so we use it.
56 #define snprintf _snprintf
57 #endif
58
59 namespace google {
60 namespace protobuf {
61
62 // These are defined as macros on some platforms. #undef them so that we can
63 // redefine them.
64 #undef isxdigit
65 #undef isprint
66
67 // The definitions of these in ctype.h change based on locale. Since our
68 // string manipulation is all in relation to the protocol buffer and C++
69 // languages, we always want to use the C locale. So, we re-define these
70 // exactly as we want them.
isxdigit(char c)71 inline bool isxdigit(char c) {
72 return ('0' <= c && c <= '9') ||
73 ('a' <= c && c <= 'f') ||
74 ('A' <= c && c <= 'F');
75 }
76
isprint(char c)77 inline bool isprint(char c) {
78 return c >= 0x20 && c <= 0x7E;
79 }
80
81 // ----------------------------------------------------------------------
82 // StripString
83 // Replaces any occurrence of the character 'remove' (or the characters
84 // in 'remove') with the character 'replacewith'.
85 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)86 void StripString(string* s, const char* remove, char replacewith) {
87 const char * str_start = s->c_str();
88 const char * str = str_start;
89 for (str = strpbrk(str, remove);
90 str != NULL;
91 str = strpbrk(str + 1, remove)) {
92 (*s)[str - str_start] = replacewith;
93 }
94 }
95
96 // ----------------------------------------------------------------------
97 // ReplaceCharacters
98 // Replaces any occurrence of the character 'remove' (or the characters
99 // in 'remove') with the character 'replacewith'.
100 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)101 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
102 const char *str_start = s->c_str();
103 const char *str = str_start;
104 for (str = strpbrk(str, remove);
105 str != NULL;
106 str = strpbrk(str + 1, remove)) {
107 (*s)[str - str_start] = replacewith;
108 }
109 }
110
StripWhitespace(string * str)111 void StripWhitespace(string* str) {
112 int str_length = str->length();
113
114 // Strip off leading whitespace.
115 int first = 0;
116 while (first < str_length && ascii_isspace(str->at(first))) {
117 ++first;
118 }
119 // If entire string is white space.
120 if (first == str_length) {
121 str->clear();
122 return;
123 }
124 if (first > 0) {
125 str->erase(0, first);
126 str_length -= first;
127 }
128
129 // Strip off trailing whitespace.
130 int last = str_length - 1;
131 while (last >= 0 && ascii_isspace(str->at(last))) {
132 --last;
133 }
134 if (last != (str_length - 1) && last >= 0) {
135 str->erase(last + 1, string::npos);
136 }
137 }
138
139 // ----------------------------------------------------------------------
140 // StringReplace()
141 // Replace the "old" pattern with the "new" pattern in a string,
142 // and append the result to "res". If replace_all is false,
143 // it only replaces the first instance of "old."
144 // ----------------------------------------------------------------------
145
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)146 void StringReplace(const string& s, const string& oldsub,
147 const string& newsub, bool replace_all,
148 string* res) {
149 if (oldsub.empty()) {
150 res->append(s); // if empty, append the given string.
151 return;
152 }
153
154 string::size_type start_pos = 0;
155 string::size_type pos;
156 do {
157 pos = s.find(oldsub, start_pos);
158 if (pos == string::npos) {
159 break;
160 }
161 res->append(s, start_pos, pos - start_pos);
162 res->append(newsub);
163 start_pos = pos + oldsub.size(); // start searching again after the "old"
164 } while (replace_all);
165 res->append(s, start_pos, s.length() - start_pos);
166 }
167
168 // ----------------------------------------------------------------------
169 // StringReplace()
170 // Give me a string and two patterns "old" and "new", and I replace
171 // the first instance of "old" in the string with "new", if it
172 // exists. If "global" is true; call this repeatedly until it
173 // fails. RETURN a new string, regardless of whether the replacement
174 // happened or not.
175 // ----------------------------------------------------------------------
176
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)177 string StringReplace(const string& s, const string& oldsub,
178 const string& newsub, bool replace_all) {
179 string ret;
180 StringReplace(s, oldsub, newsub, replace_all, &ret);
181 return ret;
182 }
183
184 // ----------------------------------------------------------------------
185 // SplitStringUsing()
186 // Split a string using a character delimiter. Append the components
187 // to 'result'.
188 //
189 // Note: For multi-character delimiters, this routine will split on *ANY* of
190 // the characters in the string, not the entire string as a single delimiter.
191 // ----------------------------------------------------------------------
192 template <typename ITR>
193 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)194 void SplitStringToIteratorUsing(const string& full,
195 const char* delim,
196 ITR& result) {
197 // Optimize the common case where delim is a single character.
198 if (delim[0] != '\0' && delim[1] == '\0') {
199 char c = delim[0];
200 const char* p = full.data();
201 const char* end = p + full.size();
202 while (p != end) {
203 if (*p == c) {
204 ++p;
205 } else {
206 const char* start = p;
207 while (++p != end && *p != c);
208 *result++ = string(start, p - start);
209 }
210 }
211 return;
212 }
213
214 string::size_type begin_index, end_index;
215 begin_index = full.find_first_not_of(delim);
216 while (begin_index != string::npos) {
217 end_index = full.find_first_of(delim, begin_index);
218 if (end_index == string::npos) {
219 *result++ = full.substr(begin_index);
220 return;
221 }
222 *result++ = full.substr(begin_index, (end_index - begin_index));
223 begin_index = full.find_first_not_of(delim, end_index);
224 }
225 }
226
SplitStringUsing(const string & full,const char * delim,vector<string> * result)227 void SplitStringUsing(const string& full,
228 const char* delim,
229 vector<string>* result) {
230 std::back_insert_iterator< vector<string> > it(*result);
231 SplitStringToIteratorUsing(full, delim, it);
232 }
233
234 // Split a string using a character delimiter. Append the components
235 // to 'result'. If there are consecutive delimiters, this function
236 // will return corresponding empty strings. The string is split into
237 // at most the specified number of pieces greedily. This means that the
238 // last piece may possibly be split further. To split into as many pieces
239 // as possible, specify 0 as the number of pieces.
240 //
241 // If "full" is the empty string, yields an empty string as the only value.
242 //
243 // If "pieces" is negative for some reason, it returns the whole string
244 // ----------------------------------------------------------------------
245 template <typename StringType, typename ITR>
246 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)247 void SplitStringToIteratorAllowEmpty(const StringType& full,
248 const char* delim,
249 int pieces,
250 ITR& result) {
251 string::size_type begin_index, end_index;
252 begin_index = 0;
253
254 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
255 end_index = full.find_first_of(delim, begin_index);
256 if (end_index == string::npos) {
257 *result++ = full.substr(begin_index);
258 return;
259 }
260 *result++ = full.substr(begin_index, (end_index - begin_index));
261 begin_index = end_index + 1;
262 }
263 *result++ = full.substr(begin_index);
264 }
265
SplitStringAllowEmpty(const string & full,const char * delim,vector<string> * result)266 void SplitStringAllowEmpty(const string& full, const char* delim,
267 vector<string>* result) {
268 std::back_insert_iterator<vector<string> > it(*result);
269 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
270 }
271
272 // ----------------------------------------------------------------------
273 // JoinStrings()
274 // This merges a vector of string components with delim inserted
275 // as separaters between components.
276 //
277 // ----------------------------------------------------------------------
278 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)279 static void JoinStringsIterator(const ITERATOR& start,
280 const ITERATOR& end,
281 const char* delim,
282 string* result) {
283 GOOGLE_CHECK(result != NULL);
284 result->clear();
285 int delim_length = strlen(delim);
286
287 // Precompute resulting length so we can reserve() memory in one shot.
288 int length = 0;
289 for (ITERATOR iter = start; iter != end; ++iter) {
290 if (iter != start) {
291 length += delim_length;
292 }
293 length += iter->size();
294 }
295 result->reserve(length);
296
297 // Now combine everything.
298 for (ITERATOR iter = start; iter != end; ++iter) {
299 if (iter != start) {
300 result->append(delim, delim_length);
301 }
302 result->append(iter->data(), iter->size());
303 }
304 }
305
JoinStrings(const vector<string> & components,const char * delim,string * result)306 void JoinStrings(const vector<string>& components,
307 const char* delim,
308 string * result) {
309 JoinStringsIterator(components.begin(), components.end(), delim, result);
310 }
311
312 // ----------------------------------------------------------------------
313 // UnescapeCEscapeSequences()
314 // This does all the unescaping that C does: \ooo, \r, \n, etc
315 // Returns length of resulting string.
316 // The implementation of \x parses any positive number of hex digits,
317 // but it is an error if the value requires more than 8 bits, and the
318 // result is truncated to 8 bits.
319 //
320 // The second call stores its errors in a supplied string vector.
321 // If the string vector pointer is NULL, it reports the errors with LOG().
322 // ----------------------------------------------------------------------
323
324 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
325
326 // Protocol buffers doesn't ever care about errors, but I don't want to remove
327 // the code.
328 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
329
UnescapeCEscapeSequences(const char * source,char * dest)330 int UnescapeCEscapeSequences(const char* source, char* dest) {
331 return UnescapeCEscapeSequences(source, dest, NULL);
332 }
333
UnescapeCEscapeSequences(const char * source,char * dest,vector<string> * errors)334 int UnescapeCEscapeSequences(const char* source, char* dest,
335 vector<string> *errors) {
336 GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
337
338 char* d = dest;
339 const char* p = source;
340
341 // Small optimization for case where source = dest and there's no escaping
342 while ( p == d && *p != '\0' && *p != '\\' )
343 p++, d++;
344
345 while (*p != '\0') {
346 if (*p != '\\') {
347 *d++ = *p++;
348 } else {
349 switch ( *++p ) { // skip past the '\\'
350 case '\0':
351 LOG_STRING(ERROR, errors) << "String cannot end with \\";
352 *d = '\0';
353 return d - dest; // we're done with p
354 case 'a': *d++ = '\a'; break;
355 case 'b': *d++ = '\b'; break;
356 case 'f': *d++ = '\f'; break;
357 case 'n': *d++ = '\n'; break;
358 case 'r': *d++ = '\r'; break;
359 case 't': *d++ = '\t'; break;
360 case 'v': *d++ = '\v'; break;
361 case '\\': *d++ = '\\'; break;
362 case '?': *d++ = '\?'; break; // \? Who knew?
363 case '\'': *d++ = '\''; break;
364 case '"': *d++ = '\"'; break;
365 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
366 case '4': case '5': case '6': case '7': {
367 char ch = *p - '0';
368 if ( IS_OCTAL_DIGIT(p[1]) )
369 ch = ch * 8 + *++p - '0';
370 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
371 ch = ch * 8 + *++p - '0'; // now points at last digit
372 *d++ = ch;
373 break;
374 }
375 case 'x': case 'X': {
376 if (!isxdigit(p[1])) {
377 if (p[1] == '\0') {
378 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
379 } else {
380 LOG_STRING(ERROR, errors) <<
381 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
382 }
383 break;
384 }
385 unsigned int ch = 0;
386 const char *hex_start = p;
387 while (isxdigit(p[1])) // arbitrarily many hex digits
388 ch = (ch << 4) + hex_digit_to_int(*++p);
389 if (ch > 0xFF)
390 LOG_STRING(ERROR, errors) << "Value of " <<
391 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
392 *d++ = ch;
393 break;
394 }
395 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
396 case 'u': {
397 // \uhhhh => convert 4 hex digits to UTF-8
398 char32 rune = 0;
399 const char *hex_start = p;
400 for (int i = 0; i < 4; ++i) {
401 if (isxdigit(p[1])) { // Look one char ahead.
402 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
403 } else {
404 LOG_STRING(ERROR, errors)
405 << "\\u must be followed by 4 hex digits: \\"
406 << string(hex_start, p+1-hex_start);
407 break;
408 }
409 }
410 d += runetochar(d, &rune);
411 break;
412 }
413 case 'U': {
414 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
415 char32 rune = 0;
416 const char *hex_start = p;
417 for (int i = 0; i < 8; ++i) {
418 if (isxdigit(p[1])) { // Look one char ahead.
419 // Don't change rune until we're sure this
420 // is within the Unicode limit, but do advance p.
421 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
422 if (newrune > 0x10FFFF) {
423 LOG_STRING(ERROR, errors)
424 << "Value of \\"
425 << string(hex_start, p + 1 - hex_start)
426 << " exceeds Unicode limit (0x10FFFF)";
427 break;
428 } else {
429 rune = newrune;
430 }
431 } else {
432 LOG_STRING(ERROR, errors)
433 << "\\U must be followed by 8 hex digits: \\"
434 << string(hex_start, p+1-hex_start);
435 break;
436 }
437 }
438 d += runetochar(d, &rune);
439 break;
440 }
441 #endif
442 default:
443 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
444 }
445 p++; // read past letter we escaped
446 }
447 }
448 *d = '\0';
449 return d - dest;
450 }
451
452 // ----------------------------------------------------------------------
453 // UnescapeCEscapeString()
454 // This does the same thing as UnescapeCEscapeSequences, but creates
455 // a new string. The caller does not need to worry about allocating
456 // a dest buffer. This should be used for non performance critical
457 // tasks such as printing debug messages. It is safe for src and dest
458 // to be the same.
459 //
460 // The second call stores its errors in a supplied string vector.
461 // If the string vector pointer is NULL, it reports the errors with LOG().
462 //
463 // In the first and second calls, the length of dest is returned. In the
464 // the third call, the new string is returned.
465 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)466 int UnescapeCEscapeString(const string& src, string* dest) {
467 return UnescapeCEscapeString(src, dest, NULL);
468 }
469
UnescapeCEscapeString(const string & src,string * dest,vector<string> * errors)470 int UnescapeCEscapeString(const string& src, string* dest,
471 vector<string> *errors) {
472 scoped_array<char> unescaped(new char[src.size() + 1]);
473 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
474 GOOGLE_CHECK(dest);
475 dest->assign(unescaped.get(), len);
476 return len;
477 }
478
UnescapeCEscapeString(const string & src)479 string UnescapeCEscapeString(const string& src) {
480 scoped_array<char> unescaped(new char[src.size() + 1]);
481 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
482 return string(unescaped.get(), len);
483 }
484
485 // ----------------------------------------------------------------------
486 // CEscapeString()
487 // CHexEscapeString()
488 // Copies 'src' to 'dest', escaping dangerous characters using
489 // C-style escape sequences. This is very useful for preparing query
490 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
491 // hexadecimal rather than octal sequences.
492 // Returns the number of bytes written to 'dest' (not including the \0)
493 // or -1 if there was insufficient space.
494 //
495 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
496 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)497 int CEscapeInternal(const char* src, int src_len, char* dest,
498 int dest_len, bool use_hex, bool utf8_safe) {
499 const char* src_end = src + src_len;
500 int used = 0;
501 bool last_hex_escape = false; // true if last output char was \xNN
502
503 for (; src < src_end; src++) {
504 if (dest_len - used < 2) // Need space for two letter escape
505 return -1;
506
507 bool is_hex_escape = false;
508 switch (*src) {
509 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
510 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
511 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
512 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
513 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
514 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
515 default:
516 // Note that if we emit \xNN and the src character after that is a hex
517 // digit then that digit must be escaped too to prevent it being
518 // interpreted as part of the character code by C.
519 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
520 (!isprint(*src) ||
521 (last_hex_escape && isxdigit(*src)))) {
522 if (dest_len - used < 4) // need space for 4 letter escape
523 return -1;
524 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
525 static_cast<uint8>(*src));
526 is_hex_escape = use_hex;
527 used += 4;
528 } else {
529 dest[used++] = *src; break;
530 }
531 }
532 last_hex_escape = is_hex_escape;
533 }
534
535 if (dest_len - used < 1) // make sure that there is room for \0
536 return -1;
537
538 dest[used] = '\0'; // doesn't count towards return value though
539 return used;
540 }
541
542 // Calculates the length of the C-style escaped version of 'src'.
543 // Assumes that non-printable characters are escaped using octal sequences, and
544 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)545 static inline size_t CEscapedLength(StringPiece src) {
546 static char c_escaped_len[256] = {
547 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
548 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
549 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
550 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
551 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
555 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
556 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
557 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
558 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
559 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
560 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
561 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
562 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
563 };
564
565 size_t escaped_len = 0;
566 for (int i = 0; i < src.size(); ++i) {
567 unsigned char c = static_cast<unsigned char>(src[i]);
568 escaped_len += c_escaped_len[c];
569 }
570 return escaped_len;
571 }
572
573 // ----------------------------------------------------------------------
574 // Escapes 'src' using C-style escape sequences, and appends the escaped string
575 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
576 // the required space using a lookup table, and also does not do any special
577 // handling for Hex or UTF-8 characters.
578 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)579 void CEscapeAndAppend(StringPiece src, string* dest) {
580 size_t escaped_len = CEscapedLength(src);
581 if (escaped_len == src.size()) {
582 dest->append(src.data(), src.size());
583 return;
584 }
585
586 size_t cur_dest_len = dest->size();
587 dest->resize(cur_dest_len + escaped_len);
588 char* append_ptr = &(*dest)[cur_dest_len];
589
590 for (int i = 0; i < src.size(); ++i) {
591 unsigned char c = static_cast<unsigned char>(src[i]);
592 switch (c) {
593 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
594 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
595 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
596 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
597 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
598 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
599 default:
600 if (!isprint(c)) {
601 *append_ptr++ = '\\';
602 *append_ptr++ = '0' + c / 64;
603 *append_ptr++ = '0' + (c % 64) / 8;
604 *append_ptr++ = '0' + c % 8;
605 } else {
606 *append_ptr++ = c;
607 }
608 break;
609 }
610 }
611 }
612
CEscape(const string & src)613 string CEscape(const string& src) {
614 string dest;
615 CEscapeAndAppend(src, &dest);
616 return dest;
617 }
618
619 namespace strings {
620
Utf8SafeCEscape(const string & src)621 string Utf8SafeCEscape(const string& src) {
622 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
623 scoped_array<char> dest(new char[dest_length]);
624 const int len = CEscapeInternal(src.data(), src.size(),
625 dest.get(), dest_length, false, true);
626 GOOGLE_DCHECK_GE(len, 0);
627 return string(dest.get(), len);
628 }
629
CHexEscape(const string & src)630 string CHexEscape(const string& src) {
631 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
632 scoped_array<char> dest(new char[dest_length]);
633 const int len = CEscapeInternal(src.data(), src.size(),
634 dest.get(), dest_length, true, false);
635 GOOGLE_DCHECK_GE(len, 0);
636 return string(dest.get(), len);
637 }
638
639 } // namespace strings
640
641 // ----------------------------------------------------------------------
642 // strto32_adaptor()
643 // strtou32_adaptor()
644 // Implementation of strto[u]l replacements that have identical
645 // overflow and underflow characteristics for both ILP-32 and LP-64
646 // platforms, including errno preservation in error-free calls.
647 // ----------------------------------------------------------------------
648
strto32_adaptor(const char * nptr,char ** endptr,int base)649 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
650 const int saved_errno = errno;
651 errno = 0;
652 const long result = strtol(nptr, endptr, base);
653 if (errno == ERANGE && result == LONG_MIN) {
654 return kint32min;
655 } else if (errno == ERANGE && result == LONG_MAX) {
656 return kint32max;
657 } else if (errno == 0 && result < kint32min) {
658 errno = ERANGE;
659 return kint32min;
660 } else if (errno == 0 && result > kint32max) {
661 errno = ERANGE;
662 return kint32max;
663 }
664 if (errno == 0)
665 errno = saved_errno;
666 return static_cast<int32>(result);
667 }
668
strtou32_adaptor(const char * nptr,char ** endptr,int base)669 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
670 const int saved_errno = errno;
671 errno = 0;
672 const unsigned long result = strtoul(nptr, endptr, base);
673 if (errno == ERANGE && result == ULONG_MAX) {
674 return kuint32max;
675 } else if (errno == 0 && result > kuint32max) {
676 errno = ERANGE;
677 return kuint32max;
678 }
679 if (errno == 0)
680 errno = saved_errno;
681 return static_cast<uint32>(result);
682 }
683
safe_parse_sign(string * text,bool * negative_ptr)684 inline bool safe_parse_sign(string* text /*inout*/,
685 bool* negative_ptr /*output*/) {
686 const char* start = text->data();
687 const char* end = start + text->size();
688
689 // Consume whitespace.
690 while (start < end && (start[0] == ' ')) {
691 ++start;
692 }
693 while (start < end && (end[-1] == ' ')) {
694 --end;
695 }
696 if (start >= end) {
697 return false;
698 }
699
700 // Consume sign.
701 *negative_ptr = (start[0] == '-');
702 if (*negative_ptr || start[0] == '+') {
703 ++start;
704 if (start >= end) {
705 return false;
706 }
707 }
708 *text = text->substr(start - text->data(), end - start);
709 return true;
710 }
711
712 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)713 bool safe_parse_positive_int(
714 string text, IntType* value_p) {
715 int base = 10;
716 IntType value = 0;
717 const IntType vmax = std::numeric_limits<IntType>::max();
718 assert(vmax > 0);
719 assert(vmax >= base);
720 const IntType vmax_over_base = vmax / base;
721 const char* start = text.data();
722 const char* end = start + text.size();
723 // loop over digits
724 for (; start < end; ++start) {
725 unsigned char c = static_cast<unsigned char>(start[0]);
726 int digit = c - '0';
727 if (digit >= base || digit < 0) {
728 *value_p = value;
729 return false;
730 }
731 if (value > vmax_over_base) {
732 *value_p = vmax;
733 return false;
734 }
735 value *= base;
736 if (value > vmax - digit) {
737 *value_p = vmax;
738 return false;
739 }
740 value += digit;
741 }
742 *value_p = value;
743 return true;
744 }
745
746 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)747 bool safe_parse_negative_int(
748 const string& text, IntType* value_p) {
749 int base = 10;
750 IntType value = 0;
751 const IntType vmin = std::numeric_limits<IntType>::min();
752 assert(vmin < 0);
753 assert(vmin <= 0 - base);
754 IntType vmin_over_base = vmin / base;
755 // 2003 c++ standard [expr.mul]
756 // "... the sign of the remainder is implementation-defined."
757 // Although (vmin/base)*base + vmin%base is always vmin.
758 // 2011 c++ standard tightens the spec but we cannot rely on it.
759 if (vmin % base > 0) {
760 vmin_over_base += 1;
761 }
762 const char* start = text.data();
763 const char* end = start + text.size();
764 // loop over digits
765 for (; start < end; ++start) {
766 unsigned char c = static_cast<unsigned char>(start[0]);
767 int digit = c - '0';
768 if (digit >= base || digit < 0) {
769 *value_p = value;
770 return false;
771 }
772 if (value < vmin_over_base) {
773 *value_p = vmin;
774 return false;
775 }
776 value *= base;
777 if (value < vmin + digit) {
778 *value_p = vmin;
779 return false;
780 }
781 value -= digit;
782 }
783 *value_p = value;
784 return true;
785 }
786
787 template<typename IntType>
safe_int_internal(string text,IntType * value_p)788 bool safe_int_internal(string text, IntType* value_p) {
789 *value_p = 0;
790 bool negative;
791 if (!safe_parse_sign(&text, &negative)) {
792 return false;
793 }
794 if (!negative) {
795 return safe_parse_positive_int(text, value_p);
796 } else {
797 return safe_parse_negative_int(text, value_p);
798 }
799 }
800
801 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)802 bool safe_uint_internal(string text, IntType* value_p) {
803 *value_p = 0;
804 bool negative;
805 if (!safe_parse_sign(&text, &negative) || negative) {
806 return false;
807 }
808 return safe_parse_positive_int(text, value_p);
809 }
810
811 // ----------------------------------------------------------------------
812 // FastIntToBuffer()
813 // FastInt64ToBuffer()
814 // FastHexToBuffer()
815 // FastHex64ToBuffer()
816 // FastHex32ToBuffer()
817 // ----------------------------------------------------------------------
818
819 // Offset into buffer where FastInt64ToBuffer places the end of string
820 // null character. Also used by FastInt64ToBufferLeft.
821 static const int kFastInt64ToBufferOffset = 21;
822
FastInt64ToBuffer(int64 i,char * buffer)823 char *FastInt64ToBuffer(int64 i, char* buffer) {
824 // We could collapse the positive and negative sections, but that
825 // would be slightly slower for positive numbers...
826 // 22 bytes is enough to store -2**64, -18446744073709551616.
827 char* p = buffer + kFastInt64ToBufferOffset;
828 *p-- = '\0';
829 if (i >= 0) {
830 do {
831 *p-- = '0' + i % 10;
832 i /= 10;
833 } while (i > 0);
834 return p + 1;
835 } else {
836 // On different platforms, % and / have different behaviors for
837 // negative numbers, so we need to jump through hoops to make sure
838 // we don't divide negative numbers.
839 if (i > -10) {
840 i = -i;
841 *p-- = '0' + i;
842 *p = '-';
843 return p;
844 } else {
845 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
846 i = i + 10;
847 i = -i;
848 *p-- = '0' + i % 10;
849 // Undo what we did a moment ago
850 i = i / 10 + 1;
851 do {
852 *p-- = '0' + i % 10;
853 i /= 10;
854 } while (i > 0);
855 *p = '-';
856 return p;
857 }
858 }
859 }
860
861 // Offset into buffer where FastInt32ToBuffer places the end of string
862 // null character. Also used by FastInt32ToBufferLeft
863 static const int kFastInt32ToBufferOffset = 11;
864
865 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
866 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
867 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)868 char *FastInt32ToBuffer(int32 i, char* buffer) {
869 // We could collapse the positive and negative sections, but that
870 // would be slightly slower for positive numbers...
871 // 12 bytes is enough to store -2**32, -4294967296.
872 char* p = buffer + kFastInt32ToBufferOffset;
873 *p-- = '\0';
874 if (i >= 0) {
875 do {
876 *p-- = '0' + i % 10;
877 i /= 10;
878 } while (i > 0);
879 return p + 1;
880 } else {
881 // On different platforms, % and / have different behaviors for
882 // negative numbers, so we need to jump through hoops to make sure
883 // we don't divide negative numbers.
884 if (i > -10) {
885 i = -i;
886 *p-- = '0' + i;
887 *p = '-';
888 return p;
889 } else {
890 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
891 i = i + 10;
892 i = -i;
893 *p-- = '0' + i % 10;
894 // Undo what we did a moment ago
895 i = i / 10 + 1;
896 do {
897 *p-- = '0' + i % 10;
898 i /= 10;
899 } while (i > 0);
900 *p = '-';
901 return p;
902 }
903 }
904 }
905
FastHexToBuffer(int i,char * buffer)906 char *FastHexToBuffer(int i, char* buffer) {
907 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
908
909 static const char *hexdigits = "0123456789abcdef";
910 char *p = buffer + 21;
911 *p-- = '\0';
912 do {
913 *p-- = hexdigits[i & 15]; // mod by 16
914 i >>= 4; // divide by 16
915 } while (i > 0);
916 return p + 1;
917 }
918
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)919 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
920 static const char *hexdigits = "0123456789abcdef";
921 buffer[num_byte] = '\0';
922 for (int i = num_byte - 1; i >= 0; i--) {
923 #ifdef _M_X64
924 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
925 // block. Given that the uint32 cast was to improve performance on 32-bit
926 // platforms, we use 64-bit '&' directly.
927 buffer[i] = hexdigits[value & 0xf];
928 #else
929 buffer[i] = hexdigits[uint32(value) & 0xf];
930 #endif
931 value >>= 4;
932 }
933 return buffer;
934 }
935
FastHex64ToBuffer(uint64 value,char * buffer)936 char *FastHex64ToBuffer(uint64 value, char* buffer) {
937 return InternalFastHexToBuffer(value, buffer, 16);
938 }
939
FastHex32ToBuffer(uint32 value,char * buffer)940 char *FastHex32ToBuffer(uint32 value, char* buffer) {
941 return InternalFastHexToBuffer(value, buffer, 8);
942 }
943
944 // ----------------------------------------------------------------------
945 // FastInt32ToBufferLeft()
946 // FastUInt32ToBufferLeft()
947 // FastInt64ToBufferLeft()
948 // FastUInt64ToBufferLeft()
949 //
950 // Like the Fast*ToBuffer() functions above, these are intended for speed.
951 // Unlike the Fast*ToBuffer() functions, however, these functions write
952 // their output to the beginning of the buffer (hence the name, as the
953 // output is left-aligned). The caller is responsible for ensuring that
954 // the buffer has enough space to hold the output.
955 //
956 // Returns a pointer to the end of the string (i.e. the null character
957 // terminating the string).
958 // ----------------------------------------------------------------------
959
960 static const char two_ASCII_digits[100][2] = {
961 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
962 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
963 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
964 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
965 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
966 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
967 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
968 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
969 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
970 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
971 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
972 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
973 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
974 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
975 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
976 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
977 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
978 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
979 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
980 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
981 };
982
FastUInt32ToBufferLeft(uint32 u,char * buffer)983 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
984 uint32 digits;
985 const char *ASCII_digits = NULL;
986 // The idea of this implementation is to trim the number of divides to as few
987 // as possible by using multiplication and subtraction rather than mod (%),
988 // and by outputting two digits at a time rather than one.
989 // The huge-number case is first, in the hopes that the compiler will output
990 // that case in one branch-free block of code, and only output conditional
991 // branches into it from below.
992 if (u >= 1000000000) { // >= 1,000,000,000
993 digits = u / 100000000; // 100,000,000
994 ASCII_digits = two_ASCII_digits[digits];
995 buffer[0] = ASCII_digits[0];
996 buffer[1] = ASCII_digits[1];
997 buffer += 2;
998 sublt100_000_000:
999 u -= digits * 100000000; // 100,000,000
1000 lt100_000_000:
1001 digits = u / 1000000; // 1,000,000
1002 ASCII_digits = two_ASCII_digits[digits];
1003 buffer[0] = ASCII_digits[0];
1004 buffer[1] = ASCII_digits[1];
1005 buffer += 2;
1006 sublt1_000_000:
1007 u -= digits * 1000000; // 1,000,000
1008 lt1_000_000:
1009 digits = u / 10000; // 10,000
1010 ASCII_digits = two_ASCII_digits[digits];
1011 buffer[0] = ASCII_digits[0];
1012 buffer[1] = ASCII_digits[1];
1013 buffer += 2;
1014 sublt10_000:
1015 u -= digits * 10000; // 10,000
1016 lt10_000:
1017 digits = u / 100;
1018 ASCII_digits = two_ASCII_digits[digits];
1019 buffer[0] = ASCII_digits[0];
1020 buffer[1] = ASCII_digits[1];
1021 buffer += 2;
1022 sublt100:
1023 u -= digits * 100;
1024 lt100:
1025 digits = u;
1026 ASCII_digits = two_ASCII_digits[digits];
1027 buffer[0] = ASCII_digits[0];
1028 buffer[1] = ASCII_digits[1];
1029 buffer += 2;
1030 done:
1031 *buffer = 0;
1032 return buffer;
1033 }
1034
1035 if (u < 100) {
1036 digits = u;
1037 if (u >= 10) goto lt100;
1038 *buffer++ = '0' + digits;
1039 goto done;
1040 }
1041 if (u < 10000) { // 10,000
1042 if (u >= 1000) goto lt10_000;
1043 digits = u / 100;
1044 *buffer++ = '0' + digits;
1045 goto sublt100;
1046 }
1047 if (u < 1000000) { // 1,000,000
1048 if (u >= 100000) goto lt1_000_000;
1049 digits = u / 10000; // 10,000
1050 *buffer++ = '0' + digits;
1051 goto sublt10_000;
1052 }
1053 if (u < 100000000) { // 100,000,000
1054 if (u >= 10000000) goto lt100_000_000;
1055 digits = u / 1000000; // 1,000,000
1056 *buffer++ = '0' + digits;
1057 goto sublt1_000_000;
1058 }
1059 // we already know that u < 1,000,000,000
1060 digits = u / 100000000; // 100,000,000
1061 *buffer++ = '0' + digits;
1062 goto sublt100_000_000;
1063 }
1064
FastInt32ToBufferLeft(int32 i,char * buffer)1065 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1066 uint32 u = i;
1067 if (i < 0) {
1068 *buffer++ = '-';
1069 u = -i;
1070 }
1071 return FastUInt32ToBufferLeft(u, buffer);
1072 }
1073
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1074 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1075 int digits;
1076 const char *ASCII_digits = NULL;
1077
1078 uint32 u = static_cast<uint32>(u64);
1079 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1080
1081 uint64 top_11_digits = u64 / 1000000000;
1082 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1083 u = u64 - (top_11_digits * 1000000000);
1084
1085 digits = u / 10000000; // 10,000,000
1086 GOOGLE_DCHECK_LT(digits, 100);
1087 ASCII_digits = two_ASCII_digits[digits];
1088 buffer[0] = ASCII_digits[0];
1089 buffer[1] = ASCII_digits[1];
1090 buffer += 2;
1091 u -= digits * 10000000; // 10,000,000
1092 digits = u / 100000; // 100,000
1093 ASCII_digits = two_ASCII_digits[digits];
1094 buffer[0] = ASCII_digits[0];
1095 buffer[1] = ASCII_digits[1];
1096 buffer += 2;
1097 u -= digits * 100000; // 100,000
1098 digits = u / 1000; // 1,000
1099 ASCII_digits = two_ASCII_digits[digits];
1100 buffer[0] = ASCII_digits[0];
1101 buffer[1] = ASCII_digits[1];
1102 buffer += 2;
1103 u -= digits * 1000; // 1,000
1104 digits = u / 10;
1105 ASCII_digits = two_ASCII_digits[digits];
1106 buffer[0] = ASCII_digits[0];
1107 buffer[1] = ASCII_digits[1];
1108 buffer += 2;
1109 u -= digits * 10;
1110 digits = u;
1111 *buffer++ = '0' + digits;
1112 *buffer = 0;
1113 return buffer;
1114 }
1115
FastInt64ToBufferLeft(int64 i,char * buffer)1116 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1117 uint64 u = i;
1118 if (i < 0) {
1119 *buffer++ = '-';
1120 u = -i;
1121 }
1122 return FastUInt64ToBufferLeft(u, buffer);
1123 }
1124
1125 // ----------------------------------------------------------------------
1126 // SimpleItoa()
1127 // Description: converts an integer to a string.
1128 //
1129 // Return value: string
1130 // ----------------------------------------------------------------------
1131
SimpleItoa(int i)1132 string SimpleItoa(int i) {
1133 char buffer[kFastToBufferSize];
1134 return (sizeof(i) == 4) ?
1135 FastInt32ToBuffer(i, buffer) :
1136 FastInt64ToBuffer(i, buffer);
1137 }
1138
SimpleItoa(unsigned int i)1139 string SimpleItoa(unsigned int i) {
1140 char buffer[kFastToBufferSize];
1141 return string(buffer, (sizeof(i) == 4) ?
1142 FastUInt32ToBufferLeft(i, buffer) :
1143 FastUInt64ToBufferLeft(i, buffer));
1144 }
1145
SimpleItoa(long i)1146 string SimpleItoa(long i) {
1147 char buffer[kFastToBufferSize];
1148 return (sizeof(i) == 4) ?
1149 FastInt32ToBuffer(i, buffer) :
1150 FastInt64ToBuffer(i, buffer);
1151 }
1152
SimpleItoa(unsigned long i)1153 string SimpleItoa(unsigned long i) {
1154 char buffer[kFastToBufferSize];
1155 return string(buffer, (sizeof(i) == 4) ?
1156 FastUInt32ToBufferLeft(i, buffer) :
1157 FastUInt64ToBufferLeft(i, buffer));
1158 }
1159
SimpleItoa(long long i)1160 string SimpleItoa(long long i) {
1161 char buffer[kFastToBufferSize];
1162 return (sizeof(i) == 4) ?
1163 FastInt32ToBuffer(i, buffer) :
1164 FastInt64ToBuffer(i, buffer);
1165 }
1166
SimpleItoa(unsigned long long i)1167 string SimpleItoa(unsigned long long i) {
1168 char buffer[kFastToBufferSize];
1169 return string(buffer, (sizeof(i) == 4) ?
1170 FastUInt32ToBufferLeft(i, buffer) :
1171 FastUInt64ToBufferLeft(i, buffer));
1172 }
1173
1174 // ----------------------------------------------------------------------
1175 // SimpleDtoa()
1176 // SimpleFtoa()
1177 // DoubleToBuffer()
1178 // FloatToBuffer()
1179 // We want to print the value without losing precision, but we also do
1180 // not want to print more digits than necessary. This turns out to be
1181 // trickier than it sounds. Numbers like 0.2 cannot be represented
1182 // exactly in binary. If we print 0.2 with a very large precision,
1183 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1184 // On the other hand, if we set the precision too low, we lose
1185 // significant digits when printing numbers that actually need them.
1186 // It turns out there is no precision value that does the right thing
1187 // for all numbers.
1188 //
1189 // Our strategy is to first try printing with a precision that is never
1190 // over-precise, then parse the result with strtod() to see if it
1191 // matches. If not, we print again with a precision that will always
1192 // give a precise result, but may use more digits than necessary.
1193 //
1194 // An arguably better strategy would be to use the algorithm described
1195 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1196 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1197 // however, that the following implementation is about as fast as
1198 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1199 // will not scale well on multi-core machines. DMG's code is slightly
1200 // more accurate (in that it will never use more digits than
1201 // necessary), but this is probably irrelevant for most users.
1202 //
1203 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1204 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1205 // one in that it makes guesses and then uses strtod() to check them.
1206 // Their implementation is faster because they use their own code to
1207 // generate the digits in the first place rather than use snprintf(),
1208 // thus avoiding format string parsing overhead. However, this makes
1209 // it considerably more complicated than the following implementation,
1210 // and it is embedded in a larger library. If speed turns out to be
1211 // an issue, we could re-implement this in terms of their
1212 // implementation.
1213 // ----------------------------------------------------------------------
1214
SimpleDtoa(double value)1215 string SimpleDtoa(double value) {
1216 char buffer[kDoubleToBufferSize];
1217 return DoubleToBuffer(value, buffer);
1218 }
1219
SimpleFtoa(float value)1220 string SimpleFtoa(float value) {
1221 char buffer[kFloatToBufferSize];
1222 return FloatToBuffer(value, buffer);
1223 }
1224
IsValidFloatChar(char c)1225 static inline bool IsValidFloatChar(char c) {
1226 return ('0' <= c && c <= '9') ||
1227 c == 'e' || c == 'E' ||
1228 c == '+' || c == '-';
1229 }
1230
DelocalizeRadix(char * buffer)1231 void DelocalizeRadix(char* buffer) {
1232 // Fast check: if the buffer has a normal decimal point, assume no
1233 // translation is needed.
1234 if (strchr(buffer, '.') != NULL) return;
1235
1236 // Find the first unknown character.
1237 while (IsValidFloatChar(*buffer)) ++buffer;
1238
1239 if (*buffer == '\0') {
1240 // No radix character found.
1241 return;
1242 }
1243
1244 // We are now pointing at the locale-specific radix character. Replace it
1245 // with '.'.
1246 *buffer = '.';
1247 ++buffer;
1248
1249 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1250 // It appears the radix was a multi-byte character. We need to remove the
1251 // extra bytes.
1252 char* target = buffer;
1253 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1254 memmove(target, buffer, strlen(buffer) + 1);
1255 }
1256 }
1257
DoubleToBuffer(double value,char * buffer)1258 char* DoubleToBuffer(double value, char* buffer) {
1259 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1260 // platforms these days. Just in case some system exists where DBL_DIG
1261 // is significantly larger -- and risks overflowing our buffer -- we have
1262 // this assert.
1263 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1264
1265 if (value == std::numeric_limits<double>::infinity()) {
1266 strcpy(buffer, "inf");
1267 return buffer;
1268 } else if (value == -std::numeric_limits<double>::infinity()) {
1269 strcpy(buffer, "-inf");
1270 return buffer;
1271 } else if (MathLimits<double>::IsNaN(value)) {
1272 strcpy(buffer, "nan");
1273 return buffer;
1274 }
1275
1276 int snprintf_result =
1277 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1278
1279 // The snprintf should never overflow because the buffer is significantly
1280 // larger than the precision we asked for.
1281 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1282
1283 // We need to make parsed_value volatile in order to force the compiler to
1284 // write it out to the stack. Otherwise, it may keep the value in a
1285 // register, and if it does that, it may keep it as a long double instead
1286 // of a double. This long double may have extra bits that make it compare
1287 // unequal to "value" even though it would be exactly equal if it were
1288 // truncated to a double.
1289 volatile double parsed_value = strtod(buffer, NULL);
1290 if (parsed_value != value) {
1291 int snprintf_result =
1292 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1293
1294 // Should never overflow; see above.
1295 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1296 }
1297
1298 DelocalizeRadix(buffer);
1299 return buffer;
1300 }
1301
memcasecmp(const char * s1,const char * s2,size_t len)1302 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1303 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1304 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1305
1306 for ( int i = 0; i < len; i++ ) {
1307 const int diff =
1308 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1309 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1310 if (diff != 0) return diff;
1311 }
1312 return 0;
1313 }
1314
CaseEqual(StringPiece s1,StringPiece s2)1315 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1316 if (s1.size() != s2.size()) return false;
1317 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1318 }
1319
safe_strtob(StringPiece str,bool * value)1320 bool safe_strtob(StringPiece str, bool* value) {
1321 GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
1322 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1323 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1324 CaseEqual(str, "1")) {
1325 *value = true;
1326 return true;
1327 }
1328 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1329 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1330 CaseEqual(str, "0")) {
1331 *value = false;
1332 return true;
1333 }
1334 return false;
1335 }
1336
safe_strtof(const char * str,float * value)1337 bool safe_strtof(const char* str, float* value) {
1338 char* endptr;
1339 errno = 0; // errno only gets set on errors
1340 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1341 *value = strtod(str, &endptr);
1342 #else
1343 *value = strtof(str, &endptr);
1344 #endif
1345 return *str != 0 && *endptr == 0 && errno == 0;
1346 }
1347
safe_strtod(const char * str,double * value)1348 bool safe_strtod(const char* str, double* value) {
1349 char* endptr;
1350 *value = strtod(str, &endptr);
1351 if (endptr != str) {
1352 while (ascii_isspace(*endptr)) ++endptr;
1353 }
1354 // Ignore range errors from strtod. The values it
1355 // returns on underflow and overflow are the right
1356 // fallback in a robust setting.
1357 return *str != '\0' && *endptr == '\0';
1358 }
1359
safe_strto32(const string & str,int32 * value)1360 bool safe_strto32(const string& str, int32* value) {
1361 return safe_int_internal(str, value);
1362 }
1363
safe_strtou32(const string & str,uint32 * value)1364 bool safe_strtou32(const string& str, uint32* value) {
1365 return safe_uint_internal(str, value);
1366 }
1367
safe_strto64(const string & str,int64 * value)1368 bool safe_strto64(const string& str, int64* value) {
1369 return safe_int_internal(str, value);
1370 }
1371
safe_strtou64(const string & str,uint64 * value)1372 bool safe_strtou64(const string& str, uint64* value) {
1373 return safe_uint_internal(str, value);
1374 }
1375
FloatToBuffer(float value,char * buffer)1376 char* FloatToBuffer(float value, char* buffer) {
1377 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1378 // platforms these days. Just in case some system exists where FLT_DIG
1379 // is significantly larger -- and risks overflowing our buffer -- we have
1380 // this assert.
1381 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1382
1383 if (value == std::numeric_limits<double>::infinity()) {
1384 strcpy(buffer, "inf");
1385 return buffer;
1386 } else if (value == -std::numeric_limits<double>::infinity()) {
1387 strcpy(buffer, "-inf");
1388 return buffer;
1389 } else if (MathLimits<float>::IsNaN(value)) {
1390 strcpy(buffer, "nan");
1391 return buffer;
1392 }
1393
1394 int snprintf_result =
1395 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1396
1397 // The snprintf should never overflow because the buffer is significantly
1398 // larger than the precision we asked for.
1399 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1400
1401 float parsed_value;
1402 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1403 int snprintf_result =
1404 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1405
1406 // Should never overflow; see above.
1407 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1408 }
1409
1410 DelocalizeRadix(buffer);
1411 return buffer;
1412 }
1413
1414 namespace strings {
1415
AlphaNum(strings::Hex hex)1416 AlphaNum::AlphaNum(strings::Hex hex) {
1417 char *const end = &digits[kFastToBufferSize];
1418 char *writer = end;
1419 uint64 value = hex.value;
1420 uint64 width = hex.spec;
1421 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1422 // where 0x10000 is the smallest hex number that is as wide as the user
1423 // asked for.
1424 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1425 static const char hexdigits[] = "0123456789abcdef";
1426 do {
1427 *--writer = hexdigits[value & 0xF];
1428 value >>= 4;
1429 mask >>= 4;
1430 } while (mask != 0);
1431 piece_data_ = writer;
1432 piece_size_ = end - writer;
1433 }
1434
1435 } // namespace strings
1436
1437 // ----------------------------------------------------------------------
1438 // StrCat()
1439 // This merges the given strings or integers, with no delimiter. This
1440 // is designed to be the fastest possible way to construct a string out
1441 // of a mix of raw C strings, C++ strings, and integer values.
1442 // ----------------------------------------------------------------------
1443
1444 // Append is merely a version of memcpy that returns the address of the byte
1445 // after the area just overwritten. It comes in multiple flavors to minimize
1446 // call overhead.
Append1(char * out,const AlphaNum & x)1447 static char *Append1(char *out, const AlphaNum &x) {
1448 memcpy(out, x.data(), x.size());
1449 return out + x.size();
1450 }
1451
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1452 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1453 memcpy(out, x1.data(), x1.size());
1454 out += x1.size();
1455
1456 memcpy(out, x2.data(), x2.size());
1457 return out + x2.size();
1458 }
1459
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1460 static char *Append4(char *out,
1461 const AlphaNum &x1, const AlphaNum &x2,
1462 const AlphaNum &x3, const AlphaNum &x4) {
1463 memcpy(out, x1.data(), x1.size());
1464 out += x1.size();
1465
1466 memcpy(out, x2.data(), x2.size());
1467 out += x2.size();
1468
1469 memcpy(out, x3.data(), x3.size());
1470 out += x3.size();
1471
1472 memcpy(out, x4.data(), x4.size());
1473 return out + x4.size();
1474 }
1475
StrCat(const AlphaNum & a,const AlphaNum & b)1476 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1477 string result;
1478 result.resize(a.size() + b.size());
1479 char *const begin = &*result.begin();
1480 char *out = Append2(begin, a, b);
1481 GOOGLE_DCHECK_EQ(out, begin + result.size());
1482 return result;
1483 }
1484
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1485 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1486 string result;
1487 result.resize(a.size() + b.size() + c.size());
1488 char *const begin = &*result.begin();
1489 char *out = Append2(begin, a, b);
1490 out = Append1(out, c);
1491 GOOGLE_DCHECK_EQ(out, begin + result.size());
1492 return result;
1493 }
1494
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1495 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1496 const AlphaNum &d) {
1497 string result;
1498 result.resize(a.size() + b.size() + c.size() + d.size());
1499 char *const begin = &*result.begin();
1500 char *out = Append4(begin, a, b, c, d);
1501 GOOGLE_DCHECK_EQ(out, begin + result.size());
1502 return result;
1503 }
1504
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1505 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1506 const AlphaNum &d, const AlphaNum &e) {
1507 string result;
1508 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1509 char *const begin = &*result.begin();
1510 char *out = Append4(begin, a, b, c, d);
1511 out = Append1(out, e);
1512 GOOGLE_DCHECK_EQ(out, begin + result.size());
1513 return result;
1514 }
1515
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1516 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1517 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1518 string result;
1519 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1520 f.size());
1521 char *const begin = &*result.begin();
1522 char *out = Append4(begin, a, b, c, d);
1523 out = Append2(out, e, f);
1524 GOOGLE_DCHECK_EQ(out, begin + result.size());
1525 return result;
1526 }
1527
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1528 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1529 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1530 const AlphaNum &g) {
1531 string result;
1532 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1533 f.size() + g.size());
1534 char *const begin = &*result.begin();
1535 char *out = Append4(begin, a, b, c, d);
1536 out = Append2(out, e, f);
1537 out = Append1(out, g);
1538 GOOGLE_DCHECK_EQ(out, begin + result.size());
1539 return result;
1540 }
1541
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1542 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1543 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1544 const AlphaNum &g, const AlphaNum &h) {
1545 string result;
1546 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1547 f.size() + g.size() + h.size());
1548 char *const begin = &*result.begin();
1549 char *out = Append4(begin, a, b, c, d);
1550 out = Append4(out, e, f, g, h);
1551 GOOGLE_DCHECK_EQ(out, begin + result.size());
1552 return result;
1553 }
1554
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1555 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1556 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1557 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1558 string result;
1559 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1560 f.size() + g.size() + h.size() + i.size());
1561 char *const begin = &*result.begin();
1562 char *out = Append4(begin, a, b, c, d);
1563 out = Append4(out, e, f, g, h);
1564 out = Append1(out, i);
1565 GOOGLE_DCHECK_EQ(out, begin + result.size());
1566 return result;
1567 }
1568
1569 // It's possible to call StrAppend with a char * pointer that is partway into
1570 // the string we're appending to. However the results of this are random.
1571 // Therefore, check for this in debug mode. Use unsigned math so we only have
1572 // to do one comparison.
1573 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1574 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1575 uintptr_t((dest).size()))
1576
StrAppend(string * result,const AlphaNum & a)1577 void StrAppend(string *result, const AlphaNum &a) {
1578 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1579 result->append(a.data(), a.size());
1580 }
1581
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1582 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1583 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1584 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1585 string::size_type old_size = result->size();
1586 result->resize(old_size + a.size() + b.size());
1587 char *const begin = &*result->begin();
1588 char *out = Append2(begin + old_size, a, b);
1589 GOOGLE_DCHECK_EQ(out, begin + result->size());
1590 }
1591
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1592 void StrAppend(string *result,
1593 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1594 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1595 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1596 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1597 string::size_type old_size = result->size();
1598 result->resize(old_size + a.size() + b.size() + c.size());
1599 char *const begin = &*result->begin();
1600 char *out = Append2(begin + old_size, a, b);
1601 out = Append1(out, c);
1602 GOOGLE_DCHECK_EQ(out, begin + result->size());
1603 }
1604
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1605 void StrAppend(string *result,
1606 const AlphaNum &a, const AlphaNum &b,
1607 const AlphaNum &c, const AlphaNum &d) {
1608 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1609 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1610 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1611 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1612 string::size_type old_size = result->size();
1613 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1614 char *const begin = &*result->begin();
1615 char *out = Append4(begin + old_size, a, b, c, d);
1616 GOOGLE_DCHECK_EQ(out, begin + result->size());
1617 }
1618
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1619 int GlobalReplaceSubstring(const string& substring,
1620 const string& replacement,
1621 string* s) {
1622 GOOGLE_CHECK(s != NULL);
1623 if (s->empty() || substring.empty())
1624 return 0;
1625 string tmp;
1626 int num_replacements = 0;
1627 int pos = 0;
1628 for (int match_pos = s->find(substring.data(), pos, substring.length());
1629 match_pos != string::npos;
1630 pos = match_pos + substring.length(),
1631 match_pos = s->find(substring.data(), pos, substring.length())) {
1632 ++num_replacements;
1633 // Append the original content before the match.
1634 tmp.append(*s, pos, match_pos - pos);
1635 // Append the replacement for the match.
1636 tmp.append(replacement.begin(), replacement.end());
1637 }
1638 // Append the content after the last match. If no replacements were made, the
1639 // original string is left untouched.
1640 if (num_replacements > 0) {
1641 tmp.append(*s, pos, s->length() - pos);
1642 s->swap(tmp);
1643 }
1644 return num_replacements;
1645 }
1646
CalculateBase64EscapedLen(int input_len,bool do_padding)1647 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1648 // Base64 encodes three bytes of input at a time. If the input is not
1649 // divisible by three, we pad as appropriate.
1650 //
1651 // (from http://tools.ietf.org/html/rfc3548)
1652 // Special processing is performed if fewer than 24 bits are available
1653 // at the end of the data being encoded. A full encoding quantum is
1654 // always completed at the end of a quantity. When fewer than 24 input
1655 // bits are available in an input group, zero bits are added (on the
1656 // right) to form an integral number of 6-bit groups. Padding at the
1657 // end of the data is performed using the '=' character. Since all base
1658 // 64 input is an integral number of octets, only the following cases
1659 // can arise:
1660
1661
1662 // Base64 encodes each three bytes of input into four bytes of output.
1663 int len = (input_len / 3) * 4;
1664
1665 if (input_len % 3 == 0) {
1666 // (from http://tools.ietf.org/html/rfc3548)
1667 // (1) the final quantum of encoding input is an integral multiple of 24
1668 // bits; here, the final unit of encoded output will be an integral
1669 // multiple of 4 characters with no "=" padding,
1670 } else if (input_len % 3 == 1) {
1671 // (from http://tools.ietf.org/html/rfc3548)
1672 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1673 // final unit of encoded output will be two characters followed by two
1674 // "=" padding characters, or
1675 len += 2;
1676 if (do_padding) {
1677 len += 2;
1678 }
1679 } else { // (input_len % 3 == 2)
1680 // (from http://tools.ietf.org/html/rfc3548)
1681 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1682 // final unit of encoded output will be three characters followed by one
1683 // "=" padding character.
1684 len += 3;
1685 if (do_padding) {
1686 len += 1;
1687 }
1688 }
1689
1690 assert(len >= input_len); // make sure we didn't overflow
1691 return len;
1692 }
1693
1694 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1695 int CalculateBase64EscapedLen(int input_len) {
1696 return CalculateBase64EscapedLen(input_len, true);
1697 }
1698
1699 // ----------------------------------------------------------------------
1700 // int Base64Unescape() - base64 decoder
1701 // int Base64Escape() - base64 encoder
1702 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1703 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1704 //
1705 // Check out
1706 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1707 // care about is that...
1708 // Take the encoded stuff in groups of 4 characters and turn each
1709 // character into a code 0 to 63 thus:
1710 // A-Z map to 0 to 25
1711 // a-z map to 26 to 51
1712 // 0-9 map to 52 to 61
1713 // +(- for WebSafe) maps to 62
1714 // /(_ for WebSafe) maps to 63
1715 // There will be four numbers, all less than 64 which can be represented
1716 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1717 // Arrange the 6 digit binary numbers into three bytes as such:
1718 // aaaaaabb bbbbcccc ccdddddd
1719 // Equals signs (one or two) are used at the end of the encoded block to
1720 // indicate that the text was not an integer multiple of three bytes long.
1721 // ----------------------------------------------------------------------
1722
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1723 int Base64UnescapeInternal(const char *src_param, int szsrc,
1724 char *dest, int szdest,
1725 const signed char* unbase64) {
1726 static const char kPad64Equals = '=';
1727 static const char kPad64Dot = '.';
1728
1729 int decode = 0;
1730 int destidx = 0;
1731 int state = 0;
1732 unsigned int ch = 0;
1733 unsigned int temp = 0;
1734
1735 // If "char" is signed by default, using *src as an array index results in
1736 // accessing negative array elements. Treat the input as a pointer to
1737 // unsigned char to avoid this.
1738 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1739
1740 // The GET_INPUT macro gets the next input character, skipping
1741 // over any whitespace, and stopping when we reach the end of the
1742 // string or when we read any non-data character. The arguments are
1743 // an arbitrary identifier (used as a label for goto) and the number
1744 // of data bytes that must remain in the input to avoid aborting the
1745 // loop.
1746 #define GET_INPUT(label, remain) \
1747 label: \
1748 --szsrc; \
1749 ch = *src++; \
1750 decode = unbase64[ch]; \
1751 if (decode < 0) { \
1752 if (ascii_isspace(ch) && szsrc >= remain) \
1753 goto label; \
1754 state = 4 - remain; \
1755 break; \
1756 }
1757
1758 // if dest is null, we're just checking to see if it's legal input
1759 // rather than producing output. (I suspect this could just be done
1760 // with a regexp...). We duplicate the loop so this test can be
1761 // outside it instead of in every iteration.
1762
1763 if (dest) {
1764 // This loop consumes 4 input bytes and produces 3 output bytes
1765 // per iteration. We can't know at the start that there is enough
1766 // data left in the string for a full iteration, so the loop may
1767 // break out in the middle; if so 'state' will be set to the
1768 // number of input bytes read.
1769
1770 while (szsrc >= 4) {
1771 // We'll start by optimistically assuming that the next four
1772 // bytes of the string (src[0..3]) are four good data bytes
1773 // (that is, no nulls, whitespace, padding chars, or illegal
1774 // chars). We need to test src[0..2] for nulls individually
1775 // before constructing temp to preserve the property that we
1776 // never read past a null in the string (no matter how long
1777 // szsrc claims the string is).
1778
1779 if (!src[0] || !src[1] || !src[2] ||
1780 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1781 (unsigned(unbase64[src[1]]) << 12) |
1782 (unsigned(unbase64[src[2]]) << 6) |
1783 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1784 // Iff any of those four characters was bad (null, illegal,
1785 // whitespace, padding), then temp's high bit will be set
1786 // (because unbase64[] is -1 for all bad characters).
1787 //
1788 // We'll back up and resort to the slower decoder, which knows
1789 // how to handle those cases.
1790
1791 GET_INPUT(first, 4);
1792 temp = decode;
1793 GET_INPUT(second, 3);
1794 temp = (temp << 6) | decode;
1795 GET_INPUT(third, 2);
1796 temp = (temp << 6) | decode;
1797 GET_INPUT(fourth, 1);
1798 temp = (temp << 6) | decode;
1799 } else {
1800 // We really did have four good data bytes, so advance four
1801 // characters in the string.
1802
1803 szsrc -= 4;
1804 src += 4;
1805 decode = -1;
1806 ch = '\0';
1807 }
1808
1809 // temp has 24 bits of input, so write that out as three bytes.
1810
1811 if (destidx+3 > szdest) return -1;
1812 dest[destidx+2] = temp;
1813 temp >>= 8;
1814 dest[destidx+1] = temp;
1815 temp >>= 8;
1816 dest[destidx] = temp;
1817 destidx += 3;
1818 }
1819 } else {
1820 while (szsrc >= 4) {
1821 if (!src[0] || !src[1] || !src[2] ||
1822 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1823 (unsigned(unbase64[src[1]]) << 12) |
1824 (unsigned(unbase64[src[2]]) << 6) |
1825 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1826 GET_INPUT(first_no_dest, 4);
1827 GET_INPUT(second_no_dest, 3);
1828 GET_INPUT(third_no_dest, 2);
1829 GET_INPUT(fourth_no_dest, 1);
1830 } else {
1831 szsrc -= 4;
1832 src += 4;
1833 decode = -1;
1834 ch = '\0';
1835 }
1836 destidx += 3;
1837 }
1838 }
1839
1840 #undef GET_INPUT
1841
1842 // if the loop terminated because we read a bad character, return
1843 // now.
1844 if (decode < 0 && ch != '\0' &&
1845 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1846 return -1;
1847
1848 if (ch == kPad64Equals || ch == kPad64Dot) {
1849 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1850 // look at it again when we count to check for the proper number of
1851 // equals signs at the end.
1852 ++szsrc;
1853 --src;
1854 } else {
1855 // This loop consumes 1 input byte per iteration. It's used to
1856 // clean up the 0-3 input bytes remaining when the first, faster
1857 // loop finishes. 'temp' contains the data from 'state' input
1858 // characters read by the first loop.
1859 while (szsrc > 0) {
1860 --szsrc;
1861 ch = *src++;
1862 decode = unbase64[ch];
1863 if (decode < 0) {
1864 if (ascii_isspace(ch)) {
1865 continue;
1866 } else if (ch == '\0') {
1867 break;
1868 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1869 // back up one character; we'll read it again when we check
1870 // for the correct number of pad characters at the end.
1871 ++szsrc;
1872 --src;
1873 break;
1874 } else {
1875 return -1;
1876 }
1877 }
1878
1879 // Each input character gives us six bits of output.
1880 temp = (temp << 6) | decode;
1881 ++state;
1882 if (state == 4) {
1883 // If we've accumulated 24 bits of output, write that out as
1884 // three bytes.
1885 if (dest) {
1886 if (destidx+3 > szdest) return -1;
1887 dest[destidx+2] = temp;
1888 temp >>= 8;
1889 dest[destidx+1] = temp;
1890 temp >>= 8;
1891 dest[destidx] = temp;
1892 }
1893 destidx += 3;
1894 state = 0;
1895 temp = 0;
1896 }
1897 }
1898 }
1899
1900 // Process the leftover data contained in 'temp' at the end of the input.
1901 int expected_equals = 0;
1902 switch (state) {
1903 case 0:
1904 // Nothing left over; output is a multiple of 3 bytes.
1905 break;
1906
1907 case 1:
1908 // Bad input; we have 6 bits left over.
1909 return -1;
1910
1911 case 2:
1912 // Produce one more output byte from the 12 input bits we have left.
1913 if (dest) {
1914 if (destidx+1 > szdest) return -1;
1915 temp >>= 4;
1916 dest[destidx] = temp;
1917 }
1918 ++destidx;
1919 expected_equals = 2;
1920 break;
1921
1922 case 3:
1923 // Produce two more output bytes from the 18 input bits we have left.
1924 if (dest) {
1925 if (destidx+2 > szdest) return -1;
1926 temp >>= 2;
1927 dest[destidx+1] = temp;
1928 temp >>= 8;
1929 dest[destidx] = temp;
1930 }
1931 destidx += 2;
1932 expected_equals = 1;
1933 break;
1934
1935 default:
1936 // state should have no other values at this point.
1937 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1938 }
1939
1940 // The remainder of the string should be all whitespace, mixed with
1941 // exactly 0 equals signs, or exactly 'expected_equals' equals
1942 // signs. (Always accepting 0 equals signs is a google extension
1943 // not covered in the RFC, as is accepting dot as the pad character.)
1944
1945 int equals = 0;
1946 while (szsrc > 0 && *src) {
1947 if (*src == kPad64Equals || *src == kPad64Dot)
1948 ++equals;
1949 else if (!ascii_isspace(*src))
1950 return -1;
1951 --szsrc;
1952 ++src;
1953 }
1954
1955 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1956 }
1957
1958 // The arrays below were generated by the following code
1959 // #include <sys/time.h>
1960 // #include <stdlib.h>
1961 // #include <string.h>
1962 // main()
1963 // {
1964 // static const char Base64[] =
1965 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1966 // char *pos;
1967 // int idx, i, j;
1968 // printf(" ");
1969 // for (i = 0; i < 255; i += 8) {
1970 // for (j = i; j < i + 8; j++) {
1971 // pos = strchr(Base64, j);
1972 // if ((pos == NULL) || (j == 0))
1973 // idx = -1;
1974 // else
1975 // idx = pos - Base64;
1976 // if (idx == -1)
1977 // printf(" %2d, ", idx);
1978 // else
1979 // printf(" %2d/*%c*/,", idx, j);
1980 // }
1981 // printf("\n ");
1982 // }
1983 // }
1984 //
1985 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1986 // tables from the functions below.
1987 static const signed char kUnBase64[] = {
1988 -1, -1, -1, -1, -1, -1, -1, -1,
1989 -1, -1, -1, -1, -1, -1, -1, -1,
1990 -1, -1, -1, -1, -1, -1, -1, -1,
1991 -1, -1, -1, -1, -1, -1, -1, -1,
1992 -1, -1, -1, -1, -1, -1, -1, -1,
1993 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1994 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1995 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1996 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1997 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1998 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1999 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
2000 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2001 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2002 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2003 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2004 -1, -1, -1, -1, -1, -1, -1, -1,
2005 -1, -1, -1, -1, -1, -1, -1, -1,
2006 -1, -1, -1, -1, -1, -1, -1, -1,
2007 -1, -1, -1, -1, -1, -1, -1, -1,
2008 -1, -1, -1, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1,
2011 -1, -1, -1, -1, -1, -1, -1, -1,
2012 -1, -1, -1, -1, -1, -1, -1, -1,
2013 -1, -1, -1, -1, -1, -1, -1, -1,
2014 -1, -1, -1, -1, -1, -1, -1, -1,
2015 -1, -1, -1, -1, -1, -1, -1, -1,
2016 -1, -1, -1, -1, -1, -1, -1, -1,
2017 -1, -1, -1, -1, -1, -1, -1, -1,
2018 -1, -1, -1, -1, -1, -1, -1, -1,
2019 -1, -1, -1, -1, -1, -1, -1, -1
2020 };
2021 static const signed char kUnWebSafeBase64[] = {
2022 -1, -1, -1, -1, -1, -1, -1, -1,
2023 -1, -1, -1, -1, -1, -1, -1, -1,
2024 -1, -1, -1, -1, -1, -1, -1, -1,
2025 -1, -1, -1, -1, -1, -1, -1, -1,
2026 -1, -1, -1, -1, -1, -1, -1, -1,
2027 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2028 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2029 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2030 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2031 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2032 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2033 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2034 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2035 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2036 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2037 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2038 -1, -1, -1, -1, -1, -1, -1, -1,
2039 -1, -1, -1, -1, -1, -1, -1, -1,
2040 -1, -1, -1, -1, -1, -1, -1, -1,
2041 -1, -1, -1, -1, -1, -1, -1, -1,
2042 -1, -1, -1, -1, -1, -1, -1, -1,
2043 -1, -1, -1, -1, -1, -1, -1, -1,
2044 -1, -1, -1, -1, -1, -1, -1, -1,
2045 -1, -1, -1, -1, -1, -1, -1, -1,
2046 -1, -1, -1, -1, -1, -1, -1, -1,
2047 -1, -1, -1, -1, -1, -1, -1, -1,
2048 -1, -1, -1, -1, -1, -1, -1, -1,
2049 -1, -1, -1, -1, -1, -1, -1, -1,
2050 -1, -1, -1, -1, -1, -1, -1, -1,
2051 -1, -1, -1, -1, -1, -1, -1, -1,
2052 -1, -1, -1, -1, -1, -1, -1, -1,
2053 -1, -1, -1, -1, -1, -1, -1, -1
2054 };
2055
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2056 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2057 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2058 }
2059
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2060 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2061 const signed char* unbase64) {
2062 // Determine the size of the output string. Base64 encodes every 3 bytes into
2063 // 4 characters. any leftover chars are added directly for good measure.
2064 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2065 const int dest_len = 3 * (slen / 4) + (slen % 4);
2066
2067 dest->resize(dest_len);
2068
2069 // We are getting the destination buffer by getting the beginning of the
2070 // string and converting it into a char *.
2071 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2072 dest_len, unbase64);
2073 if (len < 0) {
2074 dest->clear();
2075 return false;
2076 }
2077
2078 // could be shorter if there was padding
2079 GOOGLE_DCHECK_LE(len, dest_len);
2080 dest->erase(len);
2081
2082 return true;
2083 }
2084
Base64Unescape(StringPiece src,string * dest)2085 bool Base64Unescape(StringPiece src, string* dest) {
2086 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2087 }
2088
WebSafeBase64Unescape(StringPiece src,string * dest)2089 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2090 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2091 }
2092
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2093 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2094 char *dest, int szdest, const char *base64,
2095 bool do_padding) {
2096 static const char kPad64 = '=';
2097
2098 if (szsrc <= 0) return 0;
2099
2100 if (szsrc * 4 > szdest * 3) return 0;
2101
2102 char *cur_dest = dest;
2103 const unsigned char *cur_src = src;
2104
2105 char *limit_dest = dest + szdest;
2106 const unsigned char *limit_src = src + szsrc;
2107
2108 // Three bytes of data encodes to four characters of cyphertext.
2109 // So we can pump through three-byte chunks atomically.
2110 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2111 uint32 in = BigEndian::Load32(cur_src) >> 8;
2112
2113 cur_dest[0] = base64[in >> 18];
2114 in &= 0x3FFFF;
2115 cur_dest[1] = base64[in >> 12];
2116 in &= 0xFFF;
2117 cur_dest[2] = base64[in >> 6];
2118 in &= 0x3F;
2119 cur_dest[3] = base64[in];
2120
2121 cur_dest += 4;
2122 cur_src += 3;
2123 }
2124 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2125 szdest = limit_dest - cur_dest;
2126 szsrc = limit_src - cur_src;
2127
2128 /* now deal with the tail (<=3 bytes) */
2129 switch (szsrc) {
2130 case 0:
2131 // Nothing left; nothing more to do.
2132 break;
2133 case 1: {
2134 // One byte left: this encodes to two characters, and (optionally)
2135 // two pad characters to round out the four-character cypherblock.
2136 if ((szdest -= 2) < 0) return 0;
2137 uint32 in = cur_src[0];
2138 cur_dest[0] = base64[in >> 2];
2139 in &= 0x3;
2140 cur_dest[1] = base64[in << 4];
2141 cur_dest += 2;
2142 if (do_padding) {
2143 if ((szdest -= 2) < 0) return 0;
2144 cur_dest[0] = kPad64;
2145 cur_dest[1] = kPad64;
2146 cur_dest += 2;
2147 }
2148 break;
2149 }
2150 case 2: {
2151 // Two bytes left: this encodes to three characters, and (optionally)
2152 // one pad character to round out the four-character cypherblock.
2153 if ((szdest -= 3) < 0) return 0;
2154 uint32 in = BigEndian::Load16(cur_src);
2155 cur_dest[0] = base64[in >> 10];
2156 in &= 0x3FF;
2157 cur_dest[1] = base64[in >> 4];
2158 in &= 0x00F;
2159 cur_dest[2] = base64[in << 2];
2160 cur_dest += 3;
2161 if (do_padding) {
2162 if ((szdest -= 1) < 0) return 0;
2163 cur_dest[0] = kPad64;
2164 cur_dest += 1;
2165 }
2166 break;
2167 }
2168 case 3: {
2169 // Three bytes left: same as in the big loop above. We can't do this in
2170 // the loop because the loop above always reads 4 bytes, and the fourth
2171 // byte is past the end of the input.
2172 if ((szdest -= 4) < 0) return 0;
2173 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2174 cur_dest[0] = base64[in >> 18];
2175 in &= 0x3FFFF;
2176 cur_dest[1] = base64[in >> 12];
2177 in &= 0xFFF;
2178 cur_dest[2] = base64[in >> 6];
2179 in &= 0x3F;
2180 cur_dest[3] = base64[in];
2181 cur_dest += 4;
2182 break;
2183 }
2184 default:
2185 // Should not be reached: blocks of 4 bytes are handled
2186 // in the while loop before this switch statement.
2187 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2188 break;
2189 }
2190 return (cur_dest - dest);
2191 }
2192
2193 static const char kBase64Chars[] =
2194 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2195
2196 static const char kWebSafeBase64Chars[] =
2197 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2198
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2199 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2200 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2201 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2202 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2203 int szdest, bool do_padding) {
2204 return Base64EscapeInternal(src, szsrc, dest, szdest,
2205 kWebSafeBase64Chars, do_padding);
2206 }
2207
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2208 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2209 string* dest, bool do_padding,
2210 const char* base64_chars) {
2211 const int calc_escaped_size =
2212 CalculateBase64EscapedLen(szsrc, do_padding);
2213 dest->resize(calc_escaped_size);
2214 const int escaped_len = Base64EscapeInternal(src, szsrc,
2215 string_as_array(dest),
2216 dest->size(),
2217 base64_chars,
2218 do_padding);
2219 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2220 dest->erase(escaped_len);
2221 }
2222
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2223 void Base64Escape(const unsigned char *src, int szsrc,
2224 string* dest, bool do_padding) {
2225 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2226 }
2227
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2229 string *dest, bool do_padding) {
2230 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2231 }
2232
Base64Escape(StringPiece src,string * dest)2233 void Base64Escape(StringPiece src, string* dest) {
2234 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2235 src.size(), dest, true);
2236 }
2237
WebSafeBase64Escape(StringPiece src,string * dest)2238 void WebSafeBase64Escape(StringPiece src, string* dest) {
2239 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240 src.size(), dest, false);
2241 }
2242
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2243 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2244 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245 src.size(), dest, true);
2246 }
2247
2248 // Helper to append a Unicode code point to a string as UTF8, without bringing
2249 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2250 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2251 uint32 tmp = 0;
2252 int len = 0;
2253 if (code_point <= 0x7f) {
2254 tmp = code_point;
2255 len = 1;
2256 } else if (code_point <= 0x07ff) {
2257 tmp = 0x0000c080 |
2258 ((code_point & 0x07c0) << 2) |
2259 (code_point & 0x003f);
2260 len = 2;
2261 } else if (code_point <= 0xffff) {
2262 tmp = 0x00e08080 |
2263 ((code_point & 0xf000) << 4) |
2264 ((code_point & 0x0fc0) << 2) |
2265 (code_point & 0x003f);
2266 len = 3;
2267 } else {
2268 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2269 // normally only defined up to there as well.
2270 tmp = 0xf0808080 |
2271 ((code_point & 0x1c0000) << 6) |
2272 ((code_point & 0x03f000) << 4) |
2273 ((code_point & 0x000fc0) << 2) |
2274 (code_point & 0x003f);
2275 len = 4;
2276 }
2277 tmp = ghtonl(tmp);
2278 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2279 return len;
2280 }
2281
2282 // Table of UTF-8 character lengths, based on first byte
2283 static const unsigned char kUTF8LenTbl[256] = {
2284 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2285 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2286 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2287 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2288
2289 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2290 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2291 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2292 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2293 };
2294
2295 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2296 int UTF8FirstLetterNumBytes(const char* src, int len) {
2297 if (len == 0) {
2298 return 0;
2299 }
2300 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2301 }
2302
2303 } // namespace protobuf
2304 } // namespace google
2305