1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34
35 #include <errno.h>
36 #include <float.h> // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions. The former is buggy. When invoked like so:
51 // char buffer[32];
52 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10". This is plainly wrong: %g should never print
54 // trailing zeros after the decimal point. For some reason this bug only
55 // occurs with some input values, not all. In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59
60 namespace google {
61 namespace protobuf {
62
63 // These are defined as macros on some platforms. #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67
68 // The definitions of these in ctype.h change based on locale. Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale. So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73 return ('0' <= c && c <= '9') ||
74 ('a' <= c && c <= 'f') ||
75 ('A' <= c && c <= 'F');
76 }
77
isprint(char c)78 inline bool isprint(char c) {
79 return c >= 0x20 && c <= 0x7E;
80 }
81
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 // Replaces any occurrence of the character 'remove' (or the characters
85 // in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)87 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
88 const char *str_start = s->c_str();
89 const char *str = str_start;
90 for (str = strpbrk(str, remove);
91 str != nullptr;
92 str = strpbrk(str + 1, remove)) {
93 (*s)[str - str_start] = replacewith;
94 }
95 }
96
StripWhitespace(string * str)97 void StripWhitespace(string* str) {
98 int str_length = str->length();
99
100 // Strip off leading whitespace.
101 int first = 0;
102 while (first < str_length && ascii_isspace(str->at(first))) {
103 ++first;
104 }
105 // If entire string is white space.
106 if (first == str_length) {
107 str->clear();
108 return;
109 }
110 if (first > 0) {
111 str->erase(0, first);
112 str_length -= first;
113 }
114
115 // Strip off trailing whitespace.
116 int last = str_length - 1;
117 while (last >= 0 && ascii_isspace(str->at(last))) {
118 --last;
119 }
120 if (last != (str_length - 1) && last >= 0) {
121 str->erase(last + 1, string::npos);
122 }
123 }
124
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 // Replace the "old" pattern with the "new" pattern in a string,
128 // and append the result to "res". If replace_all is false,
129 // it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)132 void StringReplace(const string& s, const string& oldsub,
133 const string& newsub, bool replace_all,
134 string* res) {
135 if (oldsub.empty()) {
136 res->append(s); // if empty, append the given string.
137 return;
138 }
139
140 string::size_type start_pos = 0;
141 string::size_type pos;
142 do {
143 pos = s.find(oldsub, start_pos);
144 if (pos == string::npos) {
145 break;
146 }
147 res->append(s, start_pos, pos - start_pos);
148 res->append(newsub);
149 start_pos = pos + oldsub.size(); // start searching again after the "old"
150 } while (replace_all);
151 res->append(s, start_pos, s.length() - start_pos);
152 }
153
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 // Give me a string and two patterns "old" and "new", and I replace
157 // the first instance of "old" in the string with "new", if it
158 // exists. If "global" is true; call this repeatedly until it
159 // fails. RETURN a new string, regardless of whether the replacement
160 // happened or not.
161 // ----------------------------------------------------------------------
162
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)163 string StringReplace(const string& s, const string& oldsub,
164 const string& newsub, bool replace_all) {
165 string ret;
166 StringReplace(s, oldsub, newsub, replace_all, &ret);
167 return ret;
168 }
169
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 // Split a string using a character delimiter. Append the components
173 // to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
179 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)180 void SplitStringToIteratorUsing(const string& full,
181 const char* delim,
182 ITR& result) {
183 // Optimize the common case where delim is a single character.
184 if (delim[0] != '\0' && delim[1] == '\0') {
185 char c = delim[0];
186 const char* p = full.data();
187 const char* end = p + full.size();
188 while (p != end) {
189 if (*p == c) {
190 ++p;
191 } else {
192 const char* start = p;
193 while (++p != end && *p != c);
194 *result++ = string(start, p - start);
195 }
196 }
197 return;
198 }
199
200 string::size_type begin_index, end_index;
201 begin_index = full.find_first_not_of(delim);
202 while (begin_index != string::npos) {
203 end_index = full.find_first_of(delim, begin_index);
204 if (end_index == string::npos) {
205 *result++ = full.substr(begin_index);
206 return;
207 }
208 *result++ = full.substr(begin_index, (end_index - begin_index));
209 begin_index = full.find_first_not_of(delim, end_index);
210 }
211 }
212
SplitStringUsing(const string & full,const char * delim,std::vector<string> * result)213 void SplitStringUsing(const string& full,
214 const char* delim,
215 std::vector<string>* result) {
216 std::back_insert_iterator< std::vector<string> > it(*result);
217 SplitStringToIteratorUsing(full, delim, it);
218 }
219
220 // Split a string using a character delimiter. Append the components
221 // to 'result'. If there are consecutive delimiters, this function
222 // will return corresponding empty strings. The string is split into
223 // at most the specified number of pieces greedily. This means that the
224 // last piece may possibly be split further. To split into as many pieces
225 // as possible, specify 0 as the number of pieces.
226 //
227 // If "full" is the empty string, yields an empty string as the only value.
228 //
229 // If "pieces" is negative for some reason, it returns the whole string
230 // ----------------------------------------------------------------------
231 template <typename StringType, typename ITR>
232 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)233 void SplitStringToIteratorAllowEmpty(const StringType& full,
234 const char* delim,
235 int pieces,
236 ITR& result) {
237 string::size_type begin_index, end_index;
238 begin_index = 0;
239
240 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
241 end_index = full.find_first_of(delim, begin_index);
242 if (end_index == string::npos) {
243 *result++ = full.substr(begin_index);
244 return;
245 }
246 *result++ = full.substr(begin_index, (end_index - begin_index));
247 begin_index = end_index + 1;
248 }
249 *result++ = full.substr(begin_index);
250 }
251
SplitStringAllowEmpty(const string & full,const char * delim,std::vector<string> * result)252 void SplitStringAllowEmpty(const string& full, const char* delim,
253 std::vector<string>* result) {
254 std::back_insert_iterator<std::vector<string> > it(*result);
255 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
256 }
257
258 // ----------------------------------------------------------------------
259 // JoinStrings()
260 // This merges a vector of string components with delim inserted
261 // as separaters between components.
262 //
263 // ----------------------------------------------------------------------
264 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)265 static void JoinStringsIterator(const ITERATOR& start,
266 const ITERATOR& end,
267 const char* delim,
268 string* result) {
269 GOOGLE_CHECK(result != nullptr);
270 result->clear();
271 int delim_length = strlen(delim);
272
273 // Precompute resulting length so we can reserve() memory in one shot.
274 int length = 0;
275 for (ITERATOR iter = start; iter != end; ++iter) {
276 if (iter != start) {
277 length += delim_length;
278 }
279 length += iter->size();
280 }
281 result->reserve(length);
282
283 // Now combine everything.
284 for (ITERATOR iter = start; iter != end; ++iter) {
285 if (iter != start) {
286 result->append(delim, delim_length);
287 }
288 result->append(iter->data(), iter->size());
289 }
290 }
291
JoinStrings(const std::vector<string> & components,const char * delim,string * result)292 void JoinStrings(const std::vector<string>& components,
293 const char* delim,
294 string * result) {
295 JoinStringsIterator(components.begin(), components.end(), delim, result);
296 }
297
298 // ----------------------------------------------------------------------
299 // UnescapeCEscapeSequences()
300 // This does all the unescaping that C does: \ooo, \r, \n, etc
301 // Returns length of resulting string.
302 // The implementation of \x parses any positive number of hex digits,
303 // but it is an error if the value requires more than 8 bits, and the
304 // result is truncated to 8 bits.
305 //
306 // The second call stores its errors in a supplied string vector.
307 // If the string vector pointer is nullptr, it reports the errors with LOG().
308 // ----------------------------------------------------------------------
309
310 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
311
312 // Protocol buffers doesn't ever care about errors, but I don't want to remove
313 // the code.
314 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
315
UnescapeCEscapeSequences(const char * source,char * dest)316 int UnescapeCEscapeSequences(const char* source, char* dest) {
317 return UnescapeCEscapeSequences(source, dest, nullptr);
318 }
319
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)320 int UnescapeCEscapeSequences(const char* source, char* dest,
321 std::vector<string> *errors) {
322 GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
323
324 char* d = dest;
325 const char* p = source;
326
327 // Small optimization for case where source = dest and there's no escaping
328 while ( p == d && *p != '\0' && *p != '\\' )
329 p++, d++;
330
331 while (*p != '\0') {
332 if (*p != '\\') {
333 *d++ = *p++;
334 } else {
335 switch ( *++p ) { // skip past the '\\'
336 case '\0':
337 LOG_STRING(ERROR, errors) << "String cannot end with \\";
338 *d = '\0';
339 return d - dest; // we're done with p
340 case 'a': *d++ = '\a'; break;
341 case 'b': *d++ = '\b'; break;
342 case 'f': *d++ = '\f'; break;
343 case 'n': *d++ = '\n'; break;
344 case 'r': *d++ = '\r'; break;
345 case 't': *d++ = '\t'; break;
346 case 'v': *d++ = '\v'; break;
347 case '\\': *d++ = '\\'; break;
348 case '?': *d++ = '\?'; break; // \? Who knew?
349 case '\'': *d++ = '\''; break;
350 case '"': *d++ = '\"'; break;
351 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
352 case '4': case '5': case '6': case '7': {
353 char ch = *p - '0';
354 if ( IS_OCTAL_DIGIT(p[1]) )
355 ch = ch * 8 + *++p - '0';
356 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
357 ch = ch * 8 + *++p - '0'; // now points at last digit
358 *d++ = ch;
359 break;
360 }
361 case 'x': case 'X': {
362 if (!isxdigit(p[1])) {
363 if (p[1] == '\0') {
364 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
365 } else {
366 LOG_STRING(ERROR, errors) <<
367 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
368 }
369 break;
370 }
371 unsigned int ch = 0;
372 const char *hex_start = p;
373 while (isxdigit(p[1])) // arbitrarily many hex digits
374 ch = (ch << 4) + hex_digit_to_int(*++p);
375 if (ch > 0xFF)
376 LOG_STRING(ERROR, errors) << "Value of " <<
377 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
378 *d++ = ch;
379 break;
380 }
381 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
382 case 'u': {
383 // \uhhhh => convert 4 hex digits to UTF-8
384 char32 rune = 0;
385 const char *hex_start = p;
386 for (int i = 0; i < 4; ++i) {
387 if (isxdigit(p[1])) { // Look one char ahead.
388 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
389 } else {
390 LOG_STRING(ERROR, errors)
391 << "\\u must be followed by 4 hex digits: \\"
392 << string(hex_start, p+1-hex_start);
393 break;
394 }
395 }
396 d += runetochar(d, &rune);
397 break;
398 }
399 case 'U': {
400 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
401 char32 rune = 0;
402 const char *hex_start = p;
403 for (int i = 0; i < 8; ++i) {
404 if (isxdigit(p[1])) { // Look one char ahead.
405 // Don't change rune until we're sure this
406 // is within the Unicode limit, but do advance p.
407 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
408 if (newrune > 0x10FFFF) {
409 LOG_STRING(ERROR, errors)
410 << "Value of \\"
411 << string(hex_start, p + 1 - hex_start)
412 << " exceeds Unicode limit (0x10FFFF)";
413 break;
414 } else {
415 rune = newrune;
416 }
417 } else {
418 LOG_STRING(ERROR, errors)
419 << "\\U must be followed by 8 hex digits: \\"
420 << string(hex_start, p+1-hex_start);
421 break;
422 }
423 }
424 d += runetochar(d, &rune);
425 break;
426 }
427 #endif
428 default:
429 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
430 }
431 p++; // read past letter we escaped
432 }
433 }
434 *d = '\0';
435 return d - dest;
436 }
437
438 // ----------------------------------------------------------------------
439 // UnescapeCEscapeString()
440 // This does the same thing as UnescapeCEscapeSequences, but creates
441 // a new string. The caller does not need to worry about allocating
442 // a dest buffer. This should be used for non performance critical
443 // tasks such as printing debug messages. It is safe for src and dest
444 // to be the same.
445 //
446 // The second call stores its errors in a supplied string vector.
447 // If the string vector pointer is nullptr, it reports the errors with LOG().
448 //
449 // In the first and second calls, the length of dest is returned. In the
450 // the third call, the new string is returned.
451 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)452 int UnescapeCEscapeString(const string& src, string* dest) {
453 return UnescapeCEscapeString(src, dest, nullptr);
454 }
455
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)456 int UnescapeCEscapeString(const string& src, string* dest,
457 std::vector<string> *errors) {
458 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
459 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
460 GOOGLE_CHECK(dest);
461 dest->assign(unescaped.get(), len);
462 return len;
463 }
464
UnescapeCEscapeString(const string & src)465 string UnescapeCEscapeString(const string& src) {
466 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
467 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
468 return string(unescaped.get(), len);
469 }
470
471 // ----------------------------------------------------------------------
472 // CEscapeString()
473 // CHexEscapeString()
474 // Copies 'src' to 'dest', escaping dangerous characters using
475 // C-style escape sequences. This is very useful for preparing query
476 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
477 // hexadecimal rather than octal sequences.
478 // Returns the number of bytes written to 'dest' (not including the \0)
479 // or -1 if there was insufficient space.
480 //
481 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
482 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)483 int CEscapeInternal(const char* src, int src_len, char* dest,
484 int dest_len, bool use_hex, bool utf8_safe) {
485 const char* src_end = src + src_len;
486 int used = 0;
487 bool last_hex_escape = false; // true if last output char was \xNN
488
489 for (; src < src_end; src++) {
490 if (dest_len - used < 2) // Need space for two letter escape
491 return -1;
492
493 bool is_hex_escape = false;
494 switch (*src) {
495 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
496 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
497 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
498 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
499 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
500 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
501 default:
502 // Note that if we emit \xNN and the src character after that is a hex
503 // digit then that digit must be escaped too to prevent it being
504 // interpreted as part of the character code by C.
505 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
506 (!isprint(*src) ||
507 (last_hex_escape && isxdigit(*src)))) {
508 if (dest_len - used < 4) // need space for 4 letter escape
509 return -1;
510 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
511 static_cast<uint8>(*src));
512 is_hex_escape = use_hex;
513 used += 4;
514 } else {
515 dest[used++] = *src; break;
516 }
517 }
518 last_hex_escape = is_hex_escape;
519 }
520
521 if (dest_len - used < 1) // make sure that there is room for \0
522 return -1;
523
524 dest[used] = '\0'; // doesn't count towards return value though
525 return used;
526 }
527
528 // Calculates the length of the C-style escaped version of 'src'.
529 // Assumes that non-printable characters are escaped using octal sequences, and
530 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)531 static inline size_t CEscapedLength(StringPiece src) {
532 static char c_escaped_len[256] = {
533 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
534 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
535 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
541 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
547 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
548 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
549 };
550
551 size_t escaped_len = 0;
552 for (int i = 0; i < src.size(); ++i) {
553 unsigned char c = static_cast<unsigned char>(src[i]);
554 escaped_len += c_escaped_len[c];
555 }
556 return escaped_len;
557 }
558
559 // ----------------------------------------------------------------------
560 // Escapes 'src' using C-style escape sequences, and appends the escaped string
561 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
562 // the required space using a lookup table, and also does not do any special
563 // handling for Hex or UTF-8 characters.
564 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)565 void CEscapeAndAppend(StringPiece src, string* dest) {
566 size_t escaped_len = CEscapedLength(src);
567 if (escaped_len == src.size()) {
568 dest->append(src.data(), src.size());
569 return;
570 }
571
572 size_t cur_dest_len = dest->size();
573 dest->resize(cur_dest_len + escaped_len);
574 char* append_ptr = &(*dest)[cur_dest_len];
575
576 for (int i = 0; i < src.size(); ++i) {
577 unsigned char c = static_cast<unsigned char>(src[i]);
578 switch (c) {
579 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
580 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
581 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
582 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
583 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
584 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
585 default:
586 if (!isprint(c)) {
587 *append_ptr++ = '\\';
588 *append_ptr++ = '0' + c / 64;
589 *append_ptr++ = '0' + (c % 64) / 8;
590 *append_ptr++ = '0' + c % 8;
591 } else {
592 *append_ptr++ = c;
593 }
594 break;
595 }
596 }
597 }
598
CEscape(const string & src)599 string CEscape(const string& src) {
600 string dest;
601 CEscapeAndAppend(src, &dest);
602 return dest;
603 }
604
605 namespace strings {
606
Utf8SafeCEscape(const string & src)607 string Utf8SafeCEscape(const string& src) {
608 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
609 std::unique_ptr<char[]> dest(new char[dest_length]);
610 const int len = CEscapeInternal(src.data(), src.size(),
611 dest.get(), dest_length, false, true);
612 GOOGLE_DCHECK_GE(len, 0);
613 return string(dest.get(), len);
614 }
615
CHexEscape(const string & src)616 string CHexEscape(const string& src) {
617 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
618 std::unique_ptr<char[]> dest(new char[dest_length]);
619 const int len = CEscapeInternal(src.data(), src.size(),
620 dest.get(), dest_length, true, false);
621 GOOGLE_DCHECK_GE(len, 0);
622 return string(dest.get(), len);
623 }
624
625 } // namespace strings
626
627 // ----------------------------------------------------------------------
628 // strto32_adaptor()
629 // strtou32_adaptor()
630 // Implementation of strto[u]l replacements that have identical
631 // overflow and underflow characteristics for both ILP-32 and LP-64
632 // platforms, including errno preservation in error-free calls.
633 // ----------------------------------------------------------------------
634
strto32_adaptor(const char * nptr,char ** endptr,int base)635 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
636 const int saved_errno = errno;
637 errno = 0;
638 const long result = strtol(nptr, endptr, base);
639 if (errno == ERANGE && result == LONG_MIN) {
640 return kint32min;
641 } else if (errno == ERANGE && result == LONG_MAX) {
642 return kint32max;
643 } else if (errno == 0 && result < kint32min) {
644 errno = ERANGE;
645 return kint32min;
646 } else if (errno == 0 && result > kint32max) {
647 errno = ERANGE;
648 return kint32max;
649 }
650 if (errno == 0)
651 errno = saved_errno;
652 return static_cast<int32>(result);
653 }
654
strtou32_adaptor(const char * nptr,char ** endptr,int base)655 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
656 const int saved_errno = errno;
657 errno = 0;
658 const unsigned long result = strtoul(nptr, endptr, base);
659 if (errno == ERANGE && result == ULONG_MAX) {
660 return kuint32max;
661 } else if (errno == 0 && result > kuint32max) {
662 errno = ERANGE;
663 return kuint32max;
664 }
665 if (errno == 0)
666 errno = saved_errno;
667 return static_cast<uint32>(result);
668 }
669
safe_parse_sign(string * text,bool * negative_ptr)670 inline bool safe_parse_sign(string* text /*inout*/,
671 bool* negative_ptr /*output*/) {
672 const char* start = text->data();
673 const char* end = start + text->size();
674
675 // Consume whitespace.
676 while (start < end && (start[0] == ' ')) {
677 ++start;
678 }
679 while (start < end && (end[-1] == ' ')) {
680 --end;
681 }
682 if (start >= end) {
683 return false;
684 }
685
686 // Consume sign.
687 *negative_ptr = (start[0] == '-');
688 if (*negative_ptr || start[0] == '+') {
689 ++start;
690 if (start >= end) {
691 return false;
692 }
693 }
694 *text = text->substr(start - text->data(), end - start);
695 return true;
696 }
697
698 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)699 bool safe_parse_positive_int(
700 string text, IntType* value_p) {
701 int base = 10;
702 IntType value = 0;
703 const IntType vmax = std::numeric_limits<IntType>::max();
704 assert(vmax > 0);
705 assert(vmax >= base);
706 const IntType vmax_over_base = vmax / base;
707 const char* start = text.data();
708 const char* end = start + text.size();
709 // loop over digits
710 for (; start < end; ++start) {
711 unsigned char c = static_cast<unsigned char>(start[0]);
712 int digit = c - '0';
713 if (digit >= base || digit < 0) {
714 *value_p = value;
715 return false;
716 }
717 if (value > vmax_over_base) {
718 *value_p = vmax;
719 return false;
720 }
721 value *= base;
722 if (value > vmax - digit) {
723 *value_p = vmax;
724 return false;
725 }
726 value += digit;
727 }
728 *value_p = value;
729 return true;
730 }
731
732 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)733 bool safe_parse_negative_int(
734 const string& text, IntType* value_p) {
735 int base = 10;
736 IntType value = 0;
737 const IntType vmin = std::numeric_limits<IntType>::min();
738 assert(vmin < 0);
739 assert(vmin <= 0 - base);
740 IntType vmin_over_base = vmin / base;
741 // 2003 c++ standard [expr.mul]
742 // "... the sign of the remainder is implementation-defined."
743 // Although (vmin/base)*base + vmin%base is always vmin.
744 // 2011 c++ standard tightens the spec but we cannot rely on it.
745 if (vmin % base > 0) {
746 vmin_over_base += 1;
747 }
748 const char* start = text.data();
749 const char* end = start + text.size();
750 // loop over digits
751 for (; start < end; ++start) {
752 unsigned char c = static_cast<unsigned char>(start[0]);
753 int digit = c - '0';
754 if (digit >= base || digit < 0) {
755 *value_p = value;
756 return false;
757 }
758 if (value < vmin_over_base) {
759 *value_p = vmin;
760 return false;
761 }
762 value *= base;
763 if (value < vmin + digit) {
764 *value_p = vmin;
765 return false;
766 }
767 value -= digit;
768 }
769 *value_p = value;
770 return true;
771 }
772
773 template<typename IntType>
safe_int_internal(string text,IntType * value_p)774 bool safe_int_internal(string text, IntType* value_p) {
775 *value_p = 0;
776 bool negative;
777 if (!safe_parse_sign(&text, &negative)) {
778 return false;
779 }
780 if (!negative) {
781 return safe_parse_positive_int(text, value_p);
782 } else {
783 return safe_parse_negative_int(text, value_p);
784 }
785 }
786
787 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)788 bool safe_uint_internal(string text, IntType* value_p) {
789 *value_p = 0;
790 bool negative;
791 if (!safe_parse_sign(&text, &negative) || negative) {
792 return false;
793 }
794 return safe_parse_positive_int(text, value_p);
795 }
796
797 // ----------------------------------------------------------------------
798 // FastIntToBuffer()
799 // FastInt64ToBuffer()
800 // FastHexToBuffer()
801 // FastHex64ToBuffer()
802 // FastHex32ToBuffer()
803 // ----------------------------------------------------------------------
804
805 // Offset into buffer where FastInt64ToBuffer places the end of string
806 // null character. Also used by FastInt64ToBufferLeft.
807 static const int kFastInt64ToBufferOffset = 21;
808
FastInt64ToBuffer(int64 i,char * buffer)809 char *FastInt64ToBuffer(int64 i, char* buffer) {
810 // We could collapse the positive and negative sections, but that
811 // would be slightly slower for positive numbers...
812 // 22 bytes is enough to store -2**64, -18446744073709551616.
813 char* p = buffer + kFastInt64ToBufferOffset;
814 *p-- = '\0';
815 if (i >= 0) {
816 do {
817 *p-- = '0' + i % 10;
818 i /= 10;
819 } while (i > 0);
820 return p + 1;
821 } else {
822 // On different platforms, % and / have different behaviors for
823 // negative numbers, so we need to jump through hoops to make sure
824 // we don't divide negative numbers.
825 if (i > -10) {
826 i = -i;
827 *p-- = '0' + i;
828 *p = '-';
829 return p;
830 } else {
831 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
832 i = i + 10;
833 i = -i;
834 *p-- = '0' + i % 10;
835 // Undo what we did a moment ago
836 i = i / 10 + 1;
837 do {
838 *p-- = '0' + i % 10;
839 i /= 10;
840 } while (i > 0);
841 *p = '-';
842 return p;
843 }
844 }
845 }
846
847 // Offset into buffer where FastInt32ToBuffer places the end of string
848 // null character. Also used by FastInt32ToBufferLeft
849 static const int kFastInt32ToBufferOffset = 11;
850
851 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
852 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
853 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)854 char *FastInt32ToBuffer(int32 i, char* buffer) {
855 // We could collapse the positive and negative sections, but that
856 // would be slightly slower for positive numbers...
857 // 12 bytes is enough to store -2**32, -4294967296.
858 char* p = buffer + kFastInt32ToBufferOffset;
859 *p-- = '\0';
860 if (i >= 0) {
861 do {
862 *p-- = '0' + i % 10;
863 i /= 10;
864 } while (i > 0);
865 return p + 1;
866 } else {
867 // On different platforms, % and / have different behaviors for
868 // negative numbers, so we need to jump through hoops to make sure
869 // we don't divide negative numbers.
870 if (i > -10) {
871 i = -i;
872 *p-- = '0' + i;
873 *p = '-';
874 return p;
875 } else {
876 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
877 i = i + 10;
878 i = -i;
879 *p-- = '0' + i % 10;
880 // Undo what we did a moment ago
881 i = i / 10 + 1;
882 do {
883 *p-- = '0' + i % 10;
884 i /= 10;
885 } while (i > 0);
886 *p = '-';
887 return p;
888 }
889 }
890 }
891
FastHexToBuffer(int i,char * buffer)892 char *FastHexToBuffer(int i, char* buffer) {
893 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
894
895 static const char *hexdigits = "0123456789abcdef";
896 char *p = buffer + 21;
897 *p-- = '\0';
898 do {
899 *p-- = hexdigits[i & 15]; // mod by 16
900 i >>= 4; // divide by 16
901 } while (i > 0);
902 return p + 1;
903 }
904
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)905 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
906 static const char *hexdigits = "0123456789abcdef";
907 buffer[num_byte] = '\0';
908 for (int i = num_byte - 1; i >= 0; i--) {
909 #ifdef _M_X64
910 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
911 // block. Given that the uint32 cast was to improve performance on 32-bit
912 // platforms, we use 64-bit '&' directly.
913 buffer[i] = hexdigits[value & 0xf];
914 #else
915 buffer[i] = hexdigits[uint32(value) & 0xf];
916 #endif
917 value >>= 4;
918 }
919 return buffer;
920 }
921
FastHex64ToBuffer(uint64 value,char * buffer)922 char *FastHex64ToBuffer(uint64 value, char* buffer) {
923 return InternalFastHexToBuffer(value, buffer, 16);
924 }
925
FastHex32ToBuffer(uint32 value,char * buffer)926 char *FastHex32ToBuffer(uint32 value, char* buffer) {
927 return InternalFastHexToBuffer(value, buffer, 8);
928 }
929
930 // ----------------------------------------------------------------------
931 // FastInt32ToBufferLeft()
932 // FastUInt32ToBufferLeft()
933 // FastInt64ToBufferLeft()
934 // FastUInt64ToBufferLeft()
935 //
936 // Like the Fast*ToBuffer() functions above, these are intended for speed.
937 // Unlike the Fast*ToBuffer() functions, however, these functions write
938 // their output to the beginning of the buffer (hence the name, as the
939 // output is left-aligned). The caller is responsible for ensuring that
940 // the buffer has enough space to hold the output.
941 //
942 // Returns a pointer to the end of the string (i.e. the null character
943 // terminating the string).
944 // ----------------------------------------------------------------------
945
946 static const char two_ASCII_digits[100][2] = {
947 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
948 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
949 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
950 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
951 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
952 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
953 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
954 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
955 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
956 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
957 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
958 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
959 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
960 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
961 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
962 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
963 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
964 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
965 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
966 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
967 };
968
FastUInt32ToBufferLeft(uint32 u,char * buffer)969 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
970 uint32 digits;
971 const char *ASCII_digits = nullptr;
972 // The idea of this implementation is to trim the number of divides to as few
973 // as possible by using multiplication and subtraction rather than mod (%),
974 // and by outputting two digits at a time rather than one.
975 // The huge-number case is first, in the hopes that the compiler will output
976 // that case in one branch-free block of code, and only output conditional
977 // branches into it from below.
978 if (u >= 1000000000) { // >= 1,000,000,000
979 digits = u / 100000000; // 100,000,000
980 ASCII_digits = two_ASCII_digits[digits];
981 buffer[0] = ASCII_digits[0];
982 buffer[1] = ASCII_digits[1];
983 buffer += 2;
984 sublt100_000_000:
985 u -= digits * 100000000; // 100,000,000
986 lt100_000_000:
987 digits = u / 1000000; // 1,000,000
988 ASCII_digits = two_ASCII_digits[digits];
989 buffer[0] = ASCII_digits[0];
990 buffer[1] = ASCII_digits[1];
991 buffer += 2;
992 sublt1_000_000:
993 u -= digits * 1000000; // 1,000,000
994 lt1_000_000:
995 digits = u / 10000; // 10,000
996 ASCII_digits = two_ASCII_digits[digits];
997 buffer[0] = ASCII_digits[0];
998 buffer[1] = ASCII_digits[1];
999 buffer += 2;
1000 sublt10_000:
1001 u -= digits * 10000; // 10,000
1002 lt10_000:
1003 digits = u / 100;
1004 ASCII_digits = two_ASCII_digits[digits];
1005 buffer[0] = ASCII_digits[0];
1006 buffer[1] = ASCII_digits[1];
1007 buffer += 2;
1008 sublt100:
1009 u -= digits * 100;
1010 lt100:
1011 digits = u;
1012 ASCII_digits = two_ASCII_digits[digits];
1013 buffer[0] = ASCII_digits[0];
1014 buffer[1] = ASCII_digits[1];
1015 buffer += 2;
1016 done:
1017 *buffer = 0;
1018 return buffer;
1019 }
1020
1021 if (u < 100) {
1022 digits = u;
1023 if (u >= 10) goto lt100;
1024 *buffer++ = '0' + digits;
1025 goto done;
1026 }
1027 if (u < 10000) { // 10,000
1028 if (u >= 1000) goto lt10_000;
1029 digits = u / 100;
1030 *buffer++ = '0' + digits;
1031 goto sublt100;
1032 }
1033 if (u < 1000000) { // 1,000,000
1034 if (u >= 100000) goto lt1_000_000;
1035 digits = u / 10000; // 10,000
1036 *buffer++ = '0' + digits;
1037 goto sublt10_000;
1038 }
1039 if (u < 100000000) { // 100,000,000
1040 if (u >= 10000000) goto lt100_000_000;
1041 digits = u / 1000000; // 1,000,000
1042 *buffer++ = '0' + digits;
1043 goto sublt1_000_000;
1044 }
1045 // we already know that u < 1,000,000,000
1046 digits = u / 100000000; // 100,000,000
1047 *buffer++ = '0' + digits;
1048 goto sublt100_000_000;
1049 }
1050
FastInt32ToBufferLeft(int32 i,char * buffer)1051 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1052 uint32 u = 0;
1053 if (i < 0) {
1054 *buffer++ = '-';
1055 u -= i;
1056 } else {
1057 u = i;
1058 }
1059 return FastUInt32ToBufferLeft(u, buffer);
1060 }
1061
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1062 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1063 int digits;
1064 const char *ASCII_digits = nullptr;
1065
1066 uint32 u = static_cast<uint32>(u64);
1067 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1068
1069 uint64 top_11_digits = u64 / 1000000000;
1070 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1071 u = u64 - (top_11_digits * 1000000000);
1072
1073 digits = u / 10000000; // 10,000,000
1074 GOOGLE_DCHECK_LT(digits, 100);
1075 ASCII_digits = two_ASCII_digits[digits];
1076 buffer[0] = ASCII_digits[0];
1077 buffer[1] = ASCII_digits[1];
1078 buffer += 2;
1079 u -= digits * 10000000; // 10,000,000
1080 digits = u / 100000; // 100,000
1081 ASCII_digits = two_ASCII_digits[digits];
1082 buffer[0] = ASCII_digits[0];
1083 buffer[1] = ASCII_digits[1];
1084 buffer += 2;
1085 u -= digits * 100000; // 100,000
1086 digits = u / 1000; // 1,000
1087 ASCII_digits = two_ASCII_digits[digits];
1088 buffer[0] = ASCII_digits[0];
1089 buffer[1] = ASCII_digits[1];
1090 buffer += 2;
1091 u -= digits * 1000; // 1,000
1092 digits = u / 10;
1093 ASCII_digits = two_ASCII_digits[digits];
1094 buffer[0] = ASCII_digits[0];
1095 buffer[1] = ASCII_digits[1];
1096 buffer += 2;
1097 u -= digits * 10;
1098 digits = u;
1099 *buffer++ = '0' + digits;
1100 *buffer = 0;
1101 return buffer;
1102 }
1103
FastInt64ToBufferLeft(int64 i,char * buffer)1104 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1105 uint64 u = 0;
1106 if (i < 0) {
1107 *buffer++ = '-';
1108 u -= i;
1109 } else {
1110 u = i;
1111 }
1112 return FastUInt64ToBufferLeft(u, buffer);
1113 }
1114
1115 // ----------------------------------------------------------------------
1116 // SimpleItoa()
1117 // Description: converts an integer to a string.
1118 //
1119 // Return value: string
1120 // ----------------------------------------------------------------------
1121
SimpleItoa(int i)1122 string SimpleItoa(int i) {
1123 char buffer[kFastToBufferSize];
1124 return (sizeof(i) == 4) ?
1125 FastInt32ToBuffer(i, buffer) :
1126 FastInt64ToBuffer(i, buffer);
1127 }
1128
SimpleItoa(unsigned int i)1129 string SimpleItoa(unsigned int i) {
1130 char buffer[kFastToBufferSize];
1131 return string(buffer, (sizeof(i) == 4) ?
1132 FastUInt32ToBufferLeft(i, buffer) :
1133 FastUInt64ToBufferLeft(i, buffer));
1134 }
1135
SimpleItoa(long i)1136 string SimpleItoa(long i) {
1137 char buffer[kFastToBufferSize];
1138 return (sizeof(i) == 4) ?
1139 FastInt32ToBuffer(i, buffer) :
1140 FastInt64ToBuffer(i, buffer);
1141 }
1142
SimpleItoa(unsigned long i)1143 string SimpleItoa(unsigned long i) {
1144 char buffer[kFastToBufferSize];
1145 return string(buffer, (sizeof(i) == 4) ?
1146 FastUInt32ToBufferLeft(i, buffer) :
1147 FastUInt64ToBufferLeft(i, buffer));
1148 }
1149
SimpleItoa(long long i)1150 string SimpleItoa(long long i) {
1151 char buffer[kFastToBufferSize];
1152 return (sizeof(i) == 4) ?
1153 FastInt32ToBuffer(i, buffer) :
1154 FastInt64ToBuffer(i, buffer);
1155 }
1156
SimpleItoa(unsigned long long i)1157 string SimpleItoa(unsigned long long i) {
1158 char buffer[kFastToBufferSize];
1159 return string(buffer, (sizeof(i) == 4) ?
1160 FastUInt32ToBufferLeft(i, buffer) :
1161 FastUInt64ToBufferLeft(i, buffer));
1162 }
1163
1164 // ----------------------------------------------------------------------
1165 // SimpleDtoa()
1166 // SimpleFtoa()
1167 // DoubleToBuffer()
1168 // FloatToBuffer()
1169 // We want to print the value without losing precision, but we also do
1170 // not want to print more digits than necessary. This turns out to be
1171 // trickier than it sounds. Numbers like 0.2 cannot be represented
1172 // exactly in binary. If we print 0.2 with a very large precision,
1173 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1174 // On the other hand, if we set the precision too low, we lose
1175 // significant digits when printing numbers that actually need them.
1176 // It turns out there is no precision value that does the right thing
1177 // for all numbers.
1178 //
1179 // Our strategy is to first try printing with a precision that is never
1180 // over-precise, then parse the result with strtod() to see if it
1181 // matches. If not, we print again with a precision that will always
1182 // give a precise result, but may use more digits than necessary.
1183 //
1184 // An arguably better strategy would be to use the algorithm described
1185 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1186 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1187 // however, that the following implementation is about as fast as
1188 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1189 // will not scale well on multi-core machines. DMG's code is slightly
1190 // more accurate (in that it will never use more digits than
1191 // necessary), but this is probably irrelevant for most users.
1192 //
1193 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1194 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1195 // one in that it makes guesses and then uses strtod() to check them.
1196 // Their implementation is faster because they use their own code to
1197 // generate the digits in the first place rather than use snprintf(),
1198 // thus avoiding format string parsing overhead. However, this makes
1199 // it considerably more complicated than the following implementation,
1200 // and it is embedded in a larger library. If speed turns out to be
1201 // an issue, we could re-implement this in terms of their
1202 // implementation.
1203 // ----------------------------------------------------------------------
1204
SimpleDtoa(double value)1205 string SimpleDtoa(double value) {
1206 char buffer[kDoubleToBufferSize];
1207 return DoubleToBuffer(value, buffer);
1208 }
1209
SimpleFtoa(float value)1210 string SimpleFtoa(float value) {
1211 char buffer[kFloatToBufferSize];
1212 return FloatToBuffer(value, buffer);
1213 }
1214
IsValidFloatChar(char c)1215 static inline bool IsValidFloatChar(char c) {
1216 return ('0' <= c && c <= '9') ||
1217 c == 'e' || c == 'E' ||
1218 c == '+' || c == '-';
1219 }
1220
DelocalizeRadix(char * buffer)1221 void DelocalizeRadix(char* buffer) {
1222 // Fast check: if the buffer has a normal decimal point, assume no
1223 // translation is needed.
1224 if (strchr(buffer, '.') != nullptr) return;
1225
1226 // Find the first unknown character.
1227 while (IsValidFloatChar(*buffer)) ++buffer;
1228
1229 if (*buffer == '\0') {
1230 // No radix character found.
1231 return;
1232 }
1233
1234 // We are now pointing at the locale-specific radix character. Replace it
1235 // with '.'.
1236 *buffer = '.';
1237 ++buffer;
1238
1239 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1240 // It appears the radix was a multi-byte character. We need to remove the
1241 // extra bytes.
1242 char* target = buffer;
1243 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1244 memmove(target, buffer, strlen(buffer) + 1);
1245 }
1246 }
1247
DoubleToBuffer(double value,char * buffer)1248 char* DoubleToBuffer(double value, char* buffer) {
1249 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1250 // platforms these days. Just in case some system exists where DBL_DIG
1251 // is significantly larger -- and risks overflowing our buffer -- we have
1252 // this assert.
1253 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1254
1255 if (value == std::numeric_limits<double>::infinity()) {
1256 strcpy(buffer, "inf");
1257 return buffer;
1258 } else if (value == -std::numeric_limits<double>::infinity()) {
1259 strcpy(buffer, "-inf");
1260 return buffer;
1261 } else if (std::isnan(value)) {
1262 strcpy(buffer, "nan");
1263 return buffer;
1264 }
1265
1266 int snprintf_result =
1267 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1268
1269 // The snprintf should never overflow because the buffer is significantly
1270 // larger than the precision we asked for.
1271 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1272
1273 // We need to make parsed_value volatile in order to force the compiler to
1274 // write it out to the stack. Otherwise, it may keep the value in a
1275 // register, and if it does that, it may keep it as a long double instead
1276 // of a double. This long double may have extra bits that make it compare
1277 // unequal to "value" even though it would be exactly equal if it were
1278 // truncated to a double.
1279 volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1280 if (parsed_value != value) {
1281 int snprintf_result =
1282 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1283
1284 // Should never overflow; see above.
1285 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1286 }
1287
1288 DelocalizeRadix(buffer);
1289 return buffer;
1290 }
1291
memcasecmp(const char * s1,const char * s2,size_t len)1292 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1293 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1294 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1295
1296 for ( int i = 0; i < len; i++ ) {
1297 const int diff =
1298 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1299 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1300 if (diff != 0) return diff;
1301 }
1302 return 0;
1303 }
1304
CaseEqual(StringPiece s1,StringPiece s2)1305 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1306 if (s1.size() != s2.size()) return false;
1307 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1308 }
1309
safe_strtob(StringPiece str,bool * value)1310 bool safe_strtob(StringPiece str, bool* value) {
1311 GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1312 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1313 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1314 CaseEqual(str, "1")) {
1315 *value = true;
1316 return true;
1317 }
1318 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1319 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1320 CaseEqual(str, "0")) {
1321 *value = false;
1322 return true;
1323 }
1324 return false;
1325 }
1326
safe_strtof(const char * str,float * value)1327 bool safe_strtof(const char* str, float* value) {
1328 char* endptr;
1329 errno = 0; // errno only gets set on errors
1330 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1331 *value = internal::NoLocaleStrtod(str, &endptr);
1332 #else
1333 *value = strtof(str, &endptr);
1334 #endif
1335 return *str != 0 && *endptr == 0 && errno == 0;
1336 }
1337
safe_strtod(const char * str,double * value)1338 bool safe_strtod(const char* str, double* value) {
1339 char* endptr;
1340 *value = internal::NoLocaleStrtod(str, &endptr);
1341 if (endptr != str) {
1342 while (ascii_isspace(*endptr)) ++endptr;
1343 }
1344 // Ignore range errors from strtod. The values it
1345 // returns on underflow and overflow are the right
1346 // fallback in a robust setting.
1347 return *str != '\0' && *endptr == '\0';
1348 }
1349
safe_strto32(const string & str,int32 * value)1350 bool safe_strto32(const string& str, int32* value) {
1351 return safe_int_internal(str, value);
1352 }
1353
safe_strtou32(const string & str,uint32 * value)1354 bool safe_strtou32(const string& str, uint32* value) {
1355 return safe_uint_internal(str, value);
1356 }
1357
safe_strto64(const string & str,int64 * value)1358 bool safe_strto64(const string& str, int64* value) {
1359 return safe_int_internal(str, value);
1360 }
1361
safe_strtou64(const string & str,uint64 * value)1362 bool safe_strtou64(const string& str, uint64* value) {
1363 return safe_uint_internal(str, value);
1364 }
1365
FloatToBuffer(float value,char * buffer)1366 char* FloatToBuffer(float value, char* buffer) {
1367 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1368 // platforms these days. Just in case some system exists where FLT_DIG
1369 // is significantly larger -- and risks overflowing our buffer -- we have
1370 // this assert.
1371 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1372
1373 if (value == std::numeric_limits<double>::infinity()) {
1374 strcpy(buffer, "inf");
1375 return buffer;
1376 } else if (value == -std::numeric_limits<double>::infinity()) {
1377 strcpy(buffer, "-inf");
1378 return buffer;
1379 } else if (std::isnan(value)) {
1380 strcpy(buffer, "nan");
1381 return buffer;
1382 }
1383
1384 int snprintf_result =
1385 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1386
1387 // The snprintf should never overflow because the buffer is significantly
1388 // larger than the precision we asked for.
1389 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1390
1391 float parsed_value;
1392 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1393 int snprintf_result =
1394 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1395
1396 // Should never overflow; see above.
1397 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1398 }
1399
1400 DelocalizeRadix(buffer);
1401 return buffer;
1402 }
1403
1404 namespace strings {
1405
AlphaNum(strings::Hex hex)1406 AlphaNum::AlphaNum(strings::Hex hex) {
1407 char *const end = &digits[kFastToBufferSize];
1408 char *writer = end;
1409 uint64 value = hex.value;
1410 uint64 width = hex.spec;
1411 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1412 // where 0x10000 is the smallest hex number that is as wide as the user
1413 // asked for.
1414 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1415 static const char hexdigits[] = "0123456789abcdef";
1416 do {
1417 *--writer = hexdigits[value & 0xF];
1418 value >>= 4;
1419 mask >>= 4;
1420 } while (mask != 0);
1421 piece_data_ = writer;
1422 piece_size_ = end - writer;
1423 }
1424
1425 } // namespace strings
1426
1427 // ----------------------------------------------------------------------
1428 // StrCat()
1429 // This merges the given strings or integers, with no delimiter. This
1430 // is designed to be the fastest possible way to construct a string out
1431 // of a mix of raw C strings, C++ strings, and integer values.
1432 // ----------------------------------------------------------------------
1433
1434 // Append is merely a version of memcpy that returns the address of the byte
1435 // after the area just overwritten. It comes in multiple flavors to minimize
1436 // call overhead.
Append1(char * out,const AlphaNum & x)1437 static char *Append1(char *out, const AlphaNum &x) {
1438 memcpy(out, x.data(), x.size());
1439 return out + x.size();
1440 }
1441
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1442 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1443 memcpy(out, x1.data(), x1.size());
1444 out += x1.size();
1445
1446 memcpy(out, x2.data(), x2.size());
1447 return out + x2.size();
1448 }
1449
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1450 static char *Append4(char *out,
1451 const AlphaNum &x1, const AlphaNum &x2,
1452 const AlphaNum &x3, const AlphaNum &x4) {
1453 memcpy(out, x1.data(), x1.size());
1454 out += x1.size();
1455
1456 memcpy(out, x2.data(), x2.size());
1457 out += x2.size();
1458
1459 memcpy(out, x3.data(), x3.size());
1460 out += x3.size();
1461
1462 memcpy(out, x4.data(), x4.size());
1463 return out + x4.size();
1464 }
1465
StrCat(const AlphaNum & a,const AlphaNum & b)1466 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1467 string result;
1468 result.resize(a.size() + b.size());
1469 char *const begin = &*result.begin();
1470 char *out = Append2(begin, a, b);
1471 GOOGLE_DCHECK_EQ(out, begin + result.size());
1472 return result;
1473 }
1474
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1475 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1476 string result;
1477 result.resize(a.size() + b.size() + c.size());
1478 char *const begin = &*result.begin();
1479 char *out = Append2(begin, a, b);
1480 out = Append1(out, c);
1481 GOOGLE_DCHECK_EQ(out, begin + result.size());
1482 return result;
1483 }
1484
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1485 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1486 const AlphaNum &d) {
1487 string result;
1488 result.resize(a.size() + b.size() + c.size() + d.size());
1489 char *const begin = &*result.begin();
1490 char *out = Append4(begin, a, b, c, d);
1491 GOOGLE_DCHECK_EQ(out, begin + result.size());
1492 return result;
1493 }
1494
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1495 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1496 const AlphaNum &d, const AlphaNum &e) {
1497 string result;
1498 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1499 char *const begin = &*result.begin();
1500 char *out = Append4(begin, a, b, c, d);
1501 out = Append1(out, e);
1502 GOOGLE_DCHECK_EQ(out, begin + result.size());
1503 return result;
1504 }
1505
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1506 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1507 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1508 string result;
1509 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1510 f.size());
1511 char *const begin = &*result.begin();
1512 char *out = Append4(begin, a, b, c, d);
1513 out = Append2(out, e, f);
1514 GOOGLE_DCHECK_EQ(out, begin + result.size());
1515 return result;
1516 }
1517
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1518 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1519 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1520 const AlphaNum &g) {
1521 string result;
1522 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1523 f.size() + g.size());
1524 char *const begin = &*result.begin();
1525 char *out = Append4(begin, a, b, c, d);
1526 out = Append2(out, e, f);
1527 out = Append1(out, g);
1528 GOOGLE_DCHECK_EQ(out, begin + result.size());
1529 return result;
1530 }
1531
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1532 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1533 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1534 const AlphaNum &g, const AlphaNum &h) {
1535 string result;
1536 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1537 f.size() + g.size() + h.size());
1538 char *const begin = &*result.begin();
1539 char *out = Append4(begin, a, b, c, d);
1540 out = Append4(out, e, f, g, h);
1541 GOOGLE_DCHECK_EQ(out, begin + result.size());
1542 return result;
1543 }
1544
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1545 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1546 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1547 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1548 string result;
1549 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1550 f.size() + g.size() + h.size() + i.size());
1551 char *const begin = &*result.begin();
1552 char *out = Append4(begin, a, b, c, d);
1553 out = Append4(out, e, f, g, h);
1554 out = Append1(out, i);
1555 GOOGLE_DCHECK_EQ(out, begin + result.size());
1556 return result;
1557 }
1558
1559 // It's possible to call StrAppend with a char * pointer that is partway into
1560 // the string we're appending to. However the results of this are random.
1561 // Therefore, check for this in debug mode. Use unsigned math so we only have
1562 // to do one comparison.
1563 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1564 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1565 uintptr_t((dest).size()))
1566
StrAppend(string * result,const AlphaNum & a)1567 void StrAppend(string *result, const AlphaNum &a) {
1568 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1569 result->append(a.data(), a.size());
1570 }
1571
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1572 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1573 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1574 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1575 string::size_type old_size = result->size();
1576 result->resize(old_size + a.size() + b.size());
1577 char *const begin = &*result->begin();
1578 char *out = Append2(begin + old_size, a, b);
1579 GOOGLE_DCHECK_EQ(out, begin + result->size());
1580 }
1581
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1582 void StrAppend(string *result,
1583 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1584 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1585 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1586 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1587 string::size_type old_size = result->size();
1588 result->resize(old_size + a.size() + b.size() + c.size());
1589 char *const begin = &*result->begin();
1590 char *out = Append2(begin + old_size, a, b);
1591 out = Append1(out, c);
1592 GOOGLE_DCHECK_EQ(out, begin + result->size());
1593 }
1594
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1595 void StrAppend(string *result,
1596 const AlphaNum &a, const AlphaNum &b,
1597 const AlphaNum &c, const AlphaNum &d) {
1598 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1599 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1600 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1601 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1602 string::size_type old_size = result->size();
1603 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1604 char *const begin = &*result->begin();
1605 char *out = Append4(begin + old_size, a, b, c, d);
1606 GOOGLE_DCHECK_EQ(out, begin + result->size());
1607 }
1608
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1609 int GlobalReplaceSubstring(const string& substring,
1610 const string& replacement,
1611 string* s) {
1612 GOOGLE_CHECK(s != nullptr);
1613 if (s->empty() || substring.empty())
1614 return 0;
1615 string tmp;
1616 int num_replacements = 0;
1617 int pos = 0;
1618 for (int match_pos = s->find(substring.data(), pos, substring.length());
1619 match_pos != string::npos;
1620 pos = match_pos + substring.length(),
1621 match_pos = s->find(substring.data(), pos, substring.length())) {
1622 ++num_replacements;
1623 // Append the original content before the match.
1624 tmp.append(*s, pos, match_pos - pos);
1625 // Append the replacement for the match.
1626 tmp.append(replacement.begin(), replacement.end());
1627 }
1628 // Append the content after the last match. If no replacements were made, the
1629 // original string is left untouched.
1630 if (num_replacements > 0) {
1631 tmp.append(*s, pos, s->length() - pos);
1632 s->swap(tmp);
1633 }
1634 return num_replacements;
1635 }
1636
CalculateBase64EscapedLen(int input_len,bool do_padding)1637 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1638 // Base64 encodes three bytes of input at a time. If the input is not
1639 // divisible by three, we pad as appropriate.
1640 //
1641 // (from http://tools.ietf.org/html/rfc3548)
1642 // Special processing is performed if fewer than 24 bits are available
1643 // at the end of the data being encoded. A full encoding quantum is
1644 // always completed at the end of a quantity. When fewer than 24 input
1645 // bits are available in an input group, zero bits are added (on the
1646 // right) to form an integral number of 6-bit groups. Padding at the
1647 // end of the data is performed using the '=' character. Since all base
1648 // 64 input is an integral number of octets, only the following cases
1649 // can arise:
1650
1651
1652 // Base64 encodes each three bytes of input into four bytes of output.
1653 int len = (input_len / 3) * 4;
1654
1655 if (input_len % 3 == 0) {
1656 // (from http://tools.ietf.org/html/rfc3548)
1657 // (1) the final quantum of encoding input is an integral multiple of 24
1658 // bits; here, the final unit of encoded output will be an integral
1659 // multiple of 4 characters with no "=" padding,
1660 } else if (input_len % 3 == 1) {
1661 // (from http://tools.ietf.org/html/rfc3548)
1662 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1663 // final unit of encoded output will be two characters followed by two
1664 // "=" padding characters, or
1665 len += 2;
1666 if (do_padding) {
1667 len += 2;
1668 }
1669 } else { // (input_len % 3 == 2)
1670 // (from http://tools.ietf.org/html/rfc3548)
1671 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1672 // final unit of encoded output will be three characters followed by one
1673 // "=" padding character.
1674 len += 3;
1675 if (do_padding) {
1676 len += 1;
1677 }
1678 }
1679
1680 assert(len >= input_len); // make sure we didn't overflow
1681 return len;
1682 }
1683
1684 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1685 int CalculateBase64EscapedLen(int input_len) {
1686 return CalculateBase64EscapedLen(input_len, true);
1687 }
1688
1689 // ----------------------------------------------------------------------
1690 // int Base64Unescape() - base64 decoder
1691 // int Base64Escape() - base64 encoder
1692 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1693 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1694 //
1695 // Check out
1696 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1697 // care about is that...
1698 // Take the encoded stuff in groups of 4 characters and turn each
1699 // character into a code 0 to 63 thus:
1700 // A-Z map to 0 to 25
1701 // a-z map to 26 to 51
1702 // 0-9 map to 52 to 61
1703 // +(- for WebSafe) maps to 62
1704 // /(_ for WebSafe) maps to 63
1705 // There will be four numbers, all less than 64 which can be represented
1706 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1707 // Arrange the 6 digit binary numbers into three bytes as such:
1708 // aaaaaabb bbbbcccc ccdddddd
1709 // Equals signs (one or two) are used at the end of the encoded block to
1710 // indicate that the text was not an integer multiple of three bytes long.
1711 // ----------------------------------------------------------------------
1712
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1713 int Base64UnescapeInternal(const char *src_param, int szsrc,
1714 char *dest, int szdest,
1715 const signed char* unbase64) {
1716 static const char kPad64Equals = '=';
1717 static const char kPad64Dot = '.';
1718
1719 int decode = 0;
1720 int destidx = 0;
1721 int state = 0;
1722 unsigned int ch = 0;
1723 unsigned int temp = 0;
1724
1725 // If "char" is signed by default, using *src as an array index results in
1726 // accessing negative array elements. Treat the input as a pointer to
1727 // unsigned char to avoid this.
1728 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1729
1730 // The GET_INPUT macro gets the next input character, skipping
1731 // over any whitespace, and stopping when we reach the end of the
1732 // string or when we read any non-data character. The arguments are
1733 // an arbitrary identifier (used as a label for goto) and the number
1734 // of data bytes that must remain in the input to avoid aborting the
1735 // loop.
1736 #define GET_INPUT(label, remain) \
1737 label: \
1738 --szsrc; \
1739 ch = *src++; \
1740 decode = unbase64[ch]; \
1741 if (decode < 0) { \
1742 if (ascii_isspace(ch) && szsrc >= remain) \
1743 goto label; \
1744 state = 4 - remain; \
1745 break; \
1746 }
1747
1748 // if dest is null, we're just checking to see if it's legal input
1749 // rather than producing output. (I suspect this could just be done
1750 // with a regexp...). We duplicate the loop so this test can be
1751 // outside it instead of in every iteration.
1752
1753 if (dest) {
1754 // This loop consumes 4 input bytes and produces 3 output bytes
1755 // per iteration. We can't know at the start that there is enough
1756 // data left in the string for a full iteration, so the loop may
1757 // break out in the middle; if so 'state' will be set to the
1758 // number of input bytes read.
1759
1760 while (szsrc >= 4) {
1761 // We'll start by optimistically assuming that the next four
1762 // bytes of the string (src[0..3]) are four good data bytes
1763 // (that is, no nulls, whitespace, padding chars, or illegal
1764 // chars). We need to test src[0..2] for nulls individually
1765 // before constructing temp to preserve the property that we
1766 // never read past a null in the string (no matter how long
1767 // szsrc claims the string is).
1768
1769 if (!src[0] || !src[1] || !src[2] ||
1770 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1771 (unsigned(unbase64[src[1]]) << 12) |
1772 (unsigned(unbase64[src[2]]) << 6) |
1773 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1774 // Iff any of those four characters was bad (null, illegal,
1775 // whitespace, padding), then temp's high bit will be set
1776 // (because unbase64[] is -1 for all bad characters).
1777 //
1778 // We'll back up and resort to the slower decoder, which knows
1779 // how to handle those cases.
1780
1781 GET_INPUT(first, 4);
1782 temp = decode;
1783 GET_INPUT(second, 3);
1784 temp = (temp << 6) | decode;
1785 GET_INPUT(third, 2);
1786 temp = (temp << 6) | decode;
1787 GET_INPUT(fourth, 1);
1788 temp = (temp << 6) | decode;
1789 } else {
1790 // We really did have four good data bytes, so advance four
1791 // characters in the string.
1792
1793 szsrc -= 4;
1794 src += 4;
1795 decode = -1;
1796 ch = '\0';
1797 }
1798
1799 // temp has 24 bits of input, so write that out as three bytes.
1800
1801 if (destidx+3 > szdest) return -1;
1802 dest[destidx+2] = temp;
1803 temp >>= 8;
1804 dest[destidx+1] = temp;
1805 temp >>= 8;
1806 dest[destidx] = temp;
1807 destidx += 3;
1808 }
1809 } else {
1810 while (szsrc >= 4) {
1811 if (!src[0] || !src[1] || !src[2] ||
1812 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1813 (unsigned(unbase64[src[1]]) << 12) |
1814 (unsigned(unbase64[src[2]]) << 6) |
1815 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1816 GET_INPUT(first_no_dest, 4);
1817 GET_INPUT(second_no_dest, 3);
1818 GET_INPUT(third_no_dest, 2);
1819 GET_INPUT(fourth_no_dest, 1);
1820 } else {
1821 szsrc -= 4;
1822 src += 4;
1823 decode = -1;
1824 ch = '\0';
1825 }
1826 destidx += 3;
1827 }
1828 }
1829
1830 #undef GET_INPUT
1831
1832 // if the loop terminated because we read a bad character, return
1833 // now.
1834 if (decode < 0 && ch != '\0' &&
1835 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1836 return -1;
1837
1838 if (ch == kPad64Equals || ch == kPad64Dot) {
1839 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1840 // look at it again when we count to check for the proper number of
1841 // equals signs at the end.
1842 ++szsrc;
1843 --src;
1844 } else {
1845 // This loop consumes 1 input byte per iteration. It's used to
1846 // clean up the 0-3 input bytes remaining when the first, faster
1847 // loop finishes. 'temp' contains the data from 'state' input
1848 // characters read by the first loop.
1849 while (szsrc > 0) {
1850 --szsrc;
1851 ch = *src++;
1852 decode = unbase64[ch];
1853 if (decode < 0) {
1854 if (ascii_isspace(ch)) {
1855 continue;
1856 } else if (ch == '\0') {
1857 break;
1858 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1859 // back up one character; we'll read it again when we check
1860 // for the correct number of pad characters at the end.
1861 ++szsrc;
1862 --src;
1863 break;
1864 } else {
1865 return -1;
1866 }
1867 }
1868
1869 // Each input character gives us six bits of output.
1870 temp = (temp << 6) | decode;
1871 ++state;
1872 if (state == 4) {
1873 // If we've accumulated 24 bits of output, write that out as
1874 // three bytes.
1875 if (dest) {
1876 if (destidx+3 > szdest) return -1;
1877 dest[destidx+2] = temp;
1878 temp >>= 8;
1879 dest[destidx+1] = temp;
1880 temp >>= 8;
1881 dest[destidx] = temp;
1882 }
1883 destidx += 3;
1884 state = 0;
1885 temp = 0;
1886 }
1887 }
1888 }
1889
1890 // Process the leftover data contained in 'temp' at the end of the input.
1891 int expected_equals = 0;
1892 switch (state) {
1893 case 0:
1894 // Nothing left over; output is a multiple of 3 bytes.
1895 break;
1896
1897 case 1:
1898 // Bad input; we have 6 bits left over.
1899 return -1;
1900
1901 case 2:
1902 // Produce one more output byte from the 12 input bits we have left.
1903 if (dest) {
1904 if (destidx+1 > szdest) return -1;
1905 temp >>= 4;
1906 dest[destidx] = temp;
1907 }
1908 ++destidx;
1909 expected_equals = 2;
1910 break;
1911
1912 case 3:
1913 // Produce two more output bytes from the 18 input bits we have left.
1914 if (dest) {
1915 if (destidx+2 > szdest) return -1;
1916 temp >>= 2;
1917 dest[destidx+1] = temp;
1918 temp >>= 8;
1919 dest[destidx] = temp;
1920 }
1921 destidx += 2;
1922 expected_equals = 1;
1923 break;
1924
1925 default:
1926 // state should have no other values at this point.
1927 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1928 }
1929
1930 // The remainder of the string should be all whitespace, mixed with
1931 // exactly 0 equals signs, or exactly 'expected_equals' equals
1932 // signs. (Always accepting 0 equals signs is a google extension
1933 // not covered in the RFC, as is accepting dot as the pad character.)
1934
1935 int equals = 0;
1936 while (szsrc > 0 && *src) {
1937 if (*src == kPad64Equals || *src == kPad64Dot)
1938 ++equals;
1939 else if (!ascii_isspace(*src))
1940 return -1;
1941 --szsrc;
1942 ++src;
1943 }
1944
1945 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1946 }
1947
1948 // The arrays below were generated by the following code
1949 // #include <sys/time.h>
1950 // #include <stdlib.h>
1951 // #include <string.h>
1952 // #include <stdio.h>
1953 // main()
1954 // {
1955 // static const char Base64[] =
1956 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1957 // const char *pos;
1958 // int idx, i, j;
1959 // printf(" ");
1960 // for (i = 0; i < 255; i += 8) {
1961 // for (j = i; j < i + 8; j++) {
1962 // pos = strchr(Base64, j);
1963 // if ((pos == nullptr) || (j == 0))
1964 // idx = -1;
1965 // else
1966 // idx = pos - Base64;
1967 // if (idx == -1)
1968 // printf(" %2d, ", idx);
1969 // else
1970 // printf(" %2d/""*%c*""/,", idx, j);
1971 // }
1972 // printf("\n ");
1973 // }
1974 // }
1975 //
1976 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1977 // tables from the functions below.
1978 static const signed char kUnBase64[] = {
1979 -1, -1, -1, -1, -1, -1, -1, -1,
1980 -1, -1, -1, -1, -1, -1, -1, -1,
1981 -1, -1, -1, -1, -1, -1, -1, -1,
1982 -1, -1, -1, -1, -1, -1, -1, -1,
1983 -1, -1, -1, -1, -1, -1, -1, -1,
1984 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1985 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1986 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1987 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1988 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1989 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1990 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
1991 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1992 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1993 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1994 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
1995 -1, -1, -1, -1, -1, -1, -1, -1,
1996 -1, -1, -1, -1, -1, -1, -1, -1,
1997 -1, -1, -1, -1, -1, -1, -1, -1,
1998 -1, -1, -1, -1, -1, -1, -1, -1,
1999 -1, -1, -1, -1, -1, -1, -1, -1,
2000 -1, -1, -1, -1, -1, -1, -1, -1,
2001 -1, -1, -1, -1, -1, -1, -1, -1,
2002 -1, -1, -1, -1, -1, -1, -1, -1,
2003 -1, -1, -1, -1, -1, -1, -1, -1,
2004 -1, -1, -1, -1, -1, -1, -1, -1,
2005 -1, -1, -1, -1, -1, -1, -1, -1,
2006 -1, -1, -1, -1, -1, -1, -1, -1,
2007 -1, -1, -1, -1, -1, -1, -1, -1,
2008 -1, -1, -1, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1
2011 };
2012 static const signed char kUnWebSafeBase64[] = {
2013 -1, -1, -1, -1, -1, -1, -1, -1,
2014 -1, -1, -1, -1, -1, -1, -1, -1,
2015 -1, -1, -1, -1, -1, -1, -1, -1,
2016 -1, -1, -1, -1, -1, -1, -1, -1,
2017 -1, -1, -1, -1, -1, -1, -1, -1,
2018 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2019 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2020 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2021 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2022 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2023 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2024 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2025 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2026 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2027 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2028 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2029 -1, -1, -1, -1, -1, -1, -1, -1,
2030 -1, -1, -1, -1, -1, -1, -1, -1,
2031 -1, -1, -1, -1, -1, -1, -1, -1,
2032 -1, -1, -1, -1, -1, -1, -1, -1,
2033 -1, -1, -1, -1, -1, -1, -1, -1,
2034 -1, -1, -1, -1, -1, -1, -1, -1,
2035 -1, -1, -1, -1, -1, -1, -1, -1,
2036 -1, -1, -1, -1, -1, -1, -1, -1,
2037 -1, -1, -1, -1, -1, -1, -1, -1,
2038 -1, -1, -1, -1, -1, -1, -1, -1,
2039 -1, -1, -1, -1, -1, -1, -1, -1,
2040 -1, -1, -1, -1, -1, -1, -1, -1,
2041 -1, -1, -1, -1, -1, -1, -1, -1,
2042 -1, -1, -1, -1, -1, -1, -1, -1,
2043 -1, -1, -1, -1, -1, -1, -1, -1,
2044 -1, -1, -1, -1, -1, -1, -1, -1
2045 };
2046
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2047 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2048 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2049 }
2050
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2051 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2052 const signed char* unbase64) {
2053 // Determine the size of the output string. Base64 encodes every 3 bytes into
2054 // 4 characters. any leftover chars are added directly for good measure.
2055 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2056 const int dest_len = 3 * (slen / 4) + (slen % 4);
2057
2058 dest->resize(dest_len);
2059
2060 // We are getting the destination buffer by getting the beginning of the
2061 // string and converting it into a char *.
2062 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2063 dest_len, unbase64);
2064 if (len < 0) {
2065 dest->clear();
2066 return false;
2067 }
2068
2069 // could be shorter if there was padding
2070 GOOGLE_DCHECK_LE(len, dest_len);
2071 dest->erase(len);
2072
2073 return true;
2074 }
2075
Base64Unescape(StringPiece src,string * dest)2076 bool Base64Unescape(StringPiece src, string* dest) {
2077 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2078 }
2079
WebSafeBase64Unescape(StringPiece src,string * dest)2080 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2081 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2082 }
2083
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2084 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2085 char *dest, int szdest, const char *base64,
2086 bool do_padding) {
2087 static const char kPad64 = '=';
2088
2089 if (szsrc <= 0) return 0;
2090
2091 if (szsrc * 4 > szdest * 3) return 0;
2092
2093 char *cur_dest = dest;
2094 const unsigned char *cur_src = src;
2095
2096 char *limit_dest = dest + szdest;
2097 const unsigned char *limit_src = src + szsrc;
2098
2099 // Three bytes of data encodes to four characters of cyphertext.
2100 // So we can pump through three-byte chunks atomically.
2101 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2102 uint32 in = BigEndian::Load32(cur_src) >> 8;
2103
2104 cur_dest[0] = base64[in >> 18];
2105 in &= 0x3FFFF;
2106 cur_dest[1] = base64[in >> 12];
2107 in &= 0xFFF;
2108 cur_dest[2] = base64[in >> 6];
2109 in &= 0x3F;
2110 cur_dest[3] = base64[in];
2111
2112 cur_dest += 4;
2113 cur_src += 3;
2114 }
2115 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2116 szdest = limit_dest - cur_dest;
2117 szsrc = limit_src - cur_src;
2118
2119 /* now deal with the tail (<=3 bytes) */
2120 switch (szsrc) {
2121 case 0:
2122 // Nothing left; nothing more to do.
2123 break;
2124 case 1: {
2125 // One byte left: this encodes to two characters, and (optionally)
2126 // two pad characters to round out the four-character cypherblock.
2127 if ((szdest -= 2) < 0) return 0;
2128 uint32 in = cur_src[0];
2129 cur_dest[0] = base64[in >> 2];
2130 in &= 0x3;
2131 cur_dest[1] = base64[in << 4];
2132 cur_dest += 2;
2133 if (do_padding) {
2134 if ((szdest -= 2) < 0) return 0;
2135 cur_dest[0] = kPad64;
2136 cur_dest[1] = kPad64;
2137 cur_dest += 2;
2138 }
2139 break;
2140 }
2141 case 2: {
2142 // Two bytes left: this encodes to three characters, and (optionally)
2143 // one pad character to round out the four-character cypherblock.
2144 if ((szdest -= 3) < 0) return 0;
2145 uint32 in = BigEndian::Load16(cur_src);
2146 cur_dest[0] = base64[in >> 10];
2147 in &= 0x3FF;
2148 cur_dest[1] = base64[in >> 4];
2149 in &= 0x00F;
2150 cur_dest[2] = base64[in << 2];
2151 cur_dest += 3;
2152 if (do_padding) {
2153 if ((szdest -= 1) < 0) return 0;
2154 cur_dest[0] = kPad64;
2155 cur_dest += 1;
2156 }
2157 break;
2158 }
2159 case 3: {
2160 // Three bytes left: same as in the big loop above. We can't do this in
2161 // the loop because the loop above always reads 4 bytes, and the fourth
2162 // byte is past the end of the input.
2163 if ((szdest -= 4) < 0) return 0;
2164 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2165 cur_dest[0] = base64[in >> 18];
2166 in &= 0x3FFFF;
2167 cur_dest[1] = base64[in >> 12];
2168 in &= 0xFFF;
2169 cur_dest[2] = base64[in >> 6];
2170 in &= 0x3F;
2171 cur_dest[3] = base64[in];
2172 cur_dest += 4;
2173 break;
2174 }
2175 default:
2176 // Should not be reached: blocks of 4 bytes are handled
2177 // in the while loop before this switch statement.
2178 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2179 break;
2180 }
2181 return (cur_dest - dest);
2182 }
2183
2184 static const char kBase64Chars[] =
2185 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2186
2187 static const char kWebSafeBase64Chars[] =
2188 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2189
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2190 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2191 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2192 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2193 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2194 int szdest, bool do_padding) {
2195 return Base64EscapeInternal(src, szsrc, dest, szdest,
2196 kWebSafeBase64Chars, do_padding);
2197 }
2198
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2199 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2200 string* dest, bool do_padding,
2201 const char* base64_chars) {
2202 const int calc_escaped_size =
2203 CalculateBase64EscapedLen(szsrc, do_padding);
2204 dest->resize(calc_escaped_size);
2205 const int escaped_len = Base64EscapeInternal(src, szsrc,
2206 string_as_array(dest),
2207 dest->size(),
2208 base64_chars,
2209 do_padding);
2210 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2211 dest->erase(escaped_len);
2212 }
2213
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2214 void Base64Escape(const unsigned char *src, int szsrc,
2215 string* dest, bool do_padding) {
2216 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2217 }
2218
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2219 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2220 string *dest, bool do_padding) {
2221 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2222 }
2223
Base64Escape(StringPiece src,string * dest)2224 void Base64Escape(StringPiece src, string* dest) {
2225 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2226 src.size(), dest, true);
2227 }
2228
WebSafeBase64Escape(StringPiece src,string * dest)2229 void WebSafeBase64Escape(StringPiece src, string* dest) {
2230 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2231 src.size(), dest, false);
2232 }
2233
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2234 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2235 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2236 src.size(), dest, true);
2237 }
2238
2239 // Helper to append a Unicode code point to a string as UTF8, without bringing
2240 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2241 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2242 uint32 tmp = 0;
2243 int len = 0;
2244 if (code_point <= 0x7f) {
2245 tmp = code_point;
2246 len = 1;
2247 } else if (code_point <= 0x07ff) {
2248 tmp = 0x0000c080 |
2249 ((code_point & 0x07c0) << 2) |
2250 (code_point & 0x003f);
2251 len = 2;
2252 } else if (code_point <= 0xffff) {
2253 tmp = 0x00e08080 |
2254 ((code_point & 0xf000) << 4) |
2255 ((code_point & 0x0fc0) << 2) |
2256 (code_point & 0x003f);
2257 len = 3;
2258 } else {
2259 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2260 // normally only defined up to there as well.
2261 tmp = 0xf0808080 |
2262 ((code_point & 0x1c0000) << 6) |
2263 ((code_point & 0x03f000) << 4) |
2264 ((code_point & 0x000fc0) << 2) |
2265 (code_point & 0x003f);
2266 len = 4;
2267 }
2268 tmp = ghtonl(tmp);
2269 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2270 return len;
2271 }
2272
2273 // Table of UTF-8 character lengths, based on first byte
2274 static const unsigned char kUTF8LenTbl[256] = {
2275 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2276 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2277 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2278 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2279
2280 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2281 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2282 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2283 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2284 };
2285
2286 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2287 int UTF8FirstLetterNumBytes(const char* src, int len) {
2288 if (len == 0) {
2289 return 0;
2290 }
2291 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2292 }
2293
2294 // ----------------------------------------------------------------------
2295 // CleanStringLineEndings()
2296 // Clean up a multi-line string to conform to Unix line endings.
2297 // Reads from src and appends to dst, so usually dst should be empty.
2298 //
2299 // If there is no line ending at the end of a non-empty string, it can
2300 // be added automatically.
2301 //
2302 // Four different types of input are correctly handled:
2303 //
2304 // - Unix/Linux files: line ending is LF: pass through unchanged
2305 //
2306 // - DOS/Windows files: line ending is CRLF: convert to LF
2307 //
2308 // - Legacy Mac files: line ending is CR: convert to LF
2309 //
2310 // - Garbled files: random line endings: convert gracefully
2311 // lonely CR, lonely LF, CRLF: convert to LF
2312 //
2313 // @param src The multi-line string to convert
2314 // @param dst The converted string is appended to this string
2315 // @param auto_end_last_line Automatically terminate the last line
2316 //
2317 // Limitations:
2318 //
2319 // This does not do the right thing for CRCRLF files created by
2320 // broken programs that do another Unix->DOS conversion on files
2321 // that are already in CRLF format. For this, a two-pass approach
2322 // brute-force would be needed that
2323 //
2324 // (1) determines the presence of LF (first one is ok)
2325 // (2) if yes, removes any CR, else convert every CR to LF
2326
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2327 void CleanStringLineEndings(const string &src, string *dst,
2328 bool auto_end_last_line) {
2329 if (dst->empty()) {
2330 dst->append(src);
2331 CleanStringLineEndings(dst, auto_end_last_line);
2332 } else {
2333 string tmp = src;
2334 CleanStringLineEndings(&tmp, auto_end_last_line);
2335 dst->append(tmp);
2336 }
2337 }
2338
CleanStringLineEndings(string * str,bool auto_end_last_line)2339 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2340 ptrdiff_t output_pos = 0;
2341 bool r_seen = false;
2342 ptrdiff_t len = str->size();
2343
2344 char *p = &(*str)[0];
2345
2346 for (ptrdiff_t input_pos = 0; input_pos < len;) {
2347 if (!r_seen && input_pos + 8 < len) {
2348 uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2349 // Loop over groups of 8 bytes at a time until we come across
2350 // a word that has a byte whose value is less than or equal to
2351 // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2352 //
2353 // We use a has_less macro that quickly tests a whole 64-bit
2354 // word to see if any of the bytes has a value < N.
2355 //
2356 // For more details, see:
2357 // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2358 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2359 if (!has_less(v, '\r' + 1)) {
2360 #undef has_less
2361 // No byte in this word has a value that could be a \r or a \n
2362 if (output_pos != input_pos) {
2363 GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2364 }
2365 input_pos += 8;
2366 output_pos += 8;
2367 continue;
2368 }
2369 }
2370 string::const_reference in = p[input_pos];
2371 if (in == '\r') {
2372 if (r_seen) p[output_pos++] = '\n';
2373 r_seen = true;
2374 } else if (in == '\n') {
2375 if (input_pos != output_pos)
2376 p[output_pos++] = '\n';
2377 else
2378 output_pos++;
2379 r_seen = false;
2380 } else {
2381 if (r_seen) p[output_pos++] = '\n';
2382 r_seen = false;
2383 if (input_pos != output_pos)
2384 p[output_pos++] = in;
2385 else
2386 output_pos++;
2387 }
2388 input_pos++;
2389 }
2390 if (r_seen ||
2391 (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2392 str->resize(output_pos + 1);
2393 str->operator[](output_pos) = '\n';
2394 } else if (output_pos < len) {
2395 str->resize(output_pos);
2396 }
2397 }
2398
2399 namespace internal {
2400
2401 // ----------------------------------------------------------------------
2402 // NoLocaleStrtod()
2403 // This code will make you cry.
2404 // ----------------------------------------------------------------------
2405
2406 namespace {
2407
2408 // Returns a string identical to *input except that the character pointed to
2409 // by radix_pos (which should be '.') is replaced with the locale-specific
2410 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2411 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2412 // Determine the locale-specific radix character by calling sprintf() to
2413 // print the number 1.5, then stripping off the digits. As far as I can
2414 // tell, this is the only portable, thread-safe way to get the C library
2415 // to divuldge the locale's radix character. No, localeconv() is NOT
2416 // thread-safe.
2417 char temp[16];
2418 int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2419 GOOGLE_CHECK_EQ(temp[0], '1');
2420 GOOGLE_CHECK_EQ(temp[size - 1], '5');
2421 GOOGLE_CHECK_LE(size, 6);
2422
2423 // Now replace the '.' in the input with it.
2424 std::string result;
2425 result.reserve(strlen(input) + size - 3);
2426 result.append(input, radix_pos);
2427 result.append(temp + 1, size - 2);
2428 result.append(radix_pos + 1);
2429 return result;
2430 }
2431
2432 } // namespace
2433
NoLocaleStrtod(const char * str,char ** endptr)2434 double NoLocaleStrtod(const char *str, char **endptr) {
2435 // We cannot simply set the locale to "C" temporarily with setlocale()
2436 // as this is not thread-safe. Instead, we try to parse in the current
2437 // locale first. If parsing stops at a '.' character, then this is a
2438 // pretty good hint that we're actually in some other locale in which
2439 // '.' is not the radix character.
2440
2441 char *temp_endptr;
2442 double result = strtod(str, &temp_endptr);
2443 if (endptr != NULL) *endptr = temp_endptr;
2444 if (*temp_endptr != '.') return result;
2445
2446 // Parsing halted on a '.'. Perhaps we're in a different locale? Let's
2447 // try to replace the '.' with a locale-specific radix character and
2448 // try again.
2449 std::string localized = LocalizeRadix(str, temp_endptr);
2450 const char *localized_cstr = localized.c_str();
2451 char *localized_endptr;
2452 result = strtod(localized_cstr, &localized_endptr);
2453 if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2454 // This attempt got further, so replacing the decimal must have helped.
2455 // Update endptr to point at the right location.
2456 if (endptr != NULL) {
2457 // size_diff is non-zero if the localized radix has multiple bytes.
2458 int size_diff = localized.size() - strlen(str);
2459 // const_cast is necessary to match the strtod() interface.
2460 *endptr = const_cast<char *>(
2461 str + (localized_endptr - localized_cstr - size_diff));
2462 }
2463 }
2464
2465 return result;
2466 }
2467
2468 } // namespace internal
2469
2470 } // namespace protobuf
2471 } // namespace google
2472