1 /*
2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights
4 * reserved.
5 * Copyright (C) 2007-2009 Torch Mobile, Inc.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 */
22
23 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
24
25 #include <locale.h>
26 #include <stdarg.h>
27 #include <algorithm>
28 #include "base/callback.h"
29 #include "base/strings/string_util.h"
30 #include "build/build_config.h"
31 #include "third_party/blink/renderer/platform/wtf/dtoa.h"
32 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
33 #include "third_party/blink/renderer/platform/wtf/size_assertions.h"
34 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
35 #include "third_party/blink/renderer/platform/wtf/text/case_map.h"
36 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
37 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
38 #include "third_party/blink/renderer/platform/wtf/text/unicode.h"
39 #include "third_party/blink/renderer/platform/wtf/text/utf8.h"
40 #include "third_party/blink/renderer/platform/wtf/vector.h"
41
42 namespace WTF {
43
44 ASSERT_SIZE(String, void*);
45
46 // Construct a string with UTF-16 data.
String(const UChar * characters,unsigned length)47 String::String(const UChar* characters, unsigned length)
48 : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {}
49
50 // Construct a string with UTF-16 data, from a null-terminated source.
String(const UChar * str)51 String::String(const UChar* str) {
52 if (!str)
53 return;
54 impl_ = StringImpl::Create(str, LengthOfNullTerminatedString(str));
55 }
56
57 // Construct a string with latin1 data.
String(const LChar * characters,unsigned length)58 String::String(const LChar* characters, unsigned length)
59 : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {}
60
String(const char * characters,unsigned length)61 String::String(const char* characters, unsigned length)
62 : impl_(characters
63 ? StringImpl::Create(reinterpret_cast<const LChar*>(characters),
64 length)
65 : nullptr) {}
66
67 #if defined(ARCH_CPU_64_BITS)
String(const char * characters,size_t length)68 String::String(const char* characters, size_t length)
69 : String(characters, SafeCast<unsigned>(length)) {}
70 #endif // defined(ARCH_CPU_64_BITS)
71
CodeUnitCompare(const String & a,const String & b)72 int CodeUnitCompare(const String& a, const String& b) {
73 return CodeUnitCompare(a.Impl(), b.Impl());
74 }
75
CodeUnitCompareIgnoringASCIICase(const String & a,const char * b)76 int CodeUnitCompareIgnoringASCIICase(const String& a, const char* b) {
77 return CodeUnitCompareIgnoringASCIICase(a.Impl(),
78 reinterpret_cast<const LChar*>(b));
79 }
80
Find(base::RepeatingCallback<bool (UChar)> match_callback,wtf_size_t index) const81 wtf_size_t String::Find(base::RepeatingCallback<bool(UChar)> match_callback,
82 wtf_size_t index) const {
83 return impl_ ? impl_->Find(match_callback, index) : kNotFound;
84 }
85
CharacterStartingAt(unsigned i) const86 UChar32 String::CharacterStartingAt(unsigned i) const {
87 if (!impl_ || i >= impl_->length())
88 return 0;
89 return impl_->CharacterStartingAt(i);
90 }
91
Ensure16Bit()92 void String::Ensure16Bit() {
93 if (IsNull())
94 return;
95 if (!Is8Bit())
96 return;
97 if (unsigned length = this->length())
98 impl_ = Make16BitFrom8BitSource(impl_->Characters8(), length).ReleaseImpl();
99 else
100 impl_ = StringImpl::empty16_bit_;
101 }
102
Truncate(unsigned length)103 void String::Truncate(unsigned length) {
104 if (impl_)
105 impl_ = impl_->Truncate(length);
106 }
107
Remove(unsigned start,unsigned length_to_remove)108 void String::Remove(unsigned start, unsigned length_to_remove) {
109 if (impl_)
110 impl_ = impl_->Remove(start, length_to_remove);
111 }
112
Substring(unsigned pos,unsigned len) const113 String String::Substring(unsigned pos, unsigned len) const {
114 if (!impl_)
115 return String();
116 return impl_->Substring(pos, len);
117 }
118
DeprecatedLower() const119 String String::DeprecatedLower() const {
120 if (!impl_)
121 return String();
122 return CaseMap::FastToLowerInvariant(impl_.get());
123 }
124
LowerASCII() const125 String String::LowerASCII() const {
126 if (!impl_)
127 return String();
128 return impl_->LowerASCII();
129 }
130
UpperASCII() const131 String String::UpperASCII() const {
132 if (!impl_)
133 return String();
134 return impl_->UpperASCII();
135 }
136
StripWhiteSpace() const137 String String::StripWhiteSpace() const {
138 if (!impl_)
139 return String();
140 return impl_->StripWhiteSpace();
141 }
142
StripWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space) const143 String String::StripWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space) const {
144 if (!impl_)
145 return String();
146 return impl_->StripWhiteSpace(is_white_space);
147 }
148
SimplifyWhiteSpace(StripBehavior strip_behavior) const149 String String::SimplifyWhiteSpace(StripBehavior strip_behavior) const {
150 if (!impl_)
151 return String();
152 return impl_->SimplifyWhiteSpace(strip_behavior);
153 }
154
SimplifyWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space,StripBehavior strip_behavior) const155 String String::SimplifyWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space,
156 StripBehavior strip_behavior) const {
157 if (!impl_)
158 return String();
159 return impl_->SimplifyWhiteSpace(is_white_space, strip_behavior);
160 }
161
RemoveCharacters(CharacterMatchFunctionPtr find_match) const162 String String::RemoveCharacters(CharacterMatchFunctionPtr find_match) const {
163 if (!impl_)
164 return String();
165 return impl_->RemoveCharacters(find_match);
166 }
167
FoldCase() const168 String String::FoldCase() const {
169 if (!impl_)
170 return String();
171 return impl_->FoldCase();
172 }
173
Format(const char * format,...)174 String String::Format(const char* format, ...) {
175 // vsnprintf is locale sensitive when converting floats to strings
176 // and we need it to always use a decimal point. Double check that
177 // the locale is compatible, and also that it is the default "C"
178 // locale so that we aren't just lucky. Android's locales work
179 // differently so can't check the same way there.
180 DCHECK_EQ(strcmp(localeconv()->decimal_point, "."), 0);
181 #if !defined(OS_ANDROID)
182 DCHECK_EQ(strcmp(setlocale(LC_NUMERIC, NULL), "C"), 0);
183 #endif // !OS_ANDROID
184
185 va_list args;
186
187 // TODO(esprehn): base uses 1024, maybe we should use a bigger size too.
188 static const unsigned kDefaultSize = 256;
189 Vector<char, kDefaultSize> buffer(kDefaultSize);
190
191 va_start(args, format);
192 int length = base::vsnprintf(buffer.data(), buffer.size(), format, args);
193 va_end(args);
194
195 // TODO(esprehn): This can only happen if there's an encoding error, what's
196 // the locale set to inside blink? Can this happen? We should probably CHECK
197 // instead.
198 if (length < 0)
199 return String();
200
201 if (static_cast<unsigned>(length) >= buffer.size()) {
202 // vsnprintf doesn't include the NUL terminator in the length so we need to
203 // add space for it when growing.
204 buffer.Grow(length + 1);
205
206 // We need to call va_end() and then va_start() each time we use args, as
207 // the contents of args is undefined after the call to vsnprintf according
208 // to http://man.cx/snprintf(3)
209 //
210 // Not calling va_end/va_start here happens to work on lots of systems, but
211 // fails e.g. on 64bit Linux.
212 va_start(args, format);
213 length = base::vsnprintf(buffer.data(), buffer.size(), format, args);
214 va_end(args);
215 }
216
217 CHECK_LT(static_cast<unsigned>(length), buffer.size());
218 return String(reinterpret_cast<const LChar*>(buffer.data()), length);
219 }
220
EncodeForDebugging() const221 String String::EncodeForDebugging() const {
222 if (IsNull())
223 return "<null>";
224
225 StringBuilder builder;
226 builder.Append('"');
227 for (unsigned index = 0; index < length(); ++index) {
228 // Print shorthands for select cases.
229 UChar character = (*impl_)[index];
230 switch (character) {
231 case '\t':
232 builder.Append("\\t");
233 break;
234 case '\n':
235 builder.Append("\\n");
236 break;
237 case '\r':
238 builder.Append("\\r");
239 break;
240 case '"':
241 builder.Append("\\\"");
242 break;
243 case '\\':
244 builder.Append("\\\\");
245 break;
246 default:
247 if (IsASCIIPrintable(character)) {
248 builder.Append(static_cast<char>(character));
249 } else {
250 // Print "\uXXXX" for control or non-ASCII characters.
251 builder.AppendFormat("\\u%04X", character);
252 }
253 break;
254 }
255 }
256 builder.Append('"');
257 return builder.ToString();
258 }
259
Number(float number)260 String String::Number(float number) {
261 return Number(static_cast<double>(number));
262 }
263
Number(double number,unsigned precision)264 String String::Number(double number, unsigned precision) {
265 NumberToStringBuffer buffer;
266 return String(NumberToFixedPrecisionString(number, precision, buffer));
267 }
268
NumberToStringECMAScript(double number)269 String String::NumberToStringECMAScript(double number) {
270 NumberToStringBuffer buffer;
271 return String(NumberToString(number, buffer));
272 }
273
NumberToStringFixedWidth(double number,unsigned decimal_places)274 String String::NumberToStringFixedWidth(double number,
275 unsigned decimal_places) {
276 NumberToStringBuffer buffer;
277 return String(NumberToFixedWidthString(number, decimal_places, buffer));
278 }
279
ToIntStrict(bool * ok) const280 int String::ToIntStrict(bool* ok) const {
281 if (!impl_) {
282 if (ok)
283 *ok = false;
284 return 0;
285 }
286 return impl_->ToInt(NumberParsingOptions::kStrict, ok);
287 }
288
ToUIntStrict(bool * ok) const289 unsigned String::ToUIntStrict(bool* ok) const {
290 if (!impl_) {
291 if (ok)
292 *ok = false;
293 return 0;
294 }
295 return impl_->ToUInt(NumberParsingOptions::kStrict, ok);
296 }
297
HexToUIntStrict(bool * ok) const298 unsigned String::HexToUIntStrict(bool* ok) const {
299 if (!impl_) {
300 if (ok)
301 *ok = false;
302 return 0;
303 }
304 return impl_->HexToUIntStrict(ok);
305 }
306
HexToUInt64Strict(bool * ok) const307 uint64_t String::HexToUInt64Strict(bool* ok) const {
308 if (!impl_) {
309 if (ok)
310 *ok = false;
311 return 0;
312 }
313 return impl_->HexToUInt64Strict(ok);
314 }
315
ToInt64Strict(bool * ok) const316 int64_t String::ToInt64Strict(bool* ok) const {
317 if (!impl_) {
318 if (ok)
319 *ok = false;
320 return 0;
321 }
322 return impl_->ToInt64(NumberParsingOptions::kStrict, ok);
323 }
324
ToUInt64Strict(bool * ok) const325 uint64_t String::ToUInt64Strict(bool* ok) const {
326 if (!impl_) {
327 if (ok)
328 *ok = false;
329 return 0;
330 }
331 return impl_->ToUInt64(NumberParsingOptions::kStrict, ok);
332 }
333
ToInt(bool * ok) const334 int String::ToInt(bool* ok) const {
335 if (!impl_) {
336 if (ok)
337 *ok = false;
338 return 0;
339 }
340 return impl_->ToInt(NumberParsingOptions::kLoose, ok);
341 }
342
ToUInt(bool * ok) const343 unsigned String::ToUInt(bool* ok) const {
344 if (!impl_) {
345 if (ok)
346 *ok = false;
347 return 0;
348 }
349 return impl_->ToUInt(NumberParsingOptions::kLoose, ok);
350 }
351
ToDouble(bool * ok) const352 double String::ToDouble(bool* ok) const {
353 if (!impl_) {
354 if (ok)
355 *ok = false;
356 return 0.0;
357 }
358 return impl_->ToDouble(ok);
359 }
360
ToFloat(bool * ok) const361 float String::ToFloat(bool* ok) const {
362 if (!impl_) {
363 if (ok)
364 *ok = false;
365 return 0.0f;
366 }
367 return impl_->ToFloat(ok);
368 }
369
IsolatedCopy() const370 String String::IsolatedCopy() const {
371 if (!impl_)
372 return String();
373 return impl_->IsolatedCopy();
374 }
375
IsSafeToSendToAnotherThread() const376 bool String::IsSafeToSendToAnotherThread() const {
377 return !impl_ || impl_->IsSafeToSendToAnotherThread();
378 }
379
Split(const StringView & separator,bool allow_empty_entries,Vector<String> & result) const380 void String::Split(const StringView& separator,
381 bool allow_empty_entries,
382 Vector<String>& result) const {
383 result.clear();
384
385 unsigned start_pos = 0;
386 wtf_size_t end_pos;
387 while ((end_pos = Find(separator, start_pos)) != kNotFound) {
388 if (allow_empty_entries || start_pos != end_pos)
389 result.push_back(Substring(start_pos, end_pos - start_pos));
390 start_pos = end_pos + separator.length();
391 }
392 if (allow_empty_entries || start_pos != length())
393 result.push_back(Substring(start_pos));
394 }
395
Split(UChar separator,bool allow_empty_entries,Vector<String> & result) const396 void String::Split(UChar separator,
397 bool allow_empty_entries,
398 Vector<String>& result) const {
399 result.clear();
400
401 unsigned start_pos = 0;
402 wtf_size_t end_pos;
403 while ((end_pos = find(separator, start_pos)) != kNotFound) {
404 if (allow_empty_entries || start_pos != end_pos)
405 result.push_back(Substring(start_pos, end_pos - start_pos));
406 start_pos = end_pos + 1;
407 }
408 if (allow_empty_entries || start_pos != length())
409 result.push_back(Substring(start_pos));
410 }
411
Ascii() const412 std::string String::Ascii() const {
413 // Printable ASCII characters 32..127 and the null character are
414 // preserved, characters outside of this range are converted to '?'.
415
416 unsigned length = this->length();
417 if (!length)
418 return std::string();
419
420 std::string ascii(length, '\0');
421 if (this->Is8Bit()) {
422 const LChar* characters = this->Characters8();
423
424 for (unsigned i = 0; i < length; ++i) {
425 LChar ch = characters[i];
426 ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
427 }
428 return ascii;
429 }
430
431 const UChar* characters = this->Characters16();
432 for (unsigned i = 0; i < length; ++i) {
433 UChar ch = characters[i];
434 ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast<char>(ch);
435 }
436
437 return ascii;
438 }
439
Latin1() const440 std::string String::Latin1() const {
441 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
442 // preserved, characters outside of this range are converted to '?'.
443 unsigned length = this->length();
444
445 if (!length)
446 return std::string();
447
448 if (Is8Bit()) {
449 return std::string(reinterpret_cast<const char*>(this->Characters8()),
450 length);
451 }
452
453 const UChar* characters = this->Characters16();
454 std::string latin1(length, '\0');
455 for (unsigned i = 0; i < length; ++i) {
456 UChar ch = characters[i];
457 latin1[i] = ch > 0xff ? '?' : static_cast<char>(ch);
458 }
459
460 return latin1;
461 }
462
463 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
464 // check room is available.
PutUTF8Triple(char * & buffer,UChar ch)465 static inline void PutUTF8Triple(char*& buffer, UChar ch) {
466 DCHECK_GE(ch, 0x0800);
467 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
468 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
469 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
470 }
471
Utf8(UTF8ConversionMode mode) const472 std::string String::Utf8(UTF8ConversionMode mode) const {
473 unsigned length = this->length();
474
475 if (!length)
476 return std::string();
477
478 // Allocate a buffer big enough to hold all the characters
479 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
480 // Optimization ideas, if we find this function is hot:
481 // * We could speculatively create a std::string to contain 'length'
482 // characters, and resize if necessary (i.e. if the buffer contains
483 // non-ascii characters). (Alternatively, scan the buffer first for
484 // ascii characters, so we know this will be sufficient).
485 // * We could allocate a std::string with an appropriate size to
486 // have a good chance of being able to write the string into the
487 // buffer without reallocing (say, 1.5 x length).
488 if (length > std::numeric_limits<unsigned>::max() / 3)
489 return std::string();
490 Vector<char, 1024> buffer_vector(length * 3);
491
492 char* buffer = buffer_vector.data();
493
494 if (Is8Bit()) {
495 const LChar* characters = this->Characters8();
496
497 unicode::ConversionResult result =
498 unicode::ConvertLatin1ToUTF8(&characters, characters + length, &buffer,
499 buffer + buffer_vector.size());
500 // (length * 3) should be sufficient for any conversion
501 DCHECK_NE(result, unicode::kTargetExhausted);
502 } else {
503 const UChar* characters = this->Characters16();
504
505 if (mode == kStrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) {
506 const UChar* characters_end = characters + length;
507 char* buffer_end = buffer + buffer_vector.size();
508 while (characters < characters_end) {
509 // Use strict conversion to detect unpaired surrogates.
510 unicode::ConversionResult result = unicode::ConvertUTF16ToUTF8(
511 &characters, characters_end, &buffer, buffer_end, true);
512 DCHECK_NE(result, unicode::kTargetExhausted);
513 // Conversion fails when there is an unpaired surrogate. Put
514 // replacement character (U+FFFD) instead of the unpaired
515 // surrogate.
516 if (result != unicode::kConversionOK) {
517 DCHECK_LE(0xD800, *characters);
518 DCHECK_LE(*characters, 0xDFFF);
519 // There should be room left, since one UChar hasn't been
520 // converted.
521 DCHECK_LE(buffer + 3, buffer_end);
522 PutUTF8Triple(buffer, kReplacementCharacter);
523 ++characters;
524 }
525 }
526 } else {
527 bool strict = mode == kStrictUTF8Conversion;
528 unicode::ConversionResult result =
529 unicode::ConvertUTF16ToUTF8(&characters, characters + length, &buffer,
530 buffer + buffer_vector.size(), strict);
531 // (length * 3) should be sufficient for any conversion
532 DCHECK_NE(result, unicode::kTargetExhausted);
533
534 // Only produced from strict conversion.
535 if (result == unicode::kSourceIllegal) {
536 DCHECK(strict);
537 return std::string();
538 }
539
540 // Check for an unconverted high surrogate.
541 if (result == unicode::kSourceExhausted) {
542 if (strict)
543 return std::string();
544 // This should be one unpaired high surrogate. Treat it the same
545 // was as an unpaired high surrogate would have been handled in
546 // the middle of a string with non-strict conversion - which is
547 // to say, simply encode it to UTF-8.
548 DCHECK_EQ(characters + 1, this->Characters16() + length);
549 DCHECK_GE(*characters, 0xD800);
550 DCHECK_LE(*characters, 0xDBFF);
551 // There should be room left, since one UChar hasn't been
552 // converted.
553 DCHECK_LE(buffer + 3, buffer + buffer_vector.size());
554 PutUTF8Triple(buffer, *characters);
555 }
556 }
557 }
558
559 return std::string(buffer_vector.data(), buffer - buffer_vector.data());
560 }
561
Make8BitFrom16BitSource(const UChar * source,wtf_size_t length)562 String String::Make8BitFrom16BitSource(const UChar* source, wtf_size_t length) {
563 if (!length)
564 return g_empty_string;
565
566 LChar* destination;
567 String result = String::CreateUninitialized(length, destination);
568
569 CopyLCharsFromUCharSource(destination, source, length);
570
571 return result;
572 }
573
Make16BitFrom8BitSource(const LChar * source,wtf_size_t length)574 String String::Make16BitFrom8BitSource(const LChar* source, wtf_size_t length) {
575 if (!length)
576 return g_empty_string16_bit;
577
578 UChar* destination;
579 String result = String::CreateUninitialized(length, destination);
580
581 StringImpl::CopyChars(destination, source, length);
582
583 return result;
584 }
585
FromUTF8(const LChar * string_start,size_t string_length)586 String String::FromUTF8(const LChar* string_start, size_t string_length) {
587 wtf_size_t length = SafeCast<wtf_size_t>(string_length);
588
589 if (!string_start)
590 return String();
591
592 if (!length)
593 return g_empty_string;
594
595 if (CharactersAreAllASCII(string_start, length))
596 return StringImpl::Create(string_start, length);
597
598 Vector<UChar, 1024> buffer(length);
599 UChar* buffer_start = buffer.data();
600
601 UChar* buffer_current = buffer_start;
602 const char* string_current = reinterpret_cast<const char*>(string_start);
603 if (unicode::ConvertUTF8ToUTF16(
604 &string_current, reinterpret_cast<const char*>(string_start + length),
605 &buffer_current,
606 buffer_current + buffer.size()) != unicode::kConversionOK)
607 return String();
608
609 unsigned utf16_length =
610 static_cast<wtf_size_t>(buffer_current - buffer_start);
611 DCHECK_LT(utf16_length, length);
612 return StringImpl::Create(buffer_start, utf16_length);
613 }
614
FromUTF8(const LChar * string)615 String String::FromUTF8(const LChar* string) {
616 if (!string)
617 return String();
618 return FromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
619 }
620
FromUTF8(base::StringPiece s)621 String String::FromUTF8(base::StringPiece s) {
622 return FromUTF8(reinterpret_cast<const LChar*>(s.data()), s.size());
623 }
624
FromUTF8WithLatin1Fallback(const LChar * string,size_t size)625 String String::FromUTF8WithLatin1Fallback(const LChar* string, size_t size) {
626 String utf8 = FromUTF8(string, size);
627 if (!utf8)
628 return String(string, SafeCast<wtf_size_t>(size));
629 return utf8;
630 }
631
operator <<(std::ostream & out,const String & string)632 std::ostream& operator<<(std::ostream& out, const String& string) {
633 return out << string.EncodeForDebugging().Utf8();
634 }
635
636 #ifndef NDEBUG
Show() const637 void String::Show() const {
638 DLOG(INFO) << *this;
639 }
640 #endif
641
642 } // namespace WTF
643