1 /*
2  * (C) 1999 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights
4  * reserved.
5  * Copyright (C) 2007-2009 Torch Mobile, Inc.
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Library General Public
9  * License as published by the Free Software Foundation; either
10  * version 2 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Library General Public License for more details.
16  *
17  * You should have received a copy of the GNU Library General Public License
18  * along with this library; see the file COPYING.LIB.  If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
24 
25 #include <locale.h>
26 #include <stdarg.h>
27 #include <algorithm>
28 #include "base/callback.h"
29 #include "base/strings/string_util.h"
30 #include "build/build_config.h"
31 #include "third_party/blink/renderer/platform/wtf/dtoa.h"
32 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
33 #include "third_party/blink/renderer/platform/wtf/size_assertions.h"
34 #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
35 #include "third_party/blink/renderer/platform/wtf/text/case_map.h"
36 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
37 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
38 #include "third_party/blink/renderer/platform/wtf/text/unicode.h"
39 #include "third_party/blink/renderer/platform/wtf/text/utf8.h"
40 #include "third_party/blink/renderer/platform/wtf/vector.h"
41 
42 namespace WTF {
43 
44 ASSERT_SIZE(String, void*);
45 
46 // Construct a string with UTF-16 data.
String(const UChar * characters,unsigned length)47 String::String(const UChar* characters, unsigned length)
48     : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {}
49 
50 // Construct a string with UTF-16 data, from a null-terminated source.
String(const UChar * str)51 String::String(const UChar* str) {
52   if (!str)
53     return;
54   impl_ = StringImpl::Create(str, LengthOfNullTerminatedString(str));
55 }
56 
57 // Construct a string with latin1 data.
String(const LChar * characters,unsigned length)58 String::String(const LChar* characters, unsigned length)
59     : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {}
60 
String(const char * characters,unsigned length)61 String::String(const char* characters, unsigned length)
62     : impl_(characters
63                 ? StringImpl::Create(reinterpret_cast<const LChar*>(characters),
64                                      length)
65                 : nullptr) {}
66 
67 #if defined(ARCH_CPU_64_BITS)
String(const char * characters,size_t length)68 String::String(const char* characters, size_t length)
69     : String(characters, SafeCast<unsigned>(length)) {}
70 #endif  // defined(ARCH_CPU_64_BITS)
71 
CodeUnitCompare(const String & a,const String & b)72 int CodeUnitCompare(const String& a, const String& b) {
73   return CodeUnitCompare(a.Impl(), b.Impl());
74 }
75 
CodeUnitCompareIgnoringASCIICase(const String & a,const char * b)76 int CodeUnitCompareIgnoringASCIICase(const String& a, const char* b) {
77   return CodeUnitCompareIgnoringASCIICase(a.Impl(),
78                                           reinterpret_cast<const LChar*>(b));
79 }
80 
Find(base::RepeatingCallback<bool (UChar)> match_callback,wtf_size_t index) const81 wtf_size_t String::Find(base::RepeatingCallback<bool(UChar)> match_callback,
82                         wtf_size_t index) const {
83   return impl_ ? impl_->Find(match_callback, index) : kNotFound;
84 }
85 
CharacterStartingAt(unsigned i) const86 UChar32 String::CharacterStartingAt(unsigned i) const {
87   if (!impl_ || i >= impl_->length())
88     return 0;
89   return impl_->CharacterStartingAt(i);
90 }
91 
Ensure16Bit()92 void String::Ensure16Bit() {
93   if (IsNull())
94     return;
95   if (!Is8Bit())
96     return;
97   if (unsigned length = this->length())
98     impl_ = Make16BitFrom8BitSource(impl_->Characters8(), length).ReleaseImpl();
99   else
100     impl_ = StringImpl::empty16_bit_;
101 }
102 
Truncate(unsigned length)103 void String::Truncate(unsigned length) {
104   if (impl_)
105     impl_ = impl_->Truncate(length);
106 }
107 
Remove(unsigned start,unsigned length_to_remove)108 void String::Remove(unsigned start, unsigned length_to_remove) {
109   if (impl_)
110     impl_ = impl_->Remove(start, length_to_remove);
111 }
112 
Substring(unsigned pos,unsigned len) const113 String String::Substring(unsigned pos, unsigned len) const {
114   if (!impl_)
115     return String();
116   return impl_->Substring(pos, len);
117 }
118 
DeprecatedLower() const119 String String::DeprecatedLower() const {
120   if (!impl_)
121     return String();
122   return CaseMap::FastToLowerInvariant(impl_.get());
123 }
124 
LowerASCII() const125 String String::LowerASCII() const {
126   if (!impl_)
127     return String();
128   return impl_->LowerASCII();
129 }
130 
UpperASCII() const131 String String::UpperASCII() const {
132   if (!impl_)
133     return String();
134   return impl_->UpperASCII();
135 }
136 
StripWhiteSpace() const137 String String::StripWhiteSpace() const {
138   if (!impl_)
139     return String();
140   return impl_->StripWhiteSpace();
141 }
142 
StripWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space) const143 String String::StripWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space) const {
144   if (!impl_)
145     return String();
146   return impl_->StripWhiteSpace(is_white_space);
147 }
148 
SimplifyWhiteSpace(StripBehavior strip_behavior) const149 String String::SimplifyWhiteSpace(StripBehavior strip_behavior) const {
150   if (!impl_)
151     return String();
152   return impl_->SimplifyWhiteSpace(strip_behavior);
153 }
154 
SimplifyWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space,StripBehavior strip_behavior) const155 String String::SimplifyWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space,
156                                   StripBehavior strip_behavior) const {
157   if (!impl_)
158     return String();
159   return impl_->SimplifyWhiteSpace(is_white_space, strip_behavior);
160 }
161 
RemoveCharacters(CharacterMatchFunctionPtr find_match) const162 String String::RemoveCharacters(CharacterMatchFunctionPtr find_match) const {
163   if (!impl_)
164     return String();
165   return impl_->RemoveCharacters(find_match);
166 }
167 
FoldCase() const168 String String::FoldCase() const {
169   if (!impl_)
170     return String();
171   return impl_->FoldCase();
172 }
173 
Format(const char * format,...)174 String String::Format(const char* format, ...) {
175   // vsnprintf is locale sensitive when converting floats to strings
176   // and we need it to always use a decimal point. Double check that
177   // the locale is compatible, and also that it is the default "C"
178   // locale so that we aren't just lucky. Android's locales work
179   // differently so can't check the same way there.
180   DCHECK_EQ(strcmp(localeconv()->decimal_point, "."), 0);
181 #if !defined(OS_ANDROID)
182   DCHECK_EQ(strcmp(setlocale(LC_NUMERIC, NULL), "C"), 0);
183 #endif  // !OS_ANDROID
184 
185   va_list args;
186 
187   // TODO(esprehn): base uses 1024, maybe we should use a bigger size too.
188   static const unsigned kDefaultSize = 256;
189   Vector<char, kDefaultSize> buffer(kDefaultSize);
190 
191   va_start(args, format);
192   int length = base::vsnprintf(buffer.data(), buffer.size(), format, args);
193   va_end(args);
194 
195   // TODO(esprehn): This can only happen if there's an encoding error, what's
196   // the locale set to inside blink? Can this happen? We should probably CHECK
197   // instead.
198   if (length < 0)
199     return String();
200 
201   if (static_cast<unsigned>(length) >= buffer.size()) {
202     // vsnprintf doesn't include the NUL terminator in the length so we need to
203     // add space for it when growing.
204     buffer.Grow(length + 1);
205 
206     // We need to call va_end() and then va_start() each time we use args, as
207     // the contents of args is undefined after the call to vsnprintf according
208     // to http://man.cx/snprintf(3)
209     //
210     // Not calling va_end/va_start here happens to work on lots of systems, but
211     // fails e.g. on 64bit Linux.
212     va_start(args, format);
213     length = base::vsnprintf(buffer.data(), buffer.size(), format, args);
214     va_end(args);
215   }
216 
217   CHECK_LT(static_cast<unsigned>(length), buffer.size());
218   return String(reinterpret_cast<const LChar*>(buffer.data()), length);
219 }
220 
EncodeForDebugging() const221 String String::EncodeForDebugging() const {
222   if (IsNull())
223     return "<null>";
224 
225   StringBuilder builder;
226   builder.Append('"');
227   for (unsigned index = 0; index < length(); ++index) {
228     // Print shorthands for select cases.
229     UChar character = (*impl_)[index];
230     switch (character) {
231       case '\t':
232         builder.Append("\\t");
233         break;
234       case '\n':
235         builder.Append("\\n");
236         break;
237       case '\r':
238         builder.Append("\\r");
239         break;
240       case '"':
241         builder.Append("\\\"");
242         break;
243       case '\\':
244         builder.Append("\\\\");
245         break;
246       default:
247         if (IsASCIIPrintable(character)) {
248           builder.Append(static_cast<char>(character));
249         } else {
250           // Print "\uXXXX" for control or non-ASCII characters.
251           builder.AppendFormat("\\u%04X", character);
252         }
253         break;
254     }
255   }
256   builder.Append('"');
257   return builder.ToString();
258 }
259 
Number(float number)260 String String::Number(float number) {
261   return Number(static_cast<double>(number));
262 }
263 
Number(double number,unsigned precision)264 String String::Number(double number, unsigned precision) {
265   NumberToStringBuffer buffer;
266   return String(NumberToFixedPrecisionString(number, precision, buffer));
267 }
268 
NumberToStringECMAScript(double number)269 String String::NumberToStringECMAScript(double number) {
270   NumberToStringBuffer buffer;
271   return String(NumberToString(number, buffer));
272 }
273 
NumberToStringFixedWidth(double number,unsigned decimal_places)274 String String::NumberToStringFixedWidth(double number,
275                                         unsigned decimal_places) {
276   NumberToStringBuffer buffer;
277   return String(NumberToFixedWidthString(number, decimal_places, buffer));
278 }
279 
ToIntStrict(bool * ok) const280 int String::ToIntStrict(bool* ok) const {
281   if (!impl_) {
282     if (ok)
283       *ok = false;
284     return 0;
285   }
286   return impl_->ToInt(NumberParsingOptions::kStrict, ok);
287 }
288 
ToUIntStrict(bool * ok) const289 unsigned String::ToUIntStrict(bool* ok) const {
290   if (!impl_) {
291     if (ok)
292       *ok = false;
293     return 0;
294   }
295   return impl_->ToUInt(NumberParsingOptions::kStrict, ok);
296 }
297 
HexToUIntStrict(bool * ok) const298 unsigned String::HexToUIntStrict(bool* ok) const {
299   if (!impl_) {
300     if (ok)
301       *ok = false;
302     return 0;
303   }
304   return impl_->HexToUIntStrict(ok);
305 }
306 
HexToUInt64Strict(bool * ok) const307 uint64_t String::HexToUInt64Strict(bool* ok) const {
308   if (!impl_) {
309     if (ok)
310       *ok = false;
311     return 0;
312   }
313   return impl_->HexToUInt64Strict(ok);
314 }
315 
ToInt64Strict(bool * ok) const316 int64_t String::ToInt64Strict(bool* ok) const {
317   if (!impl_) {
318     if (ok)
319       *ok = false;
320     return 0;
321   }
322   return impl_->ToInt64(NumberParsingOptions::kStrict, ok);
323 }
324 
ToUInt64Strict(bool * ok) const325 uint64_t String::ToUInt64Strict(bool* ok) const {
326   if (!impl_) {
327     if (ok)
328       *ok = false;
329     return 0;
330   }
331   return impl_->ToUInt64(NumberParsingOptions::kStrict, ok);
332 }
333 
ToInt(bool * ok) const334 int String::ToInt(bool* ok) const {
335   if (!impl_) {
336     if (ok)
337       *ok = false;
338     return 0;
339   }
340   return impl_->ToInt(NumberParsingOptions::kLoose, ok);
341 }
342 
ToUInt(bool * ok) const343 unsigned String::ToUInt(bool* ok) const {
344   if (!impl_) {
345     if (ok)
346       *ok = false;
347     return 0;
348   }
349   return impl_->ToUInt(NumberParsingOptions::kLoose, ok);
350 }
351 
ToDouble(bool * ok) const352 double String::ToDouble(bool* ok) const {
353   if (!impl_) {
354     if (ok)
355       *ok = false;
356     return 0.0;
357   }
358   return impl_->ToDouble(ok);
359 }
360 
ToFloat(bool * ok) const361 float String::ToFloat(bool* ok) const {
362   if (!impl_) {
363     if (ok)
364       *ok = false;
365     return 0.0f;
366   }
367   return impl_->ToFloat(ok);
368 }
369 
IsolatedCopy() const370 String String::IsolatedCopy() const {
371   if (!impl_)
372     return String();
373   return impl_->IsolatedCopy();
374 }
375 
IsSafeToSendToAnotherThread() const376 bool String::IsSafeToSendToAnotherThread() const {
377   return !impl_ || impl_->IsSafeToSendToAnotherThread();
378 }
379 
Split(const StringView & separator,bool allow_empty_entries,Vector<String> & result) const380 void String::Split(const StringView& separator,
381                    bool allow_empty_entries,
382                    Vector<String>& result) const {
383   result.clear();
384 
385   unsigned start_pos = 0;
386   wtf_size_t end_pos;
387   while ((end_pos = Find(separator, start_pos)) != kNotFound) {
388     if (allow_empty_entries || start_pos != end_pos)
389       result.push_back(Substring(start_pos, end_pos - start_pos));
390     start_pos = end_pos + separator.length();
391   }
392   if (allow_empty_entries || start_pos != length())
393     result.push_back(Substring(start_pos));
394 }
395 
Split(UChar separator,bool allow_empty_entries,Vector<String> & result) const396 void String::Split(UChar separator,
397                    bool allow_empty_entries,
398                    Vector<String>& result) const {
399   result.clear();
400 
401   unsigned start_pos = 0;
402   wtf_size_t end_pos;
403   while ((end_pos = find(separator, start_pos)) != kNotFound) {
404     if (allow_empty_entries || start_pos != end_pos)
405       result.push_back(Substring(start_pos, end_pos - start_pos));
406     start_pos = end_pos + 1;
407   }
408   if (allow_empty_entries || start_pos != length())
409     result.push_back(Substring(start_pos));
410 }
411 
Ascii() const412 std::string String::Ascii() const {
413   // Printable ASCII characters 32..127 and the null character are
414   // preserved, characters outside of this range are converted to '?'.
415 
416   unsigned length = this->length();
417   if (!length)
418     return std::string();
419 
420   std::string ascii(length, '\0');
421   if (this->Is8Bit()) {
422     const LChar* characters = this->Characters8();
423 
424     for (unsigned i = 0; i < length; ++i) {
425       LChar ch = characters[i];
426       ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
427     }
428     return ascii;
429   }
430 
431   const UChar* characters = this->Characters16();
432   for (unsigned i = 0; i < length; ++i) {
433     UChar ch = characters[i];
434     ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast<char>(ch);
435   }
436 
437   return ascii;
438 }
439 
Latin1() const440 std::string String::Latin1() const {
441   // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
442   // preserved, characters outside of this range are converted to '?'.
443   unsigned length = this->length();
444 
445   if (!length)
446     return std::string();
447 
448   if (Is8Bit()) {
449     return std::string(reinterpret_cast<const char*>(this->Characters8()),
450                        length);
451   }
452 
453   const UChar* characters = this->Characters16();
454   std::string latin1(length, '\0');
455   for (unsigned i = 0; i < length; ++i) {
456     UChar ch = characters[i];
457     latin1[i] = ch > 0xff ? '?' : static_cast<char>(ch);
458   }
459 
460   return latin1;
461 }
462 
463 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
464 // check room is available.
PutUTF8Triple(char * & buffer,UChar ch)465 static inline void PutUTF8Triple(char*& buffer, UChar ch) {
466   DCHECK_GE(ch, 0x0800);
467   *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
468   *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
469   *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
470 }
471 
Utf8(UTF8ConversionMode mode) const472 std::string String::Utf8(UTF8ConversionMode mode) const {
473   unsigned length = this->length();
474 
475   if (!length)
476     return std::string();
477 
478   // Allocate a buffer big enough to hold all the characters
479   // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
480   // Optimization ideas, if we find this function is hot:
481   //  * We could speculatively create a std::string to contain 'length'
482   //    characters, and resize if necessary (i.e. if the buffer contains
483   //    non-ascii characters). (Alternatively, scan the buffer first for
484   //    ascii characters, so we know this will be sufficient).
485   //  * We could allocate a std::string with an appropriate size to
486   //    have a good chance of being able to write the string into the
487   //    buffer without reallocing (say, 1.5 x length).
488   if (length > std::numeric_limits<unsigned>::max() / 3)
489     return std::string();
490   Vector<char, 1024> buffer_vector(length * 3);
491 
492   char* buffer = buffer_vector.data();
493 
494   if (Is8Bit()) {
495     const LChar* characters = this->Characters8();
496 
497     unicode::ConversionResult result =
498         unicode::ConvertLatin1ToUTF8(&characters, characters + length, &buffer,
499                                      buffer + buffer_vector.size());
500     // (length * 3) should be sufficient for any conversion
501     DCHECK_NE(result, unicode::kTargetExhausted);
502   } else {
503     const UChar* characters = this->Characters16();
504 
505     if (mode == kStrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) {
506       const UChar* characters_end = characters + length;
507       char* buffer_end = buffer + buffer_vector.size();
508       while (characters < characters_end) {
509         // Use strict conversion to detect unpaired surrogates.
510         unicode::ConversionResult result = unicode::ConvertUTF16ToUTF8(
511             &characters, characters_end, &buffer, buffer_end, true);
512         DCHECK_NE(result, unicode::kTargetExhausted);
513         // Conversion fails when there is an unpaired surrogate.  Put
514         // replacement character (U+FFFD) instead of the unpaired
515         // surrogate.
516         if (result != unicode::kConversionOK) {
517           DCHECK_LE(0xD800, *characters);
518           DCHECK_LE(*characters, 0xDFFF);
519           // There should be room left, since one UChar hasn't been
520           // converted.
521           DCHECK_LE(buffer + 3, buffer_end);
522           PutUTF8Triple(buffer, kReplacementCharacter);
523           ++characters;
524         }
525       }
526     } else {
527       bool strict = mode == kStrictUTF8Conversion;
528       unicode::ConversionResult result =
529           unicode::ConvertUTF16ToUTF8(&characters, characters + length, &buffer,
530                                       buffer + buffer_vector.size(), strict);
531       // (length * 3) should be sufficient for any conversion
532       DCHECK_NE(result, unicode::kTargetExhausted);
533 
534       // Only produced from strict conversion.
535       if (result == unicode::kSourceIllegal) {
536         DCHECK(strict);
537         return std::string();
538       }
539 
540       // Check for an unconverted high surrogate.
541       if (result == unicode::kSourceExhausted) {
542         if (strict)
543           return std::string();
544         // This should be one unpaired high surrogate. Treat it the same
545         // was as an unpaired high surrogate would have been handled in
546         // the middle of a string with non-strict conversion - which is
547         // to say, simply encode it to UTF-8.
548         DCHECK_EQ(characters + 1, this->Characters16() + length);
549         DCHECK_GE(*characters, 0xD800);
550         DCHECK_LE(*characters, 0xDBFF);
551         // There should be room left, since one UChar hasn't been
552         // converted.
553         DCHECK_LE(buffer + 3, buffer + buffer_vector.size());
554         PutUTF8Triple(buffer, *characters);
555       }
556     }
557   }
558 
559   return std::string(buffer_vector.data(), buffer - buffer_vector.data());
560 }
561 
Make8BitFrom16BitSource(const UChar * source,wtf_size_t length)562 String String::Make8BitFrom16BitSource(const UChar* source, wtf_size_t length) {
563   if (!length)
564     return g_empty_string;
565 
566   LChar* destination;
567   String result = String::CreateUninitialized(length, destination);
568 
569   CopyLCharsFromUCharSource(destination, source, length);
570 
571   return result;
572 }
573 
Make16BitFrom8BitSource(const LChar * source,wtf_size_t length)574 String String::Make16BitFrom8BitSource(const LChar* source, wtf_size_t length) {
575   if (!length)
576     return g_empty_string16_bit;
577 
578   UChar* destination;
579   String result = String::CreateUninitialized(length, destination);
580 
581   StringImpl::CopyChars(destination, source, length);
582 
583   return result;
584 }
585 
FromUTF8(const LChar * string_start,size_t string_length)586 String String::FromUTF8(const LChar* string_start, size_t string_length) {
587   wtf_size_t length = SafeCast<wtf_size_t>(string_length);
588 
589   if (!string_start)
590     return String();
591 
592   if (!length)
593     return g_empty_string;
594 
595   if (CharactersAreAllASCII(string_start, length))
596     return StringImpl::Create(string_start, length);
597 
598   Vector<UChar, 1024> buffer(length);
599   UChar* buffer_start = buffer.data();
600 
601   UChar* buffer_current = buffer_start;
602   const char* string_current = reinterpret_cast<const char*>(string_start);
603   if (unicode::ConvertUTF8ToUTF16(
604           &string_current, reinterpret_cast<const char*>(string_start + length),
605           &buffer_current,
606           buffer_current + buffer.size()) != unicode::kConversionOK)
607     return String();
608 
609   unsigned utf16_length =
610       static_cast<wtf_size_t>(buffer_current - buffer_start);
611   DCHECK_LT(utf16_length, length);
612   return StringImpl::Create(buffer_start, utf16_length);
613 }
614 
FromUTF8(const LChar * string)615 String String::FromUTF8(const LChar* string) {
616   if (!string)
617     return String();
618   return FromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
619 }
620 
FromUTF8(base::StringPiece s)621 String String::FromUTF8(base::StringPiece s) {
622   return FromUTF8(reinterpret_cast<const LChar*>(s.data()), s.size());
623 }
624 
FromUTF8WithLatin1Fallback(const LChar * string,size_t size)625 String String::FromUTF8WithLatin1Fallback(const LChar* string, size_t size) {
626   String utf8 = FromUTF8(string, size);
627   if (!utf8)
628     return String(string, SafeCast<wtf_size_t>(size));
629   return utf8;
630 }
631 
operator <<(std::ostream & out,const String & string)632 std::ostream& operator<<(std::ostream& out, const String& string) {
633   return out << string.EncodeForDebugging().Utf8();
634 }
635 
636 #ifndef NDEBUG
Show() const637 void String::Show() const {
638   DLOG(INFO) << *this;
639 }
640 #endif
641 
642 }  // namespace WTF
643