1 // Copyright 2017 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_OBJECTS_STRING_H_ 6 #define V8_OBJECTS_STRING_H_ 7 8 #include <memory> 9 10 #include "src/base/bits.h" 11 #include "src/base/export-template.h" 12 #include "src/base/strings.h" 13 #include "src/common/globals.h" 14 #include "src/objects/instance-type.h" 15 #include "src/objects/name.h" 16 #include "src/objects/smi.h" 17 #include "src/strings/unicode-decoder.h" 18 19 // Has to be the last include (doesn't have include guards): 20 #include "src/objects/object-macros.h" 21 22 namespace v8 { 23 namespace internal { 24 25 class SharedStringAccessGuardIfNeeded; 26 27 enum InstanceType : uint16_t; 28 29 enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS }; 30 enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL }; 31 32 // The characteristics of a string are stored in its map. Retrieving these 33 // few bits of information is moderately expensive, involving two memory 34 // loads where the second is dependent on the first. To improve efficiency 35 // the shape of the string is given its own class so that it can be retrieved 36 // once and used for several string operations. A StringShape is small enough 37 // to be passed by value and is immutable, but be aware that flattening a 38 // string can potentially alter its shape. Also be aware that a GC caused by 39 // something else can alter the shape of a string due to ConsString 40 // shortcutting. Keeping these restrictions in mind has proven to be error- 41 // prone and so we no longer put StringShapes in variables unless there is a 42 // concrete performance benefit at that particular point in the code. 43 class StringShape { 44 public: 45 inline explicit StringShape(const String s); 46 inline explicit StringShape(const String s, PtrComprCageBase cage_base); 47 inline explicit StringShape(Map s); 48 inline explicit StringShape(InstanceType t); 49 inline bool IsSequential() const; 50 inline bool IsExternal() const; 51 inline bool IsCons() const; 52 inline bool IsSliced() const; 53 inline bool IsThin() const; 54 inline bool IsIndirect() const; 55 inline bool IsUncachedExternal() const; 56 inline bool IsExternalOneByte() const; 57 inline bool IsExternalTwoByte() const; 58 inline bool IsSequentialOneByte() const; 59 inline bool IsSequentialTwoByte() const; 60 inline bool IsInternalized() const; 61 inline StringRepresentationTag representation_tag() const; 62 inline uint32_t encoding_tag() const; 63 inline uint32_t full_representation_tag() const; 64 #ifdef DEBUG type()65 inline uint32_t type() const { return type_; } invalidate()66 inline void invalidate() { valid_ = false; } valid()67 inline bool valid() const { return valid_; } 68 #else invalidate()69 inline void invalidate() {} 70 #endif 71 72 // Run different behavior for each concrete string class type, as defined by 73 // the dispatcher. 74 template <typename TDispatcher, typename TResult, typename... TArgs> 75 inline TResult DispatchToSpecificTypeWithoutCast(TArgs&&... args); 76 template <typename TDispatcher, typename TResult, typename... TArgs> 77 inline TResult DispatchToSpecificType(String str, TArgs&&... args); 78 79 private: 80 uint32_t type_; 81 #ifdef DEBUG set_valid()82 inline void set_valid() { valid_ = true; } 83 bool valid_; 84 #else set_valid()85 inline void set_valid() {} 86 #endif 87 }; 88 89 #include "torque-generated/src/objects/string-tq.inc" 90 91 // The String abstract class captures JavaScript string values: 92 // 93 // Ecma-262: 94 // 4.3.16 String Value 95 // A string value is a member of the type String and is a finite 96 // ordered sequence of zero or more 16-bit unsigned integer values. 97 // 98 // All string values have a length field. 99 class String : public TorqueGeneratedString<String, Name> { 100 public: 101 enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING }; 102 103 // Representation of the flat content of a String. 104 // A non-flat string doesn't have flat content. 105 // A flat string has content that's encoded as a sequence of either 106 // one-byte chars or two-byte UC16. 107 // Returned by String::GetFlatContent(). 108 // Not safe to use from concurrent background threads. 109 // TODO(solanes): Move FlatContent into FlatStringReader, and make it private. 110 // This would de-duplicate code, as well as taking advantage of the fact that 111 // FlatStringReader is relocatable. 112 class FlatContent { 113 public: 114 // Returns true if the string is flat and this structure contains content. IsFlat()115 bool IsFlat() const { return state_ != NON_FLAT; } 116 // Returns true if the structure contains one-byte content. IsOneByte()117 bool IsOneByte() const { return state_ == ONE_BYTE; } 118 // Returns true if the structure contains two-byte content. IsTwoByte()119 bool IsTwoByte() const { return state_ == TWO_BYTE; } 120 121 // Return the one byte content of the string. Only use if IsOneByte() 122 // returns true. ToOneByteVector()123 base::Vector<const uint8_t> ToOneByteVector() const { 124 DCHECK_EQ(ONE_BYTE, state_); 125 return base::Vector<const uint8_t>(onebyte_start, length_); 126 } 127 // Return the two-byte content of the string. Only use if IsTwoByte() 128 // returns true. ToUC16Vector()129 base::Vector<const base::uc16> ToUC16Vector() const { 130 DCHECK_EQ(TWO_BYTE, state_); 131 return base::Vector<const base::uc16>(twobyte_start, length_); 132 } 133 Get(int i)134 base::uc16 Get(int i) const { 135 DCHECK(i < length_); 136 DCHECK(state_ != NON_FLAT); 137 if (state_ == ONE_BYTE) return onebyte_start[i]; 138 return twobyte_start[i]; 139 } 140 UsesSameString(const FlatContent & other)141 bool UsesSameString(const FlatContent& other) const { 142 return onebyte_start == other.onebyte_start; 143 } 144 145 private: 146 enum State { NON_FLAT, ONE_BYTE, TWO_BYTE }; 147 148 // Constructors only used by String::GetFlatContent(). FlatContent(const uint8_t * start,int length,const DisallowGarbageCollection & no_gc)149 FlatContent(const uint8_t* start, int length, 150 const DisallowGarbageCollection& no_gc) 151 : onebyte_start(start), 152 length_(length), 153 state_(ONE_BYTE), 154 no_gc_(no_gc) {} FlatContent(const base::uc16 * start,int length,const DisallowGarbageCollection & no_gc)155 FlatContent(const base::uc16* start, int length, 156 const DisallowGarbageCollection& no_gc) 157 : twobyte_start(start), 158 length_(length), 159 state_(TWO_BYTE), 160 no_gc_(no_gc) {} FlatContent(const DisallowGarbageCollection & no_gc)161 explicit FlatContent(const DisallowGarbageCollection& no_gc) 162 : onebyte_start(nullptr), length_(0), state_(NON_FLAT), no_gc_(no_gc) {} 163 164 union { 165 const uint8_t* onebyte_start; 166 const base::uc16* twobyte_start; 167 }; 168 int length_; 169 State state_; 170 const DisallowGarbageCollection& no_gc_; 171 172 friend class String; 173 friend class IterableSubString; 174 }; 175 176 template <typename IsolateT> 177 void MakeThin(IsolateT* isolate, String canonical); 178 179 template <typename Char> 180 V8_INLINE base::Vector<const Char> GetCharVector( 181 const DisallowGarbageCollection& no_gc); 182 183 // Get chars from sequential or external strings. May only be called when a 184 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 185 // read-only strings). 186 template <typename Char> 187 inline const Char* GetChars(PtrComprCageBase cage_base, 188 const DisallowGarbageCollection& no_gc) const; 189 190 // Get chars from sequential or external strings. 191 template <typename Char> 192 inline const Char* GetChars( 193 PtrComprCageBase cage_base, const DisallowGarbageCollection& no_gc, 194 const SharedStringAccessGuardIfNeeded& access_guard) const; 195 196 // Returns the address of the character at an offset into this string. 197 // Requires: this->IsFlat() 198 const byte* AddressOfCharacterAt(int start_index, 199 const DisallowGarbageCollection& no_gc); 200 201 // Forward declare the non-atomic (set_)length defined in torque. 202 using TorqueGeneratedString::length; 203 using TorqueGeneratedString::set_length; 204 DECL_RELEASE_ACQUIRE_INT_ACCESSORS(length) 205 206 // Returns whether this string has only one-byte chars, i.e. all of them can 207 // be one-byte encoded. This might be the case even if the string is 208 // two-byte. Such strings may appear when the embedder prefers 209 // two-byte external representations even for one-byte data. 210 DECL_GETTER(IsOneByteRepresentation, bool) 211 DECL_GETTER(IsTwoByteRepresentation, bool) 212 213 // Cons and slices have an encoding flag that may not represent the actual 214 // encoding of the underlying string. This is taken into account here. 215 // This function is static because that helps it get inlined. 216 // Requires: string.IsFlat() 217 static inline bool IsOneByteRepresentationUnderneath(String string); 218 219 // Get and set individual two byte chars in the string. 220 inline void Set(int index, uint16_t value); 221 // Get individual two byte char in the string. Repeated calls 222 // to this method are not efficient unless the string is flat. 223 // If it is called from a background thread, the LocalIsolate version should 224 // be used. 225 V8_INLINE uint16_t Get(int index) const; 226 V8_INLINE uint16_t Get(int index, Isolate* isolate) const; 227 V8_INLINE uint16_t Get(int index, LocalIsolate* local_isolate) const; 228 // Method to pass down the access_guard. Useful for recursive calls such as 229 // ThinStrings where we go String::Get into ThinString::Get into String::Get 230 // again for the internalized string. 231 V8_INLINE uint16_t 232 Get(int index, PtrComprCageBase cage_base, 233 const SharedStringAccessGuardIfNeeded& access_guard) const; 234 235 // ES6 section 7.1.3.1 ToNumber Applied to the String Type 236 static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject); 237 238 // Flattens the string. Checks first inline to see if it is 239 // necessary. Does nothing if the string is not a cons string. 240 // Flattening allocates a sequential string with the same data as 241 // the given string and mutates the cons string to a degenerate 242 // form, where the first component is the new sequential string and 243 // the second component is the empty string. If allocation fails, 244 // this function returns a failure. If flattening succeeds, this 245 // function returns the sequential string that is now the first 246 // component of the cons string. 247 // 248 // Degenerate cons strings are handled specially by the garbage 249 // collector (see IsShortcutCandidate). 250 251 static inline Handle<String> Flatten( 252 Isolate* isolate, Handle<String> string, 253 AllocationType allocation = AllocationType::kYoung); 254 static inline Handle<String> Flatten( 255 LocalIsolate* isolate, Handle<String> string, 256 AllocationType allocation = AllocationType::kYoung); 257 258 // Tries to return the content of a flat string as a structure holding either 259 // a flat vector of char or of base::uc16. 260 // If the string isn't flat, and therefore doesn't have flat content, the 261 // returned structure will report so, and can't provide a vector of either 262 // kind. 263 V8_EXPORT_PRIVATE FlatContent 264 GetFlatContent(const DisallowGarbageCollection& no_gc); 265 266 // Returns the parent of a sliced string or first part of a flat cons string. 267 // Requires: StringShape(this).IsIndirect() && this->IsFlat() 268 inline String GetUnderlying() const; 269 270 // String relational comparison, implemented according to ES6 section 7.2.11 271 // Abstract Relational Comparison (step 5): The comparison of Strings uses a 272 // simple lexicographic ordering on sequences of code unit values. There is no 273 // attempt to use the more complex, semantically oriented definitions of 274 // character or string equality and collating order defined in the Unicode 275 // specification. Therefore String values that are canonically equal according 276 // to the Unicode standard could test as unequal. In effect this algorithm 277 // assumes that both Strings are already in normalized form. Also, note that 278 // for strings containing supplementary characters, lexicographic ordering on 279 // sequences of UTF-16 code unit values differs from that on sequences of code 280 // point values. 281 V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate, 282 Handle<String> x, 283 Handle<String> y); 284 285 // Perform ES6 21.1.3.8, including checking arguments. 286 static Object IndexOf(Isolate* isolate, Handle<Object> receiver, 287 Handle<Object> search, Handle<Object> position); 288 // Perform string match of pattern on subject, starting at start index. 289 // Caller must ensure that 0 <= start_index <= sub->length(), as this does not 290 // check any arguments. 291 static int IndexOf(Isolate* isolate, Handle<String> receiver, 292 Handle<String> search, int start_index); 293 294 static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver, 295 Handle<Object> search, Handle<Object> position); 296 297 // Encapsulates logic related to a match and its capture groups as required 298 // by GetSubstitution. 299 class Match { 300 public: 301 virtual Handle<String> GetMatch() = 0; 302 virtual Handle<String> GetPrefix() = 0; 303 virtual Handle<String> GetSuffix() = 0; 304 305 // A named capture can be unmatched (either not specified in the pattern, 306 // or specified but unmatched in the current string), or matched. 307 enum CaptureState { UNMATCHED, MATCHED }; 308 309 virtual int CaptureCount() = 0; 310 virtual bool HasNamedCaptures() = 0; 311 virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0; 312 virtual MaybeHandle<String> GetNamedCapture(Handle<String> name, 313 CaptureState* state) = 0; 314 315 virtual ~Match() = default; 316 }; 317 318 // ES#sec-getsubstitution 319 // GetSubstitution(matched, str, position, captures, replacement) 320 // Expand the $-expressions in the string and return a new string with 321 // the result. 322 // A {start_index} can be passed to specify where to start scanning the 323 // replacement string. 324 V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution( 325 Isolate* isolate, Match* match, Handle<String> replacement, 326 int start_index = 0); 327 328 // String equality operations. 329 inline bool Equals(String other) const; 330 inline static bool Equals(Isolate* isolate, Handle<String> one, 331 Handle<String> two); 332 333 enum class EqualityType { kWholeString, kPrefix, kNoLengthCheck }; 334 335 // Check if this string matches the given vector of characters, either as a 336 // whole string or just a prefix. 337 // 338 // The Isolate is passed as "evidence" that this call is on the main thread, 339 // and to distiguish from the LocalIsolate overload. 340 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 341 inline bool IsEqualTo(base::Vector<const Char> str, Isolate* isolate) const; 342 343 // Check if this string matches the given vector of characters, either as a 344 // whole string or just a prefix. 345 // 346 // This is main-thread only, like the Isolate* overload, but additionally 347 // computes the PtrComprCageBase for IsEqualToImpl. 348 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 349 inline bool IsEqualTo(base::Vector<const Char> str) const; 350 351 // Check if this string matches the given vector of characters, either as a 352 // whole string or just a prefix. 353 // 354 // The LocalIsolate is passed to provide access to the string access lock, 355 // which is taken when reading the string's contents on a background thread. 356 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 357 inline bool IsEqualTo(base::Vector<const Char> str, 358 LocalIsolate* isolate) const; 359 360 V8_EXPORT_PRIVATE bool HasOneBytePrefix(base::Vector<const char> str); 361 V8_EXPORT_PRIVATE inline bool IsOneByteEqualTo(base::Vector<const char> str); 362 363 // Return a UTF8 representation of the string. The string is null 364 // terminated but may optionally contain nulls. Length is returned 365 // in length_output if length_output is not a null pointer The string 366 // should be nearly flat, otherwise the performance of this method may 367 // be very slow (quadratic in the length). Setting robustness_flag to 368 // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it 369 // handles unexpected data without causing assert failures and it does not 370 // do any heap allocations. This is useful when printing stack traces. 371 std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls, 372 RobustnessFlag robustness_flag, int offset, 373 int length, int* length_output = nullptr); 374 V8_EXPORT_PRIVATE std::unique_ptr<char[]> ToCString( 375 AllowNullsFlag allow_nulls = DISALLOW_NULLS, 376 RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL, 377 int* length_output = nullptr); 378 379 // Externalization. 380 V8_EXPORT_PRIVATE bool MakeExternal( 381 v8::String::ExternalStringResource* resource); 382 V8_EXPORT_PRIVATE bool MakeExternal( 383 v8::String::ExternalOneByteStringResource* resource); 384 bool SupportsExternalization(); 385 386 // Conversion. 387 // "array index": an index allowed by the ES spec for JSArrays. 388 inline bool AsArrayIndex(uint32_t* index); 389 390 // This is used for calculating array indices but differs from an 391 // Array Index in the regard that this does not support the full 392 // array index range. This only supports positive numbers less than 393 // or equal to INT_MAX. 394 // 395 // String::AsArrayIndex might be a better fit if you're looking to 396 // calculate the array index. 397 // 398 // if val < 0 or val > INT_MAX, returns -1 399 // if 0 <= val <= INT_MAX, returns val 400 static int32_t ToArrayIndex(Address addr); 401 402 // "integer index": the string is the decimal representation of an 403 // integer in the range of a size_t. Useful for TypedArray accesses. 404 inline bool AsIntegerIndex(size_t* index); 405 406 // Trimming. 407 enum TrimMode { kTrim, kTrimStart, kTrimEnd }; 408 409 V8_EXPORT_PRIVATE void PrintOn(FILE* out); 410 V8_EXPORT_PRIVATE void PrintOn(std::ostream& out); 411 412 // For use during stack traces. Performs rudimentary sanity check. 413 bool LooksValid(); 414 415 // Printing utility functions. 416 // - PrintUC16 prints the raw string contents to the given stream. 417 // Non-printable characters are formatted as hex, but otherwise the string 418 // is printed as-is. 419 // - StringShortPrint and StringPrint have extra formatting: they add a 420 // prefix and suffix depending on the string kind, may add other information 421 // such as the string heap object address, may truncate long strings, etc. 422 const char* PrefixForDebugPrint() const; 423 const char* SuffixForDebugPrint() const; 424 void StringShortPrint(StringStream* accumulator); 425 void PrintUC16(std::ostream& os, int start = 0, int end = -1); 426 void PrintUC16(StringStream* accumulator, int start, int end); 427 428 // Dispatched behavior. 429 #if defined(DEBUG) || defined(OBJECT_PRINT) 430 char* ToAsciiArray(); 431 #endif 432 DECL_PRINTER(String) 433 DECL_VERIFIER(String) 434 435 inline bool IsFlat() const; 436 inline bool IsFlat(PtrComprCageBase cage_base) const; 437 438 // Max char codes. 439 static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; 440 static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; 441 static const int kMaxUtf16CodeUnit = 0xffff; 442 static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; 443 static const base::uc32 kMaxCodePoint = 0x10ffff; 444 445 // Maximal string length. 446 // The max length is different on 32 and 64 bit platforms. Max length for 447 // 32-bit platforms is ~268.4M chars. On 64-bit platforms, max length is 448 // ~536.8M chars. 449 // See include/v8.h for the definition. 450 static const int kMaxLength = v8::String::kMaxLength; 451 // There are several defining limits imposed by our current implementation: 452 // - any string's length must fit into a Smi. 453 static_assert(kMaxLength <= kSmiMaxValue, 454 "String length must fit into a Smi"); 455 // - adding two string lengths must still fit into a 32-bit int without 456 // overflow 457 static_assert(kMaxLength * 2 <= kMaxInt, 458 "String::kMaxLength * 2 must fit into an int32"); 459 // - any heap object's size in bytes must be able to fit into a Smi, because 460 // its space on the heap might be filled with a Filler; for strings this 461 // means SeqTwoByteString::kMaxSize must be able to fit into a Smi. 462 static_assert(kMaxLength * 2 + kHeaderSize <= kSmiMaxValue, 463 "String object size in bytes must fit into a Smi"); 464 // - any heap object's size in bytes must be able to fit into an int, because 465 // that's what our object handling code uses almost everywhere. 466 static_assert(kMaxLength * 2 + kHeaderSize <= kMaxInt, 467 "String object size in bytes must fit into an int"); 468 469 // Max length for computing hash. For strings longer than this limit the 470 // string length is used as the hash value. 471 static const int kMaxHashCalcLength = 16383; 472 473 // Limit for truncation in short printing. 474 static const int kMaxShortPrintLength = 1024; 475 476 // Helper function for flattening strings. 477 template <typename sinkchar> 478 EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 479 static void WriteToFlat(String source, sinkchar* sink, int from, int to); 480 template <typename sinkchar> 481 static void WriteToFlat(String source, sinkchar* sink, int from, int to, 482 PtrComprCageBase cage_base, 483 const SharedStringAccessGuardIfNeeded&); 484 IsAscii(const char * chars,int length)485 static inline bool IsAscii(const char* chars, int length) { 486 return IsAscii(reinterpret_cast<const uint8_t*>(chars), length); 487 } 488 IsAscii(const uint8_t * chars,int length)489 static inline bool IsAscii(const uint8_t* chars, int length) { 490 return NonAsciiStart(chars, length) >= length; 491 } 492 NonOneByteStart(const base::uc16 * chars,int length)493 static inline int NonOneByteStart(const base::uc16* chars, int length) { 494 DCHECK(IsAligned(reinterpret_cast<Address>(chars), sizeof(base::uc16))); 495 const uint16_t* start = chars; 496 const uint16_t* limit = chars + length; 497 498 if (static_cast<size_t>(length) >= kUIntptrSize) { 499 // Check unaligned chars. 500 while (!IsAligned(reinterpret_cast<Address>(chars), kUIntptrSize)) { 501 if (*chars > unibrow::Latin1::kMaxChar) { 502 return static_cast<int>(chars - start); 503 } 504 ++chars; 505 } 506 507 // Check aligned words. 508 STATIC_ASSERT(unibrow::Latin1::kMaxChar == 0xFF); 509 #ifdef V8_TARGET_LITTLE_ENDIAN 510 const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0xFF00; 511 #else 512 const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0x00FF; 513 #endif 514 while (chars + sizeof(uintptr_t) <= limit) { 515 if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) { 516 break; 517 } 518 chars += (sizeof(uintptr_t) / sizeof(base::uc16)); 519 } 520 } 521 522 // Check remaining unaligned chars, or find non-one-byte char in word. 523 while (chars < limit) { 524 if (*chars > unibrow::Latin1::kMaxChar) { 525 return static_cast<int>(chars - start); 526 } 527 ++chars; 528 } 529 530 return static_cast<int>(chars - start); 531 } 532 IsOneByte(const base::uc16 * chars,int length)533 static inline bool IsOneByte(const base::uc16* chars, int length) { 534 return NonOneByteStart(chars, length) >= length; 535 } 536 537 // May only be called when a SharedStringAccessGuard is not needed (i.e. on 538 // the main thread or on read-only strings). 539 template <class Visitor> 540 static inline ConsString VisitFlat(Visitor* visitor, String string, 541 int offset = 0); 542 543 template <class Visitor> 544 static inline ConsString VisitFlat( 545 Visitor* visitor, String string, int offset, 546 const SharedStringAccessGuardIfNeeded& access_guard); 547 548 template <typename IsolateT> 549 static Handle<FixedArray> CalculateLineEnds(IsolateT* isolate, 550 Handle<String> string, 551 bool include_ending_line); 552 553 private: 554 friend class Name; 555 friend class StringTableInsertionKey; 556 friend class InternalizedStringKey; 557 558 // Implementation of the Get() public methods. Do not use directly. 559 V8_INLINE uint16_t 560 GetImpl(int index, PtrComprCageBase cage_base, 561 const SharedStringAccessGuardIfNeeded& access_guard) const; 562 563 // Implementation of the IsEqualTo() public methods. Do not use directly. 564 template <EqualityType kEqType, typename Char> 565 V8_INLINE bool IsEqualToImpl( 566 base::Vector<const Char> str, PtrComprCageBase cage_base, 567 const SharedStringAccessGuardIfNeeded& access_guard) const; 568 569 // Out-of-line IsEqualToImpl for ConsString. 570 template <typename Char> 571 V8_NOINLINE static bool IsConsStringEqualToImpl( 572 ConsString string, int slice_offset, base::Vector<const Char> str, 573 PtrComprCageBase cage_base, 574 const SharedStringAccessGuardIfNeeded& access_guard); 575 576 V8_EXPORT_PRIVATE static Handle<String> SlowFlatten( 577 Isolate* isolate, Handle<ConsString> cons, AllocationType allocation); 578 579 // Slow case of String::Equals. This implementation works on any strings 580 // but it is most efficient on strings that are almost flat. 581 V8_EXPORT_PRIVATE bool SlowEquals(String other) const; 582 V8_EXPORT_PRIVATE bool SlowEquals( 583 String other, const SharedStringAccessGuardIfNeeded&) const; 584 585 V8_EXPORT_PRIVATE static bool SlowEquals(Isolate* isolate, Handle<String> one, 586 Handle<String> two); 587 588 // Slow case of AsArrayIndex. 589 V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index); 590 V8_EXPORT_PRIVATE bool SlowAsIntegerIndex(size_t* index); 591 592 // Compute and set the hash code. 593 V8_EXPORT_PRIVATE uint32_t ComputeAndSetHash(); 594 V8_EXPORT_PRIVATE uint32_t 595 ComputeAndSetHash(const SharedStringAccessGuardIfNeeded&); 596 597 TQ_OBJECT_CONSTRUCTORS(String) 598 }; 599 600 // clang-format off 601 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 602 void String::WriteToFlat(String source, uint8_t* sink, int from, int to); 603 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 604 void String::WriteToFlat(String source, uint16_t* sink, int from, int to); 605 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 606 void String::WriteToFlat(String source, uint8_t* sink, int from, int to, 607 PtrComprCageBase cage_base, 608 const SharedStringAccessGuardIfNeeded&); 609 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 610 void String::WriteToFlat(String source, uint16_t* sink, int from, int to, 611 PtrComprCageBase cage_base, 612 const SharedStringAccessGuardIfNeeded&); 613 // clang-format on 614 615 class SubStringRange { 616 public: 617 inline SubStringRange(String string, const DisallowGarbageCollection& no_gc, 618 int first = 0, int length = -1); 619 class iterator; 620 inline iterator begin(); 621 inline iterator end(); 622 623 private: 624 String string_; 625 int first_; 626 int length_; 627 const DisallowGarbageCollection& no_gc_; 628 }; 629 630 // The SeqString abstract class captures sequential string values. 631 class SeqString : public TorqueGeneratedSeqString<SeqString, String> { 632 public: 633 // Truncate the string in-place if possible and return the result. 634 // In case of new_length == 0, the empty string is returned without 635 // truncating the original string. 636 V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string, 637 int new_length); 638 639 TQ_OBJECT_CONSTRUCTORS(SeqString) 640 }; 641 642 class InternalizedString 643 : public TorqueGeneratedInternalizedString<InternalizedString, String> { 644 public: 645 // TODO(neis): Possibly move some stuff from String here. 646 647 TQ_OBJECT_CONSTRUCTORS(InternalizedString) 648 }; 649 650 // The OneByteString class captures sequential one-byte string objects. 651 // Each character in the OneByteString is an one-byte character. 652 class SeqOneByteString 653 : public TorqueGeneratedSeqOneByteString<SeqOneByteString, SeqString> { 654 public: 655 static const bool kHasOneByteEncoding = true; 656 using Char = uint8_t; 657 658 // Dispatched behavior. The non SharedStringAccessGuardIfNeeded method is also 659 // defined for convenience and it will check that the access guard is not 660 // needed. 661 inline uint8_t Get(int index) const; 662 inline uint8_t Get(int index, PtrComprCageBase cage_base, 663 const SharedStringAccessGuardIfNeeded& access_guard) const; 664 inline void SeqOneByteStringSet(int index, uint16_t value); 665 666 // Get the address of the characters in this string. 667 inline Address GetCharsAddress() const; 668 669 // Get a pointer to the characters of the string. May only be called when a 670 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 671 // read-only strings). 672 inline uint8_t* GetChars(const DisallowGarbageCollection& no_gc) const; 673 674 // Get a pointer to the characters of the string. 675 inline uint8_t* GetChars( 676 const DisallowGarbageCollection& no_gc, 677 const SharedStringAccessGuardIfNeeded& access_guard) const; 678 679 // Clear uninitialized padding space. This ensures that the snapshot content 680 // is deterministic. 681 void clear_padding(); 682 683 // Garbage collection support. This method is called by the 684 // garbage collector to compute the actual size of an OneByteString 685 // instance. 686 inline int SeqOneByteStringSize(InstanceType instance_type); 687 688 // Maximal memory usage for a single sequential one-byte string. 689 static const int kMaxCharsSize = kMaxLength; 690 static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); 691 STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength); 692 693 int AllocatedSize(); 694 695 class BodyDescriptor; 696 697 TQ_OBJECT_CONSTRUCTORS(SeqOneByteString) 698 }; 699 700 // The TwoByteString class captures sequential unicode string objects. 701 // Each character in the TwoByteString is a two-byte uint16_t. 702 class SeqTwoByteString 703 : public TorqueGeneratedSeqTwoByteString<SeqTwoByteString, SeqString> { 704 public: 705 static const bool kHasOneByteEncoding = false; 706 using Char = uint16_t; 707 708 // Dispatched behavior. 709 inline uint16_t Get( 710 int index, PtrComprCageBase cage_base, 711 const SharedStringAccessGuardIfNeeded& access_guard) const; 712 inline void SeqTwoByteStringSet(int index, uint16_t value); 713 714 // Get the address of the characters in this string. 715 inline Address GetCharsAddress() const; 716 717 // Get a pointer to the characters of the string. May only be called when a 718 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 719 // read-only strings). 720 inline base::uc16* GetChars(const DisallowGarbageCollection& no_gc) const; 721 722 // Get a pointer to the characters of the string. 723 inline base::uc16* GetChars( 724 const DisallowGarbageCollection& no_gc, 725 const SharedStringAccessGuardIfNeeded& access_guard) const; 726 727 // Clear uninitialized padding space. This ensures that the snapshot content 728 // is deterministic. 729 void clear_padding(); 730 731 // Garbage collection support. This method is called by the 732 // garbage collector to compute the actual size of a TwoByteString 733 // instance. 734 inline int SeqTwoByteStringSize(InstanceType instance_type); 735 736 // Maximal memory usage for a single sequential two-byte string. 737 static const int kMaxCharsSize = kMaxLength * 2; 738 static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); 739 STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >= 740 String::kMaxLength); 741 742 int AllocatedSize(); 743 744 class BodyDescriptor; 745 746 TQ_OBJECT_CONSTRUCTORS(SeqTwoByteString) 747 }; 748 749 // The ConsString class describes string values built by using the 750 // addition operator on strings. A ConsString is a pair where the 751 // first and second components are pointers to other string values. 752 // One or both components of a ConsString can be pointers to other 753 // ConsStrings, creating a binary tree of ConsStrings where the leaves 754 // are non-ConsString string values. The string value represented by 755 // a ConsString can be obtained by concatenating the leaf string 756 // values in a left-to-right depth-first traversal of the tree. 757 class ConsString : public TorqueGeneratedConsString<ConsString, String> { 758 public: 759 // Doesn't check that the result is a string, even in debug mode. This is 760 // useful during GC where the mark bits confuse the checks. 761 inline Object unchecked_first(); 762 763 // Doesn't check that the result is a string, even in debug mode. This is 764 // useful during GC where the mark bits confuse the checks. 765 inline Object unchecked_second(); 766 767 // Dispatched behavior. 768 V8_EXPORT_PRIVATE uint16_t 769 Get(int index, PtrComprCageBase cage_base, 770 const SharedStringAccessGuardIfNeeded& access_guard) const; 771 772 // Minimum length for a cons string. 773 static const int kMinLength = 13; 774 775 class BodyDescriptor; 776 777 DECL_VERIFIER(ConsString) 778 779 TQ_OBJECT_CONSTRUCTORS(ConsString) 780 }; 781 782 // The ThinString class describes string objects that are just references 783 // to another string object. They are used for in-place internalization when 784 // the original string cannot actually be internalized in-place: in these 785 // cases, the original string is converted to a ThinString pointing at its 786 // internalized version (which is allocated as a new object). 787 // In terms of memory layout and most algorithms operating on strings, 788 // ThinStrings can be thought of as "one-part cons strings". 789 class ThinString : public TorqueGeneratedThinString<ThinString, String> { 790 public: 791 DECL_GETTER(unchecked_actual, HeapObject) 792 793 V8_EXPORT_PRIVATE uint16_t 794 Get(int index, PtrComprCageBase cage_base, 795 const SharedStringAccessGuardIfNeeded& access_guard) const; 796 797 DECL_VERIFIER(ThinString) 798 799 class BodyDescriptor; 800 801 TQ_OBJECT_CONSTRUCTORS(ThinString) 802 }; 803 804 // The Sliced String class describes strings that are substrings of another 805 // sequential string. The motivation is to save time and memory when creating 806 // a substring. A Sliced String is described as a pointer to the parent, 807 // the offset from the start of the parent string and the length. Using 808 // a Sliced String therefore requires unpacking of the parent string and 809 // adding the offset to the start address. A substring of a Sliced String 810 // are not nested since the double indirection is simplified when creating 811 // such a substring. 812 // Currently missing features are: 813 // - truncating sliced string to enable otherwise unneeded parent to be GC'ed. 814 class SlicedString : public TorqueGeneratedSlicedString<SlicedString, String> { 815 public: 816 inline void set_parent(String parent, 817 WriteBarrierMode mode = UPDATE_WRITE_BARRIER); 818 // Dispatched behavior. 819 V8_EXPORT_PRIVATE uint16_t 820 Get(int index, PtrComprCageBase cage_base, 821 const SharedStringAccessGuardIfNeeded& access_guard) const; 822 823 // Minimum length for a sliced string. 824 static const int kMinLength = 13; 825 826 class BodyDescriptor; 827 828 DECL_VERIFIER(SlicedString) 829 830 TQ_OBJECT_CONSTRUCTORS(SlicedString) 831 }; 832 833 // The ExternalString class describes string values that are backed by 834 // a string resource that lies outside the V8 heap. ExternalStrings 835 // consist of the length field common to all strings, a pointer to the 836 // external resource. It is important to ensure (externally) that the 837 // resource is not deallocated while the ExternalString is live in the 838 // V8 heap. 839 // 840 // The API expects that all ExternalStrings are created through the 841 // API. Therefore, ExternalStrings should not be used internally. 842 class ExternalString 843 : public TorqueGeneratedExternalString<ExternalString, String> { 844 public: 845 DECL_VERIFIER(ExternalString) 846 847 // Size of uncached external strings. 848 static const int kUncachedSize = 849 kResourceOffset + FIELD_SIZE(kResourceOffset); 850 851 inline void AllocateExternalPointerEntries(Isolate* isolate); 852 853 // Return whether the external string data pointer is not cached. 854 inline bool is_uncached() const; 855 // Size in bytes of the external payload. 856 int ExternalPayloadSize() const; 857 858 // Used in the serializer/deserializer. 859 DECL_GETTER(resource_as_address, Address) 860 inline void set_address_as_resource(Isolate* isolate, Address address); 861 inline uint32_t GetResourceRefForDeserialization(); 862 inline void SetResourceRefForSerialization(uint32_t ref); 863 864 // Disposes string's resource object if it has not already been disposed. 865 inline void DisposeResource(Isolate* isolate); 866 867 STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset); 868 static const int kSizeOfAllExternalStrings = kHeaderSize; 869 870 private: 871 // Hide generated accessors. 872 DECL_ACCESSORS(resource, void*) 873 DECL_ACCESSORS(resource_data, void*) 874 875 TQ_OBJECT_CONSTRUCTORS(ExternalString) 876 }; 877 878 // The ExternalOneByteString class is an external string backed by an 879 // one-byte string. 880 class ExternalOneByteString 881 : public TorqueGeneratedExternalOneByteString<ExternalOneByteString, 882 ExternalString> { 883 public: 884 static const bool kHasOneByteEncoding = true; 885 886 using Resource = v8::String::ExternalOneByteStringResource; 887 888 // The underlying resource. 889 DECL_GETTER(resource, const Resource*) 890 891 // It is assumed that the previous resource is null. If it is not null, then 892 // it is the responsability of the caller the handle the previous resource. 893 inline void SetResource(Isolate* isolate, const Resource* buffer); 894 895 // Used only during serialization. 896 inline void set_resource(Isolate* isolate, const Resource* buffer); 897 898 // Update the pointer cache to the external character array. 899 // The cached pointer is always valid, as the external character array does = 900 // not move during lifetime. Deserialization is the only exception, after 901 // which the pointer cache has to be refreshed. 902 inline void update_data_cache(Isolate* isolate); 903 904 inline const uint8_t* GetChars(PtrComprCageBase cage_base) const; 905 906 // Dispatched behavior. 907 inline uint8_t Get(int index, PtrComprCageBase cage_base, 908 const SharedStringAccessGuardIfNeeded& access_guard) const; 909 910 class BodyDescriptor; 911 912 STATIC_ASSERT(kSize == kSizeOfAllExternalStrings); 913 914 TQ_OBJECT_CONSTRUCTORS(ExternalOneByteString) 915 916 private: 917 // The underlying resource as a non-const pointer. 918 DECL_GETTER(mutable_resource, Resource*) 919 }; 920 921 // The ExternalTwoByteString class is an external string backed by a UTF-16 922 // encoded string. 923 class ExternalTwoByteString 924 : public TorqueGeneratedExternalTwoByteString<ExternalTwoByteString, 925 ExternalString> { 926 public: 927 static const bool kHasOneByteEncoding = false; 928 929 using Resource = v8::String::ExternalStringResource; 930 931 // The underlying string resource. 932 DECL_GETTER(resource, const Resource*) 933 934 // It is assumed that the previous resource is null. If it is not null, then 935 // it is the responsability of the caller the handle the previous resource. 936 inline void SetResource(Isolate* isolate, const Resource* buffer); 937 938 // Used only during serialization. 939 inline void set_resource(Isolate* isolate, const Resource* buffer); 940 941 // Update the pointer cache to the external character array. 942 // The cached pointer is always valid, as the external character array does = 943 // not move during lifetime. Deserialization is the only exception, after 944 // which the pointer cache has to be refreshed. 945 inline void update_data_cache(Isolate* isolate); 946 947 inline const uint16_t* GetChars(PtrComprCageBase cage_base) const; 948 949 // Dispatched behavior. 950 inline uint16_t Get( 951 int index, PtrComprCageBase cage_base, 952 const SharedStringAccessGuardIfNeeded& access_guard) const; 953 954 // For regexp code. 955 inline const uint16_t* ExternalTwoByteStringGetData(unsigned start); 956 957 class BodyDescriptor; 958 959 STATIC_ASSERT(kSize == kSizeOfAllExternalStrings); 960 961 TQ_OBJECT_CONSTRUCTORS(ExternalTwoByteString) 962 963 private: 964 // The underlying resource as a non-const pointer. 965 DECL_GETTER(mutable_resource, Resource*) 966 }; 967 968 // A flat string reader provides random access to the contents of a 969 // string independent of the character width of the string. The handle 970 // must be valid as long as the reader is being used. 971 // Not safe to use from concurrent background threads. 972 class V8_EXPORT_PRIVATE FlatStringReader : public Relocatable { 973 public: 974 FlatStringReader(Isolate* isolate, Handle<String> str); 975 void PostGarbageCollection() override; 976 inline base::uc32 Get(int index) const; 977 template <typename Char> 978 inline Char Get(int index) const; 979 int length() { return length_; } 980 981 private: 982 Handle<String> str_; 983 bool is_one_byte_; 984 int length_; 985 const void* start_; 986 }; 987 988 // This maintains an off-stack representation of the stack frames required 989 // to traverse a ConsString, allowing an entirely iterative and restartable 990 // traversal of the entire string 991 class ConsStringIterator { 992 public: 993 inline ConsStringIterator() = default; 994 inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) { 995 Reset(cons_string, offset); 996 } 997 ConsStringIterator(const ConsStringIterator&) = delete; 998 ConsStringIterator& operator=(const ConsStringIterator&) = delete; 999 inline void Reset(ConsString cons_string, int offset = 0) { 1000 depth_ = 0; 1001 // Next will always return nullptr. 1002 if (cons_string.is_null()) return; 1003 Initialize(cons_string, offset); 1004 } 1005 // Returns nullptr when complete. 1006 inline String Next(int* offset_out) { 1007 *offset_out = 0; 1008 if (depth_ == 0) return String(); 1009 return Continue(offset_out); 1010 } 1011 1012 private: 1013 static const int kStackSize = 32; 1014 // Use a mask instead of doing modulo operations for stack wrapping. 1015 static const int kDepthMask = kStackSize - 1; 1016 static_assert(base::bits::IsPowerOfTwo(kStackSize), 1017 "kStackSize must be power of two"); 1018 static inline int OffsetForDepth(int depth); 1019 1020 inline void PushLeft(ConsString string); 1021 inline void PushRight(ConsString string); 1022 inline void AdjustMaximumDepth(); 1023 inline void Pop(); 1024 inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; } 1025 V8_EXPORT_PRIVATE void Initialize(ConsString cons_string, int offset); 1026 V8_EXPORT_PRIVATE String Continue(int* offset_out); 1027 String NextLeaf(bool* blew_stack); 1028 String Search(int* offset_out); 1029 1030 // Stack must always contain only frames for which right traversal 1031 // has not yet been performed. 1032 ConsString frames_[kStackSize]; 1033 ConsString root_; 1034 int depth_; 1035 int maximum_depth_; 1036 int consumed_; 1037 }; 1038 1039 class StringCharacterStream; 1040 1041 template <typename Char> 1042 struct CharTraits; 1043 1044 template <> 1045 struct CharTraits<uint8_t> { 1046 using String = SeqOneByteString; 1047 using ExternalString = ExternalOneByteString; 1048 }; 1049 1050 template <> 1051 struct CharTraits<uint16_t> { 1052 using String = SeqTwoByteString; 1053 using ExternalString = ExternalTwoByteString; 1054 }; 1055 1056 } // namespace internal 1057 } // namespace v8 1058 1059 #include "src/objects/object-macros-undef.h" 1060 1061 #endif // V8_OBJECTS_STRING_H_ 1062