1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/parsing/scanner-character-streams.h"
6 
7 #include <memory>
8 #include <vector>
9 
10 #include "include/v8-callbacks.h"
11 #include "include/v8-primitive.h"
12 #include "src/base/strings.h"
13 #include "src/common/globals.h"
14 #include "src/execution/isolate-utils.h"
15 #include "src/handles/handles.h"
16 #include "src/logging/runtime-call-stats-scope.h"
17 #include "src/objects/objects-inl.h"
18 #include "src/parsing/scanner.h"
19 #include "src/strings/unicode-inl.h"
20 
21 namespace v8 {
22 namespace internal {
23 
24 class V8_NODISCARD ScopedExternalStringLock {
25  public:
ScopedExternalStringLock(ExternalString string)26   explicit ScopedExternalStringLock(ExternalString string) {
27     DCHECK(!string.is_null());
28     if (string.IsExternalOneByteString()) {
29       resource_ = ExternalOneByteString::cast(string).resource();
30     } else {
31       DCHECK(string.IsExternalTwoByteString());
32       resource_ = ExternalTwoByteString::cast(string).resource();
33     }
34     DCHECK(resource_);
35     resource_->Lock();
36   }
37 
38   // Copying a lock increases the locking depth.
ScopedExternalStringLock(const ScopedExternalStringLock & other)39   ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
40       : resource_(other.resource_) {
41     resource_->Lock();
42   }
43 
~ScopedExternalStringLock()44   ~ScopedExternalStringLock() { resource_->Unlock(); }
45 
46  private:
47   // Not nullptr.
48   const v8::String::ExternalStringResourceBase* resource_;
49 };
50 
51 namespace {
52 const unibrow::uchar kUtf8Bom = 0xFEFF;
53 }  // namespace
54 
55 template <typename Char>
56 struct Range {
57   const Char* start;
58   const Char* end;
59 
lengthv8::internal::Range60   size_t length() { return static_cast<size_t>(end - start); }
unaligned_startv8::internal::Range61   bool unaligned_start() const {
62     return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
63   }
64 };
65 
66 // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
67 template <typename Char>
68 class OnHeapStream {
69  public:
70   using String = typename CharTraits<Char>::String;
71 
OnHeapStream(Handle<String> string,size_t start_offset,size_t end)72   OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
73       : string_(string), start_offset_(start_offset), length_(end) {}
74 
OnHeapStream(const OnHeapStream &)75   OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
76     UNREACHABLE();
77   }
78 
79   // The no_gc argument is only here because of the templated way this class
80   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc)81   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
82                         DisallowGarbageCollection* no_gc) {
83     return {&string_->GetChars(*no_gc)[start_offset_ + std::min(length_, pos)],
84             &string_->GetChars(*no_gc)[start_offset_ + length_]};
85   }
86 
87   static const bool kCanBeCloned = false;
88   static const bool kCanAccessHeap = true;
89 
90  private:
91   Handle<String> string_;
92   const size_t start_offset_;
93   const size_t length_;
94 };
95 
96 // A Char stream backed by an off-heap ExternalOneByteString or
97 // ExternalTwoByteString.
98 template <typename Char>
99 class ExternalStringStream {
100   using ExternalString = typename CharTraits<Char>::ExternalString;
101 
102  public:
ExternalStringStream(ExternalString string,size_t start_offset,size_t length)103   ExternalStringStream(ExternalString string, size_t start_offset,
104                        size_t length)
105       : lock_(string),
106         data_(string.GetChars(GetPtrComprCageBase(string)) + start_offset),
107         length_(length) {}
108 
ExternalStringStream(const ExternalStringStream & other)109   ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
110       : lock_(other.lock_),
111         data_(other.data_),
112         length_(other.length_) {}
113 
114   // The no_gc argument is only here because of the templated way this class
115   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)116   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
117                         DisallowGarbageCollection* no_gc = nullptr) {
118     return {&data_[std::min(length_, pos)], &data_[length_]};
119   }
120 
121   static const bool kCanBeCloned = true;
122   static const bool kCanAccessHeap = false;
123 
124  private:
125   ScopedExternalStringLock lock_;
126   const Char* const data_;
127   const size_t length_;
128 };
129 
130 // A Char stream backed by a C array. Testing only.
131 template <typename Char>
132 class TestingStream {
133  public:
TestingStream(const Char * data,size_t length)134   TestingStream(const Char* data, size_t length)
135       : data_(data), length_(length) {}
136   // The no_gc argument is only here because of the templated way this class
137   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)138   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
139                         DisallowGarbageCollection* no_gc = nullptr) {
140     return {&data_[std::min(length_, pos)], &data_[length_]};
141   }
142 
143   static const bool kCanBeCloned = true;
144   static const bool kCanAccessHeap = false;
145 
146  private:
147   const Char* const data_;
148   const size_t length_;
149 };
150 
151 // A Char stream backed by multiple source-stream provided off-heap chunks.
152 template <typename Char>
153 class ChunkedStream {
154  public:
ChunkedStream(ScriptCompiler::ExternalSourceStream * source)155   explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
156       : source_(source) {}
157 
ChunkedStream(const ChunkedStream &)158   ChunkedStream(const ChunkedStream&) V8_NOEXCEPT {
159     // TODO(rmcilroy): Implement cloning for chunked streams.
160     UNREACHABLE();
161   }
162 
163   // The no_gc argument is only here because of the templated way this class
164   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)165   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
166                         DisallowGarbageCollection* no_gc = nullptr) {
167     Chunk chunk = FindChunk(pos, stats);
168     size_t buffer_end = chunk.length;
169     size_t buffer_pos = std::min(buffer_end, pos - chunk.position);
170     return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
171   }
172 
~ChunkedStream()173   ~ChunkedStream() {
174     for (Chunk& chunk : chunks_) delete[] chunk.data;
175   }
176 
177   static const bool kCanBeCloned = false;
178   static const bool kCanAccessHeap = false;
179 
180  private:
181   struct Chunk {
Chunkv8::internal::ChunkedStream::Chunk182     Chunk(const Char* const data, size_t position, size_t length)
183         : data(data), position(position), length(length) {}
184     const Char* const data;
185     // The logical position of data.
186     const size_t position;
187     const size_t length;
end_positionv8::internal::ChunkedStream::Chunk188     size_t end_position() const { return position + length; }
189   };
190 
FindChunk(size_t position,RuntimeCallStats * stats)191   Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
192     while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
193 
194     // Walk forwards while the position is in front of the current chunk.
195     while (position >= chunks_.back().end_position() &&
196            chunks_.back().length > 0) {
197       FetchChunk(chunks_.back().end_position(), stats);
198     }
199 
200     // Walk backwards.
201     for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
202          ++reverse_it) {
203       if (reverse_it->position <= position) return *reverse_it;
204     }
205 
206     UNREACHABLE();
207   }
208 
ProcessChunk(const uint8_t * data,size_t position,size_t length)209   virtual void ProcessChunk(const uint8_t* data, size_t position,
210                             size_t length) {
211     // Incoming data has to be aligned to Char size.
212     DCHECK_EQ(0, length % sizeof(Char));
213     chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
214                          length / sizeof(Char));
215   }
216 
FetchChunk(size_t position,RuntimeCallStats * stats)217   void FetchChunk(size_t position, RuntimeCallStats* stats) {
218     const uint8_t* data = nullptr;
219     size_t length;
220     {
221       RCS_SCOPE(stats, RuntimeCallCounterId::kGetMoreDataCallback);
222       length = source_->GetMoreData(&data);
223     }
224     ProcessChunk(data, position, length);
225   }
226 
227   ScriptCompiler::ExternalSourceStream* source_;
228 
229  protected:
230   std::vector<struct Chunk> chunks_;
231 };
232 
233 // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
234 // Chars are buffered if either the underlying stream isn't utf-16 or the
235 // underlying utf-16 stream might move (is on-heap).
236 template <template <typename T> class ByteStream>
237 class BufferedCharacterStream : public Utf16CharacterStream {
238  public:
239   template <class... TArgs>
BufferedCharacterStream(size_t pos,TArgs...args)240   BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
241     buffer_pos_ = pos;
242   }
243 
can_be_cloned() const244   bool can_be_cloned() const final {
245     return ByteStream<uint16_t>::kCanBeCloned;
246   }
247 
Clone() const248   std::unique_ptr<Utf16CharacterStream> Clone() const override {
249     CHECK(can_be_cloned());
250     return std::unique_ptr<Utf16CharacterStream>(
251         new BufferedCharacterStream<ByteStream>(*this));
252   }
253 
254  protected:
ReadBlock()255   bool ReadBlock() final {
256     size_t position = pos();
257     buffer_pos_ = position;
258     buffer_start_ = &buffer_[0];
259     buffer_cursor_ = buffer_start_;
260 
261     DisallowGarbageCollection no_gc;
262     Range<uint8_t> range =
263         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
264     if (range.length() == 0) {
265       buffer_end_ = buffer_start_;
266       return false;
267     }
268 
269     size_t length = std::min({kBufferSize, range.length()});
270     i::CopyChars(buffer_, range.start, length);
271     buffer_end_ = &buffer_[length];
272     return true;
273   }
274 
can_access_heap() const275   bool can_access_heap() const final {
276     return ByteStream<uint8_t>::kCanAccessHeap;
277   }
278 
279  private:
BufferedCharacterStream(const BufferedCharacterStream<ByteStream> & other)280   BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
281       : byte_stream_(other.byte_stream_) {}
282 
283   static const size_t kBufferSize = 512;
284   base::uc16 buffer_[kBufferSize];
285   ByteStream<uint8_t> byte_stream_;
286 };
287 
288 // Provides a unbuffered utf-16 view on the bytes from the underlying
289 // ByteStream.
290 template <template <typename T> class ByteStream>
291 class UnbufferedCharacterStream : public Utf16CharacterStream {
292  public:
293   template <class... TArgs>
UnbufferedCharacterStream(size_t pos,TArgs...args)294   UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
295     buffer_pos_ = pos;
296   }
297 
can_access_heap() const298   bool can_access_heap() const final {
299     return ByteStream<uint16_t>::kCanAccessHeap;
300   }
301 
can_be_cloned() const302   bool can_be_cloned() const final {
303     return ByteStream<uint16_t>::kCanBeCloned;
304   }
305 
Clone() const306   std::unique_ptr<Utf16CharacterStream> Clone() const override {
307     return std::unique_ptr<Utf16CharacterStream>(
308         new UnbufferedCharacterStream<ByteStream>(*this));
309   }
310 
311  protected:
ReadBlock()312   bool ReadBlock() final {
313     size_t position = pos();
314     buffer_pos_ = position;
315     DisallowGarbageCollection no_gc;
316     Range<uint16_t> range =
317         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
318     buffer_start_ = range.start;
319     buffer_end_ = range.end;
320     buffer_cursor_ = buffer_start_;
321     if (range.length() == 0) return false;
322 
323     DCHECK(!range.unaligned_start());
324     DCHECK_LE(buffer_start_, buffer_end_);
325     return true;
326   }
327 
UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream> & other)328   UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
329       : byte_stream_(other.byte_stream_) {}
330 
331   ByteStream<uint16_t> byte_stream_;
332 };
333 
334 // Provides a unbuffered utf-16 view on the bytes from the underlying
335 // ByteStream.
336 class RelocatingCharacterStream final
337     : public UnbufferedCharacterStream<OnHeapStream> {
338  public:
339   template <class... TArgs>
RelocatingCharacterStream(Isolate * isolate,size_t pos,TArgs...args)340   RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
341       : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
342         isolate_(isolate) {
343     isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
344                                            v8::kGCTypeAll, this);
345   }
346 
347  private:
~RelocatingCharacterStream()348   ~RelocatingCharacterStream() final {
349     isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
350                                                this);
351   }
352 
UpdateBufferPointersCallback(v8::Isolate * v8_isolate,v8::GCType type,v8::GCCallbackFlags flags,void * stream)353   static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
354                                            v8::GCType type,
355                                            v8::GCCallbackFlags flags,
356                                            void* stream) {
357     reinterpret_cast<RelocatingCharacterStream*>(stream)
358         ->UpdateBufferPointers();
359   }
360 
UpdateBufferPointers()361   void UpdateBufferPointers() {
362     DisallowGarbageCollection no_gc;
363     Range<uint16_t> range =
364         byte_stream_.GetDataAt(buffer_pos_, runtime_call_stats(), &no_gc);
365     if (range.start != buffer_start_) {
366       buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
367       buffer_start_ = range.start;
368       buffer_end_ = range.end;
369     }
370   }
371 
372   Isolate* isolate_;
373 };
374 
375 // ----------------------------------------------------------------------------
376 // BufferedUtf16CharacterStreams
377 //
378 // A buffered character stream based on a random access character
379 // source (ReadBlock can be called with pos() pointing to any position,
380 // even positions before the current).
381 //
382 // TODO(verwaest): Remove together with Utf8 external streaming streams.
383 class BufferedUtf16CharacterStream : public Utf16CharacterStream {
384  public:
385   BufferedUtf16CharacterStream();
386 
387  protected:
388   static const size_t kBufferSize = 512;
389 
390   bool ReadBlock() final;
391 
392   // FillBuffer should read up to kBufferSize characters at position and store
393   // them into buffer_[0..]. It returns the number of characters stored.
394   virtual size_t FillBuffer(size_t position) = 0;
395 
396   // Fixed sized buffer that this class reads from.
397   // The base class' buffer_start_ should always point to buffer_.
398   base::uc16 buffer_[kBufferSize];
399 };
400 
BufferedUtf16CharacterStream()401 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
402     : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
403 
ReadBlock()404 bool BufferedUtf16CharacterStream::ReadBlock() {
405   DCHECK_EQ(buffer_start_, buffer_);
406 
407   size_t position = pos();
408   buffer_pos_ = position;
409   buffer_cursor_ = buffer_;
410   buffer_end_ = buffer_ + FillBuffer(position);
411   DCHECK_EQ(pos(), position);
412   DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
413   return buffer_cursor_ < buffer_end_;
414 }
415 
416 // ----------------------------------------------------------------------------
417 // Windows1252CharacterStream - chunked streaming of windows-1252 data.
418 //
419 // Similar to BufferedCharacterStream, but does the translation of
420 // windows-1252 that are incompatible with their latin-1 equivalents.
421 
422 namespace {
423 
424 static const base::uc16 kWindows1252ToUC16[256] = {
425     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,  // 00-07
426     0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,  // 08-0F
427     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,  // 10-17
428     0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,  // 18-1F
429     0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,  // 20-27
430     0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,  // 28-2F
431     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,  // 30-37
432     0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,  // 38-3F
433     0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,  // 40-47
434     0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,  // 48-4F
435     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,  // 50-57
436     0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,  // 58-5F
437     0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,  // 60-67
438     0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,  // 68-6F
439     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,  // 70-77
440     0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F,  // 78-7F
441     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,  // 80-87
442     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,  // 88-8F
443     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,  // 90-97
444     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,  // 98-9F
445     0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7,  // A0-A7
446     0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,  // A8-AF
447     0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7,  // B0-B7
448     0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,  // B8-BF
449     0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,  // C0-C7
450     0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,  // C8-CF
451     0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7,  // D0-D7
452     0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF,  // D8-DF
453     0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,  // E0-E7
454     0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,  // E8-EF
455     0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7,  // F0-F7
456     0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF   // F8-FF
457 };
458 
459 }  // namespace
460 
461 class Windows1252CharacterStream final : public Utf16CharacterStream {
462  public:
Windows1252CharacterStream(size_t pos,ScriptCompiler::ExternalSourceStream * source_stream)463   Windows1252CharacterStream(
464       size_t pos, ScriptCompiler::ExternalSourceStream* source_stream)
465       : byte_stream_(source_stream) {
466     buffer_pos_ = pos;
467   }
468 
can_be_cloned() const469   bool can_be_cloned() const final {
470     return ChunkedStream<uint16_t>::kCanBeCloned;
471   }
472 
Clone() const473   std::unique_ptr<Utf16CharacterStream> Clone() const override {
474     CHECK(can_be_cloned());
475     return std::unique_ptr<Utf16CharacterStream>(
476         new Windows1252CharacterStream(*this));
477   }
478 
479  protected:
ReadBlock()480   bool ReadBlock() final {
481     size_t position = pos();
482     buffer_pos_ = position;
483     buffer_start_ = &buffer_[0];
484     buffer_cursor_ = buffer_start_;
485 
486     DisallowGarbageCollection no_gc;
487     Range<uint8_t> range =
488         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
489     if (range.length() == 0) {
490       buffer_end_ = buffer_start_;
491       return false;
492     }
493 
494     size_t length = std::min({kBufferSize, range.length()});
495     std::transform(range.start, range.start + length, &buffer_[0],
496                    [](uint8_t c) { return kWindows1252ToUC16[c]; });
497     buffer_end_ = &buffer_[length];
498     return true;
499   }
500 
can_access_heap() const501   bool can_access_heap() const final {
502     return ChunkedStream<uint8_t>::kCanAccessHeap;
503   }
504 
505  private:
Windows1252CharacterStream(const Windows1252CharacterStream & other)506   Windows1252CharacterStream(const Windows1252CharacterStream& other)
507       V8_NOEXCEPT : byte_stream_(other.byte_stream_) {}
508 
509   static const size_t kBufferSize = 512;
510   base::uc16 buffer_[kBufferSize];
511   ChunkedStream<uint8_t> byte_stream_;
512 };
513 
514 // ----------------------------------------------------------------------------
515 // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
516 //
517 // This implementation is fairly complex, since data arrives in chunks which
518 // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
519 // character position is tricky because the byte position cannot be derived
520 // from the character position.
521 //
522 // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
523 // instead so we don't need to buffer.
524 
525 class Utf8ExternalStreamingStream final : public BufferedUtf16CharacterStream {
526  public:
Utf8ExternalStreamingStream(ScriptCompiler::ExternalSourceStream * source_stream)527   Utf8ExternalStreamingStream(
528       ScriptCompiler::ExternalSourceStream* source_stream)
529       : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
530         source_stream_(source_stream) {}
~Utf8ExternalStreamingStream()531   ~Utf8ExternalStreamingStream() final {
532     for (const Chunk& chunk : chunks_) delete[] chunk.data;
533   }
534 
can_access_heap() const535   bool can_access_heap() const final { return false; }
536 
can_be_cloned() const537   bool can_be_cloned() const final { return false; }
538 
Clone() const539   std::unique_ptr<Utf16CharacterStream> Clone() const override {
540     UNREACHABLE();
541   }
542 
543  protected:
544   size_t FillBuffer(size_t position) final;
545 
546  private:
547   // A position within the data stream. It stores:
548   // - The 'physical' position (# of bytes in the stream),
549   // - the 'logical' position (# of ucs-2 characters, also within the stream),
550   // - a possibly incomplete utf-8 char at the current 'physical' position.
551   struct StreamPosition {
552     size_t bytes;
553     size_t chars;
554     uint32_t incomplete_char;
555     unibrow::Utf8::State state;
556   };
557 
558   // Position contains a StreamPosition and the index of the chunk the position
559   // points into. (The chunk_no could be derived from pos, but that'd be
560   // an expensive search through all chunks.)
561   struct Position {
562     size_t chunk_no;
563     StreamPosition pos;
564   };
565 
566   // A chunk in the list of chunks, containing:
567   // - The chunk data (data pointer and length), and
568   // - the position at the first byte of the chunk.
569   struct Chunk {
570     const uint8_t* data;
571     size_t length;
572     StreamPosition start;
573   };
574 
575   // Within the current chunk, skip forward from current_ towards position.
576   bool SkipToPosition(size_t position);
577   // Within the current chunk, fill the buffer_ (while it has capacity).
578   void FillBufferFromCurrentChunk();
579   // Fetch a new chunk (assuming current_ is at the end of the current data).
580   bool FetchChunk();
581   // Search through the chunks and set current_ to point to the given position.
582   // (This call is potentially expensive.)
583   void SearchPosition(size_t position);
584 
585   std::vector<Chunk> chunks_;
586   Position current_;
587   ScriptCompiler::ExternalSourceStream* source_stream_;
588 };
589 
SkipToPosition(size_t position)590 bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
591   DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.
592 
593   // Already there? Then return immediately.
594   if (current_.pos.chars == position) return true;
595 
596   const Chunk& chunk = chunks_[current_.chunk_no];
597   DCHECK(current_.pos.bytes >= chunk.start.bytes);
598 
599   unibrow::Utf8::State state = chunk.start.state;
600   uint32_t incomplete_char = chunk.start.incomplete_char;
601   size_t it = current_.pos.bytes - chunk.start.bytes;
602   const uint8_t* cursor = &chunk.data[it];
603   const uint8_t* end = &chunk.data[chunk.length];
604 
605   size_t chars = current_.pos.chars;
606 
607   if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
608     while (cursor < end) {
609       unibrow::uchar t =
610           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
611       if (t == unibrow::Utf8::kIncomplete) continue;
612       if (t != kUtf8Bom) {
613         chars++;
614         if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
615       }
616       break;
617     }
618   }
619 
620   while (cursor < end && chars < position) {
621     unibrow::uchar t =
622         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
623     if (t != unibrow::Utf8::kIncomplete) {
624       chars++;
625       if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
626     }
627   }
628 
629   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
630   current_.pos.chars = chars;
631   current_.pos.incomplete_char = incomplete_char;
632   current_.pos.state = state;
633   current_.chunk_no += (cursor == end);
634 
635   return current_.pos.chars == position;
636 }
637 
FillBufferFromCurrentChunk()638 void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
639   DCHECK_LT(current_.chunk_no, chunks_.size());
640   DCHECK_EQ(buffer_start_, buffer_cursor_);
641   DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
642 
643   const Chunk& chunk = chunks_[current_.chunk_no];
644 
645   // The buffer_ is writable, but buffer_*_ members are const. So we get a
646   // non-const pointer into buffer that points to the same char as buffer_end_.
647   uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
648   DCHECK_EQ(output_cursor, buffer_end_);
649 
650   unibrow::Utf8::State state = current_.pos.state;
651   uint32_t incomplete_char = current_.pos.incomplete_char;
652 
653   // If the current chunk is the last (empty) chunk we'll have to process
654   // any left-over, partial characters.
655   if (chunk.length == 0) {
656     unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
657     if (t != unibrow::Utf8::kBufferEmpty) {
658       DCHECK_EQ(t, unibrow::Utf8::kBadChar);
659       *output_cursor = static_cast<base::uc16>(t);
660       buffer_end_++;
661       current_.pos.chars++;
662       current_.pos.incomplete_char = 0;
663       current_.pos.state = state;
664     }
665     return;
666   }
667 
668   size_t it = current_.pos.bytes - chunk.start.bytes;
669   const uint8_t* cursor = chunk.data + it;
670   const uint8_t* end = chunk.data + chunk.length;
671 
672   // Deal with possible BOM.
673   if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
674     while (cursor < end) {
675       unibrow::uchar t =
676           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
677       if (V8_LIKELY(t < kUtf8Bom)) {
678         *(output_cursor++) =
679             static_cast<base::uc16>(t);  // The most frequent case.
680       } else if (t == unibrow::Utf8::kIncomplete) {
681         continue;
682       } else if (t == kUtf8Bom) {
683         // BOM detected at beginning of the stream. Don't copy it.
684       } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
685         *(output_cursor++) = static_cast<base::uc16>(t);
686       } else {
687         *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
688         *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
689       }
690       break;
691     }
692   }
693 
694   const uint16_t* max_buffer_end = buffer_start_ + kBufferSize;
695   while (cursor < end && output_cursor + 1 < max_buffer_end) {
696     unibrow::uchar t =
697         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
698     if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
699       *(output_cursor++) =
700           static_cast<base::uc16>(t);  // The most frequent case.
701     } else if (t == unibrow::Utf8::kIncomplete) {
702       continue;
703     } else {
704       *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
705       *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
706     }
707     // Fast path for ascii sequences.
708     size_t remaining = end - cursor;
709     size_t max_buffer = max_buffer_end - output_cursor;
710     int max_length = static_cast<int>(std::min(remaining, max_buffer));
711     DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
712     int ascii_length = NonAsciiStart(cursor, max_length);
713     CopyChars(output_cursor, cursor, ascii_length);
714     cursor += ascii_length;
715     output_cursor += ascii_length;
716   }
717 
718   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
719   current_.pos.chars += (output_cursor - buffer_end_);
720   current_.pos.incomplete_char = incomplete_char;
721   current_.pos.state = state;
722   current_.chunk_no += (cursor == end);
723 
724   buffer_end_ = output_cursor;
725 }
726 
FetchChunk()727 bool Utf8ExternalStreamingStream::FetchChunk() {
728   RCS_SCOPE(runtime_call_stats(), RuntimeCallCounterId::kGetMoreDataCallback);
729   DCHECK_EQ(current_.chunk_no, chunks_.size());
730   DCHECK(chunks_.empty() || chunks_.back().length != 0);
731 
732   const uint8_t* chunk = nullptr;
733   size_t length = source_stream_->GetMoreData(&chunk);
734   chunks_.push_back({chunk, length, current_.pos});
735   return length > 0;
736 }
737 
SearchPosition(size_t position)738 void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
739   // If current_ already points to the right position, we're done.
740   //
741   // This is expected to be the common case, since we typically call
742   // FillBuffer right after the current buffer.
743   if (current_.pos.chars == position) return;
744 
745   // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
746   if (chunks_.empty()) {
747     DCHECK_EQ(current_.chunk_no, 0u);
748     DCHECK_EQ(current_.pos.bytes, 0u);
749     DCHECK_EQ(current_.pos.chars, 0u);
750     FetchChunk();
751   }
752 
753   // Search for the last chunk whose start position is less or equal to
754   // position.
755   size_t chunk_no = chunks_.size() - 1;
756   while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
757     chunk_no--;
758   }
759 
760   // Did we find the terminating (zero-length) chunk? Then we're seeking
761   // behind the end of the data, and position does not exist.
762   // Set current_ to point to the terminating chunk.
763   if (chunks_[chunk_no].length == 0) {
764     current_ = {chunk_no, chunks_[chunk_no].start};
765     return;
766   }
767 
768   // Did we find the non-last chunk? Then our position must be within chunk_no.
769   if (chunk_no + 1 < chunks_.size()) {
770     // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
771     // (Many web sites declare utf-8 encoding, but use only (or almost only) the
772     //  ASCII subset for their JavaScript sources. We can exploit this, by
773     //  checking whether the # bytes in a chunk are equal to the # chars, and if
774     //  so avoid the expensive SkipToPosition.)
775     bool ascii_only_chunk =
776         chunks_[chunk_no].start.incomplete_char == 0 &&
777         (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
778             (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
779     if (ascii_only_chunk) {
780       size_t skip = position - chunks_[chunk_no].start.chars;
781       current_ = {chunk_no,
782                   {chunks_[chunk_no].start.bytes + skip,
783                    chunks_[chunk_no].start.chars + skip, 0,
784                    unibrow::Utf8::State::kAccept}};
785     } else {
786       current_ = {chunk_no, chunks_[chunk_no].start};
787       SkipToPosition(position);
788     }
789 
790     // Since position was within the chunk, SkipToPosition should have found
791     // something.
792     DCHECK_EQ(position, current_.pos.chars);
793     return;
794   }
795 
796   // What's left: We're in the last, non-terminating chunk. Our position
797   // may be in the chunk, but it may also be in 'future' chunks, which we'll
798   // have to obtain.
799   DCHECK_EQ(chunk_no, chunks_.size() - 1);
800   current_ = {chunk_no, chunks_[chunk_no].start};
801   bool have_more_data = true;
802   bool found = SkipToPosition(position);
803   while (have_more_data && !found) {
804     DCHECK_EQ(current_.chunk_no, chunks_.size());
805     have_more_data = FetchChunk();
806     found = have_more_data && SkipToPosition(position);
807   }
808 
809   // We'll return with a postion != the desired position only if we're out
810   // of data. In that case, we'll point to the terminating chunk.
811   DCHECK_EQ(found, current_.pos.chars == position);
812   DCHECK_EQ(have_more_data, chunks_.back().length != 0);
813   DCHECK_IMPLIES(!found, !have_more_data);
814   DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
815 }
816 
FillBuffer(size_t position)817 size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
818   buffer_cursor_ = buffer_;
819   buffer_end_ = buffer_;
820 
821   SearchPosition(position);
822   bool out_of_data = current_.chunk_no != chunks_.size() &&
823                      chunks_[current_.chunk_no].length == 0 &&
824                      current_.pos.incomplete_char == 0;
825 
826   if (out_of_data) return 0;
827 
828   // Fill the buffer, until we have at least one char (or are out of data).
829   // (The embedder might give us 1-byte blocks within a utf-8 char, so we
830   //  can't guarantee progress with one chunk. Thus we iterate.)
831   while (!out_of_data && buffer_cursor_ == buffer_end_) {
832     // At end of current data, but there might be more? Then fetch it.
833     if (current_.chunk_no == chunks_.size()) {
834       out_of_data = !FetchChunk();
835     }
836     FillBufferFromCurrentChunk();
837   }
838 
839   DCHECK_EQ(current_.pos.chars - position,
840             static_cast<size_t>(buffer_end_ - buffer_cursor_));
841   return buffer_end_ - buffer_cursor_;
842 }
843 
844 // ----------------------------------------------------------------------------
845 // ScannerStream: Create stream instances.
846 
For(Isolate * isolate,Handle<String> data)847 Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
848                                          Handle<String> data) {
849   return ScannerStream::For(isolate, data, 0, data->length());
850 }
851 
For(Isolate * isolate,Handle<String> data,int start_pos,int end_pos)852 Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
853                                          int start_pos, int end_pos) {
854   DCHECK_GE(start_pos, 0);
855   DCHECK_LE(start_pos, end_pos);
856   DCHECK_LE(end_pos, data->length());
857   size_t start_offset = 0;
858   if (data->IsSlicedString()) {
859     SlicedString string = SlicedString::cast(*data);
860     start_offset = string.offset();
861     String parent = string.parent();
862     if (parent.IsThinString()) parent = ThinString::cast(parent).actual();
863     data = handle(parent, isolate);
864   } else {
865     data = String::Flatten(isolate, data);
866   }
867   if (data->IsExternalOneByteString()) {
868     return new BufferedCharacterStream<ExternalStringStream>(
869         static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
870         start_offset, static_cast<size_t>(end_pos));
871   } else if (data->IsExternalTwoByteString()) {
872     return new UnbufferedCharacterStream<ExternalStringStream>(
873         static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
874         start_offset, static_cast<size_t>(end_pos));
875   } else if (data->IsSeqOneByteString()) {
876     return new BufferedCharacterStream<OnHeapStream>(
877         static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
878         start_offset, static_cast<size_t>(end_pos));
879   } else if (data->IsSeqTwoByteString()) {
880     return new RelocatingCharacterStream(
881         isolate, static_cast<size_t>(start_pos),
882         Handle<SeqTwoByteString>::cast(data), start_offset,
883         static_cast<size_t>(end_pos));
884   } else {
885     UNREACHABLE();
886   }
887 }
888 
ForTesting(const char * data)889 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
890     const char* data) {
891   return ScannerStream::ForTesting(data, strlen(data));
892 }
893 
ForTesting(const char * data,size_t length)894 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
895     const char* data, size_t length) {
896   if (data == nullptr) {
897     DCHECK_EQ(length, 0);
898 
899     // We don't want to pass in a null pointer into the the character stream,
900     // because then the one-past-the-end pointer is undefined, so instead pass
901     // through this static array.
902     static const char non_null_empty_string[1] = {0};
903     data = non_null_empty_string;
904   }
905 
906   return std::unique_ptr<Utf16CharacterStream>(
907       new BufferedCharacterStream<TestingStream>(
908           0, reinterpret_cast<const uint8_t*>(data), length));
909 }
910 
ForTesting(const uint16_t * data,size_t length)911 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
912     const uint16_t* data, size_t length) {
913   if (data == nullptr) {
914     DCHECK_EQ(length, 0);
915 
916     // We don't want to pass in a null pointer into the the character stream,
917     // because then the one-past-the-end pointer is undefined, so instead pass
918     // through this static array.
919     static const uint16_t non_null_empty_uint16_t_string[1] = {0};
920     data = non_null_empty_uint16_t_string;
921   }
922 
923   return std::unique_ptr<Utf16CharacterStream>(
924       new UnbufferedCharacterStream<TestingStream>(0, data, length));
925 }
926 
For(ScriptCompiler::ExternalSourceStream * source_stream,v8::ScriptCompiler::StreamedSource::Encoding encoding)927 Utf16CharacterStream* ScannerStream::For(
928     ScriptCompiler::ExternalSourceStream* source_stream,
929     v8::ScriptCompiler::StreamedSource::Encoding encoding) {
930   switch (encoding) {
931     case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
932       return new UnbufferedCharacterStream<ChunkedStream>(
933           static_cast<size_t>(0), source_stream);
934     case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
935       return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
936                                                         source_stream);
937     case v8::ScriptCompiler::StreamedSource::WINDOWS_1252:
938       return new Windows1252CharacterStream(static_cast<size_t>(0),
939                                             source_stream);
940     case v8::ScriptCompiler::StreamedSource::UTF8:
941       return new Utf8ExternalStreamingStream(source_stream);
942   }
943   UNREACHABLE();
944 }
945 
946 }  // namespace internal
947 }  // namespace v8
948