1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRINGS_UNICODE_DECODER_H_
6 #define V8_STRINGS_UNICODE_DECODER_H_
7 
8 #include "src/strings/unicode.h"
9 #include "src/utils/vector.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 // The return value may point to the first aligned word containing the first
15 // non-one-byte character, rather than directly to the non-one-byte character.
16 // If the return value is >= the passed length, the entire string was
17 // one-byte.
NonAsciiStart(const uint8_t * chars,int length)18 inline int NonAsciiStart(const uint8_t* chars, int length) {
19   const uint8_t* start = chars;
20   const uint8_t* limit = chars + length;
21 
22   if (static_cast<size_t>(length) >= kIntptrSize) {
23     // Check unaligned bytes.
24     while (!IsAligned(reinterpret_cast<intptr_t>(chars), kIntptrSize)) {
25       if (*chars > unibrow::Utf8::kMaxOneByteChar) {
26         return static_cast<int>(chars - start);
27       }
28       ++chars;
29     }
30     // Check aligned words.
31     DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F);
32     const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80;
33     while (chars + sizeof(uintptr_t) <= limit) {
34       if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) {
35         return static_cast<int>(chars - start);
36       }
37       chars += sizeof(uintptr_t);
38     }
39   }
40   // Check remaining unaligned bytes.
41   while (chars < limit) {
42     if (*chars > unibrow::Utf8::kMaxOneByteChar) {
43       return static_cast<int>(chars - start);
44     }
45     ++chars;
46   }
47 
48   return static_cast<int>(chars - start);
49 }
50 
51 class V8_EXPORT_PRIVATE Utf8Decoder final {
52  public:
53   enum class Encoding : uint8_t { kAscii, kLatin1, kUtf16 };
54 
55   explicit Utf8Decoder(const Vector<const uint8_t>& chars);
56 
is_ascii()57   bool is_ascii() const { return encoding_ == Encoding::kAscii; }
is_one_byte()58   bool is_one_byte() const { return encoding_ <= Encoding::kLatin1; }
utf16_length()59   int utf16_length() const { return utf16_length_; }
non_ascii_start()60   int non_ascii_start() const { return non_ascii_start_; }
61 
62   template <typename Char>
63   V8_EXPORT_PRIVATE void Decode(Char* out, const Vector<const uint8_t>& data);
64 
65  private:
66   Encoding encoding_;
67   int non_ascii_start_;
68   int utf16_length_;
69 };
70 
71 }  // namespace internal
72 }  // namespace v8
73 
74 #endif  // V8_STRINGS_UNICODE_DECODER_H_
75