1 // Copyright 2007-2010 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRINGS_UNICODE_INL_H_
6 #define V8_STRINGS_UNICODE_INL_H_
7 
8 #include "src/base/logging.h"
9 #include "src/strings/unicode.h"
10 #include "src/utils/utils.h"
11 
12 namespace unibrow {
13 
14 #ifndef V8_INTL_SUPPORT
15 template <class T, int s>
get(uchar code_point)16 bool Predicate<T, s>::get(uchar code_point) {
17   CacheEntry entry = entries_[code_point & kMask];
18   if (entry.code_point() == code_point) return entry.value();
19   return CalculateValue(code_point);
20 }
21 
22 template <class T, int s>
CalculateValue(uchar code_point)23 bool Predicate<T, s>::CalculateValue(uchar code_point) {
24   bool result = T::Is(code_point);
25   entries_[code_point & kMask] = CacheEntry(code_point, result);
26   return result;
27 }
28 
29 template <class T, int s>
get(uchar c,uchar n,uchar * result)30 int Mapping<T, s>::get(uchar c, uchar n, uchar* result) {
31   CacheEntry entry = entries_[c & kMask];
32   if (entry.code_point_ == c) {
33     if (entry.offset_ == 0) {
34       return 0;
35     } else {
36       result[0] = c + entry.offset_;
37       return 1;
38     }
39   } else {
40     return CalculateValue(c, n, result);
41   }
42 }
43 
44 template <class T, int s>
CalculateValue(uchar c,uchar n,uchar * result)45 int Mapping<T, s>::CalculateValue(uchar c, uchar n, uchar* result) {
46   bool allow_caching = true;
47   int length = T::Convert(c, n, result, &allow_caching);
48   if (allow_caching) {
49     if (length == 1) {
50       entries_[c & kMask] = CacheEntry(c, result[0] - c);
51       return 1;
52     } else {
53       entries_[c & kMask] = CacheEntry(c, 0);
54       return 0;
55     }
56   } else {
57     return length;
58   }
59 }
60 #endif  // !V8_INTL_SUPPORT
61 
HasUnpairedSurrogate(const uint16_t * code_units,size_t length)62 bool Utf16::HasUnpairedSurrogate(const uint16_t* code_units, size_t length) {
63   for (size_t i = 0; i < length; ++i) {
64     const int code_unit = code_units[i];
65     if (IsLeadSurrogate(code_unit)) {
66       // The current code unit is a leading surrogate. Check if it is followed
67       // by a trailing surrogate.
68       if (i == length - 1) return true;
69       if (!IsTrailSurrogate(code_units[i + 1])) return true;
70       // Skip the paired trailing surrogate.
71       ++i;
72     } else if (IsTrailSurrogate(code_unit)) {
73       // All paired trailing surrogates are skipped above, so this branch is
74       // only for those that are unpaired.
75       return true;
76     }
77   }
78   return false;
79 }
80 
81 // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
82 // stream in. This **must** be followed by a call to ValueOfIncrementalFinish
83 // when the stream is complete, to ensure incomplete sequences are handled.
ValueOfIncremental(const byte ** cursor,State * state,Utf8IncrementalBuffer * buffer)84 uchar Utf8::ValueOfIncremental(const byte** cursor, State* state,
85                                Utf8IncrementalBuffer* buffer) {
86   DCHECK_NOT_NULL(buffer);
87   State old_state = *state;
88   byte next = **cursor;
89   *cursor += 1;
90 
91   if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
92     DCHECK_EQ(0u, *buffer);
93     return static_cast<uchar>(next);
94   }
95 
96   // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
97   // char in that sequence.
98   Utf8DfaDecoder::Decode(next, state, buffer);
99 
100   switch (*state) {
101     case State::kAccept: {
102       uchar t = *buffer;
103       *buffer = 0;
104       return t;
105     }
106 
107     case State::kReject:
108       *state = State::kAccept;
109       *buffer = 0;
110 
111       // If we hit a bad byte, we need to determine if we were trying to start
112       // a sequence or continue one. If we were trying to start a sequence,
113       // that means it's just an invalid lead byte and we need to continue to
114       // the next (which we already did above). If we were already in a
115       // sequence, we need to reprocess this same byte after resetting to the
116       // initial state.
117       if (old_state != State::kAccept) {
118         // We were trying to continue a sequence, so let's reprocess this byte
119         // next time.
120         *cursor -= 1;
121       }
122       return kBadChar;
123 
124     default:
125       return kIncomplete;
126   }
127 }
128 
EncodeOneByte(char * str,uint8_t c)129 unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
130   static const int kMask = ~(1 << 6);
131   if (c <= kMaxOneByteChar) {
132     str[0] = c;
133     return 1;
134   }
135   str[0] = 0xC0 | (c >> 6);
136   str[1] = 0x80 | (c & kMask);
137   return 2;
138 }
139 
140 // Encode encodes the UTF-16 code units c and previous into the given str
141 // buffer, and combines surrogate code units into single code points. If
142 // replace_invalid is set to true, orphan surrogate code units will be replaced
143 // with kBadChar.
Encode(char * str,uchar c,int previous,bool replace_invalid)144 unsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) {
145   static const int kMask = ~(1 << 6);
146   if (c <= kMaxOneByteChar) {
147     str[0] = c;
148     return 1;
149   } else if (c <= kMaxTwoByteChar) {
150     str[0] = 0xC0 | (c >> 6);
151     str[1] = 0x80 | (c & kMask);
152     return 2;
153   } else if (c <= kMaxThreeByteChar) {
154     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
155     if (Utf16::IsSurrogatePair(previous, c)) {
156       const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
157       return Encode(str - kUnmatchedSize,
158                     Utf16::CombineSurrogatePair(previous, c),
159                     Utf16::kNoPreviousCharacter, replace_invalid) -
160              kUnmatchedSize;
161     } else if (replace_invalid &&
162                (Utf16::IsLeadSurrogate(c) || Utf16::IsTrailSurrogate(c))) {
163       c = kBadChar;
164     }
165     str[0] = 0xE0 | (c >> 12);
166     str[1] = 0x80 | ((c >> 6) & kMask);
167     str[2] = 0x80 | (c & kMask);
168     return 3;
169   } else {
170     str[0] = 0xF0 | (c >> 18);
171     str[1] = 0x80 | ((c >> 12) & kMask);
172     str[2] = 0x80 | ((c >> 6) & kMask);
173     str[3] = 0x80 | (c & kMask);
174     return 4;
175   }
176 }
177 
ValueOf(const byte * bytes,size_t length,size_t * cursor)178 uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) {
179   if (length <= 0) return kBadChar;
180   byte first = bytes[0];
181   // Characters between 0000 and 007F are encoded as a single character
182   if (V8_LIKELY(first <= kMaxOneByteChar)) {
183     *cursor += 1;
184     return first;
185   }
186   return CalculateValue(bytes, length, cursor);
187 }
188 
Length(uchar c,int previous)189 unsigned Utf8::Length(uchar c, int previous) {
190   if (c <= kMaxOneByteChar) {
191     return 1;
192   } else if (c <= kMaxTwoByteChar) {
193     return 2;
194   } else if (c <= kMaxThreeByteChar) {
195     DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));
196     if (Utf16::IsSurrogatePair(previous, c)) {
197       return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;
198     }
199     return 3;
200   } else {
201     return 4;
202   }
203 }
204 
IsValidCharacter(uchar c)205 bool Utf8::IsValidCharacter(uchar c) {
206   return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
207          (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
208           c != kBadChar);
209 }
210 
211 }  // namespace unibrow
212 
213 #endif  // V8_STRINGS_UNICODE_INL_H_
214