1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "new-regexp/regexp-macro-assembler.h"
6
7 #include "new-regexp/regexp-stack.h"
8
9 #ifdef V8_INTL_SUPPORT
10 #include "unicode/uchar.h"
11 #include "unicode/unistr.h"
12 #endif // V8_INTL_SUPPORT
13
14 namespace v8 {
15 namespace internal {
16
RegExpMacroAssembler(Isolate * isolate,Zone * zone)17 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
18 : slow_safe_compiler_(false),
19 global_mode_(NOT_GLOBAL),
20 isolate_(isolate),
21 zone_(zone) {}
22
23 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
24
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)25 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
26 Address byte_offset2,
27 size_t byte_length,
28 Isolate* isolate) {
29 // This function is not allowed to cause a garbage collection.
30 // A GC might move the calling generated code and invalidate the
31 // return address on the stack.
32 DCHECK_EQ(0, byte_length % 2);
33
34 #ifdef V8_INTL_SUPPORT
35 int32_t length = (int32_t)(byte_length >> 1);
36 icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
37 length);
38 return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
39 length, U_FOLD_CASE_DEFAULT) == 0;
40 #else
41 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
42 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
43 size_t length = byte_length >> 1;
44 DCHECK_NOT_NULL(isolate);
45 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
46 isolate->regexp_macro_assembler_canonicalize();
47 for (size_t i = 0; i < length; i++) {
48 unibrow::uchar c1 = substring1[i];
49 unibrow::uchar c2 = substring2[i];
50 if (c1 != c2) {
51 unibrow::uchar s1[1] = {c1};
52 canonicalize->get(c1, '\0', s1);
53 if (s1[0] != c2) {
54 unibrow::uchar s2[1] = {c2};
55 canonicalize->get(c2, '\0', s2);
56 if (s1[0] != s2[0]) {
57 return 0;
58 }
59 }
60 }
61 }
62 return 1;
63 #endif // V8_INTL_SUPPORT
64 }
65
66
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)67 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
68 Label* on_failure) {
69 Label ok;
70 // Check that current character is not a trail surrogate.
71 LoadCurrentCharacter(cp_offset, &ok);
72 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
73 // Check that previous character is not a lead surrogate.
74 LoadCurrentCharacter(cp_offset - 1, &ok);
75 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
76 Bind(&ok);
77 }
78
CheckPosition(int cp_offset,Label * on_outside_input)79 void RegExpMacroAssembler::CheckPosition(int cp_offset,
80 Label* on_outside_input) {
81 LoadCurrentCharacter(cp_offset, on_outside_input, true);
82 }
83
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)84 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
85 Label* on_end_of_input,
86 bool check_bounds,
87 int characters,
88 int eats_at_least) {
89 // By default, eats_at_least = characters.
90 if (eats_at_least == kUseCharactersValue) {
91 eats_at_least = characters;
92 }
93
94 LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
95 eats_at_least);
96 }
97
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)98 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
99 Label* on_no_match) {
100 return false;
101 }
102
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)103 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
104 Zone* zone)
105 : RegExpMacroAssembler(isolate, zone) {}
106
107 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
108
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)109 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
110 int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
111 int eats_at_least) {
112 // It's possible to preload a small number of characters when each success
113 // path requires a large number of characters, but not the reverse.
114 DCHECK_GE(eats_at_least, characters);
115
116 DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
117 if (check_bounds) {
118 if (cp_offset >= 0) {
119 CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
120 } else {
121 CheckPosition(cp_offset, on_end_of_input);
122 }
123 }
124 LoadCurrentCharacterUnchecked(cp_offset, characters);
125 }
126
CanReadUnaligned()127 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
128 return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
129 }
130
131 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
132
133 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)134 int NativeRegExpMacroAssembler::CheckStackGuardState(
135 Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
136 Address* return_address, Code re_code, Address* subject,
137 const byte** input_start, const byte** input_end) {
138 DisallowHeapAllocation no_gc;
139 Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
140 DCHECK_LE(re_code.raw_instruction_start(), old_pc);
141 DCHECK_LE(old_pc, re_code.raw_instruction_end());
142
143 StackLimitCheck check(isolate);
144 bool js_has_overflowed = check.JsHasOverflowed();
145
146 if (call_origin == RegExp::CallOrigin::kFromJs) {
147 // Direct calls from JavaScript can be interrupted in two ways:
148 // 1. A real stack overflow, in which case we let the caller throw the
149 // exception.
150 // 2. The stack guard was used to interrupt execution for another purpose,
151 // forcing the call through the runtime system.
152
153 // Bug(v8:9540) Investigate why this method is called from JS although no
154 // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
155 // to continue execution normally.
156 if (js_has_overflowed) {
157 return EXCEPTION;
158 } else if (check.InterruptRequested()) {
159 return RETRY;
160 } else {
161 return 0;
162 }
163 }
164 DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
165
166 // Prepare for possible GC.
167 HandleScope handles(isolate);
168 Handle<Code> code_handle(re_code, isolate);
169 Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
170 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
171 int return_value = 0;
172
173 if (js_has_overflowed) {
174 AllowHeapAllocation yes_gc;
175 isolate->StackOverflow();
176 return_value = EXCEPTION;
177 } else if (check.InterruptRequested()) {
178 AllowHeapAllocation yes_gc;
179 Object result = isolate->stack_guard()->HandleInterrupts();
180 if (result.IsException(isolate)) return_value = EXCEPTION;
181 }
182
183 if (*code_handle != re_code) { // Return address no longer valid
184 // Overwrite the return address on the stack.
185 intptr_t delta = code_handle->address() - re_code.address();
186 Address new_pc = old_pc + delta;
187 // TODO(v8:10026): avoid replacing a signed pointer.
188 PointerAuthentication::ReplacePC(return_address, new_pc, 0);
189 }
190
191 // If we continue, we need to update the subject string addresses.
192 if (return_value == 0) {
193 // String encoding might have changed.
194 if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
195 is_one_byte) {
196 // If we changed between an LATIN1 and an UC16 string, the specialized
197 // code cannot be used, and we need to restart regexp matching from
198 // scratch (including, potentially, compiling a new version of the code).
199 return_value = RETRY;
200 } else {
201 *subject = subject_handle->ptr();
202 intptr_t byte_length = *input_end - *input_start;
203 *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
204 *input_end = *input_start + byte_length;
205 }
206 }
207 return return_value;
208 }
209
210 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)211 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
212 Handle<String> subject,
213 int* offsets_vector,
214 int offsets_vector_length,
215 int previous_index, Isolate* isolate) {
216 DCHECK(subject->IsFlat());
217 DCHECK_LE(0, previous_index);
218 DCHECK_LE(previous_index, subject->length());
219
220 // No allocations before calling the regexp, but we can't use
221 // DisallowHeapAllocation, since regexps might be preempted, and another
222 // thread might do allocation anyway.
223
224 String subject_ptr = *subject;
225 // Character offsets into string.
226 int start_offset = previous_index;
227 int char_length = subject_ptr.length() - start_offset;
228 int slice_offset = 0;
229
230 // The string has been flattened, so if it is a cons string it contains the
231 // full string in the first part.
232 if (StringShape(subject_ptr).IsCons()) {
233 DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
234 subject_ptr = ConsString::cast(subject_ptr).first();
235 } else if (StringShape(subject_ptr).IsSliced()) {
236 SlicedString slice = SlicedString::cast(subject_ptr);
237 subject_ptr = slice.parent();
238 slice_offset = slice.offset();
239 }
240 if (StringShape(subject_ptr).IsThin()) {
241 subject_ptr = ThinString::cast(subject_ptr).actual();
242 }
243 // Ensure that an underlying string has the same representation.
244 bool is_one_byte = subject_ptr.IsOneByteRepresentation();
245 DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
246 // String is now either Sequential or External
247 int char_size_shift = is_one_byte ? 0 : 1;
248
249 DisallowHeapAllocation no_gc;
250 const byte* input_start =
251 subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
252 int byte_length = char_length << char_size_shift;
253 const byte* input_end = input_start + byte_length;
254 return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
255 offsets_vector_length, isolate, *regexp);
256 }
257
258 // Returns a {Result} sentinel, or the number of successful matches.
259 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
260 // the signature of the interpreter. We should get rid of JS objects passed to
261 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)262 int NativeRegExpMacroAssembler::Execute(
263 String input, // This needs to be the unpacked (sliced, cons) string.
264 int start_offset, const byte* input_start, const byte* input_end,
265 int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
266 // Ensure that the minimum stack has been allocated.
267 RegExpStackScope stack_scope(isolate);
268 Address stack_base = stack_scope.stack()->stack_base();
269
270 bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
271 Code code = Code::cast(regexp.Code(is_one_byte));
272 RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
273
274 using RegexpMatcherSig = int(
275 Address input_string, int start_offset, // NOLINT(readability/casting)
276 const byte* input_start, const byte* input_end, int* output,
277 int output_size, Address stack_base, int call_origin, Isolate* isolate,
278 Address regexp);
279
280 auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
281 int result =
282 fn.Call(input.ptr(), start_offset, input_start, input_end, output,
283 output_size, stack_base, call_origin, isolate, regexp.ptr());
284 DCHECK(result >= RETRY);
285
286 if (result == EXCEPTION && !isolate->has_pending_exception()) {
287 // We detected a stack overflow (on the backtrack stack) in RegExp code,
288 // but haven't created the exception yet. Additionally, we allow heap
289 // allocation because even though it invalidates {input_start} and
290 // {input_end}, we are about to return anyway.
291 AllowHeapAllocation allow_allocation;
292 isolate->StackOverflow();
293 }
294 return result;
295 }
296
297 #endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
298
299 // clang-format off
300 const byte NativeRegExpMacroAssembler::word_character_map[] = {
301 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305
306 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
308 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
309 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
310
311 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
312 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
313 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
314 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
315
316 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
317 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
318 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
319 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
320 // Latin-1 range
321 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325
326 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330
331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335
336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
340 };
341 // clang-format on
342
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)343 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
344 Address* stack_base,
345 Isolate* isolate) {
346 RegExpStack* regexp_stack = isolate->regexp_stack();
347 size_t size = regexp_stack->stack_capacity();
348 Address old_stack_base = regexp_stack->stack_base();
349 DCHECK(old_stack_base == *stack_base);
350 DCHECK(stack_pointer <= old_stack_base);
351 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
352 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
353 if (new_stack_base == kNullAddress) {
354 return kNullAddress;
355 }
356 *stack_base = new_stack_base;
357 intptr_t stack_content_size = old_stack_base - stack_pointer;
358 return new_stack_base - stack_content_size;
359 }
360
361 } // namespace internal
362 } // namespace v8
363