1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "irregexp/imported/regexp-macro-assembler.h"
6
7 #include "irregexp/imported/regexp-stack.h"
8 #include "irregexp/imported/special-case.h"
9
10 #ifdef V8_INTL_SUPPORT
11 #include "unicode/uchar.h"
12 #include "unicode/unistr.h"
13 #endif // V8_INTL_SUPPORT
14
15 namespace v8 {
16 namespace internal {
17
RegExpMacroAssembler(Isolate * isolate,Zone * zone)18 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
19 : slow_safe_compiler_(false),
20 global_mode_(NOT_GLOBAL),
21 isolate_(isolate),
22 zone_(zone) {}
23
24 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
25
CaseInsensitiveCompareNonUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)26 int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
27 Address byte_offset2,
28 size_t byte_length,
29 Isolate* isolate) {
30 #ifdef V8_INTL_SUPPORT
31 // This function is not allowed to cause a garbage collection.
32 // A GC might move the calling generated code and invalidate the
33 // return address on the stack.
34 DisallowGarbageCollection no_gc;
35 DCHECK_EQ(0, byte_length % 2);
36 size_t length = byte_length / 2;
37 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
38 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
39
40 for (size_t i = 0; i < length; i++) {
41 UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
42 UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]);
43 if (c1 != c2) {
44 return 0;
45 }
46 }
47 return 1;
48 #else
49 return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length,
50 isolate);
51 #endif
52 }
53
CaseInsensitiveCompareUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)54 int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
55 Address byte_offset2,
56 size_t byte_length,
57 Isolate* isolate) {
58 // This function is not allowed to cause a garbage collection.
59 // A GC might move the calling generated code and invalidate the
60 // return address on the stack.
61 DisallowGarbageCollection no_gc;
62 DCHECK_EQ(0, byte_length % 2);
63
64 #ifdef V8_INTL_SUPPORT
65 int32_t length = static_cast<int32_t>(byte_length >> 1);
66 icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
67 length);
68 return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
69 length, U_FOLD_CASE_DEFAULT) == 0;
70 #else
71 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
72 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
73 size_t length = byte_length >> 1;
74 DCHECK_NOT_NULL(isolate);
75 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
76 isolate->regexp_macro_assembler_canonicalize();
77 for (size_t i = 0; i < length; i++) {
78 unibrow::uchar c1 = substring1[i];
79 unibrow::uchar c2 = substring2[i];
80 if (c1 != c2) {
81 unibrow::uchar s1[1] = {c1};
82 canonicalize->get(c1, '\0', s1);
83 if (s1[0] != c2) {
84 unibrow::uchar s2[1] = {c2};
85 canonicalize->get(c2, '\0', s2);
86 if (s1[0] != s2[0]) {
87 return 0;
88 }
89 }
90 }
91 }
92 return 1;
93 #endif // V8_INTL_SUPPORT
94 }
95
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)96 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
97 Label* on_failure) {
98 Label ok;
99 // Check that current character is not a trail surrogate.
100 LoadCurrentCharacter(cp_offset, &ok);
101 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
102 // Check that previous character is not a lead surrogate.
103 LoadCurrentCharacter(cp_offset - 1, &ok);
104 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
105 Bind(&ok);
106 }
107
CheckPosition(int cp_offset,Label * on_outside_input)108 void RegExpMacroAssembler::CheckPosition(int cp_offset,
109 Label* on_outside_input) {
110 LoadCurrentCharacter(cp_offset, on_outside_input, true);
111 }
112
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)113 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
114 Label* on_end_of_input,
115 bool check_bounds,
116 int characters,
117 int eats_at_least) {
118 // By default, eats_at_least = characters.
119 if (eats_at_least == kUseCharactersValue) {
120 eats_at_least = characters;
121 }
122
123 LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
124 eats_at_least);
125 }
126
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)127 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
128 Label* on_no_match) {
129 return false;
130 }
131
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)132 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
133 Zone* zone)
134 : RegExpMacroAssembler(isolate, zone) {}
135
136 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
137
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)138 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
139 int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
140 int eats_at_least) {
141 // It's possible to preload a small number of characters when each success
142 // path requires a large number of characters, but not the reverse.
143 DCHECK_GE(eats_at_least, characters);
144
145 DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
146 if (check_bounds) {
147 if (cp_offset >= 0) {
148 CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
149 } else {
150 CheckPosition(cp_offset, on_end_of_input);
151 }
152 }
153 LoadCurrentCharacterUnchecked(cp_offset, characters);
154 }
155
CanReadUnaligned()156 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
157 return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
158 }
159
160 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
161
162 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)163 int NativeRegExpMacroAssembler::CheckStackGuardState(
164 Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
165 Address* return_address, Code re_code, Address* subject,
166 const byte** input_start, const byte** input_end) {
167 DisallowGarbageCollection no_gc;
168 Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
169 DCHECK_LE(re_code.raw_instruction_start(), old_pc);
170 DCHECK_LE(old_pc, re_code.raw_instruction_end());
171
172 StackLimitCheck check(isolate);
173 bool js_has_overflowed = check.JsHasOverflowed();
174
175 if (call_origin == RegExp::CallOrigin::kFromJs) {
176 // Direct calls from JavaScript can be interrupted in two ways:
177 // 1. A real stack overflow, in which case we let the caller throw the
178 // exception.
179 // 2. The stack guard was used to interrupt execution for another purpose,
180 // forcing the call through the runtime system.
181
182 // Bug(v8:9540) Investigate why this method is called from JS although no
183 // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
184 // to continue execution normally.
185 if (js_has_overflowed) {
186 return EXCEPTION;
187 } else if (check.InterruptRequested()) {
188 return RETRY;
189 } else {
190 return 0;
191 }
192 }
193 DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
194
195 // Prepare for possible GC.
196 HandleScope handles(isolate);
197 Handle<Code> code_handle(re_code, isolate);
198 Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
199 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
200 int return_value = 0;
201
202 {
203 DisableGCMole no_gc_mole;
204 if (js_has_overflowed) {
205 AllowGarbageCollection yes_gc;
206 isolate->StackOverflow();
207 return_value = EXCEPTION;
208 } else if (check.InterruptRequested()) {
209 AllowGarbageCollection yes_gc;
210 Object result = isolate->stack_guard()->HandleInterrupts();
211 if (result.IsException(isolate)) return_value = EXCEPTION;
212 }
213
214 if (*code_handle != re_code) { // Return address no longer valid
215 // Overwrite the return address on the stack.
216 intptr_t delta = code_handle->address() - re_code.address();
217 Address new_pc = old_pc + delta;
218 // TODO(v8:10026): avoid replacing a signed pointer.
219 PointerAuthentication::ReplacePC(return_address, new_pc, 0);
220 }
221 }
222
223 // If we continue, we need to update the subject string addresses.
224 if (return_value == 0) {
225 // String encoding might have changed.
226 if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
227 is_one_byte) {
228 // If we changed between an LATIN1 and an UC16 string, the specialized
229 // code cannot be used, and we need to restart regexp matching from
230 // scratch (including, potentially, compiling a new version of the code).
231 return_value = RETRY;
232 } else {
233 *subject = subject_handle->ptr();
234 intptr_t byte_length = *input_end - *input_start;
235 *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
236 *input_end = *input_start + byte_length;
237 }
238 }
239 return return_value;
240 }
241
242 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)243 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
244 Handle<String> subject,
245 int* offsets_vector,
246 int offsets_vector_length,
247 int previous_index, Isolate* isolate) {
248 DCHECK(subject->IsFlat());
249 DCHECK_LE(0, previous_index);
250 DCHECK_LE(previous_index, subject->length());
251
252 // No allocations before calling the regexp, but we can't use
253 // DisallowGarbageCollection, since regexps might be preempted, and another
254 // thread might do allocation anyway.
255
256 String subject_ptr = *subject;
257 // Character offsets into string.
258 int start_offset = previous_index;
259 int char_length = subject_ptr.length() - start_offset;
260 int slice_offset = 0;
261
262 // The string has been flattened, so if it is a cons string it contains the
263 // full string in the first part.
264 if (StringShape(subject_ptr).IsCons()) {
265 DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
266 subject_ptr = ConsString::cast(subject_ptr).first();
267 } else if (StringShape(subject_ptr).IsSliced()) {
268 SlicedString slice = SlicedString::cast(subject_ptr);
269 subject_ptr = slice.parent();
270 slice_offset = slice.offset();
271 }
272 if (StringShape(subject_ptr).IsThin()) {
273 subject_ptr = ThinString::cast(subject_ptr).actual();
274 }
275 // Ensure that an underlying string has the same representation.
276 bool is_one_byte = subject_ptr.IsOneByteRepresentation();
277 DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
278 // String is now either Sequential or External
279 int char_size_shift = is_one_byte ? 0 : 1;
280
281 DisallowGarbageCollection no_gc;
282 const byte* input_start =
283 subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
284 int byte_length = char_length << char_size_shift;
285 const byte* input_end = input_start + byte_length;
286 return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
287 offsets_vector_length, isolate, *regexp);
288 }
289
290 // Returns a {Result} sentinel, or the number of successful matches.
291 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
292 // the signature of the interpreter. We should get rid of JS objects passed to
293 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)294 int NativeRegExpMacroAssembler::Execute(
295 String input, // This needs to be the unpacked (sliced, cons) string.
296 int start_offset, const byte* input_start, const byte* input_end,
297 int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
298 // Ensure that the minimum stack has been allocated.
299 RegExpStackScope stack_scope(isolate);
300 Address stack_base = stack_scope.stack()->stack_base();
301
302 bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
303 Code code = Code::cast(regexp.Code(is_one_byte));
304 RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
305
306 using RegexpMatcherSig = int(
307 Address input_string, int start_offset, const byte* input_start,
308 const byte* input_end, int* output, int output_size, Address stack_base,
309 int call_origin, Isolate* isolate, Address regexp);
310
311 auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
312 int result =
313 fn.Call(input.ptr(), start_offset, input_start, input_end, output,
314 output_size, stack_base, call_origin, isolate, regexp.ptr());
315 DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
316
317 if (result == EXCEPTION && !isolate->has_pending_exception()) {
318 // We detected a stack overflow (on the backtrack stack) in RegExp code,
319 // but haven't created the exception yet. Additionally, we allow heap
320 // allocation because even though it invalidates {input_start} and
321 // {input_end}, we are about to return anyway.
322 AllowGarbageCollection allow_allocation;
323 isolate->StackOverflow();
324 }
325 return result;
326 }
327
328 #endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
329
330 // clang-format off
331 const byte NativeRegExpMacroAssembler::word_character_map[] = {
332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336
337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
340 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
341
342 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
343 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
344 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
345 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
346
347 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
348 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
349 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
350 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
351 // Latin-1 range
352 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
353 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
354 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
355 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
356
357 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
358 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
359 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
360 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
361
362 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
363 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
364 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
365 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
366
367 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
368 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
369 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
370 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
371 };
372 // clang-format on
373
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)374 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
375 Address* stack_base,
376 Isolate* isolate) {
377 RegExpStack* regexp_stack = isolate->regexp_stack();
378 size_t size = regexp_stack->stack_capacity();
379 Address old_stack_base = regexp_stack->stack_base();
380 DCHECK(old_stack_base == *stack_base);
381 DCHECK(stack_pointer <= old_stack_base);
382 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
383 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
384 if (new_stack_base == kNullAddress) {
385 return kNullAddress;
386 }
387 *stack_base = new_stack_base;
388 intptr_t stack_content_size = old_stack_base - stack_pointer;
389 return new_stack_base - stack_content_size;
390 }
391
392 } // namespace internal
393 } // namespace v8
394