1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "irregexp/imported/regexp-macro-assembler.h"
6 
7 #include "irregexp/imported/regexp-stack.h"
8 #include "irregexp/imported/special-case.h"
9 
10 #ifdef V8_INTL_SUPPORT
11 #include "unicode/uchar.h"
12 #include "unicode/unistr.h"
13 #endif  // V8_INTL_SUPPORT
14 
15 namespace v8 {
16 namespace internal {
17 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)18 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
19     : slow_safe_compiler_(false),
20       global_mode_(NOT_GLOBAL),
21       isolate_(isolate),
22       zone_(zone) {}
23 
24 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
25 
CaseInsensitiveCompareNonUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)26 int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
27                                                            Address byte_offset2,
28                                                            size_t byte_length,
29                                                            Isolate* isolate) {
30 #ifdef V8_INTL_SUPPORT
31   // This function is not allowed to cause a garbage collection.
32   // A GC might move the calling generated code and invalidate the
33   // return address on the stack.
34   DisallowGarbageCollection no_gc;
35   DCHECK_EQ(0, byte_length % 2);
36   size_t length = byte_length / 2;
37   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
38   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
39 
40   for (size_t i = 0; i < length; i++) {
41     UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
42     UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]);
43     if (c1 != c2) {
44       return 0;
45     }
46   }
47   return 1;
48 #else
49   return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length,
50                                        isolate);
51 #endif
52 }
53 
CaseInsensitiveCompareUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)54 int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
55                                                         Address byte_offset2,
56                                                         size_t byte_length,
57                                                         Isolate* isolate) {
58   // This function is not allowed to cause a garbage collection.
59   // A GC might move the calling generated code and invalidate the
60   // return address on the stack.
61   DisallowGarbageCollection no_gc;
62   DCHECK_EQ(0, byte_length % 2);
63 
64 #ifdef V8_INTL_SUPPORT
65   int32_t length = static_cast<int32_t>(byte_length >> 1);
66   icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
67                                length);
68   return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
69                                length, U_FOLD_CASE_DEFAULT) == 0;
70 #else
71   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
72   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
73   size_t length = byte_length >> 1;
74   DCHECK_NOT_NULL(isolate);
75   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
76       isolate->regexp_macro_assembler_canonicalize();
77   for (size_t i = 0; i < length; i++) {
78     unibrow::uchar c1 = substring1[i];
79     unibrow::uchar c2 = substring2[i];
80     if (c1 != c2) {
81       unibrow::uchar s1[1] = {c1};
82       canonicalize->get(c1, '\0', s1);
83       if (s1[0] != c2) {
84         unibrow::uchar s2[1] = {c2};
85         canonicalize->get(c2, '\0', s2);
86         if (s1[0] != s2[0]) {
87           return 0;
88         }
89       }
90     }
91   }
92   return 1;
93 #endif  // V8_INTL_SUPPORT
94 }
95 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)96 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
97                                                    Label* on_failure) {
98   Label ok;
99   // Check that current character is not a trail surrogate.
100   LoadCurrentCharacter(cp_offset, &ok);
101   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
102   // Check that previous character is not a lead surrogate.
103   LoadCurrentCharacter(cp_offset - 1, &ok);
104   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
105   Bind(&ok);
106 }
107 
CheckPosition(int cp_offset,Label * on_outside_input)108 void RegExpMacroAssembler::CheckPosition(int cp_offset,
109                                          Label* on_outside_input) {
110   LoadCurrentCharacter(cp_offset, on_outside_input, true);
111 }
112 
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)113 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
114                                                 Label* on_end_of_input,
115                                                 bool check_bounds,
116                                                 int characters,
117                                                 int eats_at_least) {
118   // By default, eats_at_least = characters.
119   if (eats_at_least == kUseCharactersValue) {
120     eats_at_least = characters;
121   }
122 
123   LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
124                            eats_at_least);
125 }
126 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)127 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
128                                                       Label* on_no_match) {
129   return false;
130 }
131 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)132 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
133                                                        Zone* zone)
134     : RegExpMacroAssembler(isolate, zone) {}
135 
136 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
137 
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)138 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
139     int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
140     int eats_at_least) {
141   // It's possible to preload a small number of characters when each success
142   // path requires a large number of characters, but not the reverse.
143   DCHECK_GE(eats_at_least, characters);
144 
145   DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
146   if (check_bounds) {
147     if (cp_offset >= 0) {
148       CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
149     } else {
150       CheckPosition(cp_offset, on_end_of_input);
151     }
152   }
153   LoadCurrentCharacterUnchecked(cp_offset, characters);
154 }
155 
CanReadUnaligned()156 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
157   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
158 }
159 
160 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
161 
162 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)163 int NativeRegExpMacroAssembler::CheckStackGuardState(
164     Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
165     Address* return_address, Code re_code, Address* subject,
166     const byte** input_start, const byte** input_end) {
167   DisallowGarbageCollection no_gc;
168   Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
169   DCHECK_LE(re_code.raw_instruction_start(), old_pc);
170   DCHECK_LE(old_pc, re_code.raw_instruction_end());
171 
172   StackLimitCheck check(isolate);
173   bool js_has_overflowed = check.JsHasOverflowed();
174 
175   if (call_origin == RegExp::CallOrigin::kFromJs) {
176     // Direct calls from JavaScript can be interrupted in two ways:
177     // 1. A real stack overflow, in which case we let the caller throw the
178     //    exception.
179     // 2. The stack guard was used to interrupt execution for another purpose,
180     //    forcing the call through the runtime system.
181 
182     // Bug(v8:9540) Investigate why this method is called from JS although no
183     // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
184     // to continue execution normally.
185     if (js_has_overflowed) {
186       return EXCEPTION;
187     } else if (check.InterruptRequested()) {
188       return RETRY;
189     } else {
190       return 0;
191     }
192   }
193   DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
194 
195   // Prepare for possible GC.
196   HandleScope handles(isolate);
197   Handle<Code> code_handle(re_code, isolate);
198   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
199   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
200   int return_value = 0;
201 
202   {
203     DisableGCMole no_gc_mole;
204     if (js_has_overflowed) {
205       AllowGarbageCollection yes_gc;
206       isolate->StackOverflow();
207       return_value = EXCEPTION;
208     } else if (check.InterruptRequested()) {
209       AllowGarbageCollection yes_gc;
210       Object result = isolate->stack_guard()->HandleInterrupts();
211       if (result.IsException(isolate)) return_value = EXCEPTION;
212     }
213 
214     if (*code_handle != re_code) {  // Return address no longer valid
215       // Overwrite the return address on the stack.
216       intptr_t delta = code_handle->address() - re_code.address();
217       Address new_pc = old_pc + delta;
218       // TODO(v8:10026): avoid replacing a signed pointer.
219       PointerAuthentication::ReplacePC(return_address, new_pc, 0);
220     }
221   }
222 
223   // If we continue, we need to update the subject string addresses.
224   if (return_value == 0) {
225     // String encoding might have changed.
226     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
227         is_one_byte) {
228       // If we changed between an LATIN1 and an UC16 string, the specialized
229       // code cannot be used, and we need to restart regexp matching from
230       // scratch (including, potentially, compiling a new version of the code).
231       return_value = RETRY;
232     } else {
233       *subject = subject_handle->ptr();
234       intptr_t byte_length = *input_end - *input_start;
235       *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
236       *input_end = *input_start + byte_length;
237     }
238   }
239   return return_value;
240 }
241 
242 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)243 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
244                                       Handle<String> subject,
245                                       int* offsets_vector,
246                                       int offsets_vector_length,
247                                       int previous_index, Isolate* isolate) {
248   DCHECK(subject->IsFlat());
249   DCHECK_LE(0, previous_index);
250   DCHECK_LE(previous_index, subject->length());
251 
252   // No allocations before calling the regexp, but we can't use
253   // DisallowGarbageCollection, since regexps might be preempted, and another
254   // thread might do allocation anyway.
255 
256   String subject_ptr = *subject;
257   // Character offsets into string.
258   int start_offset = previous_index;
259   int char_length = subject_ptr.length() - start_offset;
260   int slice_offset = 0;
261 
262   // The string has been flattened, so if it is a cons string it contains the
263   // full string in the first part.
264   if (StringShape(subject_ptr).IsCons()) {
265     DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
266     subject_ptr = ConsString::cast(subject_ptr).first();
267   } else if (StringShape(subject_ptr).IsSliced()) {
268     SlicedString slice = SlicedString::cast(subject_ptr);
269     subject_ptr = slice.parent();
270     slice_offset = slice.offset();
271   }
272   if (StringShape(subject_ptr).IsThin()) {
273     subject_ptr = ThinString::cast(subject_ptr).actual();
274   }
275   // Ensure that an underlying string has the same representation.
276   bool is_one_byte = subject_ptr.IsOneByteRepresentation();
277   DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
278   // String is now either Sequential or External
279   int char_size_shift = is_one_byte ? 0 : 1;
280 
281   DisallowGarbageCollection no_gc;
282   const byte* input_start =
283       subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
284   int byte_length = char_length << char_size_shift;
285   const byte* input_end = input_start + byte_length;
286   return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
287                  offsets_vector_length, isolate, *regexp);
288 }
289 
290 // Returns a {Result} sentinel, or the number of successful matches.
291 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
292 // the signature of the interpreter. We should get rid of JS objects passed to
293 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)294 int NativeRegExpMacroAssembler::Execute(
295     String input,  // This needs to be the unpacked (sliced, cons) string.
296     int start_offset, const byte* input_start, const byte* input_end,
297     int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
298   // Ensure that the minimum stack has been allocated.
299   RegExpStackScope stack_scope(isolate);
300   Address stack_base = stack_scope.stack()->stack_base();
301 
302   bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
303   Code code = Code::cast(regexp.Code(is_one_byte));
304   RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
305 
306   using RegexpMatcherSig = int(
307       Address input_string, int start_offset, const byte* input_start,
308       const byte* input_end, int* output, int output_size, Address stack_base,
309       int call_origin, Isolate* isolate, Address regexp);
310 
311   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
312   int result =
313       fn.Call(input.ptr(), start_offset, input_start, input_end, output,
314               output_size, stack_base, call_origin, isolate, regexp.ptr());
315   DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
316 
317   if (result == EXCEPTION && !isolate->has_pending_exception()) {
318     // We detected a stack overflow (on the backtrack stack) in RegExp code,
319     // but haven't created the exception yet. Additionally, we allow heap
320     // allocation because even though it invalidates {input_start} and
321     // {input_end}, we are about to return anyway.
322     AllowGarbageCollection allow_allocation;
323     isolate->StackOverflow();
324   }
325   return result;
326 }
327 
328 #endif  // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
329 
330 // clang-format off
331 const byte NativeRegExpMacroAssembler::word_character_map[] = {
332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 
337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
340     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
341 
342     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
343     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
344     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
345     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
346 
347     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
348     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
349     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
350     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
351     // Latin-1 range
352     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
353     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
354     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
355     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
356 
357     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
358     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
359     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
360     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
361 
362     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
363     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
364     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
365     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
366 
367     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
368     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
369     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
370     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
371 };
372 // clang-format on
373 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)374 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
375                                               Address* stack_base,
376                                               Isolate* isolate) {
377   RegExpStack* regexp_stack = isolate->regexp_stack();
378   size_t size = regexp_stack->stack_capacity();
379   Address old_stack_base = regexp_stack->stack_base();
380   DCHECK(old_stack_base == *stack_base);
381   DCHECK(stack_pointer <= old_stack_base);
382   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
383   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
384   if (new_stack_base == kNullAddress) {
385     return kNullAddress;
386   }
387   *stack_base = new_stack_base;
388   intptr_t stack_content_size = old_stack_base - stack_pointer;
389   return new_stack_base - stack_content_size;
390 }
391 
392 }  // namespace internal
393 }  // namespace v8
394