1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "new-regexp/regexp-macro-assembler.h"
6 
7 #include "new-regexp/regexp-stack.h"
8 
9 #ifdef V8_INTL_SUPPORT
10 #include "unicode/uchar.h"
11 #include "unicode/unistr.h"
12 #endif  // V8_INTL_SUPPORT
13 
14 namespace v8 {
15 namespace internal {
16 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)17 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
18     : slow_safe_compiler_(false),
19       global_mode_(NOT_GLOBAL),
20       isolate_(isolate),
21       zone_(zone) {}
22 
23 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
24 
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)25 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
26                                                      Address byte_offset2,
27                                                      size_t byte_length,
28                                                      Isolate* isolate) {
29   // This function is not allowed to cause a garbage collection.
30   // A GC might move the calling generated code and invalidate the
31   // return address on the stack.
32   DCHECK_EQ(0, byte_length % 2);
33 
34 #ifdef V8_INTL_SUPPORT
35   int32_t length = (int32_t)(byte_length >> 1);
36   icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
37                                length);
38   return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
39                                length, U_FOLD_CASE_DEFAULT) == 0;
40 #else
41   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
42   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
43   size_t length = byte_length >> 1;
44   DCHECK_NOT_NULL(isolate);
45   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
46       isolate->regexp_macro_assembler_canonicalize();
47   for (size_t i = 0; i < length; i++) {
48     unibrow::uchar c1 = substring1[i];
49     unibrow::uchar c2 = substring2[i];
50     if (c1 != c2) {
51       unibrow::uchar s1[1] = {c1};
52       canonicalize->get(c1, '\0', s1);
53       if (s1[0] != c2) {
54         unibrow::uchar s2[1] = {c2};
55         canonicalize->get(c2, '\0', s2);
56         if (s1[0] != s2[0]) {
57           return 0;
58         }
59       }
60     }
61   }
62   return 1;
63 #endif  // V8_INTL_SUPPORT
64 }
65 
66 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)67 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
68                                                    Label* on_failure) {
69   Label ok;
70   // Check that current character is not a trail surrogate.
71   LoadCurrentCharacter(cp_offset, &ok);
72   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
73   // Check that previous character is not a lead surrogate.
74   LoadCurrentCharacter(cp_offset - 1, &ok);
75   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
76   Bind(&ok);
77 }
78 
CheckPosition(int cp_offset,Label * on_outside_input)79 void RegExpMacroAssembler::CheckPosition(int cp_offset,
80                                          Label* on_outside_input) {
81   LoadCurrentCharacter(cp_offset, on_outside_input, true);
82 }
83 
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)84 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
85                                                 Label* on_end_of_input,
86                                                 bool check_bounds,
87                                                 int characters,
88                                                 int eats_at_least) {
89   // By default, eats_at_least = characters.
90   if (eats_at_least == kUseCharactersValue) {
91     eats_at_least = characters;
92   }
93 
94   LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
95                            eats_at_least);
96 }
97 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)98 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
99                                                       Label* on_no_match) {
100   return false;
101 }
102 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)103 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
104                                                        Zone* zone)
105     : RegExpMacroAssembler(isolate, zone) {}
106 
107 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
108 
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)109 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
110     int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
111     int eats_at_least) {
112   // It's possible to preload a small number of characters when each success
113   // path requires a large number of characters, but not the reverse.
114   DCHECK_GE(eats_at_least, characters);
115 
116   DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
117   if (check_bounds) {
118     if (cp_offset >= 0) {
119       CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
120     } else {
121       CheckPosition(cp_offset, on_end_of_input);
122     }
123   }
124   LoadCurrentCharacterUnchecked(cp_offset, characters);
125 }
126 
CanReadUnaligned()127 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
128   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
129 }
130 
131 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
132 
133 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)134 int NativeRegExpMacroAssembler::CheckStackGuardState(
135     Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
136     Address* return_address, Code re_code, Address* subject,
137     const byte** input_start, const byte** input_end) {
138   DisallowHeapAllocation no_gc;
139   Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
140   DCHECK_LE(re_code.raw_instruction_start(), old_pc);
141   DCHECK_LE(old_pc, re_code.raw_instruction_end());
142 
143   StackLimitCheck check(isolate);
144   bool js_has_overflowed = check.JsHasOverflowed();
145 
146   if (call_origin == RegExp::CallOrigin::kFromJs) {
147     // Direct calls from JavaScript can be interrupted in two ways:
148     // 1. A real stack overflow, in which case we let the caller throw the
149     //    exception.
150     // 2. The stack guard was used to interrupt execution for another purpose,
151     //    forcing the call through the runtime system.
152 
153     // Bug(v8:9540) Investigate why this method is called from JS although no
154     // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
155     // to continue execution normally.
156     if (js_has_overflowed) {
157       return EXCEPTION;
158     } else if (check.InterruptRequested()) {
159       return RETRY;
160     } else {
161       return 0;
162     }
163   }
164   DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
165 
166   // Prepare for possible GC.
167   HandleScope handles(isolate);
168   Handle<Code> code_handle(re_code, isolate);
169   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
170   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
171   int return_value = 0;
172 
173   if (js_has_overflowed) {
174     AllowHeapAllocation yes_gc;
175     isolate->StackOverflow();
176     return_value = EXCEPTION;
177   } else if (check.InterruptRequested()) {
178     AllowHeapAllocation yes_gc;
179     Object result = isolate->stack_guard()->HandleInterrupts();
180     if (result.IsException(isolate)) return_value = EXCEPTION;
181   }
182 
183   if (*code_handle != re_code) {  // Return address no longer valid
184     // Overwrite the return address on the stack.
185     intptr_t delta = code_handle->address() - re_code.address();
186     Address new_pc = old_pc + delta;
187     // TODO(v8:10026): avoid replacing a signed pointer.
188     PointerAuthentication::ReplacePC(return_address, new_pc, 0);
189   }
190 
191   // If we continue, we need to update the subject string addresses.
192   if (return_value == 0) {
193     // String encoding might have changed.
194     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
195         is_one_byte) {
196       // If we changed between an LATIN1 and an UC16 string, the specialized
197       // code cannot be used, and we need to restart regexp matching from
198       // scratch (including, potentially, compiling a new version of the code).
199       return_value = RETRY;
200     } else {
201       *subject = subject_handle->ptr();
202       intptr_t byte_length = *input_end - *input_start;
203       *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
204       *input_end = *input_start + byte_length;
205     }
206   }
207   return return_value;
208 }
209 
210 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)211 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
212                                       Handle<String> subject,
213                                       int* offsets_vector,
214                                       int offsets_vector_length,
215                                       int previous_index, Isolate* isolate) {
216   DCHECK(subject->IsFlat());
217   DCHECK_LE(0, previous_index);
218   DCHECK_LE(previous_index, subject->length());
219 
220   // No allocations before calling the regexp, but we can't use
221   // DisallowHeapAllocation, since regexps might be preempted, and another
222   // thread might do allocation anyway.
223 
224   String subject_ptr = *subject;
225   // Character offsets into string.
226   int start_offset = previous_index;
227   int char_length = subject_ptr.length() - start_offset;
228   int slice_offset = 0;
229 
230   // The string has been flattened, so if it is a cons string it contains the
231   // full string in the first part.
232   if (StringShape(subject_ptr).IsCons()) {
233     DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
234     subject_ptr = ConsString::cast(subject_ptr).first();
235   } else if (StringShape(subject_ptr).IsSliced()) {
236     SlicedString slice = SlicedString::cast(subject_ptr);
237     subject_ptr = slice.parent();
238     slice_offset = slice.offset();
239   }
240   if (StringShape(subject_ptr).IsThin()) {
241     subject_ptr = ThinString::cast(subject_ptr).actual();
242   }
243   // Ensure that an underlying string has the same representation.
244   bool is_one_byte = subject_ptr.IsOneByteRepresentation();
245   DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
246   // String is now either Sequential or External
247   int char_size_shift = is_one_byte ? 0 : 1;
248 
249   DisallowHeapAllocation no_gc;
250   const byte* input_start =
251       subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
252   int byte_length = char_length << char_size_shift;
253   const byte* input_end = input_start + byte_length;
254   return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
255                  offsets_vector_length, isolate, *regexp);
256 }
257 
258 // Returns a {Result} sentinel, or the number of successful matches.
259 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
260 // the signature of the interpreter. We should get rid of JS objects passed to
261 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)262 int NativeRegExpMacroAssembler::Execute(
263     String input,  // This needs to be the unpacked (sliced, cons) string.
264     int start_offset, const byte* input_start, const byte* input_end,
265     int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
266   // Ensure that the minimum stack has been allocated.
267   RegExpStackScope stack_scope(isolate);
268   Address stack_base = stack_scope.stack()->stack_base();
269 
270   bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
271   Code code = Code::cast(regexp.Code(is_one_byte));
272   RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
273 
274   using RegexpMatcherSig = int(
275       Address input_string, int start_offset,  // NOLINT(readability/casting)
276       const byte* input_start, const byte* input_end, int* output,
277       int output_size, Address stack_base, int call_origin, Isolate* isolate,
278       Address regexp);
279 
280   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
281   int result =
282       fn.Call(input.ptr(), start_offset, input_start, input_end, output,
283               output_size, stack_base, call_origin, isolate, regexp.ptr());
284   DCHECK(result >= RETRY);
285 
286   if (result == EXCEPTION && !isolate->has_pending_exception()) {
287     // We detected a stack overflow (on the backtrack stack) in RegExp code,
288     // but haven't created the exception yet. Additionally, we allow heap
289     // allocation because even though it invalidates {input_start} and
290     // {input_end}, we are about to return anyway.
291     AllowHeapAllocation allow_allocation;
292     isolate->StackOverflow();
293   }
294   return result;
295 }
296 
297 #endif  // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
298 
299 // clang-format off
300 const byte NativeRegExpMacroAssembler::word_character_map[] = {
301     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
305 
306     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
308     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
309     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
310 
311     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
312     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
313     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
314     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
315 
316     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
317     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
318     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
319     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
320     // Latin-1 range
321     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325 
326     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330 
331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
335 
336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
340 };
341 // clang-format on
342 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)343 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
344                                               Address* stack_base,
345                                               Isolate* isolate) {
346   RegExpStack* regexp_stack = isolate->regexp_stack();
347   size_t size = regexp_stack->stack_capacity();
348   Address old_stack_base = regexp_stack->stack_base();
349   DCHECK(old_stack_base == *stack_base);
350   DCHECK(stack_pointer <= old_stack_base);
351   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
352   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
353   if (new_stack_base == kNullAddress) {
354     return kNullAddress;
355   }
356   *stack_base = new_stack_base;
357   intptr_t stack_content_size = old_stack_base - stack_pointer;
358   return new_stack_base - stack_content_size;
359 }
360 
361 }  // namespace internal
362 }  // namespace v8
363