1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_H_
6 #define V8_REGEXP_REGEXP_H_
7 
8 #include "irregexp/imported/regexp-error.h"
9 #include "irregexp/RegExpShim.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 class RegExpNode;
15 class RegExpTree;
16 
17 enum class RegExpCompilationTarget : int { kBytecode, kNative };
18 
19 // TODO(jgruber): Do not expose in regexp.h.
20 // TODO(jgruber): Consider splitting between ParseData and CompileData.
21 struct RegExpCompileData {
22   // The parsed AST as produced by the RegExpParser.
23   RegExpTree* tree = nullptr;
24 
25   // The compiled Node graph as produced by RegExpTree::ToNode methods.
26   RegExpNode* node = nullptr;
27 
28   // Either the generated code as produced by the compiler or a trampoline
29   // to the interpreter.
30   Handle<Object> code;
31 
32   // True, iff the pattern is a 'simple' atom with zero captures. In other
33   // words, the pattern consists of a string with no metacharacters and special
34   // regexp features, and can be implemented as a standard string search.
35   bool simple = true;
36 
37   // True, iff the pattern is anchored at the start of the string with '^'.
38   bool contains_anchor = false;
39 
40   // Only use if the pattern contains named captures. If so, this contains a
41   // mapping of capture names to capture indices.
42   Handle<FixedArray> capture_name_map;
43 
44   // The error message. Only used if an error occurred during parsing or
45   // compilation.
46   RegExpError error = RegExpError::kNone;
47 
48   // The position at which the error was detected. Only used if an
49   // error occurred.
50   int error_pos = 0;
51 
52   // The number of capture groups, without the global capture \0.
53   int capture_count = 0;
54 
55   // The number of registers used by the generated code.
56   int register_count = 0;
57 
58   // The compilation target (bytecode or native code).
59   RegExpCompilationTarget compilation_target;
60 };
61 
62 class RegExp final : public AllStatic {
63  public:
64   // Whether the irregexp engine generates interpreter bytecode.
CanGenerateBytecode()65   static bool CanGenerateBytecode() {
66     return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
67   }
68 
69   // Parses the RegExp pattern and prepares the JSRegExp object with
70   // generic data and choice of implementation - as well as what
71   // the implementation wants to store in the data field.
72   // Returns false if compilation fails.
73   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
74       Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
75       JSRegExp::Flags flags, uint32_t backtrack_limit);
76 
77   // Ensures that a regexp is fully compiled and ready to be executed on a
78   // subject string.  Returns true on success. Return false on failure, and
79   // then an exception will be pending.
80   V8_WARN_UNUSED_RESULT static bool EnsureFullyCompiled(Isolate* isolate,
81                                                         Handle<JSRegExp> re,
82                                                         Handle<String> subject);
83 
84   enum CallOrigin : int {
85     kFromRuntime = 0,
86     kFromJs = 1,
87   };
88 
89   enum class ExecQuirks {
90     kNone,
91     // Used to work around an issue in the RegExpPrototypeSplit fast path,
92     // which diverges from the spec by not creating a sticky copy of the RegExp
93     // instance and calling `exec` in a loop. If called in this context, we
94     // must not update the last_match_info on a successful match at the subject
95     // string end. See crbug.com/1075514 for more information.
96     kTreatMatchAtEndAsFailure,
97   };
98 
99   // See ECMA-262 section 15.10.6.2.
100   // This function calls the garbage collector if necessary.
101   V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
102       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
103       int index, Handle<RegExpMatchInfo> last_match_info,
104       ExecQuirks exec_quirks = ExecQuirks::kNone);
105 
106   V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object>
107   ExperimentalOneshotExec(Isolate* isolate, Handle<JSRegExp> regexp,
108                           Handle<String> subject, int index,
109                           Handle<RegExpMatchInfo> last_match_info,
110                           ExecQuirks exec_quirks = ExecQuirks::kNone);
111 
112   // Integral return values used throughout regexp code layers.
113   static constexpr int kInternalRegExpFailure = 0;
114   static constexpr int kInternalRegExpSuccess = 1;
115   static constexpr int kInternalRegExpException = -1;
116   static constexpr int kInternalRegExpRetry = -2;
117   static constexpr int kInternalRegExpFallbackToExperimental = -3;
118   static constexpr int kInternalRegExpSmallestResult = -3;
119 
120   enum IrregexpResult : int32_t {
121     RE_FAILURE = kInternalRegExpFailure,
122     RE_SUCCESS = kInternalRegExpSuccess,
123     RE_EXCEPTION = kInternalRegExpException,
124     RE_RETRY = kInternalRegExpRetry,
125     RE_FALLBACK_TO_EXPERIMENTAL = kInternalRegExpFallbackToExperimental,
126   };
127 
128   // Set last match info.  If match is nullptr, then setting captures is
129   // omitted.
130   static Handle<RegExpMatchInfo> SetLastMatchInfo(
131       Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
132       Handle<String> subject, int capture_count, int32_t* match);
133 
134   V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
135                                                   RegExpCompileData* input,
136                                                   JSRegExp::Flags flags,
137                                                   Handle<String> pattern,
138                                                   Handle<String> sample_subject,
139                                                   bool is_one_byte);
140 
141   V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
142                                                    RegExpNode* node);
143 
144   static const int kRegExpTooLargeToOptimize = 20 * KB;
145 
146   V8_WARN_UNUSED_RESULT
147   static MaybeHandle<Object> ThrowRegExpException(Isolate* isolate,
148                                                   Handle<JSRegExp> re,
149                                                   Handle<String> pattern,
150                                                   RegExpError error);
151   static void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
152                                    RegExpError error_text);
153 
154   static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
155 };
156 
157 // Uses a special global mode of irregexp-generated code to perform a global
158 // search and return multiple results at once. As such, this is essentially an
159 // iterator over multiple results (retrieved batch-wise in advance).
160 class RegExpGlobalCache final {
161  public:
162   RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
163                     Isolate* isolate);
164 
165   ~RegExpGlobalCache();
166 
167   // Fetch the next entry in the cache for global regexp match results.
168   // This does not set the last match info.  Upon failure, nullptr is
169   // returned. The cause can be checked with Result().  The previous result is
170   // still in available in memory when a failure happens.
171   int32_t* FetchNext();
172 
173   int32_t* LastSuccessfulMatch();
174 
HasException()175   bool HasException() { return num_matches_ < 0; }
176 
177  private:
178   int AdvanceZeroLength(int last_index);
179 
180   int num_matches_;
181   int max_matches_;
182   int current_match_index_;
183   int registers_per_match_;
184   // Pointer to the last set of captures.
185   int32_t* register_array_;
186   int register_array_size_;
187   Handle<JSRegExp> regexp_;
188   Handle<String> subject_;
189   Isolate* isolate_;
190 };
191 
192 // Caches results for specific regexp queries on the isolate. At the time of
193 // writing, this is used during global calls to RegExp.prototype.exec and
194 // @@split.
195 class RegExpResultsCache final : public AllStatic {
196  public:
197   enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
198 
199   // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
200   // On success, the returned result is guaranteed to be a COW-array.
201   static Object Lookup(Heap* heap, String key_string, Object key_pattern,
202                        FixedArray* last_match_out, ResultsCacheType type);
203   // Attempt to add value_array to the cache specified by type.  On success,
204   // value_array is turned into a COW-array.
205   static void Enter(Isolate* isolate, Handle<String> key_string,
206                     Handle<Object> key_pattern, Handle<FixedArray> value_array,
207                     Handle<FixedArray> last_match_cache, ResultsCacheType type);
208   static void Clear(FixedArray cache);
209 
210   static constexpr int kRegExpResultsCacheSize = 0x100;
211 
212  private:
213   static constexpr int kStringOffset = 0;
214   static constexpr int kPatternOffset = 1;
215   static constexpr int kArrayOffset = 2;
216   static constexpr int kLastMatchOffset = 3;
217   static constexpr int kArrayEntriesPerCacheEntry = 4;
218 };
219 
220 }  // namespace internal
221 }  // namespace v8
222 
223 #endif  // V8_REGEXP_REGEXP_H_
224