1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <limits.h>
6 #include <stddef.h>
7 #include <stdint.h>
8 
9 #include <functional>
10 #include <string>
11 
12 #include "include/v8-exception.h"
13 #include "include/v8-isolate.h"
14 #include "include/v8-local-handle.h"
15 #include "include/v8-primitive.h"
16 #include "include/v8-script.h"
17 #include "src/heap/factory.h"
18 #include "src/objects/objects-inl.h"
19 #include "src/regexp/regexp.h"
20 #include "test/fuzzer/fuzzer-support.h"
21 
22 // This is a hexdump of test/fuzzer/regexp_builtins/mjsunit.js generated using
23 // `xxd -i mjsunit.js`. It contains the `assertEquals` JS function used below.
24 #include "test/fuzzer/regexp_builtins/mjsunit.js.h"
25 
26 namespace v8 {
27 namespace internal {
28 namespace {
29 
30 constexpr bool kVerbose = false;  // For debugging, verbose error messages.
31 constexpr uint32_t kRegExpBuiltinsFuzzerHashSeed = 83;
32 
33 #define REGEXP_BUILTINS(V)   \
34   V(Exec, exec)              \
35   V(Match, Symbol.match)     \
36   V(Replace, Symbol.replace) \
37   V(Search, Symbol.search)   \
38   V(Split, Symbol.split)     \
39   V(Test, test)
40 
41 struct FuzzerArgs {
FuzzerArgsv8::internal::__anon8c079ef10111::FuzzerArgs42   FuzzerArgs(const uint8_t* input_data, size_t input_length,
43              v8::Local<v8::Context> context, Isolate* isolate)
44       : input_cursor(0),
45         input_data(input_data),
46         input_length(input_length),
47         context(context),
48         isolate(isolate) {}
49 
50   size_t input_cursor;
51   const uint8_t* const input_data;
52   const size_t input_length;
53   v8::Local<v8::Context> context;
54   Isolate* const isolate;
55 };
56 
57 enum RegExpBuiltin {
58 #define CASE(name, ...) kRegExpPrototype##name,
59   REGEXP_BUILTINS(CASE)
60 #undef CASE
61       kRegExpBuiltinCount,
62 };
63 
64 #define CASE(name, ...) void TestRegExpPrototype##name(FuzzerArgs* args);
REGEXP_BUILTINS(CASE)65 REGEXP_BUILTINS(CASE)
66 #undef CASE
67 
68 v8::Local<v8::String> v8_str(v8::Isolate* isolate, const char* s) {
69   return v8::String::NewFromUtf8(isolate, s).ToLocalChecked();
70 }
71 
CompileRun(v8::Local<v8::Context> context,const char * source)72 v8::MaybeLocal<v8::Value> CompileRun(v8::Local<v8::Context> context,
73                                      const char* source) {
74   v8::Local<v8::Script> script;
75   v8::MaybeLocal<v8::Script> maybe_script =
76       v8::Script::Compile(context, v8_str(context->GetIsolate(), source));
77 
78   if (!maybe_script.ToLocal(&script)) return v8::MaybeLocal<v8::Value>();
79   return script->Run(context);
80 }
81 
RandomByte(FuzzerArgs * args)82 uint8_t RandomByte(FuzzerArgs* args) {
83   // Silently wraps to the beginning of input data. Ideally, input data should
84   // be long enough to avoid that.
85   const size_t index = args->input_cursor;
86   CHECK(index < args->input_length);
87   args->input_cursor = (index + 1) % args->input_length;
88   return args->input_data[index];
89 }
90 
CompileMjsunit(const FuzzerArgs * args)91 void CompileMjsunit(const FuzzerArgs* args) {
92   std::string source(
93       reinterpret_cast<const char*>(test_fuzzer_regexp_builtins_mjsunit_js),
94       test_fuzzer_regexp_builtins_mjsunit_js_len);
95   CompileRun(args->context, source.c_str()).ToLocalChecked();
96 }
97 
NaiveEscape(const std::string & input,char escaped_char)98 std::string NaiveEscape(const std::string& input, char escaped_char) {
99   std::string out;
100   for (size_t i = 0; i < input.size(); i++) {
101     // Just omit newlines and \0 chars and naively replace other escaped chars.
102     const char c = input[i];
103     if (c == '\r' || c == '\n' || c == '\0') continue;
104     out += (input[i] == escaped_char) ? '_' : c;
105   }
106   // Disallow trailing backslashes as they mess with our naive source string
107   // concatenation.
108   if (!out.empty() && out.back() == '\\') out.back() = '_';
109 
110   return out;
111 }
112 
GenerateRandomString(FuzzerArgs * args,size_t length)113 std::string GenerateRandomString(FuzzerArgs* args, size_t length) {
114   // Limited to an ASCII subset for now.
115   std::string s(length, '\0');
116   for (size_t i = 0; i < length; i++) {
117     s[i] = static_cast<char>((RandomByte(args) % 0x5F) + 0x20);
118   }
119 
120   return s;
121 }
122 
GenerateRandomPattern(FuzzerArgs * args)123 std::string GenerateRandomPattern(FuzzerArgs* args) {
124   const int kMaxPatternLength = 16;
125   std::string s =
126       GenerateRandomString(args, (RandomByte(args) % kMaxPatternLength) + 1);
127   // A leading '*' would be a comment instead of a regexp literal.
128   if (s[0] == '*') s[0] = '.';
129   return s;
130 }
131 
PickRandomPresetPattern(FuzzerArgs * args)132 std::string PickRandomPresetPattern(FuzzerArgs* args) {
133   static const char* preset_patterns[] = {
134       ".",         // Always matches.
135       "\\P{Any}",  // Never matches.
136       "^",         // Zero-width assertion, matches once.
137       "(?=.)",     // Zero-width assertion, matches at every position.
138       "\\b",       // Zero-width assertion, matches at each word boundary.
139       "()",      // Zero-width assertion, matches at every position with groups.
140       "(?<a>)",  // Likewise but with named groups.
141       "((((.).).).)", "(?<a>(?<b>(?<c>(?<d>.).).).)",
142       // Copied from
143       // https://cs.chromium.org/chromium/src/testing/libfuzzer/fuzzers/dicts/regexp.dict
144       "?", "abc", "()", "[]", "abc|def", "abc|def|ghi", "^xxx$",
145       "ab\\b\\d\\bcd", "\\w|\\d", "a*?", "abc+", "abc+?", "xyz?", "xyz??",
146       "xyz{0,1}", "xyz{0,1}?", "xyz{93}", "xyz{1,32}", "xyz{1,32}?", "xyz{1,}",
147       "xyz{1,}?", "a\\fb\\nc\\rd\\te\\vf", "a\\nb\\bc", "(?:foo)", "(?: foo )",
148       "foo|(bar|baz)|quux", "foo(?=bar)baz", "foo(?!bar)baz", "foo(?<=bar)baz",
149       "foo(?<!bar)baz", "()", "(?=)", "[]", "[x]", "[xyz]", "[a-zA-Z0-9]",
150       "[-123]", "[^123]", "]", "}", "[a-b-c]", "[x\\dz]", "[\\d-z]",
151       "[\\d-\\d]", "[z-\\d]", "\\cj\\cJ\\ci\\cI\\ck\\cK", "\\c!", "\\c_",
152       "\\c~", "[\\c!]", "[\\c_]", "[\\c~]", "[\\ca]", "[\\cz]", "[\\cA]",
153       "[\\cZ]", "[\\c1]", "\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ",
154       "[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "\\8", "\\9", "\\11", "\\11a",
155       "\\011", "\\118", "\\111", "\\1111", "(x)(x)(x)\\1", "(x)(x)(x)\\2",
156       "(x)(x)(x)\\3", "(x)(x)(x)\\4", "(x)(x)(x)\\1*", "(x)(x)(x)\\3*",
157       "(x)(x)(x)\\4*", "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
158       "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", "(a)\\1", "(a\\1)", "(\\1a)",
159       "(\\2)(\\1)", "(?=a){0,10}a", "(?=a){1,10}a", "(?=a){9,10}a", "(?!a)?a",
160       "\\1(a)", "(?!(a))\\1", "(?!\\1(a\\1)\\1)\\1",
161       "\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1", "[\\0]", "[\\11]", "[\\11a]",
162       "[\\011]", "[\\00011]", "[\\118]", "[\\111]", "[\\1111]", "\\x60",
163       "\\x3z", "\\c", "\\u0034", "\\u003z", "foo[z]*", "\\u{12345}",
164       "\\u{12345}\\u{23456}", "\\u{12345}{3}", "\\u{12345}*", "\\ud808\\udf45*",
165       "[\\ud808\\udf45-\\ud809\\udccc]", "a", "a|b", "a\\n", "a$", "a\\b!",
166       "a\\Bb", "a*?", "a?", "a??", "a{0,1}?", "a{1,2}?", "a+?", "(a)", "(a)\\1",
167       "(\\1a)", "\\1(a)", "a\\s", "a\\S", "a\\D", "a\\w", "a\\W", "a.", "a\\q",
168       "a[a]", "a[^a]", "a[a-z]", "a(?:b)", "a(?=b)", "a(?!b)", "\\x60",
169       "\\u0060", "\\cA", "\\q", "\\1112", "(a)\\1", "(?!a)?a\\1",
170       "(?:(?=a))a\\1", "a{}", "a{,}", "a{", "a{z}", "a{12z}", "a{12,",
171       "a{12,3b", "{}", "{,}", "{", "{z}", "{1z}", "{12,", "{12,3b", "a", "abc",
172       "a[bc]d", "a|bc", "ab|c", "a||bc", "(?:ab)", "(?:ab|cde)", "(?:ab)|cde",
173       "(ab)", "(ab|cde)", "(ab)\\1", "(ab|cde)\\1", "(?:ab)?", "(?:ab)+", "a?",
174       "a+", "a??", "a*?", "a+?", "(?:a?)?", "(?:a+)?", "(?:a?)+", "(?:a*)+",
175       "(?:a+)+", "(?:a?)*", "(?:a*)*", "(?:a+)*", "a{0}", "(?:a+){0,0}", "a*b",
176       "a+b", "a*b|c", "a+b|c", "(?:a{5,1000000}){3,1000000}", "(?:ab){4,7}",
177       "a\\bc", "a\\sc", "a\\Sc", "a(?=b)c", "a(?=bbb|bb)c", "a(?!bbb|bb)c",
178       "\xe2\x81\xa3", "[\xe2\x81\xa3]", "\xed\xb0\x80", "\xed\xa0\x80",
179       "(\xed\xb0\x80)\x01", "((\xed\xa0\x80))\x02", "\xf0\x9f\x92\xa9", "\x01",
180       "\x0f", "[-\xf0\x9f\x92\xa9]+", "[\xf0\x9f\x92\xa9-\xf4\x8f\xbf\xbf]",
181       "(?<=)", "(?<=a)", "(?<!)", "(?<!a)", "(?<a>)", "(?<a>.)",
182       "(?<a>.)\\k<a>", "\\p{Script=Greek}", "\\P{sc=Greek}",
183       "\\p{Script_Extensions=Greek}", "\\P{scx=Greek}",
184       "\\p{General_Category=Decimal_Number}", "\\P{gc=Decimal_Number}",
185       "\\p{gc=Nd}", "\\P{Decimal_Number}", "\\p{Nd}", "\\P{Any}",
186       "\\p{Changes_When_NFKC_Casefolded}",
187   };
188   static constexpr int preset_pattern_count = arraysize(preset_patterns);
189   STATIC_ASSERT(preset_pattern_count < 0xFF);
190 
191   return std::string(preset_patterns[RandomByte(args) % preset_pattern_count]);
192 }
193 
PickPattern(FuzzerArgs * args)194 std::string PickPattern(FuzzerArgs* args) {
195   if ((RandomByte(args) & 3) == 0) {
196     return NaiveEscape(GenerateRandomPattern(args), '/');
197   } else {
198     return PickRandomPresetPattern(args);
199   }
200 }
201 
GenerateRandomString(FuzzerArgs * args)202 std::string GenerateRandomString(FuzzerArgs* args) {
203   const int kMaxStringLength = 64;
204   return GenerateRandomString(args, RandomByte(args) % kMaxStringLength);
205 }
206 
PickSubjectString(FuzzerArgs * args)207 std::string PickSubjectString(FuzzerArgs* args) {
208   if ((RandomByte(args) & 0xF) == 0) {
209     // Sometimes we have a two-byte subject string.
210     return "f\\uD83D\\uDCA9ba\\u2603";
211   } else {
212     return NaiveEscape(GenerateRandomString(args), '\'');
213   }
214 }
215 
PickReplacementForReplace(FuzzerArgs * args)216 std::string PickReplacementForReplace(FuzzerArgs* args) {
217   static const char* candidates[] = {
218       "'X'",
219       "'$1$2$3'",
220       "'$$$&$`$\\'$1'",
221       "() => 'X'",
222       "(arg0, arg1, arg2, arg3, arg4) => arg0 + arg1 + arg2 + arg3 + arg4",
223       "() => 42",
224   };
225   static const int candidate_count = arraysize(candidates);
226 
227   if ((RandomByte(args) & 1) == 0) {
228     return candidates[RandomByte(args) % candidate_count];
229   } else {
230     return std::string("'") + NaiveEscape(GenerateRandomString(args), '\'') +
231            std::string("'");
232   }
233 }
234 
PickLimitForSplit(FuzzerArgs * args)235 std::string PickLimitForSplit(FuzzerArgs* args) {
236   // clang-format off
237   switch (RandomByte(args) & 0x3) {
238   case 0: return "undefined";
239   case 1: return "'not a number'";
240   case 2: return std::to_string(Smi::kMaxValue + RandomByte(args));
241   case 3: return std::to_string(RandomByte(args));
242   default: UNREACHABLE();
243   }  // clang-format on
244 }
245 
GenerateRandomFlags(FuzzerArgs * args)246 std::string GenerateRandomFlags(FuzzerArgs* args) {
247   // TODO(mbid,v8:10765): Find a way to generate the kLinear flag sometimes,
248   // but only for patterns that are supported by the experimental engine.
249   constexpr size_t kFlagCount = JSRegExp::kFlagCount;
250   CHECK_EQ(JSRegExp::kHasIndices, 1 << (kFlagCount - 1));
251   CHECK_EQ(JSRegExp::kLinear, 1 << (kFlagCount - 2));
252   CHECK_EQ(JSRegExp::kDotAll, 1 << (kFlagCount - 3));
253   STATIC_ASSERT((1 << kFlagCount) - 1 <= 0xFF);
254 
255   const size_t flags = RandomByte(args) & ((1 << kFlagCount) - 1);
256 
257   int cursor = 0;
258   char buffer[kFlagCount] = {'\0'};
259 
260   if (flags & JSRegExp::kGlobal) buffer[cursor++] = 'g';
261   if (flags & JSRegExp::kIgnoreCase) buffer[cursor++] = 'i';
262   if (flags & JSRegExp::kMultiline) buffer[cursor++] = 'm';
263   if (flags & JSRegExp::kSticky) buffer[cursor++] = 'y';
264   if (flags & JSRegExp::kUnicode) buffer[cursor++] = 'u';
265   if (flags & JSRegExp::kDotAll) buffer[cursor++] = 's';
266   if (flags & JSRegExp::kHasIndices) buffer[cursor++] = 'd';
267 
268   return std::string(buffer, cursor);
269 }
270 
GenerateRandomLastIndex(FuzzerArgs * args)271 std::string GenerateRandomLastIndex(FuzzerArgs* args) {
272   static const char* candidates[] = {
273       "undefined",  "-1",         "0",
274       "1",          "2",          "3",
275       "4",          "5",          "6",
276       "7",          "8",          "9",
277       "50",         "4294967296", "2147483647",
278       "2147483648", "NaN",        "Not a Number",
279   };
280   static const int candidate_count = arraysize(candidates);
281   return candidates[RandomByte(args) % candidate_count];
282 }
283 
RunTest(FuzzerArgs * args)284 void RunTest(FuzzerArgs* args) {
285   switch (RandomByte(args) % kRegExpBuiltinCount) {
286 #define CASE(name, ...)              \
287   case kRegExpPrototype##name:       \
288     TestRegExpPrototype##name(args); \
289     break;
290     REGEXP_BUILTINS(CASE)
291 #undef CASE
292     default:
293       UNREACHABLE();
294   }
295 }
296 
GenerateSourceString(FuzzerArgs * args,const std::string & test)297 std::string GenerateSourceString(FuzzerArgs* args, const std::string& test) {
298   std::string pattern = PickPattern(args);
299   std::string flags = GenerateRandomFlags(args);
300   std::string last_index = GenerateRandomLastIndex(args);
301   std::string subject = PickSubjectString(args);
302 
303   // clang-format off
304   std::stringstream ss;
305   ss << "function test() {\n"
306      << "  const re = /" << pattern<< "/"
307                          << flags << ";\n"
308      << "  re.lastIndex = " << last_index << ";\n"
309      << "  const str = '" << subject << "';\n"
310      << "  let result = null;\n"
311      << "  let exception = null;\n"
312      << "  try {\n"
313      << "    result = " << test << "\n"
314      << "  } catch (e) {\n"
315      << "    exception = e;\n"
316      << "  }\n"
317      << "  return { result: result, re: re, exception: exception };\n"
318      << "}\n"
319      << "%SetForceSlowPath(false);\n"
320      << "test();  // Run once ahead of time to compile the regexp.\n"
321      << "const fast = test();\n"
322      << "%SetForceSlowPath(true);\n"
323      << "const slow = test();\n"
324      << "%SetForceSlowPath(false);\n";
325   // clang-format on
326 
327   std::string source = ss.str();
328   if (kVerbose) {
329     fprintf(stderr, "Generated source:\n```\n%s\n```\n", source.c_str());
330   }
331 
332   return source;
333 }
334 
PrintExceptionMessage(v8::Isolate * isolate,v8::TryCatch * try_catch)335 void PrintExceptionMessage(v8::Isolate* isolate, v8::TryCatch* try_catch) {
336   CHECK(try_catch->HasCaught());
337   static const int kBufferLength = 256;
338   char buffer[kBufferLength + 1];
339   try_catch->Message()->Get()->WriteOneByte(
340       isolate, reinterpret_cast<uint8_t*>(&buffer[0]), 0, kBufferLength);
341   fprintf(stderr, "%s\n", buffer);
342 }
343 
ResultsAreIdentical(FuzzerArgs * args)344 bool ResultsAreIdentical(FuzzerArgs* args) {
345   std::string source =
346       "assertEquals(fast.exception, slow.exception);\n"
347       "assertEquals(fast.result, slow.result);\n"
348       "if (fast.result !== null) {\n"
349       "  assertEquals(fast.result.groups, slow.result.groups);\n"
350       "  assertEquals(fast.result.indices, slow.result.indices);\n"
351       "  if (fast.result.indices !== undefined) {\n"
352       "    assertEquals(fast.result.indices.groups,\n"
353       "                 slow.result.indices.groups);\n"
354       "  }\n"
355       "}\n"
356       "assertEquals(fast.re.lastIndex, slow.re.lastIndex);\n";
357 
358   v8::Local<v8::Value> result;
359   v8::Isolate* isolate = reinterpret_cast<v8::Isolate*>(args->isolate);
360   v8::TryCatch try_catch(isolate);
361   if (!CompileRun(args->context, source.c_str()).ToLocal(&result)) {
362     PrintExceptionMessage(isolate, &try_catch);
363     args->isolate->clear_pending_exception();
364     return false;
365   }
366 
367   return true;
368 }
369 
CompileRunAndVerify(FuzzerArgs * args,const std::string & source)370 void CompileRunAndVerify(FuzzerArgs* args, const std::string& source) {
371   v8::Local<v8::Value> result;
372   v8::Isolate* isolate = reinterpret_cast<v8::Isolate*>(args->isolate);
373   v8::TryCatch try_catch(isolate);
374   if (!CompileRun(args->context, source.c_str()).ToLocal(&result)) {
375     args->isolate->clear_pending_exception();
376     // No need to verify result if an exception was thrown here, since that
377     // implies a syntax error somewhere in the pattern or string. We simply
378     // ignore those.
379     if (kVerbose) {
380       PrintExceptionMessage(isolate, &try_catch);
381       fprintf(stderr, "Failed to run script:\n```\n%s\n```\n", source.c_str());
382     }
383     return;
384   }
385 
386   if (!ResultsAreIdentical(args)) {
387     uint32_t raw_hash_field = StringHasher::HashSequentialString(
388         args->input_data, static_cast<int>(args->input_length),
389         kRegExpBuiltinsFuzzerHashSeed);
390     FATAL("!ResultAreIdentical(args); RegExpBuiltinsFuzzerHash=%x",
391           raw_hash_field);
392   }
393 }
394 
TestRegExpPrototypeExec(FuzzerArgs * args)395 void TestRegExpPrototypeExec(FuzzerArgs* args) {
396   std::string test = "re.exec(str);";
397   std::string source = GenerateSourceString(args, test);
398   CompileRunAndVerify(args, source);
399 }
400 
TestRegExpPrototypeMatch(FuzzerArgs * args)401 void TestRegExpPrototypeMatch(FuzzerArgs* args) {
402   std::string test = "re[Symbol.match](str);";
403   std::string source = GenerateSourceString(args, test);
404   CompileRunAndVerify(args, source);
405 }
406 
TestRegExpPrototypeReplace(FuzzerArgs * args)407 void TestRegExpPrototypeReplace(FuzzerArgs* args) {
408   std::string replacement = PickReplacementForReplace(args);
409   std::string test = "re[Symbol.replace](str, " + replacement + ");";
410   std::string source = GenerateSourceString(args, test);
411   CompileRunAndVerify(args, source);
412 }
413 
TestRegExpPrototypeSearch(FuzzerArgs * args)414 void TestRegExpPrototypeSearch(FuzzerArgs* args) {
415   std::string test = "re[Symbol.search](str);";
416   std::string source = GenerateSourceString(args, test);
417   CompileRunAndVerify(args, source);
418 }
419 
TestRegExpPrototypeSplit(FuzzerArgs * args)420 void TestRegExpPrototypeSplit(FuzzerArgs* args) {
421   std::string limit = PickLimitForSplit(args);
422   std::string test = "re[Symbol.split](str, " + limit + ");";
423   std::string source = GenerateSourceString(args, test);
424   CompileRunAndVerify(args, source);
425 }
426 
TestRegExpPrototypeTest(FuzzerArgs * args)427 void TestRegExpPrototypeTest(FuzzerArgs* args) {
428   std::string test = "re.test(str);";
429   std::string source = GenerateSourceString(args, test);
430   CompileRunAndVerify(args, source);
431 }
432 
433 #undef REGEXP_BUILTINS
434 
435 }  // namespace
436 
LLVMFuzzerTestOneInput(const uint8_t * data,size_t size)437 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
438   if (size < 64) return 0;  // Need a minimal amount of randomness to do stuff.
439 
440   // Flag definitions.
441 
442   FLAG_allow_natives_syntax = true;
443 
444   // V8 setup.
445 
446   v8_fuzzer::FuzzerSupport* support = v8_fuzzer::FuzzerSupport::Get();
447   v8::Isolate* isolate = support->GetIsolate();
448   Isolate* i_isolate = reinterpret_cast<i::Isolate*>(isolate);
449   v8::Isolate::Scope isolate_scope(isolate);
450   v8::HandleScope handle_scope(isolate);
451   v8::Local<v8::Context> context = support->GetContext();
452   v8::Context::Scope context_scope(context);
453   v8::TryCatch try_catch(isolate);
454 
455   CHECK(!i_isolate->has_pending_exception());
456 
457   // And run.
458 
459   FuzzerArgs args(data, size, context, i_isolate);
460   CompileMjsunit(&args);
461   RunTest(&args);
462 
463   CHECK(!i_isolate->has_pending_exception());
464   return 0;
465 }
466 
467 }  // namespace internal
468 }  // namespace v8
469