1 /*
2 * base.c contributed by Mark Griffin
3 * Copyright (c) 2019-2020 K.Kosako
4 */
5 #include <stdio.h>
6 #include <unistd.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <fcntl.h>
12 #include <time.h>
13 #include "oniguruma.h"
14
15 #define PARSE_DEPTH_LIMIT 8
16 #define CALL_MAX_NEST_LEVEL 8
17 #define SUBEXP_CALL_LIMIT 500
18 #define BASE_RETRY_LIMIT 20000
19 #define BASE_LENGTH 2048
20 #define MATCH_STACK_LIMIT 10000000
21 #define MAX_REM_SIZE 1048576
22 #define MAX_SLOW_REM_SIZE 1024
23 #define SLOW_RETRY_LIMIT 2000
24
25 //#define EXEC_PRINT_INTERVAL 500000
26 //#define DUMP_DATA_INTERVAL 100000
27 //#define STAT_PATH "fuzzer.stat_log"
28
29 #define OPTIONS_AT_COMPILE (ONIG_OPTION_IGNORECASE | ONIG_OPTION_EXTEND | ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE | ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY | ONIG_OPTION_NEGATE_SINGLELINE | ONIG_OPTION_DONT_CAPTURE_GROUP | ONIG_OPTION_CAPTURE_GROUP | ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII | ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER | ONIG_OPTION_TEXT_SEGMENT_WORD )
30
31 #define OPTIONS_AT_RUNTIME (ONIG_OPTION_NOTBOL | ONIG_OPTION_NOTEOL | ONIG_OPTION_CHECK_VALIDITY_OF_STRING | ONIG_OPTION_NOT_BEGIN_STRING | ONIG_OPTION_NOT_END_STRING | ONIG_OPTION_NOT_BEGIN_POSITION)
32
33
34 #define ADJUST_LEN(enc, len) do {\
35 int mlen = ONIGENC_MBC_MINLEN(enc);\
36 if (mlen != 1) { len -= len % mlen; }\
37 } while (0)
38
39 typedef unsigned char uint8_t;
40
41 #ifdef DUMP_INPUT
42 static void
dump_input(unsigned char * data,size_t len)43 dump_input(unsigned char* data, size_t len)
44 {
45 static FILE* DumpFp;
46 static char end[] = { 'E', 'N', 'D' };
47
48 if (DumpFp == 0)
49 DumpFp = fopen("dump-input", "w");
50
51 fseek(DumpFp, 0, SEEK_SET);
52 fwrite(data, sizeof(unsigned char), len, DumpFp);
53 fwrite(end, sizeof(char), sizeof(end), DumpFp);
54 fflush(DumpFp);
55 }
56 #endif
57
58 #ifdef DUMP_DATA_INTERVAL
59 static void
dump_file(char * path,unsigned char * data,size_t len)60 dump_file(char* path, unsigned char* data, size_t len)
61 {
62 FILE* fp;
63
64 fp = fopen(path, "w");
65 fwrite(data, sizeof(unsigned char), len, fp);
66 fclose(fp);
67 }
68 #endif
69
70 #ifdef STANDALONE
71 #include <ctype.h>
72
73 static void
dump_data(FILE * fp,unsigned char * data,int len)74 dump_data(FILE* fp, unsigned char* data, int len)
75 {
76 int i;
77
78 fprintf(fp, "{\n");
79 for (i = 0; i < len; i++) {
80 unsigned char c = data[i];
81
82 if (isprint((int )c)) {
83 if (c == '\\')
84 fprintf(fp, " '\\\\'");
85 else
86 fprintf(fp, " '%c'", c);
87 }
88 else {
89 fprintf(fp, "0x%02x", (int )c);
90 }
91
92 if (i == len - 1) {
93 fprintf(fp, "\n");
94 }
95 else {
96 if (i % 8 == 7)
97 fprintf(fp, ",\n");
98 else
99 fprintf(fp, ", ");
100 }
101 }
102 fprintf(fp, "};\n");
103 }
104
105 #else
106
107 static void
output_current_time(FILE * fp)108 output_current_time(FILE* fp)
109 {
110 char d[64];
111 time_t t;
112
113 t = time(NULL);
114 strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t));
115
116 fprintf(fp, "%s", d);
117 }
118
119 #endif
120
121 static int
search(regex_t * reg,unsigned char * str,unsigned char * end,OnigOptionType options,int backward,int sl)122 search(regex_t* reg, unsigned char* str, unsigned char* end, OnigOptionType options, int backward, int sl)
123 {
124 int r;
125 unsigned char *start, *range;
126 OnigRegion *region;
127 unsigned int retry_limit;
128 size_t len;
129
130 region = onig_region_new();
131
132 len = (size_t )(end - str);
133 if (len < BASE_LENGTH) {
134 if (sl >= 2)
135 retry_limit = (unsigned int )SLOW_RETRY_LIMIT;
136 else
137 retry_limit = (unsigned int )BASE_RETRY_LIMIT;
138 }
139 else
140 retry_limit = (unsigned int )(BASE_RETRY_LIMIT * BASE_LENGTH / len);
141
142 #ifdef STANDALONE
143 fprintf(stdout, "retry limit: %u\n", retry_limit);
144 #endif
145
146 onig_set_retry_limit_in_search(retry_limit);
147 onig_set_match_stack_limit_size(MATCH_STACK_LIMIT);
148 onig_set_subexp_call_limit_in_search(SUBEXP_CALL_LIMIT);
149
150 if (backward != 0) {
151 start = end;
152 range = str;
153 }
154 else {
155 start = str;
156 range = end;
157 }
158
159 r = onig_search(reg, str, end, start, range, region, (options & OPTIONS_AT_RUNTIME));
160 if (r >= 0) {
161 #ifdef STANDALONE
162 int i;
163
164 fprintf(stdout, "match at %d (%s)\n", r,
165 ONIGENC_NAME(onig_get_encoding(reg)));
166 for (i = 0; i < region->num_regs; i++) {
167 fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
168 }
169 #endif
170 }
171 else if (r == ONIG_MISMATCH) {
172 #ifdef STANDALONE
173 fprintf(stdout, "search fail (%s)\n",
174 ONIGENC_NAME(onig_get_encoding(reg)));
175 #endif
176 }
177 else { /* error */
178 #ifdef STANDALONE
179 char s[ONIG_MAX_ERROR_MESSAGE_LEN];
180
181 onig_error_code_to_str((UChar* )s, r);
182 fprintf(stdout, "ERROR: %s\n", s);
183 fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
184 #endif
185 onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
186
187 if (r == ONIGERR_STACK_BUG ||
188 r == ONIGERR_UNDEFINED_BYTECODE ||
189 r == ONIGERR_UNEXPECTED_BYTECODE)
190 return -2;
191
192 return -1;
193 }
194
195 onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
196 return 0;
197 }
198
199 static long INPUT_COUNT;
200 static long EXEC_COUNT;
201 static long EXEC_COUNT_INTERVAL;
202 static long REGEX_SUCCESS_COUNT;
203 static long VALID_STRING_COUNT;
204
205 static int
exec(OnigEncoding enc,OnigOptionType options,OnigSyntaxType * syntax,char * apattern,char * apattern_end,char * astr,UChar * end,int backward,int sl)206 exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax,
207 char* apattern, char* apattern_end, char* astr, UChar* end, int backward,
208 int sl)
209 {
210 int r;
211 regex_t* reg;
212 OnigErrorInfo einfo;
213 UChar* pattern = (UChar* )apattern;
214 UChar* str = (UChar* )astr;
215 UChar* pattern_end = (UChar* )apattern_end;
216
217 EXEC_COUNT++;
218 EXEC_COUNT_INTERVAL++;
219
220 onig_initialize(&enc, 1);
221 #ifdef PARSE_DEPTH_LIMIT
222 onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT);
223 #endif
224 onig_set_subexp_call_max_nest_level(CALL_MAX_NEST_LEVEL);
225
226 r = onig_new(®, pattern, pattern_end,
227 (options & OPTIONS_AT_COMPILE), enc, syntax, &einfo);
228 if (r != ONIG_NORMAL) {
229 char s[ONIG_MAX_ERROR_MESSAGE_LEN];
230 onig_error_code_to_str((UChar* )s, r, &einfo);
231 #ifdef STANDALONE
232 fprintf(stdout, "ERROR: %s\n", s);
233 #endif
234 onig_end();
235
236 if (r == ONIGERR_PARSER_BUG ||
237 r == ONIGERR_STACK_BUG ||
238 r == ONIGERR_UNDEFINED_BYTECODE ||
239 r == ONIGERR_UNEXPECTED_BYTECODE) {
240 return -2;
241 }
242 else
243 return -1;
244 }
245 REGEX_SUCCESS_COUNT++;
246
247 r = search(reg, pattern, pattern_end, options, backward, sl);
248 if (r == -2) return -2;
249
250 if (onigenc_is_valid_mbc_string(enc, str, end) != 0) {
251 VALID_STRING_COUNT++;
252 r = search(reg, str, end, options, backward, sl);
253 if (r == -2) return -2;
254 }
255
256 onig_free(reg);
257 onig_end();
258 return 0;
259 }
260
261 static int
alloc_exec(OnigEncoding enc,OnigOptionType options,OnigSyntaxType * syntax,int backward,int pattern_size,size_t rem_size,unsigned char * data)262 alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax,
263 int backward, int pattern_size, size_t rem_size, unsigned char *data)
264 {
265 extern int onig_detect_can_be_slow_pattern(const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax);
266
267 int r;
268 int sl;
269 unsigned char *pattern;
270 unsigned char *pattern_end;
271 unsigned char *str_null_end;
272
273 pattern = (unsigned char *)malloc(pattern_size != 0 ? pattern_size : 1);
274 memcpy(pattern, data, pattern_size);
275 pattern_end = pattern + pattern_size;
276 data += pattern_size;
277 rem_size -= pattern_size;
278
279 if (rem_size > MAX_REM_SIZE) rem_size = MAX_REM_SIZE;
280
281 sl = onig_detect_can_be_slow_pattern(pattern, pattern_end, options, enc, syntax);
282 if (sl > 0) {
283 if (rem_size > MAX_SLOW_REM_SIZE)
284 rem_size = MAX_SLOW_REM_SIZE;
285 }
286
287 ADJUST_LEN(enc, rem_size);
288 #ifdef STANDALONE
289 fprintf(stdout, "rem_size: %ld\n", rem_size);
290 #endif
291
292 unsigned char *str = (unsigned char*)malloc(rem_size != 0 ? rem_size : 1);
293 memcpy(str, data, rem_size);
294 str_null_end = str + rem_size;
295
296 r = exec(enc, options, syntax,
297 (char *)pattern, (char *)pattern_end,
298 (char *)str, str_null_end, backward, sl);
299
300 free(pattern);
301 free(str);
302 return r;
303 }
304
305 #ifdef SYNTAX_TEST
306 #define NUM_CONTROL_BYTES 7
307 #else
308 #define NUM_CONTROL_BYTES 6
309 #endif
310
LLVMFuzzerTestOneInput(const uint8_t * Data,size_t Size)311 int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
312 {
313 #if !defined(UTF16_BE) && !defined(UTF16_LE)
314 static OnigEncoding encodings[] = {
315 ONIG_ENCODING_UTF8,
316 ONIG_ENCODING_UTF8,
317 ONIG_ENCODING_UTF8,
318 ONIG_ENCODING_UTF8,
319 ONIG_ENCODING_UTF8,
320 ONIG_ENCODING_UTF8,
321 ONIG_ENCODING_UTF8,
322 ONIG_ENCODING_UTF8,
323 ONIG_ENCODING_ASCII,
324 ONIG_ENCODING_EUC_JP,
325 ONIG_ENCODING_EUC_TW,
326 ONIG_ENCODING_EUC_KR,
327 ONIG_ENCODING_EUC_CN,
328 ONIG_ENCODING_SJIS,
329 ONIG_ENCODING_KOI8_R,
330 ONIG_ENCODING_CP1251,
331 ONIG_ENCODING_BIG5,
332 ONIG_ENCODING_GB18030,
333 ONIG_ENCODING_UTF16_BE,
334 ONIG_ENCODING_UTF16_LE,
335 ONIG_ENCODING_UTF16_BE,
336 ONIG_ENCODING_UTF16_LE,
337 ONIG_ENCODING_UTF32_BE,
338 ONIG_ENCODING_UTF32_LE,
339 ONIG_ENCODING_UTF32_BE,
340 ONIG_ENCODING_UTF32_LE,
341 ONIG_ENCODING_ISO_8859_1,
342 ONIG_ENCODING_ISO_8859_2,
343 ONIG_ENCODING_ISO_8859_3,
344 ONIG_ENCODING_ISO_8859_4,
345 ONIG_ENCODING_ISO_8859_5,
346 ONIG_ENCODING_ISO_8859_6,
347 ONIG_ENCODING_ISO_8859_7,
348 ONIG_ENCODING_ISO_8859_8,
349 ONIG_ENCODING_ISO_8859_9,
350 ONIG_ENCODING_ISO_8859_10,
351 ONIG_ENCODING_ISO_8859_11,
352 ONIG_ENCODING_ISO_8859_13,
353 ONIG_ENCODING_ISO_8859_14,
354 ONIG_ENCODING_ISO_8859_15,
355 ONIG_ENCODING_ISO_8859_16
356 };
357 unsigned char encoding_choice;
358 #endif
359
360 #ifdef SYNTAX_TEST
361 static OnigSyntaxType* syntaxes[] = {
362 ONIG_SYNTAX_POSIX_EXTENDED,
363 ONIG_SYNTAX_EMACS,
364 ONIG_SYNTAX_GREP,
365 ONIG_SYNTAX_GNU_REGEX,
366 ONIG_SYNTAX_JAVA,
367 ONIG_SYNTAX_PERL_NG,
368 ONIG_SYNTAX_ONIGURUMA
369 };
370
371 #ifdef STANDALONE
372 static char* syntax_names[] = {
373 "Posix Extended",
374 "Emacs",
375 "Grep",
376 "GNU Regex",
377 "Java",
378 "Perl+NG",
379 "Oniguruma"
380 };
381 #endif
382
383 unsigned char syntax_choice;
384 #endif
385
386 int r;
387 int backward;
388 int pattern_size;
389 size_t rem_size;
390 unsigned char *data;
391 unsigned char pattern_size_choice;
392 OnigOptionType options;
393 OnigEncoding enc;
394 OnigSyntaxType* syntax;
395
396 #ifndef STANDALONE
397 static FILE* STAT_FP;
398 #endif
399
400 INPUT_COUNT++;
401
402 #ifdef DUMP_DATA_INTERVAL
403 if (INPUT_COUNT % DUMP_DATA_INTERVAL == 0) {
404 char path[20];
405 sprintf(path, "dump-%ld", INPUT_COUNT);
406 dump_file(path, (unsigned char* )Data, Size);
407 }
408 #endif
409
410 if (Size < NUM_CONTROL_BYTES) return 0;
411
412 rem_size = Size;
413 data = (unsigned char* )(Data);
414
415 #ifdef UTF16_BE
416 enc = ONIG_ENCODING_UTF16_BE;
417 #else
418 #ifdef UTF16_LE
419 enc = ONIG_ENCODING_UTF16_LE;
420 #else
421 encoding_choice = data[0];
422 data++;
423 rem_size--;
424
425 int num_encodings = sizeof(encodings)/sizeof(encodings[0]);
426 enc = encodings[encoding_choice % num_encodings];
427 #endif
428 #endif
429
430 #ifdef SYNTAX_TEST
431 syntax_choice = data[0];
432 data++;
433 rem_size--;
434
435 int num_syntaxes = sizeof(syntaxes)/sizeof(syntaxes[0]);
436 syntax = syntaxes[syntax_choice % num_syntaxes];
437 #else
438 syntax = ONIG_SYNTAX_DEFAULT;
439 #endif
440
441 if ((data[2] & 0xc0) == 0)
442 options = data[0] | (data[1] << 8) | (data[2] << 16);
443 else
444 options = data[0] & ONIG_OPTION_IGNORECASE;
445
446 data++; rem_size--;
447 data++; rem_size--;
448 data++; rem_size--;
449
450 pattern_size_choice = data[0];
451 data++; rem_size--;
452
453 backward = (data[0] == 0xbb);
454 data++; rem_size--;
455
456 if (backward != 0) {
457 options = options & ~ONIG_OPTION_FIND_LONGEST;
458 }
459
460 if (rem_size == 0)
461 pattern_size = 0;
462 else {
463 pattern_size = (int )pattern_size_choice % rem_size;
464 ADJUST_LEN(enc, pattern_size);
465 }
466
467 #ifdef STANDALONE
468 dump_data(stdout, data, pattern_size);
469 #ifdef SYNTAX_TEST
470 fprintf(stdout,
471 "enc: %s, syntax: %s, options: %u, pattern_size: %d, back:%d\n",
472 ONIGENC_NAME(enc),
473 syntax_names[syntax_choice % num_syntaxes],
474 options,
475 pattern_size, backward);
476 #else
477 fprintf(stdout, "enc: %s, options: %u, pattern_size: %d, back:%d\n",
478 ONIGENC_NAME(enc), options, pattern_size, backward);
479 #endif
480 #endif
481
482 #ifdef DUMP_INPUT
483 dump_input((unsigned char* )Data, Size);
484 #endif
485
486 r = alloc_exec(enc, options, syntax, backward, pattern_size,
487 rem_size, data);
488 if (r == -2) exit(-2);
489
490 #ifndef STANDALONE
491 #ifdef EXEC_PRINT_INTERVAL
492 if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) {
493 float fexec, freg, fvalid;
494
495 if (STAT_FP == 0) {
496 #ifdef STAT_PATH
497 STAT_FP = fopen(STAT_PATH, "a");
498 #else
499 STAT_FP = stdout;
500 #endif
501 }
502
503 output_current_time(STAT_FP);
504
505 if (INPUT_COUNT != 0) { // overflow check
506 fexec = (float )EXEC_COUNT / INPUT_COUNT;
507 freg = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT;
508 fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT;
509
510 fprintf(STAT_FP, ": %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f\n",
511 EXEC_COUNT, fexec, freg, fvalid);
512 fflush(STAT_FP);
513 }
514 else {
515 fprintf(STAT_FP, ": ignore (input count overflow)\n");
516 }
517
518 EXEC_COUNT_INTERVAL = 0;
519 }
520 else if (EXEC_COUNT == 1) {
521 output_current_time(stdout);
522 fprintf(stdout, ": ------------ START ------------\n");
523 }
524 #endif
525 #endif
526
527 return r;
528 }
529
530 #ifdef STANDALONE
531
532 #define MAX_INPUT_DATA_SIZE 4194304
533
main(int argc,char * argv[])534 extern int main(int argc, char* argv[])
535 {
536 size_t max_size;
537 size_t n;
538 uint8_t Data[MAX_INPUT_DATA_SIZE];
539
540 if (argc > 1) {
541 max_size = (size_t )atoi(argv[1]);
542 }
543 else {
544 max_size = sizeof(Data);
545 }
546
547 n = read(0, Data, max_size);
548 fprintf(stdout, "read size: %ld, max_size: %ld\n", n, max_size);
549
550 LLVMFuzzerTestOneInput(Data, n);
551 return 0;
552 }
553 #endif /* STANDALONE */
554