1 /*
2  * base.c  contributed by Mark Griffin
3  * Copyright (c) 2019-2020  K.Kosako
4  */
5 #include <stdio.h>
6 #include <unistd.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <fcntl.h>
12 #include <time.h>
13 #include "oniguruma.h"
14 
15 #define PARSE_DEPTH_LIMIT           8
16 #define CALL_MAX_NEST_LEVEL         8
17 #define SUBEXP_CALL_LIMIT         500
18 #define BASE_RETRY_LIMIT        20000
19 #define BASE_LENGTH              2048
20 #define MATCH_STACK_LIMIT    10000000
21 #define MAX_REM_SIZE          1048576
22 #define MAX_SLOW_REM_SIZE        1024
23 #define SLOW_RETRY_LIMIT         2000
24 
25 //#define EXEC_PRINT_INTERVAL    500000
26 //#define DUMP_DATA_INTERVAL     100000
27 //#define STAT_PATH              "fuzzer.stat_log"
28 
29 #define OPTIONS_AT_COMPILE   (ONIG_OPTION_IGNORECASE | ONIG_OPTION_EXTEND | ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE | ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY | ONIG_OPTION_NEGATE_SINGLELINE | ONIG_OPTION_DONT_CAPTURE_GROUP | ONIG_OPTION_CAPTURE_GROUP | ONIG_OPTION_WORD_IS_ASCII | ONIG_OPTION_DIGIT_IS_ASCII | ONIG_OPTION_SPACE_IS_ASCII | ONIG_OPTION_POSIX_IS_ASCII | ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER | ONIG_OPTION_TEXT_SEGMENT_WORD )
30 
31 #define OPTIONS_AT_RUNTIME   (ONIG_OPTION_NOTBOL | ONIG_OPTION_NOTEOL | ONIG_OPTION_CHECK_VALIDITY_OF_STRING | ONIG_OPTION_NOT_BEGIN_STRING | ONIG_OPTION_NOT_END_STRING | ONIG_OPTION_NOT_BEGIN_POSITION)
32 
33 
34 #define ADJUST_LEN(enc, len) do {\
35   int mlen = ONIGENC_MBC_MINLEN(enc);\
36   if (mlen != 1) { len -= len % mlen; }\
37 } while (0)
38 
39 typedef unsigned char uint8_t;
40 
41 #ifdef DUMP_INPUT
42 static void
dump_input(unsigned char * data,size_t len)43 dump_input(unsigned char* data, size_t len)
44 {
45   static FILE* DumpFp;
46   static char end[] = { 'E', 'N', 'D' };
47 
48   if (DumpFp == 0)
49     DumpFp = fopen("dump-input", "w");
50 
51   fseek(DumpFp, 0, SEEK_SET);
52   fwrite(data, sizeof(unsigned char), len, DumpFp);
53   fwrite(end,  sizeof(char), sizeof(end), DumpFp);
54   fflush(DumpFp);
55 }
56 #endif
57 
58 #ifdef DUMP_DATA_INTERVAL
59 static void
dump_file(char * path,unsigned char * data,size_t len)60 dump_file(char* path, unsigned char* data, size_t len)
61 {
62   FILE* fp;
63 
64   fp = fopen(path, "w");
65   fwrite(data, sizeof(unsigned char), len, fp);
66   fclose(fp);
67 }
68 #endif
69 
70 #ifdef STANDALONE
71 #include <ctype.h>
72 
73 static void
dump_data(FILE * fp,unsigned char * data,int len)74 dump_data(FILE* fp, unsigned char* data, int len)
75 {
76   int i;
77 
78   fprintf(fp, "{\n");
79   for (i = 0; i < len; i++) {
80     unsigned char c = data[i];
81 
82     if (isprint((int )c)) {
83       if (c == '\\')
84         fprintf(fp, " '\\\\'");
85       else
86         fprintf(fp, " '%c'", c);
87     }
88     else {
89       fprintf(fp, "0x%02x", (int )c);
90     }
91 
92     if (i == len - 1) {
93       fprintf(fp, "\n");
94     }
95     else {
96       if (i % 8 == 7)
97         fprintf(fp, ",\n");
98       else
99         fprintf(fp, ", ");
100     }
101   }
102   fprintf(fp, "};\n");
103 }
104 
105 #else
106 
107 static void
output_current_time(FILE * fp)108 output_current_time(FILE* fp)
109 {
110   char d[64];
111   time_t t;
112 
113   t = time(NULL);
114   strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t));
115 
116   fprintf(fp, "%s", d);
117 }
118 
119 #endif
120 
121 static int
search(regex_t * reg,unsigned char * str,unsigned char * end,OnigOptionType options,int backward,int sl)122 search(regex_t* reg, unsigned char* str, unsigned char* end, OnigOptionType options, int backward, int sl)
123 {
124   int r;
125   unsigned char *start, *range;
126   OnigRegion *region;
127   unsigned int retry_limit;
128   size_t len;
129 
130   region = onig_region_new();
131 
132   len = (size_t )(end - str);
133   if (len < BASE_LENGTH) {
134     if (sl >= 2)
135       retry_limit = (unsigned int )SLOW_RETRY_LIMIT;
136     else
137       retry_limit = (unsigned int )BASE_RETRY_LIMIT;
138   }
139   else
140     retry_limit = (unsigned int )(BASE_RETRY_LIMIT * BASE_LENGTH / len);
141 
142 #ifdef STANDALONE
143   fprintf(stdout, "retry limit: %u\n", retry_limit);
144 #endif
145 
146   onig_set_retry_limit_in_search(retry_limit);
147   onig_set_match_stack_limit_size(MATCH_STACK_LIMIT);
148   onig_set_subexp_call_limit_in_search(SUBEXP_CALL_LIMIT);
149 
150   if (backward != 0) {
151     start = end;
152     range = str;
153   }
154   else {
155     start = str;
156     range = end;
157   }
158 
159   r = onig_search(reg, str, end, start, range, region, (options & OPTIONS_AT_RUNTIME));
160   if (r >= 0) {
161 #ifdef STANDALONE
162     int i;
163 
164     fprintf(stdout, "match at %d  (%s)\n", r,
165             ONIGENC_NAME(onig_get_encoding(reg)));
166     for (i = 0; i < region->num_regs; i++) {
167       fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
168     }
169 #endif
170   }
171   else if (r == ONIG_MISMATCH) {
172 #ifdef STANDALONE
173     fprintf(stdout, "search fail (%s)\n",
174             ONIGENC_NAME(onig_get_encoding(reg)));
175 #endif
176   }
177   else { /* error */
178 #ifdef STANDALONE
179     char s[ONIG_MAX_ERROR_MESSAGE_LEN];
180 
181     onig_error_code_to_str((UChar* )s, r);
182     fprintf(stdout, "ERROR: %s\n", s);
183     fprintf(stdout, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
184 #endif
185     onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
186 
187     if (r == ONIGERR_STACK_BUG ||
188         r == ONIGERR_UNDEFINED_BYTECODE ||
189         r == ONIGERR_UNEXPECTED_BYTECODE)
190       return -2;
191 
192     return -1;
193   }
194 
195   onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
196   return 0;
197 }
198 
199 static long INPUT_COUNT;
200 static long EXEC_COUNT;
201 static long EXEC_COUNT_INTERVAL;
202 static long REGEX_SUCCESS_COUNT;
203 static long VALID_STRING_COUNT;
204 
205 static int
exec(OnigEncoding enc,OnigOptionType options,OnigSyntaxType * syntax,char * apattern,char * apattern_end,char * astr,UChar * end,int backward,int sl)206 exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax,
207      char* apattern, char* apattern_end, char* astr, UChar* end, int backward,
208      int sl)
209 {
210   int r;
211   regex_t* reg;
212   OnigErrorInfo einfo;
213   UChar* pattern = (UChar* )apattern;
214   UChar* str     = (UChar* )astr;
215   UChar* pattern_end = (UChar* )apattern_end;
216 
217   EXEC_COUNT++;
218   EXEC_COUNT_INTERVAL++;
219 
220   onig_initialize(&enc, 1);
221 #ifdef PARSE_DEPTH_LIMIT
222   onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT);
223 #endif
224   onig_set_subexp_call_max_nest_level(CALL_MAX_NEST_LEVEL);
225 
226   r = onig_new(&reg, pattern, pattern_end,
227                (options & OPTIONS_AT_COMPILE), enc, syntax, &einfo);
228   if (r != ONIG_NORMAL) {
229     char s[ONIG_MAX_ERROR_MESSAGE_LEN];
230     onig_error_code_to_str((UChar* )s, r, &einfo);
231 #ifdef STANDALONE
232     fprintf(stdout, "ERROR: %s\n", s);
233 #endif
234     onig_end();
235 
236     if (r == ONIGERR_PARSER_BUG ||
237         r == ONIGERR_STACK_BUG  ||
238         r == ONIGERR_UNDEFINED_BYTECODE ||
239         r == ONIGERR_UNEXPECTED_BYTECODE) {
240       return -2;
241     }
242     else
243       return -1;
244   }
245   REGEX_SUCCESS_COUNT++;
246 
247   r = search(reg, pattern, pattern_end, options, backward, sl);
248   if (r == -2) return -2;
249 
250   if (onigenc_is_valid_mbc_string(enc, str, end) != 0) {
251     VALID_STRING_COUNT++;
252     r = search(reg, str, end, options, backward, sl);
253     if (r == -2) return -2;
254   }
255 
256   onig_free(reg);
257   onig_end();
258   return 0;
259 }
260 
261 static int
alloc_exec(OnigEncoding enc,OnigOptionType options,OnigSyntaxType * syntax,int backward,int pattern_size,size_t rem_size,unsigned char * data)262 alloc_exec(OnigEncoding enc, OnigOptionType options, OnigSyntaxType* syntax,
263            int backward, int pattern_size, size_t rem_size, unsigned char *data)
264 {
265   extern int onig_detect_can_be_slow_pattern(const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax);
266 
267   int r;
268   int sl;
269   unsigned char *pattern;
270   unsigned char *pattern_end;
271   unsigned char *str_null_end;
272 
273   pattern = (unsigned char *)malloc(pattern_size != 0 ? pattern_size : 1);
274   memcpy(pattern, data, pattern_size);
275   pattern_end = pattern + pattern_size;
276   data += pattern_size;
277   rem_size -= pattern_size;
278 
279   if (rem_size > MAX_REM_SIZE) rem_size = MAX_REM_SIZE;
280 
281   sl = onig_detect_can_be_slow_pattern(pattern, pattern_end, options, enc, syntax);
282   if (sl > 0) {
283     if (rem_size > MAX_SLOW_REM_SIZE)
284       rem_size = MAX_SLOW_REM_SIZE;
285   }
286 
287   ADJUST_LEN(enc, rem_size);
288 #ifdef STANDALONE
289   fprintf(stdout, "rem_size: %ld\n", rem_size);
290 #endif
291 
292   unsigned char *str = (unsigned char*)malloc(rem_size != 0 ? rem_size : 1);
293   memcpy(str, data, rem_size);
294   str_null_end = str + rem_size;
295 
296   r = exec(enc, options, syntax,
297            (char *)pattern, (char *)pattern_end,
298            (char *)str, str_null_end, backward, sl);
299 
300   free(pattern);
301   free(str);
302   return r;
303 }
304 
305 #ifdef SYNTAX_TEST
306 #define NUM_CONTROL_BYTES      7
307 #else
308 #define NUM_CONTROL_BYTES      6
309 #endif
310 
LLVMFuzzerTestOneInput(const uint8_t * Data,size_t Size)311 int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
312 {
313 #if !defined(UTF16_BE) && !defined(UTF16_LE)
314   static OnigEncoding encodings[] = {
315     ONIG_ENCODING_UTF8,
316     ONIG_ENCODING_UTF8,
317     ONIG_ENCODING_UTF8,
318     ONIG_ENCODING_UTF8,
319     ONIG_ENCODING_UTF8,
320     ONIG_ENCODING_UTF8,
321     ONIG_ENCODING_UTF8,
322     ONIG_ENCODING_UTF8,
323     ONIG_ENCODING_ASCII,
324     ONIG_ENCODING_EUC_JP,
325     ONIG_ENCODING_EUC_TW,
326     ONIG_ENCODING_EUC_KR,
327     ONIG_ENCODING_EUC_CN,
328     ONIG_ENCODING_SJIS,
329     ONIG_ENCODING_KOI8_R,
330     ONIG_ENCODING_CP1251,
331     ONIG_ENCODING_BIG5,
332     ONIG_ENCODING_GB18030,
333     ONIG_ENCODING_UTF16_BE,
334     ONIG_ENCODING_UTF16_LE,
335     ONIG_ENCODING_UTF16_BE,
336     ONIG_ENCODING_UTF16_LE,
337     ONIG_ENCODING_UTF32_BE,
338     ONIG_ENCODING_UTF32_LE,
339     ONIG_ENCODING_UTF32_BE,
340     ONIG_ENCODING_UTF32_LE,
341     ONIG_ENCODING_ISO_8859_1,
342     ONIG_ENCODING_ISO_8859_2,
343     ONIG_ENCODING_ISO_8859_3,
344     ONIG_ENCODING_ISO_8859_4,
345     ONIG_ENCODING_ISO_8859_5,
346     ONIG_ENCODING_ISO_8859_6,
347     ONIG_ENCODING_ISO_8859_7,
348     ONIG_ENCODING_ISO_8859_8,
349     ONIG_ENCODING_ISO_8859_9,
350     ONIG_ENCODING_ISO_8859_10,
351     ONIG_ENCODING_ISO_8859_11,
352     ONIG_ENCODING_ISO_8859_13,
353     ONIG_ENCODING_ISO_8859_14,
354     ONIG_ENCODING_ISO_8859_15,
355     ONIG_ENCODING_ISO_8859_16
356   };
357   unsigned char encoding_choice;
358 #endif
359 
360 #ifdef SYNTAX_TEST
361   static OnigSyntaxType* syntaxes[] = {
362     ONIG_SYNTAX_POSIX_EXTENDED,
363     ONIG_SYNTAX_EMACS,
364     ONIG_SYNTAX_GREP,
365     ONIG_SYNTAX_GNU_REGEX,
366     ONIG_SYNTAX_JAVA,
367     ONIG_SYNTAX_PERL_NG,
368     ONIG_SYNTAX_ONIGURUMA
369   };
370 
371 #ifdef STANDALONE
372   static char* syntax_names[] = {
373     "Posix Extended",
374     "Emacs",
375     "Grep",
376     "GNU Regex",
377     "Java",
378     "Perl+NG",
379     "Oniguruma"
380   };
381 #endif
382 
383   unsigned char syntax_choice;
384 #endif
385 
386   int r;
387   int backward;
388   int pattern_size;
389   size_t rem_size;
390   unsigned char *data;
391   unsigned char pattern_size_choice;
392   OnigOptionType  options;
393   OnigEncoding    enc;
394   OnigSyntaxType* syntax;
395 
396 #ifndef STANDALONE
397   static FILE* STAT_FP;
398 #endif
399 
400   INPUT_COUNT++;
401 
402 #ifdef DUMP_DATA_INTERVAL
403   if (INPUT_COUNT % DUMP_DATA_INTERVAL == 0) {
404     char path[20];
405     sprintf(path, "dump-%ld", INPUT_COUNT);
406     dump_file(path, (unsigned char* )Data, Size);
407   }
408 #endif
409 
410   if (Size < NUM_CONTROL_BYTES) return 0;
411 
412   rem_size = Size;
413   data = (unsigned char* )(Data);
414 
415 #ifdef UTF16_BE
416   enc = ONIG_ENCODING_UTF16_BE;
417 #else
418 #ifdef UTF16_LE
419   enc = ONIG_ENCODING_UTF16_LE;
420 #else
421   encoding_choice = data[0];
422   data++;
423   rem_size--;
424 
425   int num_encodings = sizeof(encodings)/sizeof(encodings[0]);
426   enc = encodings[encoding_choice % num_encodings];
427 #endif
428 #endif
429 
430 #ifdef SYNTAX_TEST
431   syntax_choice = data[0];
432   data++;
433   rem_size--;
434 
435   int num_syntaxes = sizeof(syntaxes)/sizeof(syntaxes[0]);
436   syntax = syntaxes[syntax_choice % num_syntaxes];
437 #else
438   syntax = ONIG_SYNTAX_DEFAULT;
439 #endif
440 
441   if ((data[2] & 0xc0) == 0)
442     options = data[0] | (data[1] << 8) | (data[2] << 16);
443   else
444     options = data[0] & ONIG_OPTION_IGNORECASE;
445 
446   data++; rem_size--;
447   data++; rem_size--;
448   data++; rem_size--;
449 
450   pattern_size_choice = data[0];
451   data++; rem_size--;
452 
453   backward = (data[0] == 0xbb);
454   data++; rem_size--;
455 
456   if (backward != 0) {
457     options = options & ~ONIG_OPTION_FIND_LONGEST;
458   }
459 
460   if (rem_size == 0)
461     pattern_size = 0;
462   else {
463     pattern_size = (int )pattern_size_choice % rem_size;
464     ADJUST_LEN(enc, pattern_size);
465   }
466 
467 #ifdef STANDALONE
468   dump_data(stdout, data, pattern_size);
469 #ifdef SYNTAX_TEST
470   fprintf(stdout,
471           "enc: %s, syntax: %s, options: %u, pattern_size: %d, back:%d\n",
472           ONIGENC_NAME(enc),
473           syntax_names[syntax_choice % num_syntaxes],
474           options,
475           pattern_size, backward);
476 #else
477   fprintf(stdout, "enc: %s, options: %u, pattern_size: %d, back:%d\n",
478           ONIGENC_NAME(enc), options, pattern_size, backward);
479 #endif
480 #endif
481 
482 #ifdef DUMP_INPUT
483   dump_input((unsigned char* )Data, Size);
484 #endif
485 
486   r = alloc_exec(enc, options, syntax, backward, pattern_size,
487                  rem_size, data);
488   if (r == -2) exit(-2);
489 
490 #ifndef STANDALONE
491 #ifdef EXEC_PRINT_INTERVAL
492   if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) {
493     float fexec, freg, fvalid;
494 
495     if (STAT_FP == 0) {
496 #ifdef STAT_PATH
497       STAT_FP = fopen(STAT_PATH, "a");
498 #else
499       STAT_FP = stdout;
500 #endif
501     }
502 
503     output_current_time(STAT_FP);
504 
505     if (INPUT_COUNT != 0) { // overflow check
506       fexec  = (float )EXEC_COUNT / INPUT_COUNT;
507       freg   = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT;
508       fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT;
509 
510       fprintf(STAT_FP, ": %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f\n",
511               EXEC_COUNT, fexec, freg, fvalid);
512       fflush(STAT_FP);
513     }
514     else {
515       fprintf(STAT_FP, ": ignore (input count overflow)\n");
516     }
517 
518     EXEC_COUNT_INTERVAL = 0;
519   }
520   else if (EXEC_COUNT == 1) {
521     output_current_time(stdout);
522     fprintf(stdout, ": ------------ START ------------\n");
523   }
524 #endif
525 #endif
526 
527   return r;
528 }
529 
530 #ifdef STANDALONE
531 
532 #define MAX_INPUT_DATA_SIZE  4194304
533 
main(int argc,char * argv[])534 extern int main(int argc, char* argv[])
535 {
536   size_t max_size;
537   size_t n;
538   uint8_t Data[MAX_INPUT_DATA_SIZE];
539 
540   if (argc > 1) {
541     max_size = (size_t )atoi(argv[1]);
542   }
543   else {
544     max_size = sizeof(Data);
545   }
546 
547   n = read(0, Data, max_size);
548   fprintf(stdout, "read size: %ld, max_size: %ld\n", n, max_size);
549 
550   LLVMFuzzerTestOneInput(Data, n);
551   return 0;
552 }
553 #endif /* STANDALONE */
554