1 /*
2  * regset.c
3  * Copyright (c) 2019  K.Kosako
4  */
5 #include <stdio.h>
6 #include <unistd.h>
7 #include <stdlib.h>
8 #include <string.h>
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <fcntl.h>
12 #include <time.h>
13 
14 #include "oniguruma.h"
15 
16 
17 #define RETRY_LIMIT   5000
18 
19 #ifdef STANDALONE
20 //#define CHECK_EACH_REGEX_SEARCH_TIME
21 #endif
22 
23 #define MAX_REG_NUM   256
24 
25 typedef unsigned char uint8_t;
26 static OnigEncoding ENC;
27 
28 static void
output_current_time(FILE * fp)29 output_current_time(FILE* fp)
30 {
31   char d[64];
32   time_t t;
33 
34   t = time(NULL);
35   strftime(d, sizeof(d), "%m/%d %H:%M:%S", localtime(&t));
36 
37   fprintf(fp, "%s", d);
38 }
39 
40 #ifdef CHECK_EACH_REGEX_SEARCH_TIME
41 static double
get_sec(struct timespec * ts,struct timespec * te)42 get_sec(struct timespec* ts, struct timespec* te)
43 {
44   double t;
45 
46   t = (te->tv_sec - ts->tv_sec) +
47       (double )(te->tv_nsec - ts->tv_nsec) / 1000000000.0;
48   return t;
49 }
50 
51 static int
check_each_regex_search_time(OnigRegSet * set,unsigned char * str,unsigned char * end)52 check_each_regex_search_time(OnigRegSet* set, unsigned char* str, unsigned char* end)
53 {
54   int n;
55   int i;
56   int r;
57   OnigRegion* region;
58 
59   n = onig_regset_number_of_regex(set);
60   region = onig_region_new();
61 
62   for (i = 0; i < n; i++) {
63     regex_t* reg;
64     unsigned char* start;
65     unsigned char* range;
66     struct timespec ts1, ts2;
67     double t;
68 
69     reg = onig_regset_get_regex(set, i);
70     start = str;
71     range = end;
72 
73     clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts1);
74 
75     r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
76 
77     clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts2);
78     t = get_sec(&ts1, &ts2);
79 
80     fprintf(stdout, "regex search time %d: %6.2lfmsec.\n", i, t * 1000.0);
81   }
82 
83   onig_region_free(region, 1);
84   return 0;
85 }
86 #endif
87 
88 static int
search(OnigRegSet * set,OnigRegSetLead lead,unsigned char * str,unsigned char * end)89 search(OnigRegSet* set, OnigRegSetLead lead, unsigned char* str, unsigned char* end)
90 {
91   int r;
92   int match_pos;
93   unsigned char *start, *range;
94 
95   start = str;
96   range = end;
97   r = onig_regset_search(set, str, end, start, range, lead,
98                          ONIG_OPTION_NONE, &match_pos);
99   if (r >= 0) {
100 #ifdef STANDALONE
101     int i;
102     int match_index;
103     OnigRegion* region;
104 
105     match_index = r;
106     fprintf(stdout, "match reg index: %d, pos: %d  (%s)\n",
107             match_index, match_pos, ONIGENC_NAME(ENC));
108     region = onig_regset_get_region(set, match_index);
109     if (region == 0) {
110       fprintf(stdout, "ERROR: can't get region.\n");
111       return -1;
112     }
113 
114     for (i = 0; i < region->num_regs; i++) {
115       fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
116     }
117 #endif
118   }
119   else if (r == ONIG_MISMATCH) {
120 #ifdef STANDALONE
121     fprintf(stdout, "search fail (%s)\n", ONIGENC_NAME(ENC));
122 #endif
123   }
124   else { /* error */
125 #ifdef STANDALONE
126     char s[ONIG_MAX_ERROR_MESSAGE_LEN];
127 
128     onig_error_code_to_str((UChar* )s, r);
129     fprintf(stdout, "ERROR: %s\n", s);
130     fprintf(stdout, "  (%s)\n", ONIGENC_NAME(ENC));
131 #endif
132     return -1;
133   }
134 
135   return 0;
136 }
137 
138 static long INPUT_COUNT;
139 static long EXEC_COUNT;
140 static long EXEC_COUNT_INTERVAL;
141 static long REGEX_SUCCESS_COUNT;
142 static long VALID_STRING_COUNT;
143 
144 static int
exec(OnigEncoding enc,int reg_num,int init_reg_num,UChar * pat[],UChar * pat_end[],OnigRegSetLead lead,UChar * str,UChar * end)145 exec(OnigEncoding enc, int reg_num, int init_reg_num,
146      UChar* pat[], UChar* pat_end[],
147      OnigRegSetLead lead, UChar* str, UChar* end)
148 {
149   int r;
150   int i, j;
151   OnigRegSet* set;
152   regex_t* reg;
153   OnigOptionType options;
154   OnigErrorInfo einfo;
155   regex_t* regs[MAX_REG_NUM];
156 
157   EXEC_COUNT++;
158   EXEC_COUNT_INTERVAL++;
159 
160   options = (EXEC_COUNT % 4 == 0) ? ONIG_OPTION_IGNORECASE : ONIG_OPTION_NONE;
161 
162   onig_initialize(&enc, 1);
163   onig_set_retry_limit_in_search(RETRY_LIMIT);
164 
165   for (i = 0; i < init_reg_num; i++) {
166     r = onig_new(&regs[i], pat[i], pat_end[i], options, ENC,
167                  ONIG_SYNTAX_DEFAULT, &einfo);
168     if (r != 0) {
169 #ifdef STANDALONE
170       char s[ONIG_MAX_ERROR_MESSAGE_LEN];
171 
172       onig_error_code_to_str((UChar* )s, r, &einfo);
173       fprintf(stdout, "ERROR: index: %d, %s\n", i, s);
174 #endif
175 
176       for (j = 0; j < i; j++) onig_free(regs[j]);
177 
178       onig_end();
179 
180       if (r == ONIGERR_PARSER_BUG ||
181           r == ONIGERR_STACK_BUG  ||
182           r == ONIGERR_UNDEFINED_BYTECODE ||
183           r == ONIGERR_UNEXPECTED_BYTECODE) {
184         return -2;
185       }
186       else
187         return -1;
188     }
189   }
190 
191   r = onig_regset_new(&set, init_reg_num, regs);
192   if (r != 0) {
193     for (i = 0; i < init_reg_num; i++) {
194       onig_free(regs[i]);
195     }
196     onig_end();
197     return -1;
198   }
199 
200   for (i = init_reg_num; i < reg_num; i++) {
201     r = onig_new(&reg, pat[i], pat_end[i], options, ENC,
202                  ONIG_SYNTAX_DEFAULT, &einfo);
203     if (r != 0) {
204 #ifdef STANDALONE
205       char s[ONIG_MAX_ERROR_MESSAGE_LEN];
206 
207       onig_error_code_to_str((UChar* )s, r, &einfo);
208       fprintf(stdout, "ERROR: index: %d, %s\n", i, s);
209 #endif
210       onig_regset_free(set);
211       onig_end();
212 
213       if (r == ONIGERR_PARSER_BUG ||
214           r == ONIGERR_STACK_BUG  ||
215           r == ONIGERR_UNDEFINED_BYTECODE ||
216           r == ONIGERR_UNEXPECTED_BYTECODE) {
217         return -2;
218       }
219       else
220         return -1;
221     }
222 
223     r = onig_regset_add(set, reg);
224     if (r != 0) {
225       onig_regset_free(set);
226       onig_end();
227       fprintf(stdout, "ERROR: onig_regset_add(): %d\n", i);
228       return r;
229     }
230   }
231 
232   REGEX_SUCCESS_COUNT++;
233 
234   if (onigenc_is_valid_mbc_string(enc, str, end) != 0) {
235     VALID_STRING_COUNT++;
236     r = search(set, lead, str, end);
237 #ifdef CHECK_EACH_REGEX_SEARCH_TIME
238     r = check_each_regex_search_time(set, str, end);
239 #endif
240   }
241 
242   onig_regset_free(set);
243   onig_end();
244   return 0;
245 }
246 
247 #define MAX_PATTERN_SIZE      30
248 #define NUM_CONTROL_BYTES      3
249 
250 #define EXEC_PRINT_INTERVAL  2000000
251 
252 static int MaxRegNum;
253 static int MaxInitRegNum;
254 
255 extern int
LLVMFuzzerTestOneInput(const uint8_t * Data,size_t Size)256 LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
257 {
258   int r, i;
259   int pattern_size;
260   unsigned char *str_null_end;
261   size_t remaining_size;
262   unsigned char *data;
263   unsigned int reg_num;
264   unsigned int init_reg_num;
265   unsigned char* pat[256];
266   unsigned char* pat_end[256];
267   int len;
268   unsigned int lead_num;
269   OnigRegSetLead lead;
270 
271   INPUT_COUNT++;
272 
273   if (Size < NUM_CONTROL_BYTES) return 0;
274 
275   remaining_size = Size;
276   data = (unsigned char* )(Data);
277 
278   reg_num = data[0];
279   data++;
280   remaining_size--;
281 
282   init_reg_num = data[0];
283   data++;
284   remaining_size--;
285 
286   lead_num = data[0];
287   data++;
288   remaining_size--;
289   lead = (lead_num % 2 == 0 ? ONIG_REGSET_POSITION_LEAD : ONIG_REGSET_REGEX_LEAD);
290 
291   if (remaining_size < reg_num * 2) {
292     reg_num = reg_num % 15;  // zero is OK.
293   }
294 
295   init_reg_num %= (reg_num + 1);
296 
297   if (MaxRegNum < reg_num)
298     MaxRegNum = reg_num;
299 
300   if (MaxInitRegNum < init_reg_num)
301     MaxInitRegNum = init_reg_num;
302 
303   if (reg_num == 0)
304     pattern_size = 1;
305   else
306     pattern_size = remaining_size / (reg_num * 2);
307 
308   if (pattern_size > MAX_PATTERN_SIZE)
309     pattern_size = MAX_PATTERN_SIZE;
310 
311   len = pattern_size * reg_num;
312   if (len == 0) len = 1;
313 
314   for (i = 0; i < reg_num; i++) {
315     pat[i] = (unsigned char* )malloc(pattern_size);
316     memcpy(pat[i], data, pattern_size);
317     pat_end[i] = pat[i] + pattern_size;
318     data += pattern_size;
319     remaining_size -= pattern_size;
320   }
321 
322   unsigned char *str = (unsigned char*)malloc(remaining_size != 0 ? remaining_size : 1);
323   memcpy(str, data, remaining_size);
324   str_null_end = str + remaining_size;
325 
326 #ifdef STANDALONE
327   fprintf(stdout, "reg num: %d, pattern size: %d, lead: %s\n",
328           reg_num, pattern_size,
329           lead == ONIG_REGSET_POSITION_LEAD ? "position" : "regex");
330 
331   if (reg_num != 0) {
332     unsigned char* p;
333     i = 0;
334     p = pat[0];
335     while (p < pat_end[0]) {
336       fprintf(stdout, " 0x%02x", (int )*p++);
337       i++;
338       if (i % 8 == 0) fprintf(stdout, "\n");
339     }
340     fprintf(stdout, "\n");
341   }
342 #endif
343 
344   ENC = ONIG_ENCODING_UTF8;
345 
346   r = exec(ENC, reg_num, init_reg_num, pat, pat_end, lead, str, str_null_end);
347 
348   for (i = 0; i < reg_num; i++) {
349     free(pat[i]);
350   }
351   free(str);
352 
353   if (r == -2) {
354     //output_data("parser-bug", Data, Size);
355     exit(-2);
356   }
357 
358   if (EXEC_COUNT_INTERVAL == EXEC_PRINT_INTERVAL) {
359     float fexec, freg, fvalid;
360 
361     fexec  = (float )EXEC_COUNT / INPUT_COUNT;
362     freg   = (float )REGEX_SUCCESS_COUNT / INPUT_COUNT;
363     fvalid = (float )VALID_STRING_COUNT / INPUT_COUNT;
364 
365     output_current_time(stdout);
366     fprintf(stdout, ": %ld: EXEC:%.2f, REG:%.2f, VALID:%.2f MAX REG:%d-%d\n",
367             EXEC_COUNT, fexec, freg, fvalid, MaxRegNum, MaxInitRegNum);
368 
369     EXEC_COUNT_INTERVAL = 0;
370   }
371   else if (EXEC_COUNT == 1) {
372     output_current_time(stdout);
373     fprintf(stdout, ": ------------ START ------------\n");
374   }
375 
376   return r;
377 }
378 
379 #ifdef STANDALONE
380 
main(int argc,char * argv[])381 extern int main(int argc, char* argv[])
382 {
383   size_t n;
384   uint8_t Data[10000];
385 
386   n = read(0, Data, sizeof(Data));
387   fprintf(stdout, "n: %ld\n", n);
388   LLVMFuzzerTestOneInput(Data, n);
389 
390   return 0;
391 }
392 #endif /* STANDALONE */
393