1 /*
2   bench.c - simple regex benchmark program
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif /* HAVE_CONFIG_H */
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #ifdef HAVE_GETOPT_H
16 #include <getopt.h>
17 #endif /* HAVE_GETOPT_H */
18 #include <time.h>
19 #include <unistd.h>
20 #include <math.h>
21 #include <sys/types.h>
22 
23 #if 0
24 #include <hackerlab/rx-posix/regex.h>
25 #else
26 #include <regex.h>
27 #endif
28 
29 /* T distribution for alpha = 0.025 (for 95% confidence).  XXX - is
30    this correct? */
31 double t_distribution[] = {
32   12.71,
33   4.303,
34   3.182,
35   2.776,
36   2.571,
37   2.447,
38   2.365,
39   2.306,
40   2.262,
41   2.228,
42   2.201,
43   2.179,
44   2.160,
45   2.145,
46   2.131,
47   2.120,
48   2.110,
49   2.101,
50   2.093,
51   2.086,
52   2.080,
53   2.074,
54   2.069,
55   2.064,
56   2.060,
57   2.056,
58   2.052,
59   2.048,
60   2.045,
61   2.042
62 };
63 
64 void
stats(double * sample_data,int samples,int len)65 stats(double *sample_data, int samples, int len)
66 {
67   double mean, tmp1, tmp2, variance, stddev, error, percent;
68   int i;
69 
70   mean = 0;
71   for (i = 0; i < samples; i++)
72     mean += sample_data[i];
73   mean = mean/i;
74   printf("# mean: %.5f\n", mean);
75 
76   tmp1 = 0;
77   for (i = 0; i < samples; i++) {
78     tmp2 = sample_data[i] - mean;
79     tmp1 += tmp2*tmp2;
80   }
81   if (samples > 1)
82     variance = tmp1 / (samples-1);
83   else
84     variance = 0;
85   stddev = sqrt(variance);
86   printf("# variance: %.16f\n", variance);
87   printf("# standard deviation: %.16f\n", stddev);
88 
89   error = t_distribution[samples-1] * stddev / sqrt(samples);
90   if (mean != 0)
91     percent = 100*error/mean;
92   else
93     percent = 0;
94   printf("# error: �%.16f (�%.4f%%)\n", error, percent);
95 
96   printf("%d\t%.5f\t%.5f\n", len, mean, error);
97 
98   fflush(stdout);
99 }
100 
101 void
run_tests(int len,int samples,double * sample_data,int repeats,regex_t * reobj,char * str,char * tmpbuf)102 run_tests(int len, int samples, double *sample_data, int repeats,
103 	  regex_t *reobj, char *str, char *tmpbuf)
104 {
105   int i, j, errcode;
106   clock_t c1, c2;
107   regmatch_t pmatch[10];
108 
109 
110   printf("# len = %d\n", len);
111   fflush(stdout);
112   for (i = 0; i < samples; i++) {
113     c1 = clock();
114     for (j = 0; j < repeats; j++)
115       if ((errcode = tre_regexec(reobj, str, 10, pmatch, 0))) {
116 	tre_regerror(errcode, reobj, tmpbuf, 255);
117 	printf("error: %s\n", tmpbuf);
118       }
119     c2 = clock();
120 
121     sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
122 
123     printf("# sample: %.5f sec, clocks: %ld\n",
124 	   (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
125 	   (long)(c2-c1));
126     fflush(stdout);
127   }
128   fflush(stdout);
129 
130   for (i = 0; i < 10; i += 2) {
131     printf("# pmatch[%d].rm_so = %d\n", i/2, (int)pmatch[i/2].rm_so);
132     printf("# pmatch[%d].rm_eo = %d\n", i/2, (int)pmatch[i/2].rm_eo);
133   }
134 }
135 
136 
137 int
main(int argc,char ** argv)138 main(int argc, char **argv)
139 {
140   regex_t reobj;
141   char *str;
142   char tmpbuf[256];
143   int i, j;
144   int max_len = 1024*1024*10;
145   int steps = 20;
146   int repeats = 10;
147   int samples = 20;
148   int len;
149   clock_t c1, c2;
150   int opt;
151   double sample_data[30];
152 
153   int test_id = -1;
154 
155   while ((opt = getopt(argc, argv, "r:l:s:j:t:")) != -1) {
156     switch (opt) {
157     case 't':
158       test_id = atoi(optarg);
159       break;
160     case 'l':
161       max_len = atoi(optarg);
162       break;
163     case 'j':
164       steps = atoi(optarg);
165       break;
166     case 's':
167       samples = atoi(optarg);
168       break;
169     case 'r':
170       repeats = atoi(optarg);
171       break;
172     default:
173       printf("P�lli.\n");
174       return 1;
175     }
176   }
177 
178   /* XXX - Check that the correct results are returned.  For example, GNU
179            regex-0.12 returns incorrect results for very long strings in
180 	   test number 1. */
181 
182   switch (test_id) {
183   case 0:
184     printf("# pattern: \"a*\"\n");
185     printf("# string:  \"aaaaaa...\"\n");
186     len = 0;
187     tre_regcomp(&reobj, "a*", REG_EXTENDED);
188     while (len <= max_len) {
189 
190       str = malloc(sizeof(char) * (len+1));
191       for (i = 0; i < len; i++)
192 	str[i] = 'a';
193       str[len-1] = '\0';
194 
195       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
196       stats(sample_data, samples, len);
197       len = len + (max_len/steps);
198       free(str);
199     }
200     break;
201 
202 
203   case 1:
204     printf("# pattern: \"(a)*\"\n");
205     printf("# string:  \"aaaaaa...\"\n");
206     len = 0;
207     tre_regcomp(&reobj, "(a)*", REG_EXTENDED);
208     while (len <= max_len) {
209 
210       str = malloc(sizeof(char) * (len+1));
211       for (i = 0; i < len; i++)
212 	str[i] = 'a';
213       str[len-1] = '\0';
214 
215       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
216       stats(sample_data, samples, len);
217       len = len + (max_len/steps);
218       free(str);
219     }
220     break;
221 
222 
223   case 2:
224     printf("# pattern: \"(a*)\"\n");
225     printf("# string:  \"aaaaaa...\"\n");    len = 0;
226     tre_regcomp(&reobj, "(a*)", REG_EXTENDED);
227     while (len <= max_len) {
228 
229       str = malloc(sizeof(char) * (len+1));
230       for (i = 0; i < len; i++)
231 	str[i] = 'a';
232       str[len-1] = '\0';
233 
234       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
235       stats(sample_data, samples, len);
236       len = len + (max_len/steps);
237       free(str);
238     }
239     break;
240 
241   case 3:
242     printf("# pattern: \"(a*)*|b*\"\n");
243     printf("# string:  \"aaaaaa...b\"\n");
244     len = 0;
245     tre_regcomp(&reobj, "(a*)*|b*", REG_EXTENDED);
246     while (len <= max_len) {
247       str = malloc(sizeof(char) * (len+1));
248       for (i = 0; i < len-1; i++)
249 	str[i] = 'a';
250       if (len > 0)
251 	str[len-1] = 'b';
252       str[len] = '\0';
253 
254       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
255       stats(sample_data, samples, len);
256       len = len + (max_len/steps);
257       free(str);
258     }
259     break;
260 
261   case 4:
262     printf("# pattern: \"(a|a|a|...|a)\"\n");
263     printf("# string:  \"aaaaaa...\"\n");
264     len = 1024*1024;
265     str = malloc(sizeof(char) * (len+1));
266     for (i = 0; i < len-1; i++)
267       str[i] = 'a';
268     str[len] = '\0';
269     len = 0;
270     while (len <= max_len) {
271       tmpbuf[0] = '(';
272       for (i = 1; i < (len*2); i++) {
273 	tmpbuf[i] = 'a';
274 	if (i < len*2-2) {
275 	  i++;
276 	  tmpbuf[i] = '|';
277 	}
278       }
279       printf("# i = %d\n", i);
280       tmpbuf[i] = ')';
281       tmpbuf[i+1] = '*';
282       tmpbuf[i+2] = '\0';
283       printf("# pat = %s\n", tmpbuf);
284       tre_regcomp(&reobj, tmpbuf, REG_EXTENDED);
285 
286       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
287       stats(sample_data, samples, len);
288       len = len + (max_len/steps);
289       tre_regfree(&reobj);
290     }
291     free(str);
292     break;
293 
294   case 5:
295     printf("# pattern: \"foobar\"\n");
296     printf("# string:  \"aaaaaa...foobar\"\n");
297     len = 0;
298     tre_regcomp(&reobj, "foobar", REG_EXTENDED);
299     while (len <= max_len) {
300       str = malloc(sizeof(char) * (len+7));
301       for (i = 0; i < len; i++) {
302 	if (i*i % 3)
303 	  str[i] = 'a';
304 	else
305 	  str[i] = 'a';
306       }
307       str[len+0] = 'f';
308       str[len+1] = 'o';
309       str[len+2] = 'o';
310       str[len+3] = 'b';
311       str[len+4] = 'a';
312       str[len+5] = 'r';
313       str[len+6] = '\0';
314 
315       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
316       stats(sample_data, samples, len);
317       len = len + (max_len/steps);
318       free(str);
319     }
320     break;
321 
322 
323   case 6:
324     printf("# pattern: \"a*foobar\"\n");
325     printf("# string:  \"aaaaaa...foobar\"\n");
326     len = 0;
327     tre_regcomp(&reobj, "a*foobar", REG_EXTENDED);
328     while (len <= max_len) {
329       str = malloc(sizeof(char) * (len+7));
330       for (i = 0; i < len; i++) {
331 	str[i] = 'a';
332       }
333       str[len+0] = 'f';
334       str[len+1] = 'o';
335       str[len+2] = 'o';
336       str[len+3] = 'b';
337       str[len+4] = 'a';
338       str[len+5] = 'r';
339       str[len+6] = '\0';
340 
341       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
342       stats(sample_data, samples, len);
343       len = len + (max_len/steps);
344       free(str);
345     }
346     break;
347 
348 
349   case 7:
350     printf("# pattern: \"(a)*foobar\"\n");
351     printf("# string:  \"aaaaabbaaab...foobar\"\n");
352     len = 0;
353     tre_regcomp(&reobj, "(a)*foobar", REG_EXTENDED);
354     while (len <= max_len) {
355       str = malloc(sizeof(char) * (len+7));
356       for (i = 0; i < len; i++) {
357 	/* Without this GNU regex won't find a match! */
358 	if (i*(i-1) % 3)
359 	  str[i] = 'b';
360 	else
361 	  str[i] = 'a';
362       }
363       str[len+0] = 'f';
364       str[len+1] = 'o';
365       str[len+2] = 'o';
366       str[len+3] = 'b';
367       str[len+4] = 'a';
368       str[len+5] = 'r';
369       str[len+6] = '\0';
370 
371       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
372       stats(sample_data, samples, len);
373       len = len + (max_len/steps);
374       free(str);
375     }
376     break;
377 
378 
379   case 8:
380     printf("# pattern: \"(a|b)*foobar\"\n");
381     printf("# string:  \"aaaaabbaaab...foobar\"\n");
382     len = 0;
383     tre_regcomp(&reobj, "(a|b)*foobar", REG_EXTENDED);
384     while (len <= max_len) {
385       str = malloc(sizeof(char) * (len+7));
386       for (i = 0; i < len; i++) {
387 	if (i*(i-1) % 3)
388 	  str[i] = 'b';
389 	else
390 	  str[i] = 'a';
391 	/* Without this GNU regex won't find a match! */
392 	if (i % (1024*1024*10 - 100))
393 	  str[i] = 'f';
394       }
395       str[len+0] = 'f';
396       str[len+1] = 'o';
397       str[len+2] = 'o';
398       str[len+3] = 'b';
399       str[len+4] = 'a';
400       str[len+5] = 'r';
401       str[len+6] = '\0';
402 
403       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
404       stats(sample_data, samples, len);
405       len = len + (max_len/steps);
406       free(str);
407     }
408     break;
409 
410 
411   case 9:
412     printf("# pattern: hand-coded a*\n");
413     printf("# string:  \"aaaaaa...\"\n");
414     len = 0;
415     while (len <= max_len) {
416       printf("# len = %d\n", len);
417 
418       str = malloc(sizeof(char)*(len+1));
419       for (i = 0; i < len; i++)
420 	str[i] = 'a';
421       str[len-1] = '\0';
422 
423       for (i = 0; i < samples; i++) {
424 	c1 = clock();
425 	for (j = 0; j < repeats; j++) {
426 	  char *s;
427 	  int l;
428 
429 	  s = str;
430 	  l = 0;
431 
432 
433 	  while (s != '\0') {
434 	    if (*s == 'a') {
435 	      s++;
436 	      l++;
437 	    } else
438 	      break;
439 	  }
440 	}
441       	c2 = clock();
442 	sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
443 
444 	printf("# sample: %.5f sec, clocks: %ld\n",
445 	       (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
446 	       (long)(c2-c1));
447 	fflush(stdout);
448       }
449       fflush(stdout);
450 
451       stats(sample_data, samples, len);
452       len = len + (max_len/steps);
453       free(str);
454     }
455     break;
456 
457 
458   default:
459     printf("Pelle.\n");
460     return 1;
461   }
462 
463   tre_regfree(&reobj);
464 
465   return 0;
466 }
467