1 /*
2   bench.c - simple regex benchmark program
3 
4   Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5 
6   This software is free; you can redistribute it and/or modify it
7   under the terms of the GNU Lesser General Public License as
8   published by the Free Software Foundation; either version 2.1 of the
9   License, or (at your option) any later version.
10 
11   This software is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15 
16   You should have received a copy of the GNU Lesser General Public
17   License along with this software; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif /* HAVE_CONFIG_H */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #ifdef HAVE_GETOPT_H
29 #include <getopt.h>
30 #endif /* HAVE_GETOPT_H */
31 #include <time.h>
32 #include <unistd.h>
33 #include <math.h>
34 #include <sys/types.h>
35 
36 #if 0
37 #include <hackerlab/rx-posix/regex.h>
38 #else
39 #include <regex.h>
40 #endif
41 
42 /* T distribution for alpha = 0.025 (for 95% confidence).  XXX - is
43    this correct? */
44 double t_distribution[] = {
45   12.71,
46   4.303,
47   3.182,
48   2.776,
49   2.571,
50   2.447,
51   2.365,
52   2.306,
53   2.262,
54   2.228,
55   2.201,
56   2.179,
57   2.160,
58   2.145,
59   2.131,
60   2.120,
61   2.110,
62   2.101,
63   2.093,
64   2.086,
65   2.080,
66   2.074,
67   2.069,
68   2.064,
69   2.060,
70   2.056,
71   2.052,
72   2.048,
73   2.045,
74   2.042
75 };
76 
77 void
stats(double * sample_data,int samples,int len)78 stats(double *sample_data, int samples, int len)
79 {
80   double mean, tmp1, tmp2, variance, stddev, error, percent;
81   int i;
82 
83   mean = 0;
84   for (i = 0; i < samples; i++)
85     mean += sample_data[i];
86   mean = mean/i;
87   printf("# mean: %.5f\n", mean);
88 
89   tmp1 = 0;
90   for (i = 0; i < samples; i++) {
91     tmp2 = sample_data[i] - mean;
92     tmp1 += tmp2*tmp2;
93   }
94   if (samples > 1)
95     variance = tmp1 / (samples-1);
96   else
97     variance = 0;
98   stddev = sqrt(variance);
99   printf("# variance: %.16f\n", variance);
100   printf("# standard deviation: %.16f\n", stddev);
101 
102   error = t_distribution[samples-1] * stddev / sqrt(samples);
103   if (mean != 0)
104     percent = 100*error/mean;
105   else
106     percent = 0;
107   printf("# error: �%.16f (�%.4f%%)\n", error, percent);
108 
109   printf("%d\t%.5f\t%.5f\n", len, mean, error);
110 
111   fflush(stdout);
112 }
113 
114 void
run_tests(int len,int samples,double * sample_data,int repeats,regex_t * reobj,char * str,char * tmpbuf)115 run_tests(int len, int samples, double *sample_data, int repeats,
116 	  regex_t *reobj, char *str, char *tmpbuf)
117 {
118   int i, j, errcode;
119   clock_t c1, c2;
120   regmatch_t pmatch[10];
121 
122 
123   printf("# len = %d\n", len);
124   fflush(stdout);
125   for (i = 0; i < samples; i++) {
126     c1 = clock();
127     for (j = 0; j < repeats; j++)
128       if ((errcode = regexec(reobj, str, 10, pmatch, 0))) {
129 	regerror(errcode, reobj, tmpbuf, 255);
130 	printf("error: %s\n", tmpbuf);
131       }
132     c2 = clock();
133 
134     sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
135 
136     printf("# sample: %.5f sec, clocks: %ld\n",
137 	   (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
138 	   (long)(c2-c1));
139     fflush(stdout);
140   }
141   fflush(stdout);
142 
143   for (i = 0; i < 10; i += 2) {
144     printf("# pmatch[%d].rm_so = %d\n", i/2, (int)pmatch[i/2].rm_so);
145     printf("# pmatch[%d].rm_eo = %d\n", i/2, (int)pmatch[i/2].rm_eo);
146   }
147 }
148 
149 
150 int
main(int argc,char ** argv)151 main(int argc, char **argv)
152 {
153   regex_t reobj;
154   char *str;
155   char tmpbuf[256];
156   int i, j;
157   int max_len = 1024*1024*10;
158   int steps = 20;
159   int repeats = 10;
160   int samples = 20;
161   int len;
162   clock_t c1, c2;
163   int opt;
164   double sample_data[30];
165 
166   int test_id = -1;
167 
168   while ((opt = getopt(argc, argv, "r:l:s:j:t:")) != -1) {
169     switch (opt) {
170     case 't':
171       test_id = atoi(optarg);
172       break;
173     case 'l':
174       max_len = atoi(optarg);
175       break;
176     case 'j':
177       steps = atoi(optarg);
178       break;
179     case 's':
180       samples = atoi(optarg);
181       break;
182     case 'r':
183       repeats = atoi(optarg);
184       break;
185     default:
186       printf("P�lli.\n");
187       return 1;
188     }
189   }
190 
191   /* XXX - Check that the correct results are returned.  For example, GNU
192            regex-0.12 returns incorrect results for very long strings in
193 	   test number 1. */
194 
195   switch (test_id) {
196   case 0:
197     printf("# pattern: \"a*\"\n");
198     printf("# string:  \"aaaaaa...\"\n");
199     len = 0;
200     regcomp(&reobj, "a*", REG_EXTENDED);
201     while (len <= max_len) {
202 
203       str = malloc(sizeof(char) * (len+1));
204       for (i = 0; i < len; i++)
205 	str[i] = 'a';
206       str[len-1] = '\0';
207 
208       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
209       stats(sample_data, samples, len);
210       len = len + (max_len/steps);
211       free(str);
212     }
213     break;
214 
215 
216   case 1:
217     printf("# pattern: \"(a)*\"\n");
218     printf("# string:  \"aaaaaa...\"\n");
219     len = 0;
220     regcomp(&reobj, "(a)*", REG_EXTENDED);
221     while (len <= max_len) {
222 
223       str = malloc(sizeof(char) * (len+1));
224       for (i = 0; i < len; i++)
225 	str[i] = 'a';
226       str[len-1] = '\0';
227 
228       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
229       stats(sample_data, samples, len);
230       len = len + (max_len/steps);
231       free(str);
232     }
233     break;
234 
235 
236   case 2:
237     printf("# pattern: \"(a*)\"\n");
238     printf("# string:  \"aaaaaa...\"\n");    len = 0;
239     regcomp(&reobj, "(a*)", REG_EXTENDED);
240     while (len <= max_len) {
241 
242       str = malloc(sizeof(char) * (len+1));
243       for (i = 0; i < len; i++)
244 	str[i] = 'a';
245       str[len-1] = '\0';
246 
247       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
248       stats(sample_data, samples, len);
249       len = len + (max_len/steps);
250       free(str);
251     }
252     break;
253 
254   case 3:
255     printf("# pattern: \"(a*)*|b*\"\n");
256     printf("# string:  \"aaaaaa...b\"\n");
257     len = 0;
258     regcomp(&reobj, "(a*)*|b*", REG_EXTENDED);
259     while (len <= max_len) {
260       str = malloc(sizeof(char) * (len+1));
261       for (i = 0; i < len-1; i++)
262 	str[i] = 'a';
263       if (len > 0)
264 	str[len-1] = 'b';
265       str[len] = '\0';
266 
267       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
268       stats(sample_data, samples, len);
269       len = len + (max_len/steps);
270       free(str);
271     }
272     break;
273 
274   case 4:
275     printf("# pattern: \"(a|a|a|...|a)\"\n");
276     printf("# string:  \"aaaaaa...\"\n");
277     len = 1024*1024;
278     str = malloc(sizeof(char) * (len+1));
279     for (i = 0; i < len-1; i++)
280       str[i] = 'a';
281     str[len] = '\0';
282     len = 0;
283     while (len <= max_len) {
284       tmpbuf[0] = '(';
285       for (i = 1; i < (len*2); i++) {
286 	tmpbuf[i] = 'a';
287 	if (i < len*2-2) {
288 	  i++;
289 	  tmpbuf[i] = '|';
290 	}
291       }
292       printf("# i = %d\n", i);
293       tmpbuf[i] = ')';
294       tmpbuf[i+1] = '*';
295       tmpbuf[i+2] = '\0';
296       printf("# pat = %s\n", tmpbuf);
297       regcomp(&reobj, tmpbuf, REG_EXTENDED);
298 
299       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
300       stats(sample_data, samples, len);
301       len = len + (max_len/steps);
302       regfree(&reobj);
303     }
304     free(str);
305     break;
306 
307   case 5:
308     printf("# pattern: \"foobar\"\n");
309     printf("# string:  \"aaaaaa...foobar\"\n");
310     len = 0;
311     regcomp(&reobj, "foobar", REG_EXTENDED);
312     while (len <= max_len) {
313       str = malloc(sizeof(char) * (len+7));
314       for (i = 0; i < len; i++) {
315 	if (i*i % 3)
316 	  str[i] = 'a';
317 	else
318 	  str[i] = 'a';
319       }
320       str[len+0] = 'f';
321       str[len+1] = 'o';
322       str[len+2] = 'o';
323       str[len+3] = 'b';
324       str[len+4] = 'a';
325       str[len+5] = 'r';
326       str[len+6] = '\0';
327 
328       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
329       stats(sample_data, samples, len);
330       len = len + (max_len/steps);
331       free(str);
332     }
333     break;
334 
335 
336   case 6:
337     printf("# pattern: \"a*foobar\"\n");
338     printf("# string:  \"aaaaaa...foobar\"\n");
339     len = 0;
340     regcomp(&reobj, "a*foobar", REG_EXTENDED);
341     while (len <= max_len) {
342       str = malloc(sizeof(char) * (len+7));
343       for (i = 0; i < len; i++) {
344 	str[i] = 'a';
345       }
346       str[len+0] = 'f';
347       str[len+1] = 'o';
348       str[len+2] = 'o';
349       str[len+3] = 'b';
350       str[len+4] = 'a';
351       str[len+5] = 'r';
352       str[len+6] = '\0';
353 
354       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
355       stats(sample_data, samples, len);
356       len = len + (max_len/steps);
357       free(str);
358     }
359     break;
360 
361 
362   case 7:
363     printf("# pattern: \"(a)*foobar\"\n");
364     printf("# string:  \"aaaaabbaaab...foobar\"\n");
365     len = 0;
366     regcomp(&reobj, "(a)*foobar", REG_EXTENDED);
367     while (len <= max_len) {
368       str = malloc(sizeof(char) * (len+7));
369       for (i = 0; i < len; i++) {
370 	/* Without this GNU regex won't find a match! */
371 	if (i*(i-1) % 3)
372 	  str[i] = 'b';
373 	else
374 	  str[i] = 'a';
375       }
376       str[len+0] = 'f';
377       str[len+1] = 'o';
378       str[len+2] = 'o';
379       str[len+3] = 'b';
380       str[len+4] = 'a';
381       str[len+5] = 'r';
382       str[len+6] = '\0';
383 
384       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
385       stats(sample_data, samples, len);
386       len = len + (max_len/steps);
387       free(str);
388     }
389     break;
390 
391 
392   case 8:
393     printf("# pattern: \"(a|b)*foobar\"\n");
394     printf("# string:  \"aaaaabbaaab...foobar\"\n");
395     len = 0;
396     regcomp(&reobj, "(a|b)*foobar", REG_EXTENDED);
397     while (len <= max_len) {
398       str = malloc(sizeof(char) * (len+7));
399       for (i = 0; i < len; i++) {
400 	if (i*(i-1) % 3)
401 	  str[i] = 'b';
402 	else
403 	  str[i] = 'a';
404 	/* Without this GNU regex won't find a match! */
405 	if (i % (1024*1024*10 - 100))
406 	  str[i] = 'f';
407       }
408       str[len+0] = 'f';
409       str[len+1] = 'o';
410       str[len+2] = 'o';
411       str[len+3] = 'b';
412       str[len+4] = 'a';
413       str[len+5] = 'r';
414       str[len+6] = '\0';
415 
416       run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
417       stats(sample_data, samples, len);
418       len = len + (max_len/steps);
419       free(str);
420     }
421     break;
422 
423 
424   case 9:
425     printf("# pattern: hand-coded a*\n");
426     printf("# string:  \"aaaaaa...\"\n");
427     len = 0;
428     while (len <= max_len) {
429       printf("# len = %d\n", len);
430 
431       str = malloc(sizeof(char)*(len+1));
432       for (i = 0; i < len; i++)
433 	str[i] = 'a';
434       str[len-1] = '\0';
435 
436       for (i = 0; i < samples; i++) {
437 	c1 = clock();
438 	for (j = 0; j < repeats; j++) {
439 	  char *s;
440 	  int l;
441 
442 	  s = str;
443 	  l = 0;
444 
445 
446 	  while (s != '\0') {
447 	    if (*s == 'a') {
448 	      s++;
449 	      l++;
450 	    } else
451 	      break;
452 	  }
453 	}
454       	c2 = clock();
455 	sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
456 
457 	printf("# sample: %.5f sec, clocks: %ld\n",
458 	       (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
459 	       (long)(c2-c1));
460 	fflush(stdout);
461       }
462       fflush(stdout);
463 
464       stats(sample_data, samples, len);
465       len = len + (max_len/steps);
466       free(str);
467     }
468     break;
469 
470 
471   default:
472     printf("Pelle.\n");
473     return 1;
474   }
475 
476   regfree(&reobj);
477 
478   return 0;
479 }
480