1 /*
2 bench.c - simple regex benchmark program
3
4 Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5
6 This software is free; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as
8 published by the Free Software Foundation; either version 2.1 of the
9 License, or (at your option) any later version.
10
11 This software is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with this software; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif /* HAVE_CONFIG_H */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #ifdef HAVE_GETOPT_H
29 #include <getopt.h>
30 #endif /* HAVE_GETOPT_H */
31 #include <time.h>
32 #include <unistd.h>
33 #include <math.h>
34 #include <sys/types.h>
35
36 #if 0
37 #include <hackerlab/rx-posix/regex.h>
38 #else
39 #include <regex.h>
40 #endif
41
42 /* T distribution for alpha = 0.025 (for 95% confidence). XXX - is
43 this correct? */
44 double t_distribution[] = {
45 12.71,
46 4.303,
47 3.182,
48 2.776,
49 2.571,
50 2.447,
51 2.365,
52 2.306,
53 2.262,
54 2.228,
55 2.201,
56 2.179,
57 2.160,
58 2.145,
59 2.131,
60 2.120,
61 2.110,
62 2.101,
63 2.093,
64 2.086,
65 2.080,
66 2.074,
67 2.069,
68 2.064,
69 2.060,
70 2.056,
71 2.052,
72 2.048,
73 2.045,
74 2.042
75 };
76
77 void
stats(double * sample_data,int samples,int len)78 stats(double *sample_data, int samples, int len)
79 {
80 double mean, tmp1, tmp2, variance, stddev, error, percent;
81 int i;
82
83 mean = 0;
84 for (i = 0; i < samples; i++)
85 mean += sample_data[i];
86 mean = mean/i;
87 printf("# mean: %.5f\n", mean);
88
89 tmp1 = 0;
90 for (i = 0; i < samples; i++) {
91 tmp2 = sample_data[i] - mean;
92 tmp1 += tmp2*tmp2;
93 }
94 if (samples > 1)
95 variance = tmp1 / (samples-1);
96 else
97 variance = 0;
98 stddev = sqrt(variance);
99 printf("# variance: %.16f\n", variance);
100 printf("# standard deviation: %.16f\n", stddev);
101
102 error = t_distribution[samples-1] * stddev / sqrt(samples);
103 if (mean != 0)
104 percent = 100*error/mean;
105 else
106 percent = 0;
107 printf("# error: �%.16f (�%.4f%%)\n", error, percent);
108
109 printf("%d\t%.5f\t%.5f\n", len, mean, error);
110
111 fflush(stdout);
112 }
113
114 void
run_tests(int len,int samples,double * sample_data,int repeats,regex_t * reobj,char * str,char * tmpbuf)115 run_tests(int len, int samples, double *sample_data, int repeats,
116 regex_t *reobj, char *str, char *tmpbuf)
117 {
118 int i, j, errcode;
119 clock_t c1, c2;
120 regmatch_t pmatch[10];
121
122
123 printf("# len = %d\n", len);
124 fflush(stdout);
125 for (i = 0; i < samples; i++) {
126 c1 = clock();
127 for (j = 0; j < repeats; j++)
128 if ((errcode = regexec(reobj, str, 10, pmatch, 0))) {
129 regerror(errcode, reobj, tmpbuf, 255);
130 printf("error: %s\n", tmpbuf);
131 }
132 c2 = clock();
133
134 sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
135
136 printf("# sample: %.5f sec, clocks: %ld\n",
137 (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
138 (long)(c2-c1));
139 fflush(stdout);
140 }
141 fflush(stdout);
142
143 for (i = 0; i < 10; i += 2) {
144 printf("# pmatch[%d].rm_so = %d\n", i/2, (int)pmatch[i/2].rm_so);
145 printf("# pmatch[%d].rm_eo = %d\n", i/2, (int)pmatch[i/2].rm_eo);
146 }
147 }
148
149
150 int
main(int argc,char ** argv)151 main(int argc, char **argv)
152 {
153 regex_t reobj;
154 char *str;
155 char tmpbuf[256];
156 int i, j;
157 int max_len = 1024*1024*10;
158 int steps = 20;
159 int repeats = 10;
160 int samples = 20;
161 int len;
162 clock_t c1, c2;
163 int opt;
164 double sample_data[30];
165
166 int test_id = -1;
167
168 while ((opt = getopt(argc, argv, "r:l:s:j:t:")) != -1) {
169 switch (opt) {
170 case 't':
171 test_id = atoi(optarg);
172 break;
173 case 'l':
174 max_len = atoi(optarg);
175 break;
176 case 'j':
177 steps = atoi(optarg);
178 break;
179 case 's':
180 samples = atoi(optarg);
181 break;
182 case 'r':
183 repeats = atoi(optarg);
184 break;
185 default:
186 printf("P�lli.\n");
187 return 1;
188 }
189 }
190
191 /* XXX - Check that the correct results are returned. For example, GNU
192 regex-0.12 returns incorrect results for very long strings in
193 test number 1. */
194
195 switch (test_id) {
196 case 0:
197 printf("# pattern: \"a*\"\n");
198 printf("# string: \"aaaaaa...\"\n");
199 len = 0;
200 regcomp(&reobj, "a*", REG_EXTENDED);
201 while (len <= max_len) {
202
203 str = malloc(sizeof(char) * (len+1));
204 for (i = 0; i < len; i++)
205 str[i] = 'a';
206 str[len-1] = '\0';
207
208 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
209 stats(sample_data, samples, len);
210 len = len + (max_len/steps);
211 free(str);
212 }
213 break;
214
215
216 case 1:
217 printf("# pattern: \"(a)*\"\n");
218 printf("# string: \"aaaaaa...\"\n");
219 len = 0;
220 regcomp(&reobj, "(a)*", REG_EXTENDED);
221 while (len <= max_len) {
222
223 str = malloc(sizeof(char) * (len+1));
224 for (i = 0; i < len; i++)
225 str[i] = 'a';
226 str[len-1] = '\0';
227
228 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
229 stats(sample_data, samples, len);
230 len = len + (max_len/steps);
231 free(str);
232 }
233 break;
234
235
236 case 2:
237 printf("# pattern: \"(a*)\"\n");
238 printf("# string: \"aaaaaa...\"\n"); len = 0;
239 regcomp(&reobj, "(a*)", REG_EXTENDED);
240 while (len <= max_len) {
241
242 str = malloc(sizeof(char) * (len+1));
243 for (i = 0; i < len; i++)
244 str[i] = 'a';
245 str[len-1] = '\0';
246
247 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
248 stats(sample_data, samples, len);
249 len = len + (max_len/steps);
250 free(str);
251 }
252 break;
253
254 case 3:
255 printf("# pattern: \"(a*)*|b*\"\n");
256 printf("# string: \"aaaaaa...b\"\n");
257 len = 0;
258 regcomp(&reobj, "(a*)*|b*", REG_EXTENDED);
259 while (len <= max_len) {
260 str = malloc(sizeof(char) * (len+1));
261 for (i = 0; i < len-1; i++)
262 str[i] = 'a';
263 if (len > 0)
264 str[len-1] = 'b';
265 str[len] = '\0';
266
267 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
268 stats(sample_data, samples, len);
269 len = len + (max_len/steps);
270 free(str);
271 }
272 break;
273
274 case 4:
275 printf("# pattern: \"(a|a|a|...|a)\"\n");
276 printf("# string: \"aaaaaa...\"\n");
277 len = 1024*1024;
278 str = malloc(sizeof(char) * (len+1));
279 for (i = 0; i < len-1; i++)
280 str[i] = 'a';
281 str[len] = '\0';
282 len = 0;
283 while (len <= max_len) {
284 tmpbuf[0] = '(';
285 for (i = 1; i < (len*2); i++) {
286 tmpbuf[i] = 'a';
287 if (i < len*2-2) {
288 i++;
289 tmpbuf[i] = '|';
290 }
291 }
292 printf("# i = %d\n", i);
293 tmpbuf[i] = ')';
294 tmpbuf[i+1] = '*';
295 tmpbuf[i+2] = '\0';
296 printf("# pat = %s\n", tmpbuf);
297 regcomp(&reobj, tmpbuf, REG_EXTENDED);
298
299 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
300 stats(sample_data, samples, len);
301 len = len + (max_len/steps);
302 regfree(&reobj);
303 }
304 free(str);
305 break;
306
307 case 5:
308 printf("# pattern: \"foobar\"\n");
309 printf("# string: \"aaaaaa...foobar\"\n");
310 len = 0;
311 regcomp(&reobj, "foobar", REG_EXTENDED);
312 while (len <= max_len) {
313 str = malloc(sizeof(char) * (len+7));
314 for (i = 0; i < len; i++) {
315 if (i*i % 3)
316 str[i] = 'a';
317 else
318 str[i] = 'a';
319 }
320 str[len+0] = 'f';
321 str[len+1] = 'o';
322 str[len+2] = 'o';
323 str[len+3] = 'b';
324 str[len+4] = 'a';
325 str[len+5] = 'r';
326 str[len+6] = '\0';
327
328 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
329 stats(sample_data, samples, len);
330 len = len + (max_len/steps);
331 free(str);
332 }
333 break;
334
335
336 case 6:
337 printf("# pattern: \"a*foobar\"\n");
338 printf("# string: \"aaaaaa...foobar\"\n");
339 len = 0;
340 regcomp(&reobj, "a*foobar", REG_EXTENDED);
341 while (len <= max_len) {
342 str = malloc(sizeof(char) * (len+7));
343 for (i = 0; i < len; i++) {
344 str[i] = 'a';
345 }
346 str[len+0] = 'f';
347 str[len+1] = 'o';
348 str[len+2] = 'o';
349 str[len+3] = 'b';
350 str[len+4] = 'a';
351 str[len+5] = 'r';
352 str[len+6] = '\0';
353
354 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
355 stats(sample_data, samples, len);
356 len = len + (max_len/steps);
357 free(str);
358 }
359 break;
360
361
362 case 7:
363 printf("# pattern: \"(a)*foobar\"\n");
364 printf("# string: \"aaaaabbaaab...foobar\"\n");
365 len = 0;
366 regcomp(&reobj, "(a)*foobar", REG_EXTENDED);
367 while (len <= max_len) {
368 str = malloc(sizeof(char) * (len+7));
369 for (i = 0; i < len; i++) {
370 /* Without this GNU regex won't find a match! */
371 if (i*(i-1) % 3)
372 str[i] = 'b';
373 else
374 str[i] = 'a';
375 }
376 str[len+0] = 'f';
377 str[len+1] = 'o';
378 str[len+2] = 'o';
379 str[len+3] = 'b';
380 str[len+4] = 'a';
381 str[len+5] = 'r';
382 str[len+6] = '\0';
383
384 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
385 stats(sample_data, samples, len);
386 len = len + (max_len/steps);
387 free(str);
388 }
389 break;
390
391
392 case 8:
393 printf("# pattern: \"(a|b)*foobar\"\n");
394 printf("# string: \"aaaaabbaaab...foobar\"\n");
395 len = 0;
396 regcomp(&reobj, "(a|b)*foobar", REG_EXTENDED);
397 while (len <= max_len) {
398 str = malloc(sizeof(char) * (len+7));
399 for (i = 0; i < len; i++) {
400 if (i*(i-1) % 3)
401 str[i] = 'b';
402 else
403 str[i] = 'a';
404 /* Without this GNU regex won't find a match! */
405 if (i % (1024*1024*10 - 100))
406 str[i] = 'f';
407 }
408 str[len+0] = 'f';
409 str[len+1] = 'o';
410 str[len+2] = 'o';
411 str[len+3] = 'b';
412 str[len+4] = 'a';
413 str[len+5] = 'r';
414 str[len+6] = '\0';
415
416 run_tests(len, samples, sample_data, repeats, &reobj, str, tmpbuf);
417 stats(sample_data, samples, len);
418 len = len + (max_len/steps);
419 free(str);
420 }
421 break;
422
423
424 case 9:
425 printf("# pattern: hand-coded a*\n");
426 printf("# string: \"aaaaaa...\"\n");
427 len = 0;
428 while (len <= max_len) {
429 printf("# len = %d\n", len);
430
431 str = malloc(sizeof(char)*(len+1));
432 for (i = 0; i < len; i++)
433 str[i] = 'a';
434 str[len-1] = '\0';
435
436 for (i = 0; i < samples; i++) {
437 c1 = clock();
438 for (j = 0; j < repeats; j++) {
439 char *s;
440 int l;
441
442 s = str;
443 l = 0;
444
445
446 while (s != '\0') {
447 if (*s == 'a') {
448 s++;
449 l++;
450 } else
451 break;
452 }
453 }
454 c2 = clock();
455 sample_data[i] = (double)(c2-c1)/(CLOCKS_PER_SEC*repeats);
456
457 printf("# sample: %.5f sec, clocks: %ld\n",
458 (double)(c2-c1)/(CLOCKS_PER_SEC*repeats),
459 (long)(c2-c1));
460 fflush(stdout);
461 }
462 fflush(stdout);
463
464 stats(sample_data, samples, len);
465 len = len + (max_len/steps);
466 free(str);
467 }
468 break;
469
470
471 default:
472 printf("Pelle.\n");
473 return 1;
474 }
475
476 regfree(&reobj);
477
478 return 0;
479 }
480