1 /*
2 * bench_packedmatrix.c
3 *
4 * Application to test functionality of packedmatrix.c.
5 *
6 * Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
7 * RSA-1024 0x624ACAD5 1997-01-26 Sign & Encrypt
8 * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6 F6 F6 55 DD 1C DC FF 61
9 *
10 * This program is free software: you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation, either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program. If not, see <http://www.gnu.org/licenses/>.
22 */
23
24 #ifndef __STDC_FORMAT_MACROS
25 #define __STDC_FORMAT_MACROS 1
26 #endif
27
28 #include <m4ri/config.h>
29
30 #ifdef HAVE_LIBPAPI
31 #define _GNU_SOURCE
32 #include <sys/types.h> // papi.h needs caddr_t
33 #include <papi.h>
34 #include <errno.h>
35 #endif
36
37 #include <stdlib.h>
38 #include <ctype.h>
39 #include <inttypes.h>
40
41 #include "cpucycles.h"
42 #include <m4ri/m4ri.h>
43 #include "benchmarking.h"
44
45 struct test_params {
46 rci_t m;
47 rci_t n;
48 rci_t k;
49 rci_t l;
50 rci_t row[3];
51 int rows;
52 rci_t col[3];
53 int cols;
54 wi_t wrd[3];
55 int wrds;
56 uint64_t count;
57 int cutoff;
58 int boolean;
59 int integer;
60 char const* funcname;
61 };
62
63 typedef int (*run_type)(void*, unsigned long long*, int*);
64
65 static unsigned long long loop_calibration[32];
66
67 #ifdef HAVE_LIBPAPI
68
69 #define BENCHMARK_PREFIX(mzd_func) \
70 int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \
71 *data_len = MIN(papi_array_len + 1, *data_len); \
72 struct test_params *p = (struct test_params *)_p; \
73 int papi_res; \
74 do
75 #define TIME_BEGIN(mzd_func_with_ARGS) \
76 do { \
77 int array_len = *data_len - 1; \
78 mzd_func_with_ARGS; \
79 unsigned long long t0 = PAPI_get_virt_usec(); \
80 papi_res = PAPI_start_counters((int*)papi_events, array_len)
81 #define TIME_END \
82 PAPI_stop_counters((long long*)&data[1], array_len); \
83 t0 = PAPI_get_virt_usec() - t0; \
84 data[0] = t0; \
85 for (int nv = 0; nv <= array_len; ++nv) \
86 { \
87 if (data[nv] < loop_calibration[nv]) \
88 loop_calibration[nv] = data[nv]; \
89 data[nv] -= loop_calibration[nv]; \
90 } \
91 } while(0)
92 #define BENCHMARK_POSTFIX \
93 while(0); \
94 return papi_res; \
95 }
96
97 #else // HAVE_LIBPAPI
98
99 #define BENCHMARK_PREFIX(mzd_func) \
100 int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \
101 *data_len = 2; \
102 struct test_params *p = (struct test_params *)_p; \
103 do
104 #define TIME_BEGIN(mzd_func_with_ARGS) \
105 do { \
106 mzd_func_with_ARGS; \
107 data[0] = walltime(0); \
108 data[1] = cpucycles()
109 #define TIME_END \
110 data[1] = cpucycles() - data[1]; \
111 data[0] = walltime(data[0]); \
112 } while(0)
113 #define BENCHMARK_POSTFIX \
114 while(0); \
115 return 0; \
116 }
117
118 #endif // HAVE_LIBPAPI
119
120 #define TIME(mzd_func_with_ARGS) \
121 TIME_BEGIN(mzd_func_with_ARGS); \
122 for (uint64_t i = 0; i < loop_count; ++i) { mzd_func_with_ARGS; } \
123 TIME_END
124
125 mzd_t* volatile vA;
126 rci_t volatile vrowa;
127 rci_t volatile vcola;
128 rci_t volatile vrowb;
129 rci_t volatile vcolb;
130 wi_t volatile vstartblock;
131 int volatile vn;
132 int volatile vint;
133 word volatile vword;
134 BIT volatile vbit;
135
BENCHMARK_PREFIX(bench_nothing)136 BENCHMARK_PREFIX(bench_nothing)
137 {
138 mzd_t* const A = mzd_init(64, 64);
139 mzd_randomize(A);
140 uint64_t volatile loop_count = p->count;
141
142 TIME();
143
144 mzd_free(A);
145 }
146 BENCHMARK_POSTFIX
147
BENCHMARK_PREFIX(_mzd_row_swap)148 BENCHMARK_PREFIX(_mzd_row_swap)
149 {
150 mzd_t* const A = mzd_init(p->m, p->n);
151 mzd_randomize(A);
152
153 vA = A;
154 vrowa = p->row[0];
155 vrowb = p->row[1];
156 vstartblock = p->wrd[0];
157
158 uint64_t const loop_count = p->count;
159
160 TIME(_mzd_row_swap(vA, vrowa, vrowb, vstartblock));
161
162 mzd_free(A);
163 }
164 BENCHMARK_POSTFIX
165
BENCHMARK_PREFIX(mzd_row_swap)166 BENCHMARK_PREFIX(mzd_row_swap)
167 {
168 mzd_t* const A = mzd_init(p->m, p->n);
169 mzd_randomize(A);
170 rci_t const rowa = p->row[0];
171 rci_t const rowb = p->row[1];
172 uint64_t const loop_count = p->count;
173
174 TIME(mzd_row_swap(A, rowa, rowb));
175
176 mzd_free(A);
177 }
178 BENCHMARK_POSTFIX
179
BENCHMARK_PREFIX(mzd_copy_row)180 BENCHMARK_PREFIX(mzd_copy_row)
181 {
182 mzd_t* const A = mzd_init(p->m, p->n);
183 mzd_t* const B = mzd_init(p->m, p->n);
184 mzd_randomize(A);
185 rci_t const rowa = p->row[0];
186 rci_t const rowb = p->row[1];
187 uint64_t const loop_count = p->count;
188
189 TIME(mzd_copy_row(B, rowb, A, rowa));
190
191 mzd_free(A);
192 mzd_free(B);
193 }
194 BENCHMARK_POSTFIX
195
BENCHMARK_PREFIX(mzd_col_swap)196 BENCHMARK_PREFIX(mzd_col_swap)
197 {
198 mzd_t* const A = mzd_init(p->m, p->n);
199 mzd_randomize(A);
200 rci_t const cola = p->col[0];
201 rci_t const colb = p->col[1];
202 uint64_t const loop_count = p->count;
203
204 TIME(mzd_col_swap(A, cola, colb));
205
206 mzd_free(A);
207 }
208 BENCHMARK_POSTFIX
209
BENCHMARK_PREFIX(mzd_col_swap_in_rows)210 BENCHMARK_PREFIX(mzd_col_swap_in_rows)
211 {
212 mzd_t* const A = mzd_init(p->m, p->n);
213 mzd_randomize(A);
214
215 vA = A;
216 vcola = p->col[0];
217 vcolb = p->col[1];
218 vrowa = p->row[0];
219 vrowb = p->row[1];
220
221 uint64_t const loop_count = p->count;
222
223 TIME(mzd_col_swap_in_rows(vA, vcola, vcolb, vrowa, vrowb));
224
225 mzd_free(A);
226 }
227 BENCHMARK_POSTFIX
228
BENCHMARK_PREFIX(mzd_read_bit)229 BENCHMARK_PREFIX(mzd_read_bit)
230 {
231 mzd_t* const A = mzd_init(p->m, p->n);
232 mzd_randomize(A);
233
234 vA = A;
235 vrowa = p->row[0];
236 vcola = p->col[0];
237
238 uint64_t const loop_count = p->count;
239
240 TIME(vbit = mzd_read_bit(vA, vrowa, vcola));
241
242 mzd_free(A);
243 }
244 BENCHMARK_POSTFIX
245
BENCHMARK_PREFIX(mzd_write_bit)246 BENCHMARK_PREFIX(mzd_write_bit)
247 {
248 mzd_t* const A = mzd_init(p->m, p->n);
249
250 vA = A;
251 vrowa = p->row[0];
252 vcola = p->col[0];
253 vbit = 0;
254
255 uint64_t const loop_count = p->count;
256
257 TIME(mzd_write_bit(vA, vrowa, vcola, vbit); vbit = !vbit);
258
259 mzd_free(A);
260 }
261 BENCHMARK_POSTFIX
262
BENCHMARK_PREFIX(mzd_row_add_offset)263 BENCHMARK_PREFIX(mzd_row_add_offset)
264 {
265 mzd_t* const A = mzd_init(p->m, p->n);
266 mzd_randomize(A);
267
268 vA = A;
269 vrowa = p->row[0];
270 vrowb = p->row[1];
271 vcola = p->col[0];
272
273 uint64_t const loop_count = p->count;
274
275 TIME(mzd_row_add_offset(vA, vrowa, vrowb, vcola));
276
277 mzd_free(A);
278 }
279 BENCHMARK_POSTFIX
280
BENCHMARK_PREFIX(mzd_row_add)281 BENCHMARK_PREFIX(mzd_row_add)
282 {
283 mzd_t* const A = mzd_init(p->m, p->n);
284 mzd_randomize(A);
285 rci_t const rowa = p->row[0];
286 rci_t const rowb = p->row[1];
287 uint64_t const loop_count = p->count;
288
289 TIME(mzd_row_add(A, rowa, rowb));
290
291 mzd_free(A);
292 }
293 BENCHMARK_POSTFIX
294
BENCHMARK_PREFIX(mzd_transpose)295 BENCHMARK_PREFIX(mzd_transpose)
296 {
297 mzd_t* const A = mzd_init(p->m, p->n);
298 mzd_t* const B = mzd_init(p->n, p->m);
299 mzd_randomize(A);
300 uint64_t const loop_count = p->count;
301
302 TIME(mzd_transpose(B, A));
303
304 mzd_free(A);
305 mzd_free(B);
306 }
307 BENCHMARK_POSTFIX
308
BENCHMARK_PREFIX(mzd_mul_naive)309 BENCHMARK_PREFIX(mzd_mul_naive)
310 {
311 mzd_t* const A = mzd_init(p->m, p->l);
312 mzd_t* const B = mzd_init(p->l, p->n);
313 mzd_t* const C = mzd_init(p->m, p->n);
314 mzd_randomize(A);
315 mzd_randomize(B);
316 uint64_t const loop_count = p->count;
317
318 TIME(mzd_mul_naive(C, A, B));
319
320 mzd_free(A);
321 mzd_free(B);
322 mzd_free(C);
323 }
324 BENCHMARK_POSTFIX
325
BENCHMARK_PREFIX(mzd_addmul_naive)326 BENCHMARK_PREFIX(mzd_addmul_naive)
327 {
328 mzd_t* const A = mzd_init(p->m, p->l);
329 mzd_t* const B = mzd_init(p->l, p->n);
330 mzd_t* const C = mzd_init(p->m, p->n);
331 mzd_randomize(A);
332 mzd_randomize(B);
333 uint64_t const loop_count = p->count;
334
335 TIME(mzd_addmul_naive(C, A, B));
336
337 mzd_free(A);
338 mzd_free(B);
339 mzd_free(C);
340 }
341 BENCHMARK_POSTFIX
342
BENCHMARK_PREFIX(_mzd_mul_naive)343 BENCHMARK_PREFIX(_mzd_mul_naive)
344 {
345 mzd_t* const A = mzd_init(p->m, p->l);
346 mzd_t* const B = mzd_init(p->n, p->l);
347 mzd_t* const C = mzd_init(p->m, p->n);
348 mzd_randomize(A);
349 mzd_randomize(B);
350 int const clear = p->boolean;
351 uint64_t const loop_count = p->count;
352
353 TIME(_mzd_mul_naive(C, A, B, clear));
354
355 mzd_free(A);
356 mzd_free(B);
357 mzd_free(C);
358 }
359 BENCHMARK_POSTFIX
360
BENCHMARK_PREFIX(_mzd_mul_va)361 BENCHMARK_PREFIX(_mzd_mul_va)
362 {
363 mzd_t* const A = mzd_init(p->m, p->n);
364 mzd_t* const V = mzd_init(1, p->m);
365 mzd_t* const C = mzd_init(1, p->n);
366 mzd_randomize(A);
367 mzd_randomize(V);
368 int const clear = p->boolean;
369 uint64_t const loop_count = p->count;
370
371 TIME(_mzd_mul_va(C, V, A, clear));
372
373 mzd_free(A);
374 mzd_free(V);
375 mzd_free(C);
376 }
377 BENCHMARK_POSTFIX
378
BENCHMARK_PREFIX(mzd_gauss_delayed)379 BENCHMARK_PREFIX(mzd_gauss_delayed)
380 {
381 mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1));
382 rci_t const cola = p->col[0];
383 int const full = p->boolean;
384 uint64_t const loop_count = p->count;
385 rci_t result;
386
387 for (int i = loop_count; i >= 0; --i) {
388 A[i] = mzd_init(p->m, p->n);
389 mzd_randomize(A[i]);
390 }
391
392 TIME_BEGIN(result = mzd_gauss_delayed(A[0], cola, full));
393 for (int i = loop_count; i > 0; --i) {
394 result = mzd_gauss_delayed(A[i], cola, full);
395 }
396 TIME_END;
397
398 for (int i = 0; i <= loop_count; ++i)
399 mzd_free(A[i]);
400 free(A);
401 }
402 BENCHMARK_POSTFIX
403
BENCHMARK_PREFIX(mzd_echelonize_naive)404 BENCHMARK_PREFIX(mzd_echelonize_naive)
405 {
406 mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1));
407 int const full = p->boolean;
408 uint64_t const loop_count = p->count;
409 rci_t result;
410
411 for (int i = loop_count; i >= 0; --i) {
412 A[i] = mzd_init(p->m, p->n);
413 mzd_randomize(A[i]);
414 }
415
416 TIME_BEGIN(result = mzd_echelonize_naive(A[0], full));
417 for (int i = loop_count; i > 0; --i) {
418 result = mzd_echelonize_naive(A[i], full);
419 }
420 TIME_END;
421
422 for (int i = 0; i <= loop_count; ++i)
423 mzd_free(A[i]);
424 free(A);
425 }
426 BENCHMARK_POSTFIX
427
BENCHMARK_PREFIX(mzd_equal)428 BENCHMARK_PREFIX(mzd_equal)
429 {
430 mzd_t* const A = mzd_init(p->m, p->n);
431 mzd_randomize(A);
432 mzd_t* const B = mzd_copy(NULL, A);
433 uint64_t const loop_count = p->count;
434 int volatile result;
435
436 TIME(result = mzd_equal(A, B));
437
438 mzd_free(A);
439 mzd_free(B);
440 }
441 BENCHMARK_POSTFIX
442
BENCHMARK_PREFIX(mzd_cmp)443 BENCHMARK_PREFIX(mzd_cmp)
444 {
445 mzd_t* const A = mzd_init(p->m, p->n);
446 mzd_randomize(A);
447 mzd_t* const B = mzd_copy(NULL, A);
448 uint64_t const loop_count = p->count;
449 int volatile result;
450
451 TIME(result = mzd_cmp(A, B));
452
453 mzd_free(A);
454 mzd_free(B);
455 }
456 BENCHMARK_POSTFIX
457
BENCHMARK_PREFIX(mzd_copy)458 BENCHMARK_PREFIX(mzd_copy)
459 {
460 mzd_t* const A = mzd_init(p->m, p->n);
461 mzd_t* const B = mzd_init(p->m, p->n);
462 mzd_randomize(A);
463 uint64_t const loop_count = p->count;
464
465 TIME(mzd_copy(B, A));
466
467 mzd_free(A);
468 mzd_free(B);
469 }
470 BENCHMARK_POSTFIX
471
BENCHMARK_PREFIX(mzd_concat)472 BENCHMARK_PREFIX(mzd_concat)
473 {
474 mzd_t* const A = mzd_init(p->m, p->k);
475 mzd_t* const B = mzd_init(p->m, p->l);
476 mzd_t* const C = mzd_init(p->m, p->k + p->l);
477 mzd_randomize(A);
478 mzd_randomize(B);
479 uint64_t const loop_count = p->count;
480
481 TIME(mzd_concat(C, A, B));
482
483 mzd_free(A);
484 mzd_free(B);
485 mzd_free(C);
486 }
487 BENCHMARK_POSTFIX
488
BENCHMARK_PREFIX(mzd_stack)489 BENCHMARK_PREFIX(mzd_stack)
490 {
491 mzd_t* const A = mzd_init(p->k, p->n);
492 mzd_t* const B = mzd_init(p->l, p->n);
493 mzd_t* const C = mzd_init(p->k + p->l, p->n);
494 mzd_randomize(A);
495 mzd_randomize(B);
496 uint64_t const loop_count = p->count;
497
498 TIME(mzd_stack(C, A, B));
499
500 mzd_free(A);
501 mzd_free(B);
502 mzd_free(C);
503 }
504 BENCHMARK_POSTFIX
505
BENCHMARK_PREFIX(mzd_submatrix)506 BENCHMARK_PREFIX(mzd_submatrix)
507 {
508 mzd_t* const A = mzd_init(p->m, p->n);
509 mzd_randomize(A);
510 rci_t const rowa = p->row[0];
511 rci_t const cola = p->col[0];
512 rci_t const rowb = p->row[1];
513 rci_t const colb = p->col[1];
514 mzd_t* const S = mzd_init(rowb - rowa, colb - cola);
515 uint64_t const loop_count = p->count;
516
517 TIME(mzd_submatrix(S, A, rowa, cola, rowb, colb));
518
519 mzd_free(A);
520 mzd_free(S);
521 }
522 BENCHMARK_POSTFIX
523
BENCHMARK_PREFIX(mzd_invert_naive)524 BENCHMARK_PREFIX(mzd_invert_naive)
525 {
526 mzd_t* const A = mzd_init(p->m, p->m);
527 mzd_t* const I = mzd_init(p->m, p->m);
528 mzd_t* const C = mzd_init(p->m, p->m);
529 mzd_randomize(A);
530 mzd_set_ui(I, 1);
531 uint64_t const loop_count = p->count;
532
533 TIME(mzd_invert_naive(C, A, I));
534
535 mzd_free(A);
536 mzd_free(I);
537 mzd_free(C);
538 }
539 BENCHMARK_POSTFIX
540
BENCHMARK_PREFIX(mzd_add)541 BENCHMARK_PREFIX(mzd_add)
542 {
543 mzd_t* const A = mzd_init(p->m, p->n);
544 mzd_t* const B = mzd_init(p->m, p->n);
545 mzd_t* const C = mzd_init(p->m, p->n);
546 mzd_randomize(A);
547 mzd_randomize(B);
548 uint64_t const loop_count = p->count;
549
550 TIME(mzd_add(C, A, B));
551
552 mzd_free(A);
553 mzd_free(B);
554 mzd_free(C);
555 }
556 BENCHMARK_POSTFIX
557
BENCHMARK_PREFIX(_mzd_add)558 BENCHMARK_PREFIX(_mzd_add)
559 {
560 mzd_t* const A = mzd_init(p->m, p->n);
561 mzd_t* const B = mzd_init(p->m, p->n);
562 mzd_t* const C = mzd_init(p->m, p->n);
563 mzd_randomize(A);
564 mzd_randomize(B);
565 uint64_t const loop_count = p->count;
566
567 TIME(_mzd_add(C, A, B));
568
569 mzd_free(A);
570 mzd_free(B);
571 mzd_free(C);
572 }
573 BENCHMARK_POSTFIX
574
BENCHMARK_PREFIX(mzd_combine)575 BENCHMARK_PREFIX(mzd_combine)
576 {
577 mzd_t* const A = mzd_init(p->m, p->n);
578 mzd_t* const B = mzd_init(p->m, p->n);
579 mzd_t* const C = mzd_init(p->m, p->n);
580 mzd_randomize(A);
581 mzd_randomize(B);
582 rci_t row1 = p->row[0];
583 rci_t row2 = p->row[1];
584 rci_t row3 = p->row[2];
585 wi_t startblock = p->wrd[0];
586 uint64_t const loop_count = p->count;
587
588 TIME(mzd_combine(C, row3, startblock, A, row1, startblock, B, row2, startblock));
589
590 mzd_free(A);
591 mzd_free(B);
592 mzd_free(C);
593 }
594 BENCHMARK_POSTFIX
595
BENCHMARK_PREFIX(mzd_read_bits)596 BENCHMARK_PREFIX(mzd_read_bits)
597 {
598 mzd_t* const A = mzd_init(p->m, p->n);
599 mzd_randomize(A);
600
601 vA = A;
602 vrowa = p->row[0];
603 vcola = p->col[0];
604 vn = p->integer;
605
606 uint64_t const loop_count = p->count;
607
608 TIME(vword = mzd_read_bits(vA, vrowa, vcola, vn));
609
610 mzd_free(A);
611 }
612 BENCHMARK_POSTFIX
613
BENCHMARK_PREFIX(mzd_read_bits_int)614 BENCHMARK_PREFIX(mzd_read_bits_int)
615 {
616 mzd_t* const A = mzd_init(p->m, p->n);
617 mzd_randomize(A);
618
619 vA = A;
620 vrowa = p->row[0];
621 vcola = p->col[0];
622 vn = p->integer;
623
624 uint64_t const loop_count = p->count;
625
626 TIME(vint = mzd_read_bits_int(vA, vrowa, vcola, vn));
627
628 mzd_free(A);
629 }
630 BENCHMARK_POSTFIX
631
BENCHMARK_PREFIX(mzd_xor_bits)632 BENCHMARK_PREFIX(mzd_xor_bits)
633 {
634 mzd_t* const A = mzd_init(p->m, p->n);
635 mzd_randomize(A);
636
637 vA = A;
638 vrowa = p->row[0];
639 vcola = p->col[0];
640 vn = p->integer;
641 vword = 0xffffffffffffffffULL;
642
643 uint64_t const loop_count = p->count;
644
645 TIME(mzd_xor_bits(vA, vrowa, vcola, vn, vword));
646
647 mzd_free(A);
648 }
649 BENCHMARK_POSTFIX
650
BENCHMARK_PREFIX(mzd_and_bits)651 BENCHMARK_PREFIX(mzd_and_bits)
652 {
653 mzd_t* const A = mzd_init(p->m, p->n);
654 mzd_randomize(A);
655
656 vA = A;
657 vrowa = p->row[0];
658 vcola = p->col[0];
659 vn = p->integer;
660 vword = 0xffffffffffffffffULL;
661
662 uint64_t const loop_count = p->count;
663
664 TIME(mzd_and_bits(vA, vrowa, vcola, vn, vword));
665
666 mzd_free(A);
667 }
668 BENCHMARK_POSTFIX
669
BENCHMARK_PREFIX(mzd_clear_bits)670 BENCHMARK_PREFIX(mzd_clear_bits)
671 {
672 mzd_t* volatile A = mzd_init(p->m, p->n);
673 mzd_randomize(A);
674
675 vA = A;
676 vrowa = p->row[0];
677 vcola = p->col[0];
678 vn = p->integer;
679
680 uint64_t const loop_count = p->count;
681
682 TIME(mzd_clear_bits(vA, vrowa, vcola, vn));
683
684 mzd_free(A);
685 }
686 BENCHMARK_POSTFIX
687
BENCHMARK_PREFIX(mzd_is_zero)688 BENCHMARK_PREFIX(mzd_is_zero)
689 {
690 mzd_t* const A = mzd_init(p->m, p->n);
691 uint64_t const loop_count = p->count;
692 int volatile result;
693
694 TIME(result = mzd_is_zero(A));
695
696 mzd_free(A);
697 }
698 BENCHMARK_POSTFIX
699
BENCHMARK_PREFIX(mzd_find_pivot)700 BENCHMARK_PREFIX(mzd_find_pivot)
701 {
702 mzd_t* const A = mzd_init(p->m, p->n);
703 mzd_randomize(A);
704 rci_t row = p->row[0];
705 rci_t col = p->col[0];
706 uint64_t const loop_count = p->count;
707 int volatile result;
708 rci_t row_out;
709 rci_t col_out;
710
711 TIME(result = mzd_find_pivot(A, row, col, &row_out, &col_out));
712
713 mzd_free(A);
714 }
715 BENCHMARK_POSTFIX
716
BENCHMARK_PREFIX(mzd_density)717 BENCHMARK_PREFIX(mzd_density)
718 {
719 mzd_t* const A = mzd_init(p->m, p->n);
720 mzd_randomize(A);
721 wi_t res = p->wrd[0];
722 uint64_t const loop_count = p->count;
723 double volatile result;
724
725 TIME(result = mzd_density(A, res));
726
727 mzd_free(A);
728 }
729 BENCHMARK_POSTFIX
730
BENCHMARK_PREFIX(_mzd_density)731 BENCHMARK_PREFIX(_mzd_density)
732 {
733 mzd_t* const A = mzd_init(p->m, p->n);
734 mzd_randomize(A);
735 rci_t row = p->row[0];
736 rci_t col = p->col[0];
737 wi_t res = p->wrd[0];
738 uint64_t const loop_count = p->count;
739 double volatile result;
740
741 TIME(result = _mzd_density(A, res, row, col));
742
743 mzd_free(A);
744 }
745 BENCHMARK_POSTFIX
746
BENCHMARK_PREFIX(mzd_first_zero_row)747 BENCHMARK_PREFIX(mzd_first_zero_row)
748 {
749 mzd_t* const A = mzd_init(p->m, p->m);
750 mzd_set_ui(A, 1);
751 uint64_t const loop_count = p->count;
752 rci_t volatile result;
753
754 TIME(result = mzd_first_zero_row(A));
755
756 mzd_free(A);
757 }
758 BENCHMARK_POSTFIX
759
760 // Returns a number proportional with the ideal number of
761 // mathematical operations for the given code.
complexity1(struct test_params * p,char code)762 double complexity1(struct test_params *p, char code)
763 {
764 switch(code)
765 {
766 case 'k':
767 return p->k; // Linear with size 'k' of a matrix.
768 case 'l':
769 return p->l; // Linear with size 'l' of a matrix.
770 case 'm':
771 return p->m; // Linear with the number of rows of the matrix.
772 case 'n':
773 return p->n; // Linear with the number of columns of the matrix.
774 case 'W':
775 assert(p->n > m4ri_radix * p->wrd[0]); // Linear with the number of processed columns.
776 return p->n - m4ri_radix * p->wrd[0];
777 case 'D':
778 assert(p->row[0] < p->row[1]);
779 return p->row[1] - p->row[0]; // Linear with the number of rows between start_row and stop_row.
780 case 'E':
781 assert(p->col[0] < p->col[1]);
782 return p->col[1] - p->col[0]; // Linear with the number of cols between start_col and stop_col.
783 case 'C':
784 assert(p->col[0] < p->n);
785 return p->n - p->col[0]; // Linear with the number of columns of column col and beyond.
786 }
787 return 0.0;
788 }
789
complexity1_human(struct test_params * p,char code)790 char const* complexity1_human(struct test_params *p, char code)
791 {
792 switch(code)
793 {
794 case 'k':
795 return "k";
796 case 'l':
797 return "l";
798 case 'm':
799 return "m";
800 case 'n':
801 return "n";
802 case 'W':
803 return "cols";
804 case 'D':
805 return "rows";
806 case 'E':
807 return "cols";
808 case 'C':
809 return "cols";
810 }
811 return "UNKNOWN";
812 }
813
complexity(struct test_params * p,char const * cp)814 double complexity(struct test_params *p, char const* cp)
815 {
816 double c = 1;
817 while (*cp)
818 {
819 c *= complexity1(p, *cp);
820 ++cp;
821 }
822 return c;
823 }
824
print_complexity_human(struct test_params * p,char const * cp)825 void print_complexity_human(struct test_params *p, char const* cp)
826 {
827 int first = 1;
828 char last_cp = 0;
829 int power = 0;
830 while (*cp)
831 {
832 if (*cp != last_cp)
833 {
834 if (power > 1)
835 printf("^%d", power);
836 if (!first && isupper(*cp))
837 printf("*");
838 printf("%s", complexity1_human(p, *cp));
839 power = 0;
840 last_cp = *cp;
841 }
842 ++power;
843 ++cp;
844 }
845 if (power > 1)
846 printf("^%d", power);
847 }
848
849 struct function_st {
850 char const* funcname;
851 run_type run_func;
852 char const* input_codes;
853 char const* complexity_code;
854 uint64_t count;
855 };
856
857 typedef struct function_st function_st;
858
859 static function_st function_mapper[] = {
860 { "_mzd_row_swap", run__mzd_row_swap, "Rmn,ri,ri,wi", "W", 1000000000 },
861 { "mzd_row_swap", run_mzd_row_swap, "Rmn,ri,ri", "n", 1000000000 },
862 { "mzd_copy_row", run_mzd_copy_row, "Omn,ri,R,ri", "n", 1000000000 },
863 { "mzd_col_swap", run_mzd_col_swap, "Rmn,ci,ci", "m", 10000000 },
864 { "mzd_col_swap_in_rows", run_mzd_col_swap_in_rows, "Rmn,ci,ci,ri,ri", "D", 10000000 },
865 { "mzd_read_bit", run_mzd_read_bit, "Rmn,ri,ci", "", 100000000 },
866 { "mzd_write_bit", run_mzd_write_bit, "Omn,ri,ci", "", 100000000 },
867 { "mzd_row_add_offset", run_mzd_row_add_offset, "Rmn,ri,ri,ci", "C", 100000000 },
868 { "mzd_row_add", run_mzd_row_add, "Rmn,ri,ri", "n", 100000000 },
869 { "mzd_transpose", run_mzd_transpose, "Onm,Rmn", "mn", 10000000 },
870 { "mzd_mul_naive", run_mzd_mul_naive, "Omn,Rml,Rln", "mnl",10000000 },
871 { "mzd_addmul_naive", run_mzd_addmul_naive, "Omn,Rml,Rln", "mnl",10000000 },
872 { "_mzd_mul_naive", run__mzd_mul_naive, "Omn,Rml,Rnl,b", "mnl",10000000 },
873 { "_mzd_mul_va", run__mzd_mul_va, "O1n,V1m,Amn,b", "mn", 1000000000 },
874 { "mzd_gauss_delayed", run_mzd_gauss_delayed, "Rmn,ci,b", "mC", 10000000 },
875 { "mzd_echelonize_naive", run_mzd_echelonize_naive, "Rmn,b", "mn", 10000000 },
876 { "mzd_equal", run_mzd_equal, "Rmn,Rmn", "mn", 1000000000 },
877 { "mzd_cmp", run_mzd_cmp, "Rmn,Rmn", "mn", 1000000000 },
878 { "mzd_copy", run_mzd_copy, "Omn,Rmn", "mn", 1000000000 },
879 { "mzd_concat", run_mzd_concat, "Omn,Rmk,Rml", "mn", 10000000 },
880 { "mzd_stack", run_mzd_stack, "Omn,Rkn,Rln", "mn", 1000000000 },
881 { "mzd_submatrix", run_mzd_submatrix, "O,Rmn,ri,ci,ri,ci", "DE", 10000000 },
882 { "mzd_invert_naive", run_mzd_invert_naive, "Omm,Rmm,Imm", "mmm",10000000 },
883 { "mzd_add", run_mzd_add, "Omn,Rmn,Rmn", "mn", 10000000 },
884 { "_mzd_add", run__mzd_add, "Omn,Rmn,Rmn", "mn", 10000000 },
885 { "mzd_combine", run_mzd_combine, "Omn,ri,wi,R,ri,R,ri", "W", 10000000 },
886 { "mzd_read_bits", run_mzd_read_bits, "Rmn,ri,ci,n", "", 10000000 },
887 { "mzd_read_bits_int", run_mzd_read_bits_int, "Rmn,ri,ci,n", "", 10000000 },
888 { "mzd_xor_bits", run_mzd_xor_bits, "Rmn,ri,ci,n,w", "", 10000000 },
889 { "mzd_and_bits", run_mzd_and_bits, "Rmn,ri,ci,n,w", "", 10000000 },
890 { "mzd_clear_bits", run_mzd_clear_bits, "Rmn,ri,ci,n", "", 10000000 },
891 { "mzd_is_zero", run_mzd_is_zero, "Rmn", "mn", 10000000 },
892 { "mzd_find_pivot", run_mzd_find_pivot, "Rmn,ri,ci", "", 1000000 },
893 { "mzd_density", run_mzd_density, "Rmn,wi", "", 10000000 },
894 { "_mzd_density", run__mzd_density, "Rmn,wi,ri,ci", "", 10000000 },
895 { "mzd_first_zero_row", run_mzd_first_zero_row, "Rmm", "m", 10000000000 },
896 { "nothing", run_bench_nothing, "", "", 1 }
897 };
898
decode_size(char var,struct test_params * params,int * argcp,char *** argvp)899 int decode_size(char var, struct test_params* params, int* argcp, char*** argvp)
900 {
901 if (*argcp < 1)
902 {
903 fprintf(stderr, "%s: Not enough arguments. Expected matrix size: %c\n", progname, var);
904 return 1;
905 }
906 --(*argcp);
907 switch(var)
908 {
909 case 'k':
910 params->k = atoi((*argvp)[0]);
911 break;
912 case 'l':
913 params->l = atoi((*argvp)[0]);
914 break;
915 case 'm':
916 params->m = atoi((*argvp)[0]);
917 break;
918 case 'n':
919 params->n = atoi((*argvp)[0]);
920 break;
921 }
922 ++(*argvp);
923 return 0;
924 }
925
decode_index(char idx,struct test_params * params,int * argcp,char *** argvp)926 int decode_index(char idx, struct test_params* params, int* argcp, char*** argvp)
927 {
928 if (*argcp < 1)
929 {
930 int count = 0;
931 switch(idx)
932 {
933 case 'r':
934 count = params->rows;
935 break;
936 case 'c':
937 count = params->cols;
938 break;
939 case 'w':
940 count = params->wrds;
941 break;
942 }
943 fprintf(stderr, "%s: Not enough arguments. Expected ", progname);
944 switch(idx)
945 {
946 case 'r':
947 fprintf(stderr, "row");
948 break;
949 case 'c':
950 fprintf(stderr, "column");
951 break;
952 case 'w':
953 fprintf(stderr, "word");
954 break;
955 }
956 fprintf(stderr, " index : %c%d\n", idx, count + 1);
957 return 1;
958 }
959 --(*argcp);
960 switch(idx)
961 {
962 case 'r':
963 params->row[params->rows++] = atoi((*argvp)[0]);
964 break;
965 case 'c':
966 params->col[params->cols++] = atoi((*argvp)[0]);
967 break;
968 case 'w':
969 params->wrd[params->wrds++] = atoi((*argvp)[0]);
970 break;
971 }
972 ++(*argvp);
973 return 0;
974 }
975
decode_code(char idx,struct test_params * params,int * argcp,char *** argvp)976 int decode_code(char idx, struct test_params* params, int* argcp, char*** argvp)
977 {
978 if (*argcp < 1)
979 {
980 fprintf(stderr, "%s: Not enough arguments. Expected ", progname);
981 switch(idx)
982 {
983 case 'b':
984 printf("boolean");
985 break;
986 case 'n':
987 printf("integer");
988 break;
989 default:
990 printf("%c", idx);
991 }
992 fprintf(stderr, ".\n");
993 return 1;
994 }
995 --(*argcp);
996 switch(idx)
997 {
998 case 'b':
999 params->boolean = atoi((*argvp)[0]);
1000 if (params->boolean != 0 && params->boolean != 1)
1001 {
1002 fprintf(stderr, "%s: Expected boolean: %s\n", progname, (*argvp)[0]);
1003 return 1;
1004 }
1005 break;
1006 case 'n':
1007 params->integer = atoi((*argvp)[0]);
1008 break;
1009 }
1010 ++(*argvp);
1011 return 0;
1012 }
1013
main(int argc,char ** argv)1014 int main(int argc, char** argv)
1015 {
1016 int opts = global_options(&argc, &argv);
1017
1018 struct test_params params;
1019 unsigned long long data[8];
1020 int data_len;
1021
1022 #ifdef HAVE_LIBPAPI
1023 int papi_counters = PAPI_num_counters();
1024 if (papi_counters < papi_array_len)
1025 {
1026 fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters);
1027 papi_array_len = papi_counters;
1028 }
1029 int res = PAPI_start_counters((int*)papi_events, papi_array_len);
1030 switch(res)
1031 {
1032 case 0:
1033 {
1034 long long* tmp = (long long*)malloc(papi_array_len * sizeof(long long));
1035 PAPI_stop_counters(tmp, papi_array_len);
1036 free(tmp);
1037 break;
1038 }
1039 case PAPI_ECNFLCT:
1040 {
1041 fprintf(stderr, "%s: %s: Conflicting event: The underlying counter hardware cannot count the specified events simultaneously.\n", progname, papi_event_name(papi_events[papi_array_len - 1]));
1042 fprintf(stderr, "Run `papi_event_chooser PRESET");
1043 for (int nv = 0; nv < papi_array_len - 1; ++nv)
1044 fprintf(stderr, " %s", papi_event_name(papi_events[nv]));
1045 fprintf(stderr, "` to get a list of possible events that can be added.\n");
1046 break;
1047 }
1048 case PAPI_ENOEVNT:
1049 {
1050 for (int nv = 0; nv < papi_array_len; ++nv)
1051 if ((res = PAPI_query_event(papi_events[nv])) != PAPI_OK)
1052 {
1053 fprintf(stderr, "%s: PAPI_start_counters: %s: %s.\n", progname, papi_event_name(papi_events[nv]), PAPI_strerror(res));
1054 break;
1055 }
1056 break;
1057 }
1058 case PAPI_ESYS:
1059 fprintf(stderr, "%s: PAPI_start_counters: %s\n", progname, strerror(errno));
1060 break;
1061 default:
1062 fprintf(stderr, "%s: PAPI_start_counters: %s.\n", progname, PAPI_strerror(res));
1063 break;
1064 }
1065 if (res)
1066 return 1;
1067 for (int nv = 0; nv <= papi_array_len; ++nv)
1068 loop_calibration[nv] = 100000000;
1069 params.count = 1;
1070 data_len = papi_array_len + 1;
1071 for (int i = 0; i < 100; ++i)
1072 run_bench_nothing((void*)¶ms, data, &data_len);
1073 #endif
1074
1075 int f;
1076 int found = 0;
1077
1078 params.rows = 0;
1079 params.cols = 0;
1080 params.wrds = 0;
1081 params.cutoff = -1;
1082
1083 if (argc >= 2)
1084 {
1085 params.funcname = argv[1];
1086
1087 for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f)
1088 {
1089 if (strcmp(params.funcname, function_mapper[f].funcname) == 0)
1090 {
1091 found = 1;
1092 break;
1093 }
1094 }
1095 }
1096 if (!found)
1097 {
1098 if (argc >= 2)
1099 fprintf(stderr, "%s: function name \"%s\" not found.\n", progname, params.funcname);
1100 else
1101 {
1102 fprintf(stderr, "Usage: %s [OPTIONS] <funcname> [ARGS]\n", progname);
1103 bench_print_global_options(stderr);
1104 }
1105 fprintf(stderr, "Possible values for <funcname>:\n");
1106 for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f)
1107 {
1108 if (f != 0 && f % 4 == 0)
1109 fprintf(stderr, "\n");
1110 fprintf(stderr, "%-22s", function_mapper[f].funcname);
1111 }
1112 fprintf(stderr, "\n");
1113 return 1;
1114 }
1115
1116 argc -= 2; // argc >= 1 if more arguments.
1117 argv += 2; // Next argument in argv[0]
1118 char* input_codes = strdup(function_mapper[f].input_codes);
1119 char* input_code[10];
1120 char* p = input_codes;
1121 int codes = 0;
1122 while(*p)
1123 {
1124 input_code[codes++] = p++;
1125 while(*p && *p != ',')
1126 ++p;
1127 if (*p == ',')
1128 *p++ = '\0';
1129 }
1130 int saw_var[4];
1131 for (int var_index = 0; var_index < 4; ++var_index)
1132 saw_var[var_index] = 0;
1133 int saw_vars = 0;
1134 char usage[64];
1135 char* usage_ptr = usage;
1136 int error = 0;
1137 for (int c = 0; ; ++c)
1138 {
1139 if (c < codes)
1140 {
1141 p = input_code[c];
1142 if (isupper(*p))
1143 {
1144 while(*++p)
1145 {
1146 if (*p != '1')
1147 {
1148 int var_index = *p - 'k';
1149 assert(var_index >= 0 && var_index <= 3); // 'k', 'l', 'm' or 'n'.
1150 saw_var[var_index] = 1;
1151 saw_vars = 1;
1152 }
1153 }
1154 continue;
1155 }
1156 }
1157 if (saw_vars)
1158 {
1159 saw_vars = 0;
1160 for (int var_count = 2; var_count < 6; ++var_count)
1161 {
1162 int var_index = var_count % 4;
1163 if (saw_var[var_index] == 1)
1164 {
1165 *usage_ptr++ = ' ';
1166 *usage_ptr++ = 'k' + var_index;
1167 saw_var[var_index] = 2;
1168 if (!error && decode_size('k' + var_index, ¶ms, &argc, &argv))
1169 error = 1;
1170 }
1171 }
1172 }
1173 if (c == codes)
1174 break;
1175 if (p[1] == 'i')
1176 {
1177 *usage_ptr++ = ' ';
1178 *usage_ptr++ = *p;
1179 switch(*p)
1180 {
1181 case 'r':
1182 *usage_ptr++ = '1' + params.rows;
1183 if (error) ++params.rows;
1184 break;
1185 case 'c':
1186 *usage_ptr++ = '1' + params.cols;
1187 if (error) ++params.cols;
1188 break;
1189 case 'w':
1190 *usage_ptr++ = '1' + params.wrds;
1191 if (error) ++params.wrds;
1192 break;
1193 }
1194 if (!error && decode_index(*p, ¶ms, &argc, &argv))
1195 error = 1;
1196 }
1197 else
1198 {
1199 *usage_ptr++ = ' ';
1200 *usage_ptr++ = *p;
1201 if (!error && decode_code(*p, ¶ms, &argc, &argv))
1202 error = 1;
1203 }
1204 }
1205 *usage_ptr = '\0';
1206 if (argc != 0)
1207 error = 1;
1208 if (error)
1209 {
1210 if (argc != 0)
1211 fprintf(stderr, "%s %s: too many parameters.\n", progname, params.funcname);
1212 fprintf(stderr, "Usage: %s [OPTIONS] %s%s\n", progname, params.funcname, usage);
1213 if (opts <= 0)
1214 bench_print_global_options(stderr);
1215 return 1;
1216 }
1217
1218 double cost = complexity(¶ms, function_mapper[f].complexity_code);
1219 params.count = bench_count ? bench_count : function_mapper[f].count / cost;
1220 if (params.count < 1)
1221 params.count = 1;
1222 bench_count = params.count;
1223
1224 srandom(17);
1225
1226 data_len = run_bench(function_mapper[f].run_func, (void*)¶ms, data, sizeof(data) / sizeof(unsigned long long));
1227
1228 printf("function: %s, count: %" PRId64 ", ", params.funcname, params.count);
1229 if (saw_var[2])
1230 printf("m: %d, ", params.m);
1231 if (saw_var[3])
1232 printf("n: %d, ", params.n);
1233 if (saw_var[0])
1234 printf("k: %d, ", params.k);
1235 if (saw_var[1])
1236 printf("l: %d, ", params.l);
1237 for (int i = 0; i < 3; ++i)
1238 {
1239 if (i < params.rows)
1240 printf("row%c: %d, ", 'a' + i, params.row[i]);
1241 if (i < params.cols)
1242 printf("col%c: %d, ", 'a' + i, params.col[i]);
1243 if (i < params.wrds)
1244 printf("word%c: %d, ", 'a' + i, params.wrd[i]);
1245 }
1246 if (params.cutoff != -1)
1247 printf("cutoff: %d, ", params.cutoff);
1248 print_wall_time(data[0] / 1000000.0 / params.count);
1249 printf(", cpu cycles: %llu", (data[1] + params.count / 2) / params.count);
1250 #ifndef HAVE_LIBPAPI
1251 printf(", cc/");
1252 print_complexity_human(¶ms, function_mapper[f].complexity_code);
1253 printf(": %f\n", data[1] / (params.count * cost));
1254 #else
1255 printf("\n");
1256 for (int n = 1; n < data_len; ++n)
1257 {
1258 printf("%s (%f) per bit (divided by ", papi_event_name(papi_events[n - 1]), (double)data[n] / params.count);
1259 print_complexity_human(¶ms, function_mapper[f].complexity_code);
1260 printf("): %f\n", data[n] / (params.count * cost));
1261 }
1262 #endif
1263 }
1264