1 /*
2  * bench_packedmatrix.c
3  *
4  * Application to test functionality of packedmatrix.c.
5  *
6  * Copyright (C) 2011  Carlo Wood  <carlo@alinoe.com>
7  * RSA-1024 0x624ACAD5 1997-01-26                    Sign & Encrypt
8  * Fingerprint16 = 32 EC A7 B6 AC DB 65 A6  F6 F6 55 DD 1C DC FF 61
9  *
10  * This program is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation, either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22  */
23 
24 #ifndef __STDC_FORMAT_MACROS
25 #define __STDC_FORMAT_MACROS 1
26 #endif
27 
28 #include <m4ri/config.h>
29 
30 #ifdef HAVE_LIBPAPI
31 #define _GNU_SOURCE
32 #include <sys/types.h>          // papi.h needs caddr_t
33 #include <papi.h>
34 #include <errno.h>
35 #endif
36 
37 #include <stdlib.h>
38 #include <ctype.h>
39 #include <inttypes.h>
40 
41 #include "cpucycles.h"
42 #include <m4ri/m4ri.h>
43 #include "benchmarking.h"
44 
45 struct test_params {
46   rci_t m;
47   rci_t n;
48   rci_t k;
49   rci_t l;
50   rci_t row[3];
51   int rows;
52   rci_t col[3];
53   int cols;
54   wi_t wrd[3];
55   int wrds;
56   uint64_t count;
57   int cutoff;
58   int boolean;
59   int integer;
60   char const* funcname;
61 };
62 
63 typedef int (*run_type)(void*, unsigned long long*, int*);
64 
65 static unsigned long long loop_calibration[32];
66 
67 #ifdef HAVE_LIBPAPI
68 
69 #define BENCHMARK_PREFIX(mzd_func) \
70   int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \
71     *data_len = MIN(papi_array_len + 1, *data_len); \
72     struct test_params *p = (struct test_params *)_p; \
73     int papi_res; \
74     do
75 #define TIME_BEGIN(mzd_func_with_ARGS) \
76       do { \
77 	int array_len = *data_len - 1; \
78 	mzd_func_with_ARGS; \
79 	unsigned long long t0 = PAPI_get_virt_usec(); \
80 	papi_res = PAPI_start_counters((int*)papi_events, array_len)
81 #define TIME_END \
82 	PAPI_stop_counters((long long*)&data[1], array_len); \
83 	t0 = PAPI_get_virt_usec() - t0; \
84 	data[0] = t0; \
85 	for (int nv = 0; nv <= array_len; ++nv) \
86 	{ \
87 	  if (data[nv] < loop_calibration[nv]) \
88 	    loop_calibration[nv] = data[nv]; \
89 	  data[nv] -= loop_calibration[nv]; \
90 	} \
91       } while(0)
92 #define BENCHMARK_POSTFIX \
93     while(0); \
94     return papi_res; \
95   }
96 
97 #else // HAVE_LIBPAPI
98 
99 #define BENCHMARK_PREFIX(mzd_func) \
100   int run_##mzd_func(void *_p, unsigned long long *data, int *data_len) { \
101     *data_len = 2; \
102     struct test_params *p = (struct test_params *)_p; \
103     do
104 #define TIME_BEGIN(mzd_func_with_ARGS) \
105     do { \
106       mzd_func_with_ARGS; \
107       data[0] = walltime(0); \
108       data[1] = cpucycles()
109 #define TIME_END \
110       data[1] = cpucycles() - data[1]; \
111       data[0] = walltime(data[0]); \
112     } while(0)
113 #define BENCHMARK_POSTFIX \
114     while(0); \
115     return 0; \
116   }
117 
118 #endif // HAVE_LIBPAPI
119 
120 #define TIME(mzd_func_with_ARGS)					\
121     TIME_BEGIN(mzd_func_with_ARGS);					\
122     for (uint64_t i = 0; i < loop_count; ++i) { mzd_func_with_ARGS; }	\
123     TIME_END
124 
125 mzd_t* volatile vA;
126 rci_t volatile vrowa;
127 rci_t volatile vcola;
128 rci_t volatile vrowb;
129 rci_t volatile vcolb;
130 wi_t volatile vstartblock;
131 int volatile vn;
132 int volatile vint;
133 word volatile vword;
134 BIT volatile vbit;
135 
BENCHMARK_PREFIX(bench_nothing)136 BENCHMARK_PREFIX(bench_nothing)
137 {
138   mzd_t* const A = mzd_init(64, 64);
139   mzd_randomize(A);
140   uint64_t volatile loop_count = p->count;
141 
142   TIME();
143 
144   mzd_free(A);
145 }
146 BENCHMARK_POSTFIX
147 
BENCHMARK_PREFIX(_mzd_row_swap)148 BENCHMARK_PREFIX(_mzd_row_swap)
149 {
150   mzd_t* const A = mzd_init(p->m, p->n);
151   mzd_randomize(A);
152 
153   vA = A;
154   vrowa = p->row[0];
155   vrowb = p->row[1];
156   vstartblock = p->wrd[0];
157 
158   uint64_t const loop_count = p->count;
159 
160   TIME(_mzd_row_swap(vA, vrowa, vrowb, vstartblock));
161 
162   mzd_free(A);
163 }
164 BENCHMARK_POSTFIX
165 
BENCHMARK_PREFIX(mzd_row_swap)166 BENCHMARK_PREFIX(mzd_row_swap)
167 {
168   mzd_t* const A = mzd_init(p->m, p->n);
169   mzd_randomize(A);
170   rci_t const rowa = p->row[0];
171   rci_t const rowb = p->row[1];
172   uint64_t const loop_count = p->count;
173 
174   TIME(mzd_row_swap(A, rowa, rowb));
175 
176   mzd_free(A);
177 }
178 BENCHMARK_POSTFIX
179 
BENCHMARK_PREFIX(mzd_copy_row)180 BENCHMARK_PREFIX(mzd_copy_row)
181 {
182   mzd_t* const A = mzd_init(p->m, p->n);
183   mzd_t* const B = mzd_init(p->m, p->n);
184   mzd_randomize(A);
185   rci_t const rowa = p->row[0];
186   rci_t const rowb = p->row[1];
187   uint64_t const loop_count = p->count;
188 
189   TIME(mzd_copy_row(B, rowb, A, rowa));
190 
191   mzd_free(A);
192   mzd_free(B);
193 }
194 BENCHMARK_POSTFIX
195 
BENCHMARK_PREFIX(mzd_col_swap)196 BENCHMARK_PREFIX(mzd_col_swap)
197 {
198   mzd_t* const A = mzd_init(p->m, p->n);
199   mzd_randomize(A);
200   rci_t const cola = p->col[0];
201   rci_t const colb = p->col[1];
202   uint64_t const loop_count = p->count;
203 
204   TIME(mzd_col_swap(A, cola, colb));
205 
206   mzd_free(A);
207 }
208 BENCHMARK_POSTFIX
209 
BENCHMARK_PREFIX(mzd_col_swap_in_rows)210 BENCHMARK_PREFIX(mzd_col_swap_in_rows)
211 {
212   mzd_t* const A = mzd_init(p->m, p->n);
213   mzd_randomize(A);
214 
215   vA = A;
216   vcola = p->col[0];
217   vcolb = p->col[1];
218   vrowa = p->row[0];
219   vrowb = p->row[1];
220 
221   uint64_t const loop_count = p->count;
222 
223   TIME(mzd_col_swap_in_rows(vA, vcola, vcolb, vrowa, vrowb));
224 
225   mzd_free(A);
226 }
227 BENCHMARK_POSTFIX
228 
BENCHMARK_PREFIX(mzd_read_bit)229 BENCHMARK_PREFIX(mzd_read_bit)
230 {
231   mzd_t* const A = mzd_init(p->m, p->n);
232   mzd_randomize(A);
233 
234   vA = A;
235   vrowa = p->row[0];
236   vcola = p->col[0];
237 
238   uint64_t const loop_count = p->count;
239 
240   TIME(vbit = mzd_read_bit(vA, vrowa, vcola));
241 
242   mzd_free(A);
243 }
244 BENCHMARK_POSTFIX
245 
BENCHMARK_PREFIX(mzd_write_bit)246 BENCHMARK_PREFIX(mzd_write_bit)
247 {
248   mzd_t* const A = mzd_init(p->m, p->n);
249 
250   vA = A;
251   vrowa = p->row[0];
252   vcola = p->col[0];
253   vbit = 0;
254 
255   uint64_t const loop_count = p->count;
256 
257   TIME(mzd_write_bit(vA, vrowa, vcola, vbit); vbit = !vbit);
258 
259   mzd_free(A);
260 }
261 BENCHMARK_POSTFIX
262 
BENCHMARK_PREFIX(mzd_row_add_offset)263 BENCHMARK_PREFIX(mzd_row_add_offset)
264 {
265   mzd_t* const A = mzd_init(p->m, p->n);
266   mzd_randomize(A);
267 
268   vA = A;
269   vrowa = p->row[0];
270   vrowb = p->row[1];
271   vcola = p->col[0];
272 
273   uint64_t const loop_count = p->count;
274 
275   TIME(mzd_row_add_offset(vA, vrowa, vrowb, vcola));
276 
277   mzd_free(A);
278 }
279 BENCHMARK_POSTFIX
280 
BENCHMARK_PREFIX(mzd_row_add)281 BENCHMARK_PREFIX(mzd_row_add)
282 {
283   mzd_t* const A = mzd_init(p->m, p->n);
284   mzd_randomize(A);
285   rci_t const rowa = p->row[0];
286   rci_t const rowb = p->row[1];
287   uint64_t const loop_count = p->count;
288 
289   TIME(mzd_row_add(A, rowa, rowb));
290 
291   mzd_free(A);
292 }
293 BENCHMARK_POSTFIX
294 
BENCHMARK_PREFIX(mzd_transpose)295 BENCHMARK_PREFIX(mzd_transpose)
296 {
297   mzd_t* const A = mzd_init(p->m, p->n);
298   mzd_t* const B = mzd_init(p->n, p->m);
299   mzd_randomize(A);
300   uint64_t const loop_count = p->count;
301 
302   TIME(mzd_transpose(B, A));
303 
304   mzd_free(A);
305   mzd_free(B);
306 }
307 BENCHMARK_POSTFIX
308 
BENCHMARK_PREFIX(mzd_mul_naive)309 BENCHMARK_PREFIX(mzd_mul_naive)
310 {
311   mzd_t* const A = mzd_init(p->m, p->l);
312   mzd_t* const B = mzd_init(p->l, p->n);
313   mzd_t* const C = mzd_init(p->m, p->n);
314   mzd_randomize(A);
315   mzd_randomize(B);
316   uint64_t const loop_count = p->count;
317 
318   TIME(mzd_mul_naive(C, A, B));
319 
320   mzd_free(A);
321   mzd_free(B);
322   mzd_free(C);
323 }
324 BENCHMARK_POSTFIX
325 
BENCHMARK_PREFIX(mzd_addmul_naive)326 BENCHMARK_PREFIX(mzd_addmul_naive)
327 {
328   mzd_t* const A = mzd_init(p->m, p->l);
329   mzd_t* const B = mzd_init(p->l, p->n);
330   mzd_t* const C = mzd_init(p->m, p->n);
331   mzd_randomize(A);
332   mzd_randomize(B);
333   uint64_t const loop_count = p->count;
334 
335   TIME(mzd_addmul_naive(C, A, B));
336 
337   mzd_free(A);
338   mzd_free(B);
339   mzd_free(C);
340 }
341 BENCHMARK_POSTFIX
342 
BENCHMARK_PREFIX(_mzd_mul_naive)343 BENCHMARK_PREFIX(_mzd_mul_naive)
344 {
345   mzd_t* const A = mzd_init(p->m, p->l);
346   mzd_t* const B = mzd_init(p->n, p->l);
347   mzd_t* const C = mzd_init(p->m, p->n);
348   mzd_randomize(A);
349   mzd_randomize(B);
350   int const clear = p->boolean;
351   uint64_t const loop_count = p->count;
352 
353   TIME(_mzd_mul_naive(C, A, B, clear));
354 
355   mzd_free(A);
356   mzd_free(B);
357   mzd_free(C);
358 }
359 BENCHMARK_POSTFIX
360 
BENCHMARK_PREFIX(_mzd_mul_va)361 BENCHMARK_PREFIX(_mzd_mul_va)
362 {
363   mzd_t* const A = mzd_init(p->m, p->n);
364   mzd_t* const V = mzd_init(1, p->m);
365   mzd_t* const C = mzd_init(1, p->n);
366   mzd_randomize(A);
367   mzd_randomize(V);
368   int const clear = p->boolean;
369   uint64_t const loop_count = p->count;
370 
371   TIME(_mzd_mul_va(C, V, A, clear));
372 
373   mzd_free(A);
374   mzd_free(V);
375   mzd_free(C);
376 }
377 BENCHMARK_POSTFIX
378 
BENCHMARK_PREFIX(mzd_gauss_delayed)379 BENCHMARK_PREFIX(mzd_gauss_delayed)
380 {
381   mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1));
382   rci_t const cola = p->col[0];
383   int const full = p->boolean;
384   uint64_t const loop_count = p->count;
385   rci_t result;
386 
387   for (int i = loop_count; i >= 0; --i) {
388     A[i] = mzd_init(p->m, p->n);
389     mzd_randomize(A[i]);
390   }
391 
392   TIME_BEGIN(result = mzd_gauss_delayed(A[0], cola, full));
393   for (int i = loop_count; i > 0; --i) {
394     result = mzd_gauss_delayed(A[i], cola, full);
395   }
396   TIME_END;
397 
398   for (int i = 0; i <= loop_count; ++i)
399     mzd_free(A[i]);
400   free(A);
401 }
402 BENCHMARK_POSTFIX
403 
BENCHMARK_PREFIX(mzd_echelonize_naive)404 BENCHMARK_PREFIX(mzd_echelonize_naive)
405 {
406   mzd_t** A = malloc(sizeof(mzd_t) * (p->count + 1));
407   int const full = p->boolean;
408   uint64_t const loop_count = p->count;
409   rci_t result;
410 
411   for (int i = loop_count; i >= 0; --i) {
412     A[i] = mzd_init(p->m, p->n);
413     mzd_randomize(A[i]);
414   }
415 
416   TIME_BEGIN(result = mzd_echelonize_naive(A[0], full));
417   for (int i = loop_count; i > 0; --i) {
418     result = mzd_echelonize_naive(A[i], full);
419   }
420   TIME_END;
421 
422   for (int i = 0; i <= loop_count; ++i)
423     mzd_free(A[i]);
424   free(A);
425 }
426 BENCHMARK_POSTFIX
427 
BENCHMARK_PREFIX(mzd_equal)428 BENCHMARK_PREFIX(mzd_equal)
429 {
430   mzd_t* const A = mzd_init(p->m, p->n);
431   mzd_randomize(A);
432   mzd_t* const B = mzd_copy(NULL, A);
433   uint64_t const loop_count = p->count;
434   int volatile result;
435 
436   TIME(result = mzd_equal(A, B));
437 
438   mzd_free(A);
439   mzd_free(B);
440 }
441 BENCHMARK_POSTFIX
442 
BENCHMARK_PREFIX(mzd_cmp)443 BENCHMARK_PREFIX(mzd_cmp)
444 {
445   mzd_t* const A = mzd_init(p->m, p->n);
446   mzd_randomize(A);
447   mzd_t* const B = mzd_copy(NULL, A);
448   uint64_t const loop_count = p->count;
449   int volatile result;
450 
451   TIME(result = mzd_cmp(A, B));
452 
453   mzd_free(A);
454   mzd_free(B);
455 }
456 BENCHMARK_POSTFIX
457 
BENCHMARK_PREFIX(mzd_copy)458 BENCHMARK_PREFIX(mzd_copy)
459 {
460   mzd_t* const A = mzd_init(p->m, p->n);
461   mzd_t* const B = mzd_init(p->m, p->n);
462   mzd_randomize(A);
463   uint64_t const loop_count = p->count;
464 
465   TIME(mzd_copy(B, A));
466 
467   mzd_free(A);
468   mzd_free(B);
469 }
470 BENCHMARK_POSTFIX
471 
BENCHMARK_PREFIX(mzd_concat)472 BENCHMARK_PREFIX(mzd_concat)
473 {
474   mzd_t* const A = mzd_init(p->m, p->k);
475   mzd_t* const B = mzd_init(p->m, p->l);
476   mzd_t* const C = mzd_init(p->m, p->k + p->l);
477   mzd_randomize(A);
478   mzd_randomize(B);
479   uint64_t const loop_count = p->count;
480 
481   TIME(mzd_concat(C, A, B));
482 
483   mzd_free(A);
484   mzd_free(B);
485   mzd_free(C);
486 }
487 BENCHMARK_POSTFIX
488 
BENCHMARK_PREFIX(mzd_stack)489 BENCHMARK_PREFIX(mzd_stack)
490 {
491   mzd_t* const A = mzd_init(p->k, p->n);
492   mzd_t* const B = mzd_init(p->l, p->n);
493   mzd_t* const C = mzd_init(p->k + p->l, p->n);
494   mzd_randomize(A);
495   mzd_randomize(B);
496   uint64_t const loop_count = p->count;
497 
498   TIME(mzd_stack(C, A, B));
499 
500   mzd_free(A);
501   mzd_free(B);
502   mzd_free(C);
503 }
504 BENCHMARK_POSTFIX
505 
BENCHMARK_PREFIX(mzd_submatrix)506 BENCHMARK_PREFIX(mzd_submatrix)
507 {
508   mzd_t* const A = mzd_init(p->m, p->n);
509   mzd_randomize(A);
510   rci_t const rowa = p->row[0];
511   rci_t const cola = p->col[0];
512   rci_t const rowb = p->row[1];
513   rci_t const colb = p->col[1];
514   mzd_t* const S = mzd_init(rowb - rowa, colb - cola);
515   uint64_t const loop_count = p->count;
516 
517   TIME(mzd_submatrix(S, A, rowa, cola, rowb, colb));
518 
519   mzd_free(A);
520   mzd_free(S);
521 }
522 BENCHMARK_POSTFIX
523 
BENCHMARK_PREFIX(mzd_invert_naive)524 BENCHMARK_PREFIX(mzd_invert_naive)
525 {
526   mzd_t* const A = mzd_init(p->m, p->m);
527   mzd_t* const I = mzd_init(p->m, p->m);
528   mzd_t* const C = mzd_init(p->m, p->m);
529   mzd_randomize(A);
530   mzd_set_ui(I, 1);
531   uint64_t const loop_count = p->count;
532 
533   TIME(mzd_invert_naive(C, A, I));
534 
535   mzd_free(A);
536   mzd_free(I);
537   mzd_free(C);
538 }
539 BENCHMARK_POSTFIX
540 
BENCHMARK_PREFIX(mzd_add)541 BENCHMARK_PREFIX(mzd_add)
542 {
543   mzd_t* const A = mzd_init(p->m, p->n);
544   mzd_t* const B = mzd_init(p->m, p->n);
545   mzd_t* const C = mzd_init(p->m, p->n);
546   mzd_randomize(A);
547   mzd_randomize(B);
548   uint64_t const loop_count = p->count;
549 
550   TIME(mzd_add(C, A, B));
551 
552   mzd_free(A);
553   mzd_free(B);
554   mzd_free(C);
555 }
556 BENCHMARK_POSTFIX
557 
BENCHMARK_PREFIX(_mzd_add)558 BENCHMARK_PREFIX(_mzd_add)
559 {
560   mzd_t* const A = mzd_init(p->m, p->n);
561   mzd_t* const B = mzd_init(p->m, p->n);
562   mzd_t* const C = mzd_init(p->m, p->n);
563   mzd_randomize(A);
564   mzd_randomize(B);
565   uint64_t const loop_count = p->count;
566 
567   TIME(_mzd_add(C, A, B));
568 
569   mzd_free(A);
570   mzd_free(B);
571   mzd_free(C);
572 }
573 BENCHMARK_POSTFIX
574 
BENCHMARK_PREFIX(mzd_combine)575 BENCHMARK_PREFIX(mzd_combine)
576 {
577   mzd_t* const A = mzd_init(p->m, p->n);
578   mzd_t* const B = mzd_init(p->m, p->n);
579   mzd_t* const C = mzd_init(p->m, p->n);
580   mzd_randomize(A);
581   mzd_randomize(B);
582   rci_t row1 = p->row[0];
583   rci_t row2 = p->row[1];
584   rci_t row3 = p->row[2];
585   wi_t startblock = p->wrd[0];
586   uint64_t const loop_count = p->count;
587 
588   TIME(mzd_combine(C, row3, startblock, A, row1, startblock, B, row2, startblock));
589 
590   mzd_free(A);
591   mzd_free(B);
592   mzd_free(C);
593 }
594 BENCHMARK_POSTFIX
595 
BENCHMARK_PREFIX(mzd_read_bits)596 BENCHMARK_PREFIX(mzd_read_bits)
597 {
598   mzd_t* const A = mzd_init(p->m, p->n);
599   mzd_randomize(A);
600 
601   vA = A;
602   vrowa = p->row[0];
603   vcola = p->col[0];
604   vn = p->integer;
605 
606   uint64_t const loop_count = p->count;
607 
608   TIME(vword = mzd_read_bits(vA, vrowa, vcola, vn));
609 
610   mzd_free(A);
611 }
612 BENCHMARK_POSTFIX
613 
BENCHMARK_PREFIX(mzd_read_bits_int)614 BENCHMARK_PREFIX(mzd_read_bits_int)
615 {
616   mzd_t* const A = mzd_init(p->m, p->n);
617   mzd_randomize(A);
618 
619   vA = A;
620   vrowa = p->row[0];
621   vcola = p->col[0];
622   vn = p->integer;
623 
624   uint64_t const loop_count = p->count;
625 
626   TIME(vint = mzd_read_bits_int(vA, vrowa, vcola, vn));
627 
628   mzd_free(A);
629 }
630 BENCHMARK_POSTFIX
631 
BENCHMARK_PREFIX(mzd_xor_bits)632 BENCHMARK_PREFIX(mzd_xor_bits)
633 {
634   mzd_t* const A = mzd_init(p->m, p->n);
635   mzd_randomize(A);
636 
637   vA = A;
638   vrowa = p->row[0];
639   vcola = p->col[0];
640   vn = p->integer;
641   vword = 0xffffffffffffffffULL;
642 
643   uint64_t const loop_count = p->count;
644 
645   TIME(mzd_xor_bits(vA, vrowa, vcola, vn, vword));
646 
647   mzd_free(A);
648 }
649 BENCHMARK_POSTFIX
650 
BENCHMARK_PREFIX(mzd_and_bits)651 BENCHMARK_PREFIX(mzd_and_bits)
652 {
653   mzd_t* const A = mzd_init(p->m, p->n);
654   mzd_randomize(A);
655 
656   vA = A;
657   vrowa = p->row[0];
658   vcola = p->col[0];
659   vn = p->integer;
660   vword = 0xffffffffffffffffULL;
661 
662   uint64_t const loop_count = p->count;
663 
664   TIME(mzd_and_bits(vA, vrowa, vcola, vn, vword));
665 
666   mzd_free(A);
667 }
668 BENCHMARK_POSTFIX
669 
BENCHMARK_PREFIX(mzd_clear_bits)670 BENCHMARK_PREFIX(mzd_clear_bits)
671 {
672   mzd_t* volatile A = mzd_init(p->m, p->n);
673   mzd_randomize(A);
674 
675   vA = A;
676   vrowa = p->row[0];
677   vcola = p->col[0];
678   vn = p->integer;
679 
680   uint64_t const loop_count = p->count;
681 
682   TIME(mzd_clear_bits(vA, vrowa, vcola, vn));
683 
684   mzd_free(A);
685 }
686 BENCHMARK_POSTFIX
687 
BENCHMARK_PREFIX(mzd_is_zero)688 BENCHMARK_PREFIX(mzd_is_zero)
689 {
690   mzd_t* const A = mzd_init(p->m, p->n);
691   uint64_t const loop_count = p->count;
692   int volatile result;
693 
694   TIME(result = mzd_is_zero(A));
695 
696   mzd_free(A);
697 }
698 BENCHMARK_POSTFIX
699 
BENCHMARK_PREFIX(mzd_find_pivot)700 BENCHMARK_PREFIX(mzd_find_pivot)
701 {
702   mzd_t* const A = mzd_init(p->m, p->n);
703   mzd_randomize(A);
704   rci_t row = p->row[0];
705   rci_t col = p->col[0];
706   uint64_t const loop_count = p->count;
707   int volatile result;
708   rci_t row_out;
709   rci_t col_out;
710 
711   TIME(result = mzd_find_pivot(A, row, col, &row_out, &col_out));
712 
713   mzd_free(A);
714 }
715 BENCHMARK_POSTFIX
716 
BENCHMARK_PREFIX(mzd_density)717 BENCHMARK_PREFIX(mzd_density)
718 {
719   mzd_t* const A = mzd_init(p->m, p->n);
720   mzd_randomize(A);
721   wi_t res = p->wrd[0];
722   uint64_t const loop_count = p->count;
723   double volatile result;
724 
725   TIME(result = mzd_density(A, res));
726 
727   mzd_free(A);
728 }
729 BENCHMARK_POSTFIX
730 
BENCHMARK_PREFIX(_mzd_density)731 BENCHMARK_PREFIX(_mzd_density)
732 {
733   mzd_t* const A = mzd_init(p->m, p->n);
734   mzd_randomize(A);
735   rci_t row = p->row[0];
736   rci_t col = p->col[0];
737   wi_t res = p->wrd[0];
738   uint64_t const loop_count = p->count;
739   double volatile result;
740 
741   TIME(result = _mzd_density(A, res, row, col));
742 
743   mzd_free(A);
744 }
745 BENCHMARK_POSTFIX
746 
BENCHMARK_PREFIX(mzd_first_zero_row)747 BENCHMARK_PREFIX(mzd_first_zero_row)
748 {
749   mzd_t* const A = mzd_init(p->m, p->m);
750   mzd_set_ui(A, 1);
751   uint64_t const loop_count = p->count;
752   rci_t volatile result;
753 
754   TIME(result = mzd_first_zero_row(A));
755 
756   mzd_free(A);
757 }
758 BENCHMARK_POSTFIX
759 
760 // Returns a number proportional with the ideal number of
761 // mathematical operations for the given code.
complexity1(struct test_params * p,char code)762 double complexity1(struct test_params *p, char code)
763 {
764   switch(code)
765   {
766     case 'k':
767       return p->k;			// Linear with size 'k' of a matrix.
768     case 'l':
769       return p->l;			// Linear with size 'l' of a matrix.
770     case 'm':
771       return p->m;			// Linear with the number of rows of the matrix.
772     case 'n':
773       return p->n;			// Linear with the number of columns of the matrix.
774     case 'W':
775       assert(p->n > m4ri_radix * p->wrd[0]);	// Linear with the number of processed columns.
776       return p->n - m4ri_radix * p->wrd[0];
777     case 'D':
778       assert(p->row[0] < p->row[1]);
779       return p->row[1] - p->row[0];	// Linear with the number of rows between start_row and stop_row.
780     case 'E':
781       assert(p->col[0] < p->col[1]);
782       return p->col[1] - p->col[0];	// Linear with the number of cols between start_col and stop_col.
783     case 'C':
784       assert(p->col[0] < p->n);
785       return p->n - p->col[0];		// Linear with the number of columns of column col and beyond.
786   }
787   return 0.0;
788 }
789 
complexity1_human(struct test_params * p,char code)790 char const* complexity1_human(struct test_params *p, char code)
791 {
792   switch(code)
793   {
794     case 'k':
795       return "k";
796     case 'l':
797       return "l";
798     case 'm':
799       return "m";
800     case 'n':
801       return "n";
802     case 'W':
803       return "cols";
804     case 'D':
805       return "rows";
806     case 'E':
807       return "cols";
808     case 'C':
809       return "cols";
810   }
811   return "UNKNOWN";
812 }
813 
complexity(struct test_params * p,char const * cp)814 double complexity(struct test_params *p, char const* cp)
815 {
816   double c = 1;
817   while (*cp)
818   {
819     c *= complexity1(p, *cp);
820     ++cp;
821   }
822   return c;
823 }
824 
print_complexity_human(struct test_params * p,char const * cp)825 void print_complexity_human(struct test_params *p, char const* cp)
826 {
827   int first = 1;
828   char last_cp = 0;
829   int power = 0;
830   while (*cp)
831   {
832     if (*cp != last_cp)
833     {
834       if (power > 1)
835 	printf("^%d", power);
836       if (!first && isupper(*cp))
837 	printf("*");
838       printf("%s", complexity1_human(p, *cp));
839       power = 0;
840       last_cp = *cp;
841     }
842     ++power;
843     ++cp;
844   }
845   if (power > 1)
846     printf("^%d", power);
847 }
848 
849 struct function_st {
850   char const* funcname;
851   run_type run_func;
852   char const* input_codes;
853   char const* complexity_code;
854   uint64_t count;
855 };
856 
857 typedef struct function_st function_st;
858 
859 static function_st function_mapper[] = {
860   { "_mzd_row_swap",        run__mzd_row_swap,        "Rmn,ri,ri,wi",    "W",  1000000000 },
861   { "mzd_row_swap",         run_mzd_row_swap,         "Rmn,ri,ri",       "n",  1000000000 },
862   { "mzd_copy_row",         run_mzd_copy_row,         "Omn,ri,R,ri",     "n",  1000000000 },
863   { "mzd_col_swap",         run_mzd_col_swap,         "Rmn,ci,ci",       "m",  10000000 },
864   { "mzd_col_swap_in_rows", run_mzd_col_swap_in_rows, "Rmn,ci,ci,ri,ri", "D",  10000000 },
865   { "mzd_read_bit",         run_mzd_read_bit,         "Rmn,ri,ci",       "",   100000000 },
866   { "mzd_write_bit",        run_mzd_write_bit,        "Omn,ri,ci",       "",   100000000 },
867   { "mzd_row_add_offset",   run_mzd_row_add_offset,   "Rmn,ri,ri,ci",    "C",  100000000 },
868   { "mzd_row_add",          run_mzd_row_add,          "Rmn,ri,ri",       "n",  100000000 },
869   { "mzd_transpose",        run_mzd_transpose,        "Onm,Rmn",         "mn", 10000000 },
870   { "mzd_mul_naive",        run_mzd_mul_naive,        "Omn,Rml,Rln",     "mnl",10000000 },
871   { "mzd_addmul_naive",     run_mzd_addmul_naive,     "Omn,Rml,Rln",     "mnl",10000000 },
872   { "_mzd_mul_naive",       run__mzd_mul_naive,       "Omn,Rml,Rnl,b",   "mnl",10000000 },
873   { "_mzd_mul_va",          run__mzd_mul_va,          "O1n,V1m,Amn,b",   "mn", 1000000000 },
874   { "mzd_gauss_delayed",    run_mzd_gauss_delayed,    "Rmn,ci,b",        "mC", 10000000 },
875   { "mzd_echelonize_naive", run_mzd_echelonize_naive, "Rmn,b",           "mn", 10000000 },
876   { "mzd_equal",            run_mzd_equal,            "Rmn,Rmn",         "mn", 1000000000 },
877   { "mzd_cmp",              run_mzd_cmp,              "Rmn,Rmn",         "mn", 1000000000 },
878   { "mzd_copy",             run_mzd_copy,             "Omn,Rmn",         "mn", 1000000000 },
879   { "mzd_concat",           run_mzd_concat,           "Omn,Rmk,Rml",     "mn", 10000000 },
880   { "mzd_stack",            run_mzd_stack,            "Omn,Rkn,Rln",     "mn", 1000000000 },
881   { "mzd_submatrix",        run_mzd_submatrix,      "O,Rmn,ri,ci,ri,ci", "DE", 10000000 },
882   { "mzd_invert_naive",     run_mzd_invert_naive,     "Omm,Rmm,Imm",     "mmm",10000000 },
883   { "mzd_add",              run_mzd_add,              "Omn,Rmn,Rmn",     "mn", 10000000 },
884   { "_mzd_add",             run__mzd_add,             "Omn,Rmn,Rmn",     "mn", 10000000 },
885   { "mzd_combine",          run_mzd_combine,      "Omn,ri,wi,R,ri,R,ri", "W",  10000000 },
886   { "mzd_read_bits",        run_mzd_read_bits,        "Rmn,ri,ci,n",     "",   10000000 },
887   { "mzd_read_bits_int",    run_mzd_read_bits_int,    "Rmn,ri,ci,n",     "",   10000000 },
888   { "mzd_xor_bits",         run_mzd_xor_bits,         "Rmn,ri,ci,n,w",   "",   10000000 },
889   { "mzd_and_bits",         run_mzd_and_bits,         "Rmn,ri,ci,n,w",   "",   10000000 },
890   { "mzd_clear_bits",       run_mzd_clear_bits,       "Rmn,ri,ci,n",     "",   10000000 },
891   { "mzd_is_zero",          run_mzd_is_zero,          "Rmn",             "mn", 10000000 },
892   { "mzd_find_pivot",       run_mzd_find_pivot,       "Rmn,ri,ci",       "",   1000000 },
893   { "mzd_density",          run_mzd_density,          "Rmn,wi",          "",   10000000 },
894   { "_mzd_density",         run__mzd_density,         "Rmn,wi,ri,ci",    "",   10000000 },
895   { "mzd_first_zero_row",   run_mzd_first_zero_row,   "Rmm",             "m",  10000000000 },
896   { "nothing",              run_bench_nothing,        "",                "",   1 }
897 };
898 
decode_size(char var,struct test_params * params,int * argcp,char *** argvp)899 int decode_size(char var, struct test_params* params, int* argcp, char*** argvp)
900 {
901   if (*argcp < 1)
902   {
903     fprintf(stderr, "%s: Not enough arguments. Expected matrix size: %c\n", progname, var);
904     return 1;
905   }
906   --(*argcp);
907   switch(var)
908   {
909     case 'k':
910       params->k = atoi((*argvp)[0]);
911       break;
912     case 'l':
913       params->l = atoi((*argvp)[0]);
914       break;
915     case 'm':
916       params->m = atoi((*argvp)[0]);
917       break;
918     case 'n':
919       params->n = atoi((*argvp)[0]);
920       break;
921   }
922   ++(*argvp);
923   return 0;
924 }
925 
decode_index(char idx,struct test_params * params,int * argcp,char *** argvp)926 int decode_index(char idx, struct test_params* params, int* argcp, char*** argvp)
927 {
928   if (*argcp < 1)
929   {
930     int count = 0;
931     switch(idx)
932     {
933       case 'r':
934 	count = params->rows;
935 	break;
936       case 'c':
937 	count = params->cols;
938 	break;
939       case 'w':
940 	count = params->wrds;
941 	break;
942     }
943     fprintf(stderr, "%s: Not enough arguments. Expected ", progname);
944     switch(idx)
945     {
946       case 'r':
947 	fprintf(stderr, "row");
948 	break;
949       case 'c':
950 	fprintf(stderr, "column");
951 	break;
952       case 'w':
953 	fprintf(stderr, "word");
954 	break;
955     }
956     fprintf(stderr, " index : %c%d\n", idx, count + 1);
957     return 1;
958   }
959   --(*argcp);
960   switch(idx)
961   {
962     case 'r':
963       params->row[params->rows++] = atoi((*argvp)[0]);
964       break;
965     case 'c':
966       params->col[params->cols++] = atoi((*argvp)[0]);
967       break;
968     case 'w':
969       params->wrd[params->wrds++] = atoi((*argvp)[0]);
970       break;
971   }
972   ++(*argvp);
973   return 0;
974 }
975 
decode_code(char idx,struct test_params * params,int * argcp,char *** argvp)976 int decode_code(char idx, struct test_params* params, int* argcp, char*** argvp)
977 {
978   if (*argcp < 1)
979   {
980     fprintf(stderr, "%s: Not enough arguments. Expected ", progname);
981     switch(idx)
982     {
983       case 'b':
984 	printf("boolean");
985 	break;
986       case 'n':
987 	printf("integer");
988 	break;
989       default:
990 	printf("%c", idx);
991     }
992     fprintf(stderr, ".\n");
993     return 1;
994   }
995   --(*argcp);
996   switch(idx)
997   {
998     case 'b':
999       params->boolean = atoi((*argvp)[0]);
1000       if (params->boolean != 0 && params->boolean != 1)
1001       {
1002 	fprintf(stderr, "%s: Expected boolean: %s\n", progname, (*argvp)[0]);
1003 	return 1;
1004       }
1005       break;
1006     case 'n':
1007       params->integer = atoi((*argvp)[0]);
1008       break;
1009   }
1010   ++(*argvp);
1011   return 0;
1012 }
1013 
main(int argc,char ** argv)1014 int main(int argc, char** argv)
1015 {
1016   int opts = global_options(&argc, &argv);
1017 
1018   struct test_params params;
1019   unsigned long long data[8];
1020   int data_len;
1021 
1022 #ifdef HAVE_LIBPAPI
1023   int papi_counters = PAPI_num_counters();
1024   if (papi_counters < papi_array_len)
1025   {
1026     fprintf(stderr, "%s: Warning: there are only %d hardware counters available!\n", progname, papi_counters);
1027     papi_array_len = papi_counters;
1028   }
1029   int res = PAPI_start_counters((int*)papi_events, papi_array_len);
1030   switch(res)
1031   {
1032     case 0:
1033     {
1034       long long* tmp = (long long*)malloc(papi_array_len * sizeof(long long));
1035       PAPI_stop_counters(tmp, papi_array_len);
1036       free(tmp);
1037       break;
1038     }
1039     case PAPI_ECNFLCT:
1040     {
1041       fprintf(stderr, "%s: %s: Conflicting event: The underlying counter hardware cannot count the specified events simultaneously.\n", progname, papi_event_name(papi_events[papi_array_len - 1]));
1042       fprintf(stderr, "Run `papi_event_chooser PRESET");
1043       for (int nv = 0; nv < papi_array_len - 1; ++nv)
1044 	fprintf(stderr, " %s", papi_event_name(papi_events[nv]));
1045       fprintf(stderr, "` to get a list of possible events that can be added.\n");
1046       break;
1047     }
1048     case PAPI_ENOEVNT:
1049     {
1050       for (int nv = 0; nv < papi_array_len; ++nv)
1051 	if ((res = PAPI_query_event(papi_events[nv])) != PAPI_OK)
1052 	{
1053 	  fprintf(stderr, "%s: PAPI_start_counters: %s: %s.\n", progname, papi_event_name(papi_events[nv]), PAPI_strerror(res));
1054 	  break;
1055 	}
1056       break;
1057     }
1058     case PAPI_ESYS:
1059       fprintf(stderr, "%s: PAPI_start_counters: %s\n", progname, strerror(errno));
1060       break;
1061     default:
1062       fprintf(stderr, "%s: PAPI_start_counters: %s.\n", progname, PAPI_strerror(res));
1063       break;
1064   }
1065   if (res)
1066     return 1;
1067   for (int nv = 0; nv <= papi_array_len; ++nv)
1068     loop_calibration[nv] = 100000000;
1069   params.count = 1;
1070   data_len = papi_array_len + 1;
1071   for (int i = 0; i < 100; ++i)
1072     run_bench_nothing((void*)&params, data, &data_len);
1073 #endif
1074 
1075   int f;
1076   int found = 0;
1077 
1078   params.rows = 0;
1079   params.cols = 0;
1080   params.wrds = 0;
1081   params.cutoff = -1;
1082 
1083   if (argc >= 2)
1084   {
1085     params.funcname = argv[1];
1086 
1087     for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f)
1088     {
1089       if (strcmp(params.funcname, function_mapper[f].funcname) == 0)
1090       {
1091 	found = 1;
1092 	break;
1093       }
1094     }
1095   }
1096   if (!found)
1097   {
1098     if (argc >= 2)
1099       fprintf(stderr, "%s: function name \"%s\" not found.\n", progname, params.funcname);
1100     else
1101     {
1102       fprintf(stderr, "Usage: %s [OPTIONS] <funcname> [ARGS]\n", progname);
1103       bench_print_global_options(stderr);
1104     }
1105     fprintf(stderr, "Possible values for <funcname>:\n");
1106     for (f = 0; f < sizeof(function_mapper) / sizeof(function_mapper[0]); ++f)
1107     {
1108       if (f != 0 && f % 4 == 0)
1109 	fprintf(stderr, "\n");
1110       fprintf(stderr, "%-22s", function_mapper[f].funcname);
1111     }
1112     fprintf(stderr, "\n");
1113     return 1;
1114   }
1115 
1116   argc -= 2;	// argc >= 1 if more arguments.
1117   argv += 2;	// Next argument in argv[0]
1118   char* input_codes = strdup(function_mapper[f].input_codes);
1119   char* input_code[10];
1120   char* p = input_codes;
1121   int codes = 0;
1122   while(*p)
1123   {
1124     input_code[codes++] = p++;
1125     while(*p && *p != ',')
1126       ++p;
1127     if (*p == ',')
1128       *p++ = '\0';
1129   }
1130   int saw_var[4];
1131   for (int var_index = 0; var_index < 4; ++var_index)
1132     saw_var[var_index] = 0;
1133   int saw_vars = 0;
1134   char usage[64];
1135   char* usage_ptr = usage;
1136   int error = 0;
1137   for (int c = 0; ; ++c)
1138   {
1139     if (c < codes)
1140     {
1141       p = input_code[c];
1142       if (isupper(*p))
1143       {
1144 	while(*++p)
1145 	{
1146 	  if (*p != '1')
1147 	  {
1148 	    int var_index = *p - 'k';
1149 	    assert(var_index >= 0 && var_index <= 3); // 'k', 'l', 'm' or 'n'.
1150 	    saw_var[var_index] = 1;
1151 	    saw_vars = 1;
1152 	  }
1153 	}
1154 	continue;
1155       }
1156     }
1157     if (saw_vars)
1158     {
1159       saw_vars = 0;
1160       for (int var_count = 2; var_count < 6; ++var_count)
1161       {
1162 	int var_index = var_count % 4;
1163 	if (saw_var[var_index] == 1)
1164 	{
1165 	  *usage_ptr++ = ' ';
1166 	  *usage_ptr++ = 'k' + var_index;
1167 	  saw_var[var_index] = 2;
1168 	  if (!error && decode_size('k' + var_index, &params, &argc, &argv))
1169 	    error = 1;
1170 	}
1171       }
1172     }
1173     if (c == codes)
1174       break;
1175     if (p[1] == 'i')
1176     {
1177       *usage_ptr++ = ' ';
1178       *usage_ptr++ = *p;
1179       switch(*p)
1180       {
1181 	case 'r':
1182 	  *usage_ptr++ = '1' + params.rows;
1183 	  if (error) ++params.rows;
1184 	  break;
1185 	case 'c':
1186 	  *usage_ptr++ = '1' + params.cols;
1187 	  if (error) ++params.cols;
1188 	  break;
1189 	case 'w':
1190 	  *usage_ptr++ = '1' + params.wrds;
1191 	  if (error) ++params.wrds;
1192 	  break;
1193       }
1194       if (!error && decode_index(*p, &params, &argc, &argv))
1195 	error = 1;
1196     }
1197     else
1198     {
1199       *usage_ptr++ = ' ';
1200       *usage_ptr++ = *p;
1201       if (!error && decode_code(*p, &params, &argc, &argv))
1202 	error = 1;
1203     }
1204   }
1205   *usage_ptr = '\0';
1206   if (argc != 0)
1207     error = 1;
1208   if (error)
1209   {
1210     if (argc != 0)
1211       fprintf(stderr, "%s %s: too many parameters.\n", progname, params.funcname);
1212     fprintf(stderr, "Usage: %s [OPTIONS] %s%s\n", progname, params.funcname, usage);
1213     if (opts <= 0)
1214       bench_print_global_options(stderr);
1215     return 1;
1216   }
1217 
1218   double cost = complexity(&params, function_mapper[f].complexity_code);
1219   params.count = bench_count ? bench_count : function_mapper[f].count / cost;
1220   if (params.count < 1)
1221     params.count = 1;
1222   bench_count = params.count;
1223 
1224   srandom(17);
1225 
1226   data_len = run_bench(function_mapper[f].run_func, (void*)&params, data, sizeof(data) / sizeof(unsigned long long));
1227 
1228   printf("function: %s, count: %" PRId64 ", ", params.funcname, params.count);
1229   if (saw_var[2])
1230     printf("m: %d, ", params.m);
1231   if (saw_var[3])
1232     printf("n: %d, ", params.n);
1233   if (saw_var[0])
1234     printf("k: %d, ", params.k);
1235   if (saw_var[1])
1236     printf("l: %d, ", params.l);
1237   for (int i = 0; i < 3; ++i)
1238   {
1239     if (i < params.rows)
1240       printf("row%c: %d, ", 'a' + i, params.row[i]);
1241     if (i < params.cols)
1242       printf("col%c: %d, ", 'a' + i, params.col[i]);
1243     if (i < params.wrds)
1244       printf("word%c: %d, ", 'a' + i, params.wrd[i]);
1245   }
1246   if (params.cutoff != -1)
1247     printf("cutoff: %d, ", params.cutoff);
1248   print_wall_time(data[0] / 1000000.0 / params.count);
1249   printf(", cpu cycles: %llu", (data[1] + params.count / 2) / params.count);
1250 #ifndef HAVE_LIBPAPI
1251   printf(", cc/");
1252   print_complexity_human(&params, function_mapper[f].complexity_code);
1253   printf(": %f\n", data[1] / (params.count * cost));
1254 #else
1255   printf("\n");
1256   for (int n = 1; n < data_len; ++n)
1257   {
1258     printf("%s (%f) per bit (divided by ", papi_event_name(papi_events[n - 1]), (double)data[n] / params.count);
1259     print_complexity_human(&params, function_mapper[f].complexity_code);
1260     printf("): %f\n", data[n] / (params.count * cost));
1261   }
1262 #endif
1263 }
1264