1 #include "cado.h" // IWYU pragma: keep
2
3 /* This program is the simplest interface to the bare matrix
4 * multiplication routine. It's meant to provide an easy way of benching,
5 * and comparing, different matrix product implementations.
6 *
7 * It must be used with the INNER matrix, not the .info one. So either use
8 * the vanilla cado format (before balance), or one of the .h<i>.v<j>
9 * matrices as output by balance.
10 */
11
12 #include <stdio.h>
13 #include <stdint.h> // for uint64_t
14 #include <stdlib.h>
15 #include <time.h>
16 #include <errno.h>
17 #include <limits.h>
18 #include <inttypes.h>
19 #include <string.h>
20 #include <sys/time.h>
21 #include <pthread.h> // for pthread_mutex_lock, pthread_mutex_unlock
22 #include <gmp.h>
23
24 #include "raw_matrix_u32.h" // for matrix_u32
25 #include "cheating_vec_init.h"
26 #include "crc.h" // cado_crc_lfsr
27 #include "macros.h"
28 #include "matmul-mf.h"
29 #include "matmul.h"
30 #include "mpfq/mpfq.h"
31 #include "mpfq/mpfq_vbase.h"
32 #include "params.h"
33 #include "portability.h" // asprintf // IWYU pragma: keep
34 #include "version_info.h" // cado_revision_string
35 #include "worker-threads.h"
36
usage()37 void usage()
38 {
39 fprintf(stderr,
40 "Usage: ./bench [--impl <implementation>] [--tmax <time>] [--nmax <n_iter>] [--nchecks <number> | --nocheck] [-r|--rebuild] [-t|--transpose] [--nthreads <number>] [--cycles <frequency>] -- <file0> [<file1> ... ]\n");
41 exit(1);
42 }
43
44
45 struct private_args {
46 matmul_t mm;
47 void * colvec;
48 void * rowvec;
49 };
50
51 struct bench_args {
52 mpfq_vbase xx;
53 param_list pl;
54 const char * impl;
55 int nthreads;
56 int transpose;
57 int nchecks;
58 int rebuild;
59 mpz_t prime;
60 double freq;
61 char ** mfiles;
62 const char * source_vec;
63 struct private_args * p;
64 struct worker_threads_group * tg;
65 };
66
init_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)67 void init_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
68 {
69 int t0 = time(NULL);
70 struct private_args * p = ba->p + tnum;
71
72 p->mm = matmul_init(ba->xx, 0, 0, ba->mfiles[tnum], ba->impl, ba->pl, !ba->transpose);
73
74 fprintf(stderr, "Expect to find or create cache file %s\n", p->mm->cachefile_name);
75
76 if (!ba->rebuild && matmul_reload_cache(p->mm)) {
77 pthread_mutex_lock(&tg->mu);
78 fprintf(stderr, "T%d Reusing cache file for %s\n",
79 tnum, ba->mfiles[tnum]);
80 fprintf(stderr, "T%d Cache load time %ds wct\n",
81 tnum, (int) time(NULL) - t0);
82 pthread_mutex_unlock(&tg->mu);
83 } else {
84 clock_t ct0 = clock();
85 pthread_mutex_lock(&tg->mu);
86 fprintf(stderr, "T%d Building cache file for %s\n",
87 tnum, ba->mfiles[tnum]);
88 pthread_mutex_unlock(&tg->mu);
89 ASSERT_ALWAYS(p->mm->store_transposed == ba->transpose);
90 matrix_u32 m;
91 int withcoeffs = mpz_cmp_ui(ba->prime, 2) > 0;
92 mf_prepare_matrix_u32(p->mm, m, ba->mfiles[tnum], withcoeffs);
93 matmul_build_cache(p->mm, m);
94 pthread_mutex_lock(&tg->mu);
95 fprintf(stderr, "T%d Cache build time %.2fs cpu\n",
96 tnum, (double) (clock()-ct0) / CLOCKS_PER_SEC);
97 fprintf(stderr, "T%d Saving cache file for %s\n",
98 tnum, ba->mfiles[tnum]);
99 pthread_mutex_unlock(&tg->mu);
100 t0 = time(NULL);
101 matmul_save_cache(p->mm);
102 pthread_mutex_lock(&tg->mu);
103 fprintf(stderr, "T%d Cache save time %ds wct\n",
104 tnum, (int) time(NULL) - t0);
105 pthread_mutex_unlock(&tg->mu);
106 /* XXX ok, it's freakin ugly. We must really rethink this object. */
107 if (m->mfile) free((void*) m->mfile);
108 }
109
110 pthread_mutex_lock(&tg->mu);
111 fprintf(stderr, "T%u uses cache file %s\n",
112 tnum,
113 /* cache for mmt->locfile, */
114 p->mm->cachefile_name);
115 pthread_mutex_unlock(&tg->mu);
116
117 unsigned int nr = p->mm->dim[0];
118 unsigned int nc = p->mm->dim[1];
119 matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nr);
120 matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nc);
121
122 cheating_vec_init(ba->xx, &p->colvec, nc);
123 cheating_vec_init(ba->xx, &p->rowvec, nr);
124 ba->xx->vec_set_zero(ba->xx, p->colvec, nc);
125 ba->xx->vec_set_zero(ba->xx, p->rowvec, nr);
126 }/*}}}*/
127
check_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)128 void check_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
129 {
130 struct private_args * p = ba->p + tnum;
131 mpfq_vbase_ptr A = ba->xx;
132
133 unsigned int nr = p->mm->dim[0];
134 unsigned int nc = p->mm->dim[1];
135 unsigned int nr0 = nr;
136 unsigned int nc0 = nc;
137 matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nr);
138 matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nc);
139
140 void * colvec_bis;
141 void * rowvec_bis;
142 cheating_vec_init(A, &colvec_bis, nc);
143 cheating_vec_init(A, &rowvec_bis, nr);
144 A->vec_set_zero(A, colvec_bis, nc);
145 A->vec_set_zero(A, rowvec_bis, nr);
146
147 void * check0;
148 void * check1;
149 cheating_vec_init(A, &check0, A->simd_groupsize(A));
150 cheating_vec_init(A, &check1, A->simd_groupsize(A));
151
152 A->vec_set(A, colvec_bis, p->colvec, nc);
153 A->vec_set(A, rowvec_bis, p->rowvec, nr);
154
155 /* See the comment in matmul_mul about the direction argument and the
156 * number of coordinates of source/destination vectors */
157
158 printf("T%d colvec(%u): %08" PRIx32 "\n", tnum,
159 nc, crc32((unsigned long*) p->colvec, A->vec_elt_stride(A, nc0)));
160 matmul_mul(p->mm, p->rowvec, p->colvec, 1);
161 printf("T%d rowvec(%u): %08" PRIx32 "\n", tnum,
162 nr, crc32((unsigned long*) p->rowvec, A->vec_elt_stride(A, nr0)));
163
164 A->vec_set_zero(A, check0, A->simd_groupsize(A));
165 A->add_dotprod(A, check0, p->rowvec, rowvec_bis, nr0);
166
167 printf("T%d rowvec_bis(%u): %08" PRIx32 "\n", tnum,
168 nr, crc32((unsigned long*) rowvec_bis, A->vec_elt_stride(A, nr0)));
169 matmul_mul(p->mm, colvec_bis, rowvec_bis, 0);
170 printf("T%d colvec_bis(%u): %08" PRIx32 "\n", tnum,
171 nc, crc32((unsigned long*) colvec_bis, A->vec_elt_stride(A, nc0)));
172
173 A->vec_set_zero(A, check1, A->simd_groupsize(A));
174 A->add_dotprod(A, check1, p->colvec, colvec_bis, nc0);
175
176 if (A->vec_cmp(A, check0, check1, A->simd_groupsize(A)) != 0) {
177 pthread_mutex_lock(&tg->mu);
178 fprintf(stderr, "T%d : Check failed\n", tnum);
179 pthread_mutex_unlock(&tg->mu);
180 abort();
181 }
182
183 cheating_vec_clear(A, &colvec_bis, nc);
184 cheating_vec_clear(A, &rowvec_bis, nr);
185 cheating_vec_clear(A, &check0, A->simd_groupsize(A));
186 cheating_vec_clear(A, &check1, A->simd_groupsize(A));
187
188
189 matmul_aux(p->mm, MATMUL_AUX_ZERO_STATS);
190 }/*}}}*/
191
mul_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)192 void mul_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
193 {
194 struct private_args * p = ba->p + tnum;
195 void * dstvec = ba->transpose ? p->colvec : p->rowvec;
196 void * srcvec = ba->transpose ? p->rowvec : p->colvec;
197 matmul_mul(p->mm, dstvec, srcvec, !ba->transpose);
198 }/*}}}*/
199
clear_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)200 void clear_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
201 {
202 struct private_args * p = ba->p + tnum;
203 mpfq_vbase_ptr A = ba->xx;
204 unsigned int nr = p->mm->dim[0];
205 unsigned int nc = p->mm->dim[1];
206 pthread_mutex_lock(&tg->mu);
207 matmul_report(p->mm, ba->freq);
208 printf("\n");
209 pthread_mutex_unlock(&tg->mu);
210 matmul_clear(p->mm);
211 cheating_vec_clear(A, &p->colvec, nc);
212 cheating_vec_clear(A, &p->rowvec, nr);
213 }/*}}}*/
214
banner(int argc,char * argv[])215 void banner(int argc, char * argv[])
216 {
217 /* print command line */
218 fprintf (stderr, "# (%s) %s", cado_revision_string, (argv)[0]);
219 for (int i = 1; i < (argc); i++)
220 fprintf (stderr, " %s", (argv)[i]);
221 fprintf (stderr, "\n");
222
223 #ifdef __GNUC__
224 fprintf(stderr, "# Compiled with gcc " __VERSION__ "\n");
225 #endif
226 fprintf(stderr, "# Compilation flags " CFLAGS "\n");
227 }
228
main(int argc,char * argv[])229 int main(int argc, char * argv[])
230 {
231 struct bench_args ba[1];
232
233 setbuf(stdout, NULL);
234 setbuf(stderr, NULL);
235
236 banner(argc, argv);
237
238 memset(ba, 0, sizeof(ba));
239
240 ba->impl = "bucket";
241 char * file = NULL;
242 int nocheck = 0;
243 ba->nchecks = 4;
244 ba->nthreads = 1;
245 ba->freq = 1;
246 mpz_init_set_ui(ba->prime, 2);
247
248 /* {{{ */
249 param_list_init(ba->pl);
250 argv++,argc--;
251 param_list_configure_switch(ba->pl, "--transpose", &ba->transpose);
252 param_list_configure_switch(ba->pl, "--rebuild", &ba->rebuild);
253 param_list_configure_switch(ba->pl, "--nocheck", &nocheck);
254 param_list_configure_alias(ba->pl, "--transpose", "-t");
255 param_list_configure_alias(ba->pl, "--rebuild", "-r");
256
257 int wild = 0;
258 for( ; argc ; ) {
259 if (param_list_update_cmdline(ba->pl, &argc, &argv)) { continue; }
260 if (strcmp(argv[0],"--") == 0) {
261 argv++, argc--;
262 break;
263 }
264 if (argv[0][0] != '-' && wild == 0) {
265 file = argv[0];
266 argv++,argc--;
267 wild++;
268 continue;
269 }
270 fprintf (stderr, "Unknown option: %s\n", argv[0]);
271 usage();
272 }
273
274 unsigned int nmax = UINT_MAX;
275 param_list_parse_mpz(ba->pl, "prime", ba->prime);
276 param_list_parse_uint(ba->pl, "nmax", &nmax);
277 param_list_parse_int(ba->pl, "nthreads", &ba->nthreads);
278 const char * unit;
279 if (param_list_parse_double(ba->pl, "cycles", &ba->freq)) {
280 unit = "cy/c";
281 } else {
282 unit = "ns/c";
283 }
284
285 if (ba->nthreads == 1) {
286 if (file) {
287 ba->mfiles = &file;
288 } else {
289 if (argc) {
290 ba->mfiles = argv;
291 } else {
292 fprintf(stderr, "No file !\n");
293 exit(1);
294 }
295 }
296 } else {
297 ba->mfiles = argv;
298 if (file) {
299 fprintf(stderr, "When using --nthreads, please specify files after a terminating --\n");
300 exit(1);
301 } else if (argc != ba->nthreads) {
302 fprintf(stderr, "%u threads requested, but %u files given on the command line.\n", ba->nthreads, argc);
303 exit(1);
304 }
305 }
306
307 param_list_lookup_string(ba->pl, "matmul_bucket_methods");
308 param_list_lookup_string(ba->pl, "l1_cache_size");
309 param_list_lookup_string(ba->pl, "l2_cache_size");
310
311 double tmax = 100.0;
312 param_list_parse_double(ba->pl, "tmax", &tmax);
313 param_list_parse_int(ba->pl, "nchecks", &ba->nchecks);
314 if (nocheck) ba->nchecks = 0;
315
316 const char * tmp;
317 if ((tmp = param_list_lookup_string(ba->pl, "impl")) != NULL) {
318 ba->impl = tmp;
319 }
320
321 unsigned int nbys = 64;
322
323 mpfq_vbase_ptr A = ba->xx;
324
325 /* may leave nbys at the default value ! */
326 param_list_parse_uint(ba->pl, "nbys", &nbys);
327
328 /* The api mandates that we set the desired value for nbys. Here,
329 * that's not really our intent, since we really want to bench
330 * the layer in its favorite working context. Most of the time,
331 * setting nbys is pointless.
332 */
333 mpfq_vbase_oo_field_init_byfeatures(A,
334 MPFQ_PRIME_MPZ, ba->prime,
335 MPFQ_SIMD_GROUPSIZE, nbys,
336 MPFQ_DONE);
337
338 param_list_lookup_string(ba->pl, "srcvec");
339 if (param_list_warn_unused(ba->pl)) {
340 usage();
341 }/*}}}*/
342
343 ba->p = malloc(ba->nthreads * sizeof(struct private_args));
344 ba->tg = worker_threads_init(ba->nthreads);
345 fprintf(stderr, "Using implementation \"%s\"\n", ba->impl);
346 worker_threads_do(ba->tg, (worker_func_t) &init_func, ba);
347
348 /* {{{ display some info */
349 uint64_t ncoeffs_total = 0;
350 for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
351 struct private_args * p = ba->p + tnum;
352 fprintf (stderr, "T%d %s: %u rows %u cols %" PRIu64 " coeffs\n",
353 tnum, ba->mfiles[tnum],
354 p->mm->dim[0],
355 p->mm->dim[1],
356 p->mm->ncoeffs);
357 ncoeffs_total += p->mm->ncoeffs;
358 }
359 fprintf (stderr, "total %" PRIu64 " coeffs\n", ncoeffs_total);
360 /* }}} */
361
362 gmp_randstate_t rstate;
363 gmp_randinit_default(rstate);
364 gmp_randseed_ui(rstate, 1);
365
366 /* {{{ Do checks if requested */
367 for(int t = 0; t < ba->nchecks ; t++) {
368 /* create deterministic test values */
369 for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
370 struct private_args * p = ba->p + tnum;
371 A->vec_random(A, p->colvec, p->mm->dim[1], rstate);
372 A->vec_random(A, p->rowvec, p->mm->dim[0], rstate);
373 /* If we want shared vectors, this is the way to go. */
374 /* Note that for such a test, the clear_func must be skipped
375 * or improved, since we don't really want to free() the same
376 * pointer twice */
377 /*
378 if (ba->transpose)
379 p->rowvec = ba->p[0].rowvec;
380 else
381 p->colvec = ba->p[0].colvec;
382 */
383 }
384 worker_threads_do(ba->tg, (worker_func_t) &check_func, ba);
385 fprintf(stderr, "Check %d ok\n", t);
386 }
387 if (ba->nchecks)
388 printf("All %d checks passed\n", ba->nchecks);
389 /* }}} */
390
391 for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
392 struct private_args * p = ba->p + tnum;
393 A->vec_random(A, p->colvec, p->mm->dim[1], rstate);
394 A->vec_random(A, p->rowvec, p->mm->dim[0], rstate);
395 }
396
397 if ((tmp = param_list_lookup_string(ba->pl, "srcvec")) != NULL) {
398 struct private_args * p = ba->p;
399 if (ba->nthreads > 1) {
400 fprintf(stderr, "srcvec incompatible with multithread\n");
401 exit(1);
402 }
403 A->vec_set_zero(A, p->colvec, p->mm->dim[1]);
404 A->vec_set_zero(A, p->rowvec, p->mm->dim[0]);
405 FILE * f = fopen(tmp, "rb");
406 if (f == NULL) {
407 fprintf(stderr, "fopen(%s): %s\n", tmp, strerror(errno));
408 exit(1);
409 }
410 /* with ba->transpose, we're doing vector times matrix (rowvec
411 * times matrix -> colvec) */
412 void * srcvec = ba->transpose ? p->rowvec : p->colvec;
413 void * dstvec = ba->transpose ? p->colvec : p->rowvec;
414 int n = p->mm->dim[1 ^ ba->transpose];
415 fprintf(stderr, "reading %zu bytes from %s\n",
416 n * sizeof(uint64_t), tmp);
417 int nread = fread(srcvec, sizeof(uint64_t), n, f);
418 if (nread != n) {
419 fprintf(stderr, "short read (%d < %d)\n", nread, n);
420 exit(1);
421 }
422 fclose(f);
423 worker_threads_do(ba->tg, (worker_func_t) &mul_func, ba);
424 char * dstfile;
425 int rc = asprintf(&dstfile, "%s.dst", tmp);
426 ASSERT_ALWAYS(rc >= 0);
427 f = fopen(dstfile, "wb");
428 int nw = p->mm->dim[0 ^ ba->transpose];
429 fprintf(stderr, "writing %zu bytes to %s\n",
430 nw * sizeof(uint64_t), dstfile);
431 int nwritten = fwrite(dstvec, sizeof(uint64_t), nw, f);
432 if (nwritten != nw) {
433 fprintf(stderr, "short write (%d < %d)\n", nwritten, nw);
434 exit(1);
435 }
436 fclose(f);
437 fprintf(stderr, "Saved [%s]%s * [%s] to [%s]\n",
438 p->mm->cachefile_name,
439 ba->transpose ? "^T" : "",
440 tmp,
441 dstfile);
442 free(dstvec);
443 }
444 gmp_randclear(rstate);
445
446 #define NLAST 10
447 double last[10]={0,};
448 double sum_last = 0;
449
450 clock_t t0 = clock();
451 double next = 0.25 * CLOCKS_PER_SEC;
452 double t1 = 0;
453 double t, dt;
454 if (next > tmax * CLOCKS_PER_SEC) { next = tmax * CLOCKS_PER_SEC; }
455 unsigned int n;
456 printf("Note: timings in seconds per iterations are given in cpu seconds ;\n");
457 printf("Note: with %d threads, this means %.2f walltime seconds.\n", ba->nthreads, 1.0 / ba->nthreads);
458 for(n = 0 ; n < nmax ; ) {
459 worker_threads_do(ba->tg, (worker_func_t) &mul_func, ba);
460 t = clock();
461 dt = t - t0;
462 sum_last -= last[n % NLAST];
463 last[n % NLAST] = (t - t1) / CLOCKS_PER_SEC;
464 sum_last += (t - t1) / CLOCKS_PER_SEC;
465 t1 = t;
466 unsigned int nlast = NLAST;
467 n++;
468 if (n < nlast) nlast = n;
469 if (dt > next || n == nmax - 1) {
470 do { next += 0.25 * CLOCKS_PER_SEC; } while (dt > next);
471 if (next > tmax * CLOCKS_PER_SEC) { next = tmax * CLOCKS_PER_SEC; }
472 dt /= CLOCKS_PER_SEC;
473 printf("%d iters in %2.fs, %.3f/1, %.2f %s (last %u : %.3f/1, %.2f %s) \r",
474 n, dt, dt/n, ba->freq * 1.0e9 * dt/n/ncoeffs_total, unit,
475 nlast, sum_last/nlast, ba->freq * 1.0e9 *sum_last/nlast/ncoeffs_total, unit);
476 fflush(stdout);
477 if (dt > tmax)
478 break;
479 }
480 }
481 printf("\n");
482 // printf("Scanned %lu coeffs in total\n", n * ncoeffs_total);
483
484 worker_threads_do(ba->tg, (worker_func_t) &clear_func, ba);
485 worker_threads_clear(ba->tg);
486 param_list_clear(ba->pl);
487 free(ba->p);
488 mpz_clear(ba->prime);
489
490 return 0;
491 }
492