1 #include "cado.h" // IWYU pragma: keep
2 
3 /* This program is the simplest interface to the bare matrix
4  * multiplication routine. It's meant to provide an easy way of benching,
5  * and comparing, different matrix product implementations.
6  *
7  * It must be used with the INNER matrix, not the .info one. So either use
8  * the vanilla cado format (before balance), or one of the .h<i>.v<j>
9  * matrices as output by balance.
10  */
11 
12 #include <stdio.h>
13 #include <stdint.h>             // for uint64_t
14 #include <stdlib.h>
15 #include <time.h>
16 #include <errno.h>
17 #include <limits.h>
18 #include <inttypes.h>
19 #include <string.h>
20 #include <sys/time.h>
21 #include <pthread.h>            // for pthread_mutex_lock, pthread_mutex_unlock
22 #include <gmp.h>
23 
24 #include "raw_matrix_u32.h"     // for matrix_u32
25 #include "cheating_vec_init.h"
26 #include "crc.h"        // cado_crc_lfsr
27 #include "macros.h"
28 #include "matmul-mf.h"
29 #include "matmul.h"
30 #include "mpfq/mpfq.h"
31 #include "mpfq/mpfq_vbase.h"
32 #include "params.h"
33 #include "portability.h" // asprintf // IWYU pragma: keep
34 #include "version_info.h" // cado_revision_string
35 #include "worker-threads.h"
36 
usage()37 void usage()
38 {
39     fprintf(stderr,
40             "Usage: ./bench [--impl <implementation>] [--tmax <time>] [--nmax <n_iter>] [--nchecks <number> | --nocheck] [-r|--rebuild] [-t|--transpose] [--nthreads <number>] [--cycles <frequency>] -- <file0> [<file1> ... ]\n");
41     exit(1);
42 }
43 
44 
45 struct private_args {
46     matmul_t mm;
47     void * colvec;
48     void * rowvec;
49 };
50 
51 struct bench_args {
52     mpfq_vbase xx;
53     param_list pl;
54     const char * impl;
55     int nthreads;
56     int transpose;
57     int nchecks;
58     int rebuild;
59     mpz_t prime;
60     double freq;
61     char ** mfiles;
62     const char * source_vec;
63     struct private_args * p;
64     struct worker_threads_group * tg;
65 };
66 
init_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)67 void init_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
68 {
69     int t0 = time(NULL);
70     struct private_args * p = ba->p + tnum;
71 
72     p->mm = matmul_init(ba->xx, 0, 0, ba->mfiles[tnum], ba->impl, ba->pl, !ba->transpose);
73 
74     fprintf(stderr, "Expect to find or create cache file %s\n", p->mm->cachefile_name);
75 
76     if (!ba->rebuild && matmul_reload_cache(p->mm)) {
77         pthread_mutex_lock(&tg->mu);
78         fprintf(stderr, "T%d Reusing cache file for %s\n",
79                 tnum, ba->mfiles[tnum]);
80         fprintf(stderr, "T%d Cache load time %ds wct\n",
81                 tnum, (int) time(NULL) - t0);
82         pthread_mutex_unlock(&tg->mu);
83     } else {
84         clock_t ct0 = clock();
85         pthread_mutex_lock(&tg->mu);
86         fprintf(stderr, "T%d Building cache file for %s\n",
87                 tnum, ba->mfiles[tnum]);
88         pthread_mutex_unlock(&tg->mu);
89         ASSERT_ALWAYS(p->mm->store_transposed == ba->transpose);
90         matrix_u32 m;
91         int withcoeffs = mpz_cmp_ui(ba->prime, 2) > 0;
92         mf_prepare_matrix_u32(p->mm, m, ba->mfiles[tnum], withcoeffs);
93         matmul_build_cache(p->mm, m);
94         pthread_mutex_lock(&tg->mu);
95         fprintf(stderr, "T%d Cache build time %.2fs cpu\n",
96                 tnum, (double) (clock()-ct0) / CLOCKS_PER_SEC);
97         fprintf(stderr, "T%d Saving cache file for %s\n",
98                 tnum, ba->mfiles[tnum]);
99         pthread_mutex_unlock(&tg->mu);
100         t0 = time(NULL);
101         matmul_save_cache(p->mm);
102         pthread_mutex_lock(&tg->mu);
103         fprintf(stderr, "T%d Cache save time %ds wct\n",
104                 tnum, (int) time(NULL) - t0);
105         pthread_mutex_unlock(&tg->mu);
106         /* XXX ok, it's freakin ugly. We must really rethink this object. */
107         if (m->mfile) free((void*) m->mfile);
108     }
109 
110     pthread_mutex_lock(&tg->mu);
111     fprintf(stderr, "T%u uses cache file %s\n",
112             tnum,
113             /* cache for mmt->locfile, */
114             p->mm->cachefile_name);
115     pthread_mutex_unlock(&tg->mu);
116 
117     unsigned int nr = p->mm->dim[0];
118     unsigned int nc = p->mm->dim[1];
119     matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nr);
120     matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nc);
121 
122     cheating_vec_init(ba->xx, &p->colvec, nc);
123     cheating_vec_init(ba->xx, &p->rowvec, nr);
124     ba->xx->vec_set_zero(ba->xx, p->colvec, nc);
125     ba->xx->vec_set_zero(ba->xx, p->rowvec, nr);
126 }/*}}}*/
127 
check_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)128 void check_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
129 {
130     struct private_args * p = ba->p + tnum;
131     mpfq_vbase_ptr A = ba->xx;
132 
133     unsigned int nr = p->mm->dim[0];
134     unsigned int nc = p->mm->dim[1];
135     unsigned int nr0 = nr;
136     unsigned int nc0 = nc;
137     matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nr);
138     matmul_aux(p->mm, MATMUL_AUX_GET_READAHEAD, &nc);
139 
140     void * colvec_bis;
141     void * rowvec_bis;
142     cheating_vec_init(A, &colvec_bis, nc);
143     cheating_vec_init(A, &rowvec_bis, nr);
144     A->vec_set_zero(A, colvec_bis, nc);
145     A->vec_set_zero(A, rowvec_bis, nr);
146 
147     void * check0;
148     void * check1;
149     cheating_vec_init(A, &check0, A->simd_groupsize(A));
150     cheating_vec_init(A, &check1, A->simd_groupsize(A));
151 
152     A->vec_set(A, colvec_bis, p->colvec, nc);
153     A->vec_set(A, rowvec_bis, p->rowvec, nr);
154 
155     /* See the comment in matmul_mul about the direction argument and the
156      * number of coordinates of source/destination vectors */
157 
158     printf("T%d colvec(%u): %08" PRIx32 "\n", tnum,
159             nc, crc32((unsigned long*) p->colvec, A->vec_elt_stride(A, nc0)));
160     matmul_mul(p->mm, p->rowvec, p->colvec, 1);
161     printf("T%d rowvec(%u): %08" PRIx32 "\n", tnum,
162             nr, crc32((unsigned long*) p->rowvec, A->vec_elt_stride(A, nr0)));
163 
164     A->vec_set_zero(A, check0, A->simd_groupsize(A));
165     A->add_dotprod(A, check0, p->rowvec, rowvec_bis, nr0);
166 
167     printf("T%d rowvec_bis(%u): %08" PRIx32 "\n", tnum,
168             nr, crc32((unsigned long*) rowvec_bis, A->vec_elt_stride(A, nr0)));
169     matmul_mul(p->mm, colvec_bis, rowvec_bis, 0);
170     printf("T%d colvec_bis(%u): %08" PRIx32 "\n", tnum,
171             nc, crc32((unsigned long*) colvec_bis, A->vec_elt_stride(A, nc0)));
172 
173     A->vec_set_zero(A, check1, A->simd_groupsize(A));
174     A->add_dotprod(A, check1, p->colvec, colvec_bis, nc0);
175 
176     if (A->vec_cmp(A, check0, check1, A->simd_groupsize(A)) != 0) {
177         pthread_mutex_lock(&tg->mu);
178         fprintf(stderr, "T%d : Check failed\n", tnum);
179         pthread_mutex_unlock(&tg->mu);
180         abort();
181     }
182 
183     cheating_vec_clear(A, &colvec_bis, nc);
184     cheating_vec_clear(A, &rowvec_bis, nr);
185     cheating_vec_clear(A, &check0, A->simd_groupsize(A));
186     cheating_vec_clear(A, &check1, A->simd_groupsize(A));
187 
188 
189     matmul_aux(p->mm, MATMUL_AUX_ZERO_STATS);
190 }/*}}}*/
191 
mul_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)192 void mul_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
193 {
194     struct private_args * p = ba->p + tnum;
195     void * dstvec = ba->transpose ? p->colvec : p->rowvec;
196     void * srcvec = ba->transpose ? p->rowvec : p->colvec;
197     matmul_mul(p->mm, dstvec, srcvec, !ba->transpose);
198 }/*}}}*/
199 
clear_func(struct worker_threads_group * tg MAYBE_UNUSED,int tnum,struct bench_args * ba)200 void clear_func(struct worker_threads_group * tg MAYBE_UNUSED, int tnum, struct bench_args * ba)/*{{{*/
201 {
202     struct private_args * p = ba->p + tnum;
203     mpfq_vbase_ptr A = ba->xx;
204     unsigned int nr = p->mm->dim[0];
205     unsigned int nc = p->mm->dim[1];
206     pthread_mutex_lock(&tg->mu);
207     matmul_report(p->mm, ba->freq);
208     printf("\n");
209     pthread_mutex_unlock(&tg->mu);
210     matmul_clear(p->mm);
211     cheating_vec_clear(A, &p->colvec, nc);
212     cheating_vec_clear(A, &p->rowvec, nr);
213 }/*}}}*/
214 
banner(int argc,char * argv[])215 void banner(int argc, char * argv[])
216 {
217     /* print command line */
218     fprintf (stderr, "# (%s) %s", cado_revision_string, (argv)[0]);
219     for (int i = 1; i < (argc); i++)
220         fprintf (stderr, " %s", (argv)[i]);
221     fprintf (stderr, "\n");
222 
223 #ifdef  __GNUC__
224     fprintf(stderr, "# Compiled with gcc " __VERSION__ "\n");
225 #endif
226     fprintf(stderr, "# Compilation flags " CFLAGS "\n");
227 }
228 
main(int argc,char * argv[])229 int main(int argc, char * argv[])
230 {
231     struct bench_args ba[1];
232 
233     setbuf(stdout, NULL);
234     setbuf(stderr, NULL);
235 
236     banner(argc, argv);
237 
238     memset(ba, 0, sizeof(ba));
239 
240     ba->impl = "bucket";
241     char * file = NULL;
242     int nocheck = 0;
243     ba->nchecks = 4;
244     ba->nthreads = 1;
245     ba->freq = 1;
246     mpz_init_set_ui(ba->prime, 2);
247 
248     /* {{{ */
249     param_list_init(ba->pl);
250     argv++,argc--;
251     param_list_configure_switch(ba->pl, "--transpose", &ba->transpose);
252     param_list_configure_switch(ba->pl, "--rebuild", &ba->rebuild);
253     param_list_configure_switch(ba->pl, "--nocheck", &nocheck);
254     param_list_configure_alias(ba->pl, "--transpose", "-t");
255     param_list_configure_alias(ba->pl, "--rebuild", "-r");
256 
257     int wild = 0;
258     for( ; argc ; ) {
259         if (param_list_update_cmdline(ba->pl, &argc, &argv)) { continue; }
260         if (strcmp(argv[0],"--") == 0) {
261             argv++, argc--;
262             break;
263         }
264         if (argv[0][0] != '-' && wild == 0) {
265             file = argv[0];
266             argv++,argc--;
267             wild++;
268             continue;
269         }
270         fprintf (stderr, "Unknown option: %s\n", argv[0]);
271         usage();
272     }
273 
274     unsigned int nmax = UINT_MAX;
275     param_list_parse_mpz(ba->pl, "prime", ba->prime);
276     param_list_parse_uint(ba->pl, "nmax", &nmax);
277     param_list_parse_int(ba->pl, "nthreads", &ba->nthreads);
278     const char * unit;
279     if (param_list_parse_double(ba->pl, "cycles", &ba->freq)) {
280         unit = "cy/c";
281     } else {
282         unit = "ns/c";
283     }
284 
285     if (ba->nthreads == 1) {
286         if (file) {
287             ba->mfiles = &file;
288         } else {
289             if (argc) {
290                 ba->mfiles = argv;
291             } else {
292                 fprintf(stderr, "No file !\n");
293                 exit(1);
294             }
295         }
296     } else {
297         ba->mfiles = argv;
298         if (file) {
299             fprintf(stderr, "When using --nthreads, please specify files after a terminating --\n");
300             exit(1);
301         } else if (argc != ba->nthreads) {
302             fprintf(stderr, "%u threads requested, but %u files given on the command line.\n", ba->nthreads, argc);
303             exit(1);
304         }
305     }
306 
307     param_list_lookup_string(ba->pl, "matmul_bucket_methods");
308     param_list_lookup_string(ba->pl, "l1_cache_size");
309     param_list_lookup_string(ba->pl, "l2_cache_size");
310 
311     double tmax = 100.0;
312     param_list_parse_double(ba->pl, "tmax", &tmax);
313     param_list_parse_int(ba->pl, "nchecks", &ba->nchecks);
314     if (nocheck) ba->nchecks = 0;
315 
316     const char * tmp;
317     if ((tmp = param_list_lookup_string(ba->pl, "impl")) != NULL) {
318         ba->impl = tmp;
319     }
320 
321     unsigned int nbys = 64;
322 
323     mpfq_vbase_ptr A = ba->xx;
324 
325     /* may leave nbys at the default value ! */
326     param_list_parse_uint(ba->pl, "nbys", &nbys);
327 
328     /* The api mandates that we set the desired value for nbys. Here,
329      * that's not really our intent, since we really want to bench
330      * the layer in its favorite working context. Most of the time,
331      * setting nbys is pointless.
332      */
333     mpfq_vbase_oo_field_init_byfeatures(A,
334                 MPFQ_PRIME_MPZ, ba->prime,
335                 MPFQ_SIMD_GROUPSIZE, nbys,
336                 MPFQ_DONE);
337 
338     param_list_lookup_string(ba->pl, "srcvec");
339     if (param_list_warn_unused(ba->pl)) {
340         usage();
341     }/*}}}*/
342 
343     ba->p = malloc(ba->nthreads * sizeof(struct private_args));
344     ba->tg = worker_threads_init(ba->nthreads);
345     fprintf(stderr, "Using implementation \"%s\"\n", ba->impl);
346     worker_threads_do(ba->tg, (worker_func_t) &init_func, ba);
347 
348     /* {{{ display some info */
349     uint64_t ncoeffs_total = 0;
350     for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
351         struct private_args * p = ba->p + tnum;
352         fprintf (stderr, "T%d %s: %u rows %u cols %" PRIu64 " coeffs\n",
353                 tnum, ba->mfiles[tnum],
354                 p->mm->dim[0],
355                 p->mm->dim[1],
356                 p->mm->ncoeffs);
357         ncoeffs_total += p->mm->ncoeffs;
358     }
359     fprintf (stderr, "total %" PRIu64 " coeffs\n", ncoeffs_total);
360     /* }}} */
361 
362     gmp_randstate_t rstate;
363     gmp_randinit_default(rstate);
364     gmp_randseed_ui(rstate, 1);
365 
366     /* {{{ Do checks if requested */
367     for(int t = 0; t < ba->nchecks ; t++) {
368         /* create deterministic test values */
369         for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
370             struct private_args * p = ba->p + tnum;
371             A->vec_random(A, p->colvec, p->mm->dim[1], rstate);
372             A->vec_random(A, p->rowvec, p->mm->dim[0], rstate);
373             /* If we want shared vectors, this is the way to go. */
374             /* Note that for such a test, the clear_func must be skipped
375              * or improved, since we don't really want to free() the same
376              * pointer twice */
377             /*
378             if (ba->transpose)
379                 p->rowvec = ba->p[0].rowvec;
380             else
381                 p->colvec = ba->p[0].colvec;
382                 */
383         }
384         worker_threads_do(ba->tg, (worker_func_t) &check_func, ba);
385         fprintf(stderr, "Check %d ok\n", t);
386     }
387     if (ba->nchecks)
388         printf("All %d checks passed\n", ba->nchecks);
389     /* }}} */
390 
391     for(int tnum = 0 ; tnum < ba->nthreads ; tnum++) {
392         struct private_args * p = ba->p + tnum;
393         A->vec_random(A, p->colvec, p->mm->dim[1], rstate);
394         A->vec_random(A, p->rowvec, p->mm->dim[0], rstate);
395     }
396 
397     if ((tmp = param_list_lookup_string(ba->pl, "srcvec")) != NULL) {
398         struct private_args * p = ba->p;
399         if (ba->nthreads > 1) {
400             fprintf(stderr, "srcvec incompatible with multithread\n");
401             exit(1);
402         }
403         A->vec_set_zero(A, p->colvec, p->mm->dim[1]);
404         A->vec_set_zero(A, p->rowvec, p->mm->dim[0]);
405         FILE * f = fopen(tmp, "rb");
406         if (f == NULL) {
407             fprintf(stderr, "fopen(%s): %s\n", tmp, strerror(errno));
408             exit(1);
409         }
410         /* with ba->transpose, we're doing vector times matrix (rowvec
411          * times matrix -> colvec) */
412         void * srcvec = ba->transpose ? p->rowvec : p->colvec;
413         void * dstvec = ba->transpose ? p->colvec : p->rowvec;
414         int n = p->mm->dim[1 ^ ba->transpose];
415         fprintf(stderr, "reading %zu bytes from %s\n",
416                 n * sizeof(uint64_t), tmp);
417         int nread = fread(srcvec, sizeof(uint64_t), n, f);
418         if (nread != n) {
419             fprintf(stderr, "short read (%d < %d)\n", nread, n);
420             exit(1);
421         }
422         fclose(f);
423         worker_threads_do(ba->tg, (worker_func_t) &mul_func, ba);
424         char * dstfile;
425         int rc = asprintf(&dstfile, "%s.dst", tmp);
426         ASSERT_ALWAYS(rc >= 0);
427         f = fopen(dstfile, "wb");
428         int nw = p->mm->dim[0 ^ ba->transpose];
429         fprintf(stderr, "writing %zu bytes to %s\n",
430                 nw * sizeof(uint64_t), dstfile);
431         int nwritten = fwrite(dstvec, sizeof(uint64_t), nw, f);
432         if (nwritten != nw) {
433             fprintf(stderr, "short write (%d < %d)\n", nwritten, nw);
434             exit(1);
435         }
436         fclose(f);
437         fprintf(stderr, "Saved [%s]%s * [%s] to [%s]\n",
438                 p->mm->cachefile_name,
439                 ba->transpose ? "^T" : "",
440                 tmp,
441                 dstfile);
442         free(dstvec);
443     }
444     gmp_randclear(rstate);
445 
446 #define NLAST   10
447     double last[10]={0,};
448     double sum_last = 0;
449 
450     clock_t t0 = clock();
451     double next = 0.25 * CLOCKS_PER_SEC;
452     double t1 = 0;
453     double t, dt;
454     if (next > tmax * CLOCKS_PER_SEC) { next = tmax * CLOCKS_PER_SEC; }
455     unsigned int n;
456     printf("Note: timings in seconds per iterations are given in cpu seconds ;\n");
457     printf("Note: with %d threads, this means %.2f walltime seconds.\n", ba->nthreads, 1.0 / ba->nthreads);
458     for(n = 0 ; n < nmax ; ) {
459         worker_threads_do(ba->tg, (worker_func_t) &mul_func, ba);
460         t = clock();
461         dt = t - t0;
462         sum_last -= last[n % NLAST];
463         last[n % NLAST] = (t - t1) / CLOCKS_PER_SEC;
464         sum_last += (t - t1) / CLOCKS_PER_SEC;
465         t1 = t;
466         unsigned int nlast = NLAST;
467         n++;
468         if (n < nlast) nlast = n;
469         if (dt > next || n == nmax - 1) {
470             do { next += 0.25 * CLOCKS_PER_SEC; } while (dt > next);
471             if (next > tmax * CLOCKS_PER_SEC) { next = tmax * CLOCKS_PER_SEC; }
472             dt /= CLOCKS_PER_SEC;
473             printf("%d iters in %2.fs, %.3f/1, %.2f %s (last %u : %.3f/1, %.2f %s)        \r",
474                     n, dt, dt/n, ba->freq * 1.0e9 * dt/n/ncoeffs_total, unit,
475                     nlast, sum_last/nlast, ba->freq * 1.0e9 *sum_last/nlast/ncoeffs_total, unit);
476             fflush(stdout);
477             if (dt > tmax)
478                 break;
479         }
480     }
481     printf("\n");
482     // printf("Scanned %lu coeffs in total\n", n * ncoeffs_total);
483 
484     worker_threads_do(ba->tg, (worker_func_t) &clear_func, ba);
485     worker_threads_clear(ba->tg);
486     param_list_clear(ba->pl);
487     free(ba->p);
488     mpz_clear(ba->prime);
489 
490     return 0;
491 }
492