1 #include "cado.h" // IWYU pragma: keep
2 #include <cstdio>
3 #include <cinttypes>
4 #include <cstdint>              // for uint32_t
5 #include <cstring>              // for memset
6 #include <ctime>                // for time
7 #include <cstdlib>
8 #include <gmp.h>
9 #include "balancing.h"           // for balancing_pre_shuffle
10 #include "parallelizing_info.h"
11 #include "matmul_top.h"
12 #include "select_mpi.h"
13 #include "bblas_gauss.h"
14 #include "params.h"
15 #include "xvectors.h"
16 #include "bw-common.h"
17 #include "mpfq/mpfq.h"
18 #include "mpfq/mpfq_vbase.h"
19 #include "cheating_vec_init.h"
20 #include "portability.h" // asprintf // IWYU pragma: keep
21 #include "macros.h"
22 
23 
bw_rank_check(matmul_top_data_ptr mmt,param_list_ptr pl)24 void bw_rank_check(matmul_top_data_ptr mmt, param_list_ptr pl)
25 {
26     int tcan_print = bw->can_print && mmt->pi->m->trank == 0;
27     unsigned int r = matmul_top_rank_upper_bound(mmt);
28     if (tcan_print) {
29         printf("Matrix rank is at most %u (based on zero columns and rows encountered)\n", r);
30     }
31     int skip=0;
32     param_list_parse_int(pl, "skip_bw_early_rank_check", &skip);
33     if (bw->m + r < mmt->n0[0]) {
34         fprintf(stderr, "Based on the parameter m (=%u) and the rank defect of the matrix (>=%u), we can't expect to compute solutions reliably.\n",
35                 bw->m, mmt->n0[0]-r);
36         if (skip) {
37             fprintf(stderr, "Proceeding anyway as per skip_bw_early_rank_check=1\n");
38         } else {
39             fprintf(stderr, "Aborting. Use skip_bw_early_rank_check=1 to proceed nevertheless.\n");
40             exit(EXIT_FAILURE);
41         }
42     }
43 }
44 
prep_prog(parallelizing_info_ptr pi,param_list pl,void * arg MAYBE_UNUSED)45 void * prep_prog(parallelizing_info_ptr pi, param_list pl, void * arg MAYBE_UNUSED)
46 {
47     /* Interleaving does not make sense for this program. So the second
48      * block of threads just leave immediately */
49     ASSERT_ALWAYS(!pi->interleaved);
50 
51     // Doing the ``hello world'' test is a very good way of testing the
52     // global mpi/pthreads setup. So despite its apparent irrelevance, I
53     // suggest leaving it here as a cheap sanity check.
54     pi_hello(pi);
55 
56     int tcan_print = bw->can_print && pi->m->trank == 0;
57     matmul_top_data mmt;
58 
59     unsigned int nrhs = 0;
60     int char2 = mpz_cmp_ui(bw->p, 2) == 0;
61     int splitwidth = char2 ? 64 : 1;
62 
63     mpfq_vbase A;
64     unsigned int A_width = splitwidth;
65     unsigned int A_multiplex = bw->n / A_width;
66     mpfq_vbase_oo_field_init_byfeatures(A,
67             MPFQ_PRIME_MPZ, bw->p,
68             MPFQ_SIMD_GROUPSIZE, A_width,
69             MPFQ_DONE);
70     ASSERT_ALWAYS(A->simd_groupsize(A) * A_multiplex == (unsigned int) bw->n);
71 
72     matmul_top_init(mmt, A, pi, pl, bw->dir);
73 
74     bw_rank_check(mmt, pl);
75 
76 
77     gmp_randstate_t rstate;
78     gmp_randinit_default(rstate);
79 
80     if (pi->m->trank == 0 && !bw->seed) {
81         /* note that bw is shared between threads, thus only thread 0 should
82          * test and update it here.
83          * at pi->m->jrank > 0, we don't care about the seed anyway
84          */
85         bw->seed = time(NULL);
86         MPI_Bcast(&bw->seed, 1, MPI_INT, 0, pi->m->pals);
87     }
88     serialize_threads(pi->m);
89 
90     gmp_randseed_ui(rstate, bw->seed);
91     if (tcan_print) {
92         printf("// Random generator seeded with %d\n", bw->seed);
93     }
94 
95     const char * rhs_name = param_list_lookup_string(pl, "rhs");
96     FILE * rhs = NULL;
97     if (rhs_name) {
98         rhs = fopen(rhs_name, "r");
99         get_rhs_file_header_stream(rhs, NULL, &nrhs, NULL);
100         ASSERT_ALWAYS(rhs != NULL);
101         ASSERT_ALWAYS(nrhs <= mmt->n[!bw->dir]);
102     }
103 
104     mmt_vec ymy[2];
105     mmt_vec_ptr y = ymy[0];
106     mmt_vec_ptr my = ymy[1];
107 
108     mmt_vec_init(mmt,0,0, y,   bw->dir, /* shared ! */ 1, mmt->n[bw->dir]);
109     mmt_vec_init(mmt,0,0, my, !bw->dir,                0, mmt->n[!bw->dir]);
110 
111     unsigned int unpadded = MAX(mmt->n0[0], mmt->n0[1]);
112 
113     /* Number of copies of m by n matrices to use for trying to obtain a
114      * matrix of rank m.
115      *
116      * Note that it must be at least m/n, otherwise we stand no chance !
117      */
118     unsigned int prep_lookahead_iterations = iceildiv(bw->m, bw->n) + 1;
119 
120     unsigned int my_nx = 1;
121     uint32_t * xvecs = (uint32_t*) malloc(my_nx * bw->m * sizeof(uint32_t));
122     void * xymats;
123 
124     /* We're cheating on the generic init routines */
125     cheating_vec_init(A, &xymats, bw->m * prep_lookahead_iterations * A_multiplex);
126 
127     for (unsigned ntri = 0;; ntri++) {
128         if (nrhs == A_multiplex) {
129             if (ntri) ++my_nx;
130             if (ntri >= 4) {
131                 fprintf(stderr, "Cannot find a satisfactory initialization. "
132                         "Maybe your RHS vectors are bad ?\n");
133                 exit(EXIT_FAILURE);
134             }
135         } else if (ntri >= my_nx * 10) {
136             ++my_nx;
137             if (tcan_print) {
138                 printf("// Getting bored. Trying %u x vectors\n", my_nx);
139             }
140             xvecs = (uint32_t*) realloc(xvecs, my_nx * bw->m * sizeof(uint32_t));
141             ASSERT_ALWAYS(xvecs != NULL);
142         }
143         serialize_threads(pi->m);
144 
145         if (tcan_print) {
146             printf("// Generating new x,y vector pair (trial # %u)\n", ntri);
147         }
148 
149         // if we're looking for the right nullspace, then x is on the left.
150         // Otherwise, it's on the right.
151 
152         // generate indices w.r.t *unpadded* dimensions !
153         setup_x_random(xvecs, bw->m, my_nx, mmt->n0[bw->dir], pi, rstate);
154 
155         // we have indices mmt->wr[1]->i0..i1 available.
156         A->vec_set_zero(A, xymats, bw->m * prep_lookahead_iterations * A_multiplex);
157 
158         ASSERT_ALWAYS(nrhs <= A_multiplex);
159 
160         for(unsigned int j = 0 ; j < A_multiplex ; j++) {
161             /* Random generation + save is better done as writing random data
162              * to a file followed by reading it: this way, seeding works
163              * better.
164              */
165             if (j < nrhs) {
166                 /* create it as an extraction from the rhs file */
167                 ASSERT_ALWAYS(0);       /* implement me... See GF(p) below. */
168             } else {
169                 mmt_vec_set_random_through_file(y, "V%u-%u.0", unpadded, rstate, j * A_width);
170             }
171             if (tcan_print) {
172                 printf("// generated V%u-%u.0 (trial # %u)\n",
173                         j * A_width, (j + 1) * A_width, ntri);
174             }
175 
176             // compute x^T M y, x^T M^2 y, and so on. Since we do that
177             // piecewise for the different vectors, we first collect
178             // everything in the xymats array, and compute the rank later on.
179 
180             // XXX Note that x^Ty does not count here, because it does not
181             // take part to the sequence computed by lingen !
182             mmt_vec_twist(mmt, y);
183             matmul_top_mul(mmt, ymy, NULL);
184             mmt_vec_untwist(mmt, y);
185 
186 
187             /* XXX it's really like x_dotprod, except that we're filling
188              * the coefficients in another order (but why ?) */
189             for(unsigned int k = 0 ; k < prep_lookahead_iterations ; k++) {
190                 for(int r = 0 ; r < bw->m ; r++) {
191                     void * where = A->vec_subvec(A, xymats, (r * prep_lookahead_iterations + k) * A_multiplex + j);
192                     for(unsigned int t = 0 ; t < my_nx ; t++) {
193                         uint32_t row = xvecs[r*my_nx+t];
194                         unsigned int vi0 = y->i0 + mmt_my_own_offset_in_items(y);
195                         unsigned int vi1 = vi0 + mmt_my_own_size_in_items(y);
196                         if (row < vi0 || row >= vi1)
197                             continue;
198 
199                         void * coeff = y->abase->vec_subvec(y->abase, y->v, row - y->i0);
200                         A->add(A, where, where, coeff);
201                     }
202                 }
203                 mmt_vec_twist(mmt, y);
204                 matmul_top_mul(mmt, ymy, NULL);
205                 mmt_vec_untwist(mmt, y);
206             }
207         }
208 
209         /* Make sure computation is over for everyone ! */
210         serialize_threads(pi->m);
211 
212         /* Now all threads and jobs must collectively reduce the zone
213          * pointed to by xymats */
214         pi_allreduce(NULL, xymats,
215                 bw->m * prep_lookahead_iterations * A_multiplex,
216                 mmt->pitype, BWC_PI_SUM, pi->m);
217 
218         /* OK -- now everybody has the same data */
219 
220         int dimk;
221 
222         /* the kernel() call is not reentrant */
223         if (pi->m->trank == 0) {
224             dimk = kernel((mp_limb_t *) xymats, NULL,
225                     bw->m, prep_lookahead_iterations * A->simd_groupsize(A) * A_multiplex,
226                     A->vec_elt_stride(A, prep_lookahead_iterations * A_multiplex)/sizeof(mp_limb_t),
227                     0);
228         }
229         pi_thread_bcast((void *) &dimk, 1, BWC_PI_INT, 0, pi->m);
230 
231         if (tcan_print)
232             printf("// Dimension of kernel: %d\n", dimk);
233 
234         if (dimk == 0) {
235             if (tcan_print)
236                 printf("// Found good x,y vector pair after %u trials\n",
237                         ntri+1);
238             break;
239         }
240     }
241 
242     save_x(xvecs, bw->m, my_nx, pi);
243 
244     if (rhs_name)
245         free(rhs);
246 
247     gmp_randclear(rstate);
248 
249     mmt_vec_clear(mmt, y);
250     mmt_vec_clear(mmt, my);
251     matmul_top_clear(mmt);
252 
253     /* clean up xy mats stuff */
254     cheating_vec_clear(A, &xymats, bw->m * prep_lookahead_iterations * A_multiplex);
255 
256     A->oo_field_clear(A);
257 
258     free(xvecs);
259     return NULL;
260 }
261 
262 /* The GF(p) case is significantly different:
263  *  - this is *NOT* a parallel program, although we're happy to use the
264  *    same tools as everywhere else for some common operations, namely
265  *    the matmul_top layer.
266  *  - we do *NOT* perform the * consistency check.
267  *  - and we possibly inject the RHS in the data produced.
268  */
prep_prog_gfp(parallelizing_info_ptr pi,param_list pl,void * arg MAYBE_UNUSED)269 void * prep_prog_gfp(parallelizing_info_ptr pi, param_list pl, void * arg MAYBE_UNUSED)
270 {
271     /* Interleaving does not make sense for this program. So the second
272      * block of threads just leave immediately */
273     ASSERT_ALWAYS(!pi->interleaved);
274 
275     // Doing the ``hello world'' test is a very good way of testing the
276     // global mpi/pthreads setup. So despite its apparent irrelevance, I
277     // suggest leaving it here as a cheap sanity check.
278     pi_hello(pi);
279 
280     int tcan_print = bw->can_print && pi->m->trank == 0;
281     matmul_top_data mmt;
282 
283     unsigned int nrhs = 0;
284     int char2 = mpz_cmp_ui(bw->p, 2) == 0;
285     int splitwidth = char2 ? 64 : 1;
286 
287     const char * rhs_name;
288     FILE * rhs;
289 
290     mpfq_vbase A;
291     mpfq_vbase_oo_field_init_byfeatures(A,
292             MPFQ_PRIME_MPZ, bw->p,
293             MPFQ_SIMD_GROUPSIZE, splitwidth,
294             MPFQ_DONE);
295 
296     matmul_top_init(mmt, A, pi, pl, bw->dir);
297 
298     bw_rank_check(mmt, pl);
299 
300     if (pi->m->trank || pi->m->jrank) {
301         /* as said above, this is *NOT* a parallel program.  */
302         goto leave_prep_prog_gfp;
303     }
304 
305     gmp_randstate_t rstate;
306     gmp_randinit_default(rstate);
307 
308     if (pi->m->trank == 0 && !bw->seed) {
309         /* note that bw is shared between threads, thus only thread 0 should
310          * test and update it here.
311          * at pi->m->jrank > 0, we don't care about the seed anyway
312          */
313         bw->seed = time(NULL);
314         MPI_Bcast(&bw->seed, 1, MPI_INT, 0, pi->m->pals);
315     }
316 
317     gmp_randseed_ui(rstate, bw->seed);
318 
319     if (tcan_print) {
320         printf("// Random generator seeded with %d\n", bw->seed);
321     }
322 
323     rhs_name = param_list_lookup_string(pl, "rhs");
324     rhs = NULL;
325     if (rhs_name) {
326         rhs = fopen(rhs_name, "r");
327         get_rhs_file_header_stream(rhs, NULL, &nrhs, NULL);
328         ASSERT_ALWAYS(rhs != NULL);
329         ASSERT_ALWAYS(nrhs <= mmt->n[!bw->dir]);
330     }
331 
332     /* First create all RHS vectors -- these are just splits of the big
333      * RHS block. Those files get created together. */
334     if (nrhs) {
335         char ** vec_names = (char**) malloc(nrhs * sizeof(char *));
336         FILE ** vec_files = (FILE**) malloc(nrhs * sizeof(FILE *));
337         for(unsigned int j = 0 ; j < nrhs ; j++) {
338             int rc = asprintf(&vec_names[j], "V%d-%d.0", j, j+1);
339             ASSERT_ALWAYS(rc >= 0);
340             vec_files[j] = fopen(vec_names[j], "wb");
341             ASSERT_ALWAYS(vec_files[j] != NULL);
342             printf("// Creating %s (extraction from %s)\n", vec_names[j], rhs_name);
343         }
344         void * coeff;
345         cheating_vec_init(A, &coeff, 1);
346         mpz_t c;
347         mpz_init(c);
348         for(unsigned int i = 0 ; i < mmt->n0[!bw->dir] ; i++) {
349             for(unsigned int j = 0 ; j < nrhs ; j++) {
350                 int rc;
351                 memset(coeff, 0, A->vec_elt_stride(A, 1));
352                 rc = gmp_fscanf(rhs, "%Zd", c);
353                 ASSERT_ALWAYS(rc == 1);
354                 A->set_mpz(A, A->vec_coeff_ptr(A, coeff, 0), c);
355                 rc = fwrite(coeff, A->vec_elt_stride(A,1), 1, vec_files[j]);
356                 ASSERT_ALWAYS(rc == 1);
357             }
358         }
359         mpz_clear(c);
360         cheating_vec_clear(A, &coeff, 1);
361         for(unsigned int j = 0 ; j < nrhs ; j++) {
362             fclose(vec_files[j]);
363             free(vec_names[j]);
364         }
365         free(vec_files);
366         free(vec_names);
367     }
368     /* Now create purely random vectors */
369     for(int j = (int) nrhs ; j < bw->n ; j++) {
370         void * vec;
371         char * vec_name;
372         FILE * vec_file;
373         int rc = asprintf(&vec_name, "V%d-%d.0", j, j+1);
374         ASSERT_ALWAYS(rc >= 0);
375         vec_file = fopen(vec_name, "wb");
376         ASSERT_ALWAYS(vec_file != NULL);
377         printf("// Creating %s\n", vec_name);
378         unsigned int unpadded = MAX(mmt->n0[0], mmt->n0[1]);
379         cheating_vec_init(A, &vec, unpadded);
380         A->vec_random(A, vec, unpadded, rstate);
381         A->vec_set_zero(A, A->vec_subvec(A, vec, mmt->n0[bw->dir]), unpadded - mmt->n0[bw->dir]);
382         rc = fwrite(vec, A->vec_elt_stride(A,1), unpadded, vec_file);
383         ASSERT_ALWAYS(rc >= 0 && ((unsigned int) rc) == unpadded);
384         cheating_vec_clear(A, &vec, unpadded);
385         fclose(vec_file);
386         free(vec_name);
387     }
388 
389 
390     gmp_randclear(rstate);
391 
392     {
393         /* initialize x -- make it completely deterministic. */
394         unsigned int my_nx = 4;
395         uint32_t * xvecs = (uint32_t*) malloc(my_nx * bw->m * sizeof(uint32_t));
396         /* with rhs, consider the strategy where the matrix is kept with
397          * its full size, but the SM block is replaced with zeros. Here
398          * we just force the x vectors to have data there, so as to avoid
399          * the possibility that a solution vector found by BW still has
400          * non-zero coordinates at these locations.
401          */
402         if (bw->m < (int) nrhs) {
403             fprintf(stderr, "m < nrhs is not supported\n");
404             exit(EXIT_FAILURE);
405         }
406         /* I am not sure that using balancing_pre_shuffle is right both
407          * for bw->dir == 0 and bw->dir == 1. Let's make sure we're in
408          * the case where this has been tested and seems to work
409          * correctly.
410          */
411         ASSERT_ALWAYS(bw->dir == 1);
412         ASSERT_ALWAYS(mmt->nmatrices == 1);
413         for(unsigned int i = 0 ; i < nrhs ; i++) {
414             xvecs[i * my_nx] = balancing_pre_shuffle(mmt->matrices[0]->bal, mmt->n0[!bw->dir]-nrhs+i);
415             printf("Forced %d-th x vector to be the %" PRIu32"-th canonical basis vector\n", i, xvecs[i * my_nx]);
416             ASSERT_ALWAYS(xvecs[i * my_nx] >= (uint32_t) (bw->m - nrhs));
417             for(unsigned int j = 1 ; j < my_nx ; j++) {
418                 xvecs[i * my_nx + j] = (1009 * (i * my_nx + j)) % mmt->n0[!bw->dir];
419             }
420         }
421         for(int i = (int) nrhs ; i < bw->m ; i++) {
422             xvecs[i * my_nx] = i - nrhs;
423             for(unsigned int j = 1 ; j < my_nx ; j++) {
424                 xvecs[i * my_nx + j] = (1009 * (i * my_nx + j)) % mmt->n0[!bw->dir];
425             }
426         }
427         /* save_x operates only on the leader thread */
428         save_x(xvecs, bw->m, my_nx, pi);
429         free(xvecs);
430     }
431 
432 leave_prep_prog_gfp:
433     matmul_top_clear(mmt);
434 
435     A->oo_field_clear(A);
436     return NULL;
437 }
438 
439 // coverity[root_function]
main(int argc,char * argv[])440 int main(int argc, char * argv[])
441 {
442     param_list pl;
443 
444     bw_common_init(bw, &argc, &argv);
445     param_list_init(pl);
446     parallelizing_info_init();
447 
448     bw_common_decl_usage(pl);
449     parallelizing_info_decl_usage(pl);
450     matmul_top_decl_usage(pl);
451     /* declare local parameters and switches */
452     param_list_decl_usage(pl, "rhs",
453             "file with the right-hand side vectors for inhomogeneous systems mod p");
454 
455     bw_common_parse_cmdline(bw, pl, &argc, &argv);
456 
457     /* This program does not support interleaving */
458     param_list_remove_key(pl, "interleaving");
459 
460     bw_common_interpret_parameters(bw, pl);
461     parallelizing_info_lookup_parameters(pl);
462     matmul_top_lookup_parameters(pl);
463     /* interpret our parameters: none here (so far). */
464     if (mpz_cmp_ui(bw->p, 2) != 0)
465         param_list_lookup_string(pl, "rhs");
466 
467     ASSERT_ALWAYS(!param_list_lookup_string(pl, "ys"));
468     ASSERT_ALWAYS(!param_list_lookup_string(pl, "solutions"));
469 
470     if (param_list_warn_unused(pl)) {
471         int rank;
472         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
473         if (!rank) param_list_print_usage(pl, bw->original_argv[0], stderr);
474         MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
475     }
476 
477     if (mpz_cmp_ui(bw->p, 2) == 0) {
478         pi_go(prep_prog, pl, 0);
479     } else {
480         pi_go(prep_prog_gfp, pl, 0);
481     }
482 
483     parallelizing_info_finish();
484     param_list_clear(pl);
485     bw_common_clear(bw);
486 
487     return 0;
488 }
489 
490