1 #include "cado.h" // IWYU pragma: keep
2 #include <cstdio>
3 #include <cinttypes>
4 #include <cstdint> // for uint32_t
5 #include <cstring> // for memset
6 #include <ctime> // for time
7 #include <cstdlib>
8 #include <gmp.h>
9 #include "balancing.h" // for balancing_pre_shuffle
10 #include "parallelizing_info.h"
11 #include "matmul_top.h"
12 #include "select_mpi.h"
13 #include "bblas_gauss.h"
14 #include "params.h"
15 #include "xvectors.h"
16 #include "bw-common.h"
17 #include "mpfq/mpfq.h"
18 #include "mpfq/mpfq_vbase.h"
19 #include "cheating_vec_init.h"
20 #include "portability.h" // asprintf // IWYU pragma: keep
21 #include "macros.h"
22
23
bw_rank_check(matmul_top_data_ptr mmt,param_list_ptr pl)24 void bw_rank_check(matmul_top_data_ptr mmt, param_list_ptr pl)
25 {
26 int tcan_print = bw->can_print && mmt->pi->m->trank == 0;
27 unsigned int r = matmul_top_rank_upper_bound(mmt);
28 if (tcan_print) {
29 printf("Matrix rank is at most %u (based on zero columns and rows encountered)\n", r);
30 }
31 int skip=0;
32 param_list_parse_int(pl, "skip_bw_early_rank_check", &skip);
33 if (bw->m + r < mmt->n0[0]) {
34 fprintf(stderr, "Based on the parameter m (=%u) and the rank defect of the matrix (>=%u), we can't expect to compute solutions reliably.\n",
35 bw->m, mmt->n0[0]-r);
36 if (skip) {
37 fprintf(stderr, "Proceeding anyway as per skip_bw_early_rank_check=1\n");
38 } else {
39 fprintf(stderr, "Aborting. Use skip_bw_early_rank_check=1 to proceed nevertheless.\n");
40 exit(EXIT_FAILURE);
41 }
42 }
43 }
44
prep_prog(parallelizing_info_ptr pi,param_list pl,void * arg MAYBE_UNUSED)45 void * prep_prog(parallelizing_info_ptr pi, param_list pl, void * arg MAYBE_UNUSED)
46 {
47 /* Interleaving does not make sense for this program. So the second
48 * block of threads just leave immediately */
49 ASSERT_ALWAYS(!pi->interleaved);
50
51 // Doing the ``hello world'' test is a very good way of testing the
52 // global mpi/pthreads setup. So despite its apparent irrelevance, I
53 // suggest leaving it here as a cheap sanity check.
54 pi_hello(pi);
55
56 int tcan_print = bw->can_print && pi->m->trank == 0;
57 matmul_top_data mmt;
58
59 unsigned int nrhs = 0;
60 int char2 = mpz_cmp_ui(bw->p, 2) == 0;
61 int splitwidth = char2 ? 64 : 1;
62
63 mpfq_vbase A;
64 unsigned int A_width = splitwidth;
65 unsigned int A_multiplex = bw->n / A_width;
66 mpfq_vbase_oo_field_init_byfeatures(A,
67 MPFQ_PRIME_MPZ, bw->p,
68 MPFQ_SIMD_GROUPSIZE, A_width,
69 MPFQ_DONE);
70 ASSERT_ALWAYS(A->simd_groupsize(A) * A_multiplex == (unsigned int) bw->n);
71
72 matmul_top_init(mmt, A, pi, pl, bw->dir);
73
74 bw_rank_check(mmt, pl);
75
76
77 gmp_randstate_t rstate;
78 gmp_randinit_default(rstate);
79
80 if (pi->m->trank == 0 && !bw->seed) {
81 /* note that bw is shared between threads, thus only thread 0 should
82 * test and update it here.
83 * at pi->m->jrank > 0, we don't care about the seed anyway
84 */
85 bw->seed = time(NULL);
86 MPI_Bcast(&bw->seed, 1, MPI_INT, 0, pi->m->pals);
87 }
88 serialize_threads(pi->m);
89
90 gmp_randseed_ui(rstate, bw->seed);
91 if (tcan_print) {
92 printf("// Random generator seeded with %d\n", bw->seed);
93 }
94
95 const char * rhs_name = param_list_lookup_string(pl, "rhs");
96 FILE * rhs = NULL;
97 if (rhs_name) {
98 rhs = fopen(rhs_name, "r");
99 get_rhs_file_header_stream(rhs, NULL, &nrhs, NULL);
100 ASSERT_ALWAYS(rhs != NULL);
101 ASSERT_ALWAYS(nrhs <= mmt->n[!bw->dir]);
102 }
103
104 mmt_vec ymy[2];
105 mmt_vec_ptr y = ymy[0];
106 mmt_vec_ptr my = ymy[1];
107
108 mmt_vec_init(mmt,0,0, y, bw->dir, /* shared ! */ 1, mmt->n[bw->dir]);
109 mmt_vec_init(mmt,0,0, my, !bw->dir, 0, mmt->n[!bw->dir]);
110
111 unsigned int unpadded = MAX(mmt->n0[0], mmt->n0[1]);
112
113 /* Number of copies of m by n matrices to use for trying to obtain a
114 * matrix of rank m.
115 *
116 * Note that it must be at least m/n, otherwise we stand no chance !
117 */
118 unsigned int prep_lookahead_iterations = iceildiv(bw->m, bw->n) + 1;
119
120 unsigned int my_nx = 1;
121 uint32_t * xvecs = (uint32_t*) malloc(my_nx * bw->m * sizeof(uint32_t));
122 void * xymats;
123
124 /* We're cheating on the generic init routines */
125 cheating_vec_init(A, &xymats, bw->m * prep_lookahead_iterations * A_multiplex);
126
127 for (unsigned ntri = 0;; ntri++) {
128 if (nrhs == A_multiplex) {
129 if (ntri) ++my_nx;
130 if (ntri >= 4) {
131 fprintf(stderr, "Cannot find a satisfactory initialization. "
132 "Maybe your RHS vectors are bad ?\n");
133 exit(EXIT_FAILURE);
134 }
135 } else if (ntri >= my_nx * 10) {
136 ++my_nx;
137 if (tcan_print) {
138 printf("// Getting bored. Trying %u x vectors\n", my_nx);
139 }
140 xvecs = (uint32_t*) realloc(xvecs, my_nx * bw->m * sizeof(uint32_t));
141 ASSERT_ALWAYS(xvecs != NULL);
142 }
143 serialize_threads(pi->m);
144
145 if (tcan_print) {
146 printf("// Generating new x,y vector pair (trial # %u)\n", ntri);
147 }
148
149 // if we're looking for the right nullspace, then x is on the left.
150 // Otherwise, it's on the right.
151
152 // generate indices w.r.t *unpadded* dimensions !
153 setup_x_random(xvecs, bw->m, my_nx, mmt->n0[bw->dir], pi, rstate);
154
155 // we have indices mmt->wr[1]->i0..i1 available.
156 A->vec_set_zero(A, xymats, bw->m * prep_lookahead_iterations * A_multiplex);
157
158 ASSERT_ALWAYS(nrhs <= A_multiplex);
159
160 for(unsigned int j = 0 ; j < A_multiplex ; j++) {
161 /* Random generation + save is better done as writing random data
162 * to a file followed by reading it: this way, seeding works
163 * better.
164 */
165 if (j < nrhs) {
166 /* create it as an extraction from the rhs file */
167 ASSERT_ALWAYS(0); /* implement me... See GF(p) below. */
168 } else {
169 mmt_vec_set_random_through_file(y, "V%u-%u.0", unpadded, rstate, j * A_width);
170 }
171 if (tcan_print) {
172 printf("// generated V%u-%u.0 (trial # %u)\n",
173 j * A_width, (j + 1) * A_width, ntri);
174 }
175
176 // compute x^T M y, x^T M^2 y, and so on. Since we do that
177 // piecewise for the different vectors, we first collect
178 // everything in the xymats array, and compute the rank later on.
179
180 // XXX Note that x^Ty does not count here, because it does not
181 // take part to the sequence computed by lingen !
182 mmt_vec_twist(mmt, y);
183 matmul_top_mul(mmt, ymy, NULL);
184 mmt_vec_untwist(mmt, y);
185
186
187 /* XXX it's really like x_dotprod, except that we're filling
188 * the coefficients in another order (but why ?) */
189 for(unsigned int k = 0 ; k < prep_lookahead_iterations ; k++) {
190 for(int r = 0 ; r < bw->m ; r++) {
191 void * where = A->vec_subvec(A, xymats, (r * prep_lookahead_iterations + k) * A_multiplex + j);
192 for(unsigned int t = 0 ; t < my_nx ; t++) {
193 uint32_t row = xvecs[r*my_nx+t];
194 unsigned int vi0 = y->i0 + mmt_my_own_offset_in_items(y);
195 unsigned int vi1 = vi0 + mmt_my_own_size_in_items(y);
196 if (row < vi0 || row >= vi1)
197 continue;
198
199 void * coeff = y->abase->vec_subvec(y->abase, y->v, row - y->i0);
200 A->add(A, where, where, coeff);
201 }
202 }
203 mmt_vec_twist(mmt, y);
204 matmul_top_mul(mmt, ymy, NULL);
205 mmt_vec_untwist(mmt, y);
206 }
207 }
208
209 /* Make sure computation is over for everyone ! */
210 serialize_threads(pi->m);
211
212 /* Now all threads and jobs must collectively reduce the zone
213 * pointed to by xymats */
214 pi_allreduce(NULL, xymats,
215 bw->m * prep_lookahead_iterations * A_multiplex,
216 mmt->pitype, BWC_PI_SUM, pi->m);
217
218 /* OK -- now everybody has the same data */
219
220 int dimk;
221
222 /* the kernel() call is not reentrant */
223 if (pi->m->trank == 0) {
224 dimk = kernel((mp_limb_t *) xymats, NULL,
225 bw->m, prep_lookahead_iterations * A->simd_groupsize(A) * A_multiplex,
226 A->vec_elt_stride(A, prep_lookahead_iterations * A_multiplex)/sizeof(mp_limb_t),
227 0);
228 }
229 pi_thread_bcast((void *) &dimk, 1, BWC_PI_INT, 0, pi->m);
230
231 if (tcan_print)
232 printf("// Dimension of kernel: %d\n", dimk);
233
234 if (dimk == 0) {
235 if (tcan_print)
236 printf("// Found good x,y vector pair after %u trials\n",
237 ntri+1);
238 break;
239 }
240 }
241
242 save_x(xvecs, bw->m, my_nx, pi);
243
244 if (rhs_name)
245 free(rhs);
246
247 gmp_randclear(rstate);
248
249 mmt_vec_clear(mmt, y);
250 mmt_vec_clear(mmt, my);
251 matmul_top_clear(mmt);
252
253 /* clean up xy mats stuff */
254 cheating_vec_clear(A, &xymats, bw->m * prep_lookahead_iterations * A_multiplex);
255
256 A->oo_field_clear(A);
257
258 free(xvecs);
259 return NULL;
260 }
261
262 /* The GF(p) case is significantly different:
263 * - this is *NOT* a parallel program, although we're happy to use the
264 * same tools as everywhere else for some common operations, namely
265 * the matmul_top layer.
266 * - we do *NOT* perform the * consistency check.
267 * - and we possibly inject the RHS in the data produced.
268 */
prep_prog_gfp(parallelizing_info_ptr pi,param_list pl,void * arg MAYBE_UNUSED)269 void * prep_prog_gfp(parallelizing_info_ptr pi, param_list pl, void * arg MAYBE_UNUSED)
270 {
271 /* Interleaving does not make sense for this program. So the second
272 * block of threads just leave immediately */
273 ASSERT_ALWAYS(!pi->interleaved);
274
275 // Doing the ``hello world'' test is a very good way of testing the
276 // global mpi/pthreads setup. So despite its apparent irrelevance, I
277 // suggest leaving it here as a cheap sanity check.
278 pi_hello(pi);
279
280 int tcan_print = bw->can_print && pi->m->trank == 0;
281 matmul_top_data mmt;
282
283 unsigned int nrhs = 0;
284 int char2 = mpz_cmp_ui(bw->p, 2) == 0;
285 int splitwidth = char2 ? 64 : 1;
286
287 const char * rhs_name;
288 FILE * rhs;
289
290 mpfq_vbase A;
291 mpfq_vbase_oo_field_init_byfeatures(A,
292 MPFQ_PRIME_MPZ, bw->p,
293 MPFQ_SIMD_GROUPSIZE, splitwidth,
294 MPFQ_DONE);
295
296 matmul_top_init(mmt, A, pi, pl, bw->dir);
297
298 bw_rank_check(mmt, pl);
299
300 if (pi->m->trank || pi->m->jrank) {
301 /* as said above, this is *NOT* a parallel program. */
302 goto leave_prep_prog_gfp;
303 }
304
305 gmp_randstate_t rstate;
306 gmp_randinit_default(rstate);
307
308 if (pi->m->trank == 0 && !bw->seed) {
309 /* note that bw is shared between threads, thus only thread 0 should
310 * test and update it here.
311 * at pi->m->jrank > 0, we don't care about the seed anyway
312 */
313 bw->seed = time(NULL);
314 MPI_Bcast(&bw->seed, 1, MPI_INT, 0, pi->m->pals);
315 }
316
317 gmp_randseed_ui(rstate, bw->seed);
318
319 if (tcan_print) {
320 printf("// Random generator seeded with %d\n", bw->seed);
321 }
322
323 rhs_name = param_list_lookup_string(pl, "rhs");
324 rhs = NULL;
325 if (rhs_name) {
326 rhs = fopen(rhs_name, "r");
327 get_rhs_file_header_stream(rhs, NULL, &nrhs, NULL);
328 ASSERT_ALWAYS(rhs != NULL);
329 ASSERT_ALWAYS(nrhs <= mmt->n[!bw->dir]);
330 }
331
332 /* First create all RHS vectors -- these are just splits of the big
333 * RHS block. Those files get created together. */
334 if (nrhs) {
335 char ** vec_names = (char**) malloc(nrhs * sizeof(char *));
336 FILE ** vec_files = (FILE**) malloc(nrhs * sizeof(FILE *));
337 for(unsigned int j = 0 ; j < nrhs ; j++) {
338 int rc = asprintf(&vec_names[j], "V%d-%d.0", j, j+1);
339 ASSERT_ALWAYS(rc >= 0);
340 vec_files[j] = fopen(vec_names[j], "wb");
341 ASSERT_ALWAYS(vec_files[j] != NULL);
342 printf("// Creating %s (extraction from %s)\n", vec_names[j], rhs_name);
343 }
344 void * coeff;
345 cheating_vec_init(A, &coeff, 1);
346 mpz_t c;
347 mpz_init(c);
348 for(unsigned int i = 0 ; i < mmt->n0[!bw->dir] ; i++) {
349 for(unsigned int j = 0 ; j < nrhs ; j++) {
350 int rc;
351 memset(coeff, 0, A->vec_elt_stride(A, 1));
352 rc = gmp_fscanf(rhs, "%Zd", c);
353 ASSERT_ALWAYS(rc == 1);
354 A->set_mpz(A, A->vec_coeff_ptr(A, coeff, 0), c);
355 rc = fwrite(coeff, A->vec_elt_stride(A,1), 1, vec_files[j]);
356 ASSERT_ALWAYS(rc == 1);
357 }
358 }
359 mpz_clear(c);
360 cheating_vec_clear(A, &coeff, 1);
361 for(unsigned int j = 0 ; j < nrhs ; j++) {
362 fclose(vec_files[j]);
363 free(vec_names[j]);
364 }
365 free(vec_files);
366 free(vec_names);
367 }
368 /* Now create purely random vectors */
369 for(int j = (int) nrhs ; j < bw->n ; j++) {
370 void * vec;
371 char * vec_name;
372 FILE * vec_file;
373 int rc = asprintf(&vec_name, "V%d-%d.0", j, j+1);
374 ASSERT_ALWAYS(rc >= 0);
375 vec_file = fopen(vec_name, "wb");
376 ASSERT_ALWAYS(vec_file != NULL);
377 printf("// Creating %s\n", vec_name);
378 unsigned int unpadded = MAX(mmt->n0[0], mmt->n0[1]);
379 cheating_vec_init(A, &vec, unpadded);
380 A->vec_random(A, vec, unpadded, rstate);
381 A->vec_set_zero(A, A->vec_subvec(A, vec, mmt->n0[bw->dir]), unpadded - mmt->n0[bw->dir]);
382 rc = fwrite(vec, A->vec_elt_stride(A,1), unpadded, vec_file);
383 ASSERT_ALWAYS(rc >= 0 && ((unsigned int) rc) == unpadded);
384 cheating_vec_clear(A, &vec, unpadded);
385 fclose(vec_file);
386 free(vec_name);
387 }
388
389
390 gmp_randclear(rstate);
391
392 {
393 /* initialize x -- make it completely deterministic. */
394 unsigned int my_nx = 4;
395 uint32_t * xvecs = (uint32_t*) malloc(my_nx * bw->m * sizeof(uint32_t));
396 /* with rhs, consider the strategy where the matrix is kept with
397 * its full size, but the SM block is replaced with zeros. Here
398 * we just force the x vectors to have data there, so as to avoid
399 * the possibility that a solution vector found by BW still has
400 * non-zero coordinates at these locations.
401 */
402 if (bw->m < (int) nrhs) {
403 fprintf(stderr, "m < nrhs is not supported\n");
404 exit(EXIT_FAILURE);
405 }
406 /* I am not sure that using balancing_pre_shuffle is right both
407 * for bw->dir == 0 and bw->dir == 1. Let's make sure we're in
408 * the case where this has been tested and seems to work
409 * correctly.
410 */
411 ASSERT_ALWAYS(bw->dir == 1);
412 ASSERT_ALWAYS(mmt->nmatrices == 1);
413 for(unsigned int i = 0 ; i < nrhs ; i++) {
414 xvecs[i * my_nx] = balancing_pre_shuffle(mmt->matrices[0]->bal, mmt->n0[!bw->dir]-nrhs+i);
415 printf("Forced %d-th x vector to be the %" PRIu32"-th canonical basis vector\n", i, xvecs[i * my_nx]);
416 ASSERT_ALWAYS(xvecs[i * my_nx] >= (uint32_t) (bw->m - nrhs));
417 for(unsigned int j = 1 ; j < my_nx ; j++) {
418 xvecs[i * my_nx + j] = (1009 * (i * my_nx + j)) % mmt->n0[!bw->dir];
419 }
420 }
421 for(int i = (int) nrhs ; i < bw->m ; i++) {
422 xvecs[i * my_nx] = i - nrhs;
423 for(unsigned int j = 1 ; j < my_nx ; j++) {
424 xvecs[i * my_nx + j] = (1009 * (i * my_nx + j)) % mmt->n0[!bw->dir];
425 }
426 }
427 /* save_x operates only on the leader thread */
428 save_x(xvecs, bw->m, my_nx, pi);
429 free(xvecs);
430 }
431
432 leave_prep_prog_gfp:
433 matmul_top_clear(mmt);
434
435 A->oo_field_clear(A);
436 return NULL;
437 }
438
439 // coverity[root_function]
main(int argc,char * argv[])440 int main(int argc, char * argv[])
441 {
442 param_list pl;
443
444 bw_common_init(bw, &argc, &argv);
445 param_list_init(pl);
446 parallelizing_info_init();
447
448 bw_common_decl_usage(pl);
449 parallelizing_info_decl_usage(pl);
450 matmul_top_decl_usage(pl);
451 /* declare local parameters and switches */
452 param_list_decl_usage(pl, "rhs",
453 "file with the right-hand side vectors for inhomogeneous systems mod p");
454
455 bw_common_parse_cmdline(bw, pl, &argc, &argv);
456
457 /* This program does not support interleaving */
458 param_list_remove_key(pl, "interleaving");
459
460 bw_common_interpret_parameters(bw, pl);
461 parallelizing_info_lookup_parameters(pl);
462 matmul_top_lookup_parameters(pl);
463 /* interpret our parameters: none here (so far). */
464 if (mpz_cmp_ui(bw->p, 2) != 0)
465 param_list_lookup_string(pl, "rhs");
466
467 ASSERT_ALWAYS(!param_list_lookup_string(pl, "ys"));
468 ASSERT_ALWAYS(!param_list_lookup_string(pl, "solutions"));
469
470 if (param_list_warn_unused(pl)) {
471 int rank;
472 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
473 if (!rank) param_list_print_usage(pl, bw->original_argv[0], stderr);
474 MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
475 }
476
477 if (mpz_cmp_ui(bw->p, 2) == 0) {
478 pi_go(prep_prog, pl, 0);
479 } else {
480 pi_go(prep_prog_gfp, pl, 0);
481 }
482
483 parallelizing_info_finish();
484 param_list_clear(pl);
485 bw_common_clear(bw);
486
487 return 0;
488 }
489
490