1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <sys/param.h>
3 #include <cstdio> // for printf, fprintf, stderr
4 #include <cstdlib> // for exit, qsort, EXIT_FAILURE
5 #include <tuple> // for tie, tuple
6 #include <vector> // for vector
7 #include "cxx_mpz.hpp"
8 #include "lingen_abfield.hpp" // IWYU pragma: keep
9 #include "lingen_bmstatus.hpp" // for bmstatus
10 #include "lingen_bw_dimensions.hpp"
11 #include "lingen_call_companion.hpp" // for lingen_call_companion
12 #include "lingen_expected_pi_length.hpp"
13 #include "lingen_qcode_prime.hpp"
14 #include "macros.h" // for ASSERT_ALWAYS, ASSERT, icei...
15 #include "tree_stats.hpp" // for tree_stats, tree_stats::sen...
16
17 /* This destructively cancels the first len coefficients of E, and
18 * computes the appropriate matrix pi which achieves this. The
19 * elimination is done in accordance with the nominal degrees found in
20 * delta.
21 *
22 * The result is expected to have degree ceil(len*m/b) coefficients, so
23 * that E*pi is divisible by X^len.
24 */
25
26 /*{{{ basecase */
27
28 /* {{{ col sorting */
29 /* We sort only with respect to the global delta[] parameter. As it turns
30 * out, we also access the column index in the same aray and sort with
31 * respect to it, but this is only cosmetic.
32 *
33 * Note that unlike what is asserted in other coiped of the code, sorting
34 * w.r.t. the local delta[] value is completely useless. Code which
35 * relies on this should be fixed.
36 */
37
38 typedef int (*sortfunc_t) (const void*, const void*);
39
lexcmp2(const int x[2],const int y[2])40 static int lexcmp2(const int x[2], const int y[2])
41 {
42 for(int i = 0 ; i < 2 ; i++) {
43 int d = x[i] - y[i];
44 if (d) return d;
45 }
46 return 0;
47 }
48
49 /* }}} */
50
51 /*}}}*/
52
bw_lingen_basecase_raw(bmstatus & bm,matpoly const & E)53 matpoly bw_lingen_basecase_raw(bmstatus & bm, matpoly const & E) /*{{{*/
54 {
55 int generator_found = 0;
56
57 bw_dimensions & d = bm.d;
58 unsigned int m = d.m;
59 unsigned int n = d.n;
60 unsigned int b = m + n;
61 abdst_field ab = d.ab;
62 ASSERT(E.m == m);
63 ASSERT(E.n == b);
64
65 /* Allocate something large enough for the result. This will be
66 * soon freed anyway. Set it to identity. */
67 unsigned int mi, ma;
68 std::tie(mi, ma) = get_minmax_delta(bm.delta);
69
70 unsigned int pi_room_base = expected_pi_length(d, bm.delta, E.get_size());
71
72 matpoly pi(ab, b, b, pi_room_base);
73 pi.set_size(pi_room_base);
74
75 /* Also keep track of the
76 * number of coefficients for the columns of pi. Set pi to Id */
77
78 std::vector<unsigned int> pi_lengths(b, 1);
79 std::vector<unsigned int> pi_real_lengths(b, 1);
80 pi.set_constant_ui(1);
81
82 for(unsigned int i = 0 ; i < b ; i++) {
83 pi_lengths[i] = 1;
84 /* Fix for check 21744-bis. Our column priority will allow adding
85 * to a row if its delta is above the average. But then this
86 * might surprise us, because in truth we dont start with
87 * something of large degree here. So we're bumping pi_length a
88 * little bit to accomodate for this situation.
89 */
90 pi_lengths[i] += bm.delta[i] - mi;
91 }
92
93 /* Keep a list of columns which have been used as pivots at the
94 * previous iteration */
95 std::vector<unsigned int> pivots(m, 0);
96 std::vector<int> is_pivot(b, 0);
97
98 matpoly e(ab, m, b, 1);
99 e.set_size(1);
100
101 matpoly T(ab, b, b, 1);
102 int (*ctable)[2] = new int[b][2];
103
104 for (unsigned int t = 0; t < E.get_size() ; t++, bm.t++) {
105
106 /* {{{ Update the columns of e for degree t. Save computation
107 * time by not recomputing those which can easily be derived from
108 * previous iteration. Notice that the columns of e are exactly
109 * at the physical positions of the corresponding columns of pi.
110 */
111
112 std::vector<unsigned int> todo;
113 for(unsigned int j = 0 ; j < b ; j++) {
114 if (is_pivot[j]) continue;
115 /* We should never have to recompute from pi using discarded
116 * columns. Discarded columns should always correspond to
117 * pivots */
118 ASSERT_ALWAYS(bm.lucky[j] >= 0);
119 todo.push_back(j);
120 }
121 /* icc openmp doesn't grok todo.size() as being a constant
122 * loop bound */
123 unsigned int nj = todo.size();
124
125 #ifdef HAVE_OPENMP
126 #pragma omp parallel
127 #endif
128 {
129 abelt_ur tmp_ur;
130 abelt_ur_init(ab, &tmp_ur);
131
132 abelt_ur e_ur;
133 abelt_ur_init(ab, &e_ur);
134
135 #ifdef HAVE_OPENMP
136 #pragma omp for collapse(2)
137 #endif
138 for(unsigned int jl = 0 ; jl < nj ; ++jl) {
139 for(unsigned int i = 0 ; i < m ; ++i) {
140 unsigned int j = todo[jl];
141 unsigned int lj = MIN(pi_real_lengths[j], t + 1);
142 abelt_ur_set_zero(ab, e_ur);
143 for(unsigned int k = 0 ; k < b ; ++k) {
144 for(unsigned int s = 0 ; s < lj ; s++) {
145 abmul_ur(ab, tmp_ur,
146 E.coeff(i, k, t - s),
147 pi.coeff(k, j, s));
148 abelt_ur_add(ab, e_ur, e_ur, tmp_ur);
149 }
150 }
151 abreduce(ab, e.coeff(i, j, 0), e_ur);
152 }
153 }
154
155 abelt_ur_clear(ab, &tmp_ur);
156 abelt_ur_clear(ab, &e_ur);
157 }
158 /* }}} */
159
160 /* {{{ check for cancellations */
161
162 unsigned int newluck = 0;
163 for(unsigned int jl = 0 ; jl < todo.size() ; ++jl) {
164 unsigned int j = todo[jl];
165 unsigned int nz = 0;
166 for(unsigned int i = 0 ; i < m ; i++) {
167 nz += abcmp_ui(ab, e.coeff(i, j, 0), 0) == 0;
168 }
169 if (nz == m) {
170 newluck++, bm.lucky[j]++;
171 } else if (bm.lucky[j] > 0) {
172 bm.lucky[j] = 0;
173 }
174 }
175
176
177 if (newluck) {
178 /* If newluck == n, then we probably have a generator. We add an
179 * extra guarantee. newluck==n, for a total of k iterations in a
180 * row, means m*n*k coefficients cancelling magically. We would
181 * like this to be impossible by mere chance. Thus we want n*k >
182 * luck_mini, which can easily be checked */
183
184 int luck_mini = expected_pi_length(d);
185 unsigned int luck_sure = 0;
186
187 printf("t=%d, canceled columns:", bm.t);
188 for(unsigned int j = 0 ; j < b ; j++) {
189 if (bm.lucky[j] > 0) {
190 printf(" %u", j);
191 luck_sure += bm.lucky[j] >= luck_mini;
192 }
193 }
194
195 if (newluck == n && luck_sure == n) {
196 if (!generator_found) {
197 printf(", complete generator found, for sure");
198 }
199 generator_found = 1;
200 }
201 printf(".\n");
202 }
203 /* }}} */
204
205 if (generator_found) break;
206
207 /* {{{ Now see in which order I may look at the columns of pi, so
208 * as to keep the nominal degrees correct. In contrast with what
209 * we used to do before, we no longer apply the permutation to
210 * delta. So the delta[] array keeps referring to physical
211 * indices, and we'll tune this in the end. */
212 for(unsigned int j = 0; j < b; j++) {
213 ctable[j][0] = bm.delta[j];
214 ctable[j][1] = j;
215 }
216 qsort(ctable, b, 2 * sizeof(int), (sortfunc_t) & lexcmp2);
217 /* }}} */
218
219 /* {{{ Now do Gaussian elimination */
220
221 /*
222 * The matrix T is *not* used for actually storing the product of
223 * the transvections, just the *list* of transvections. Then,
224 * instead of applying them row-major, we apply them column-major
225 * (abiding by the ordering of pivots), so that we get a better
226 * opportunity to do lazy reductions.
227 */
228
229 T.set_constant_ui(1);
230
231 is_pivot.assign(b, 0);
232 unsigned int r = 0;
233
234 std::vector<unsigned int> pivot_columns;
235 /* Loop through logical indices */
236 for(unsigned int jl = 0; jl < b; jl++) {
237 unsigned int j = ctable[jl][1];
238 unsigned int u = 0;
239 /* {{{ Find the pivot */
240 for( ; u < m ; u++) {
241 if (abcmp_ui(ab, e.coeff(u, j, 0), 0) != 0)
242 break;
243 }
244 if (u == m) continue;
245 ASSERT(r < m);
246 /* }}} */
247 pivots[r++] = j;
248 is_pivot[j] = 1;
249 pivot_columns.push_back(j);
250 /* {{{ Cancel this coeff in all other columns. */
251 abelt inv;
252 abinit(ab, &inv);
253 int rc = abinv(ab, inv, e.coeff(u, j, 0));
254 if (!rc) {
255 fprintf(stderr, "Error, found a factor of the modulus: ");
256 abfprint(ab, stderr, inv);
257 fprintf(stderr, "\n");
258 exit(EXIT_FAILURE);
259 }
260 abneg(ab, inv, inv);
261 for (unsigned int kl = jl + 1; kl < b ; kl++) {
262 unsigned int k = ctable[kl][1];
263 if (abcmp_ui(ab, e.coeff(u, k, 0), 0) == 0)
264 continue;
265 // add lambda = e[u,k]*-e[u,j]^-1 times col j to col k.
266 abelt lambda;
267 abinit(ab, &lambda);
268 abmul(ab, lambda, inv, e.coeff(u, k, 0));
269 ASSERT(bm.delta[j] <= bm.delta[k]);
270 /* {{{ Apply on both e and pi */
271 abelt tmp;
272 abinit(ab, &tmp);
273 for(unsigned int i = 0 ; i < m ; i++) {
274 /* TODO: Would be better if mpfq had an addmul */
275 abmul(ab, tmp, lambda, e.coeff(i, j, 0));
276 abadd(ab,
277 e.coeff(i, k, 0),
278 e.coeff(i, k, 0),
279 tmp);
280 }
281 if (bm.lucky[k] < 0) {
282 /* This column is already discarded, don't bother */
283 continue;
284 }
285 if (bm.lucky[j] < 0) {
286 /* This column is discarded. This is going to
287 * invalidate another column of pi. Not a problem,
288 * unless it's been marked as lucky previously ! */
289 ASSERT_ALWAYS(bm.lucky[k] <= 0);
290 printf("Column %u discarded from now on (through addition from column %u)\n", k, j);
291 bm.lucky[k] = -1;
292 continue;
293 }
294 /* We do *NOT* really update T. T is only used as
295 * storage!
296 */
297 abset(ab, T.coeff(j, k, 0), lambda);
298 abclear(ab, &tmp);
299 /* }}} */
300 abclear(ab, &lambda);
301 }
302 abclear(ab, &inv); /* }}} */
303 }
304 /* }}} */
305
306 /* {{{ apply the transformations, using the transvection
307 * reordering trick */
308
309 /* non-pivot columns are only added to and never read, so it does
310 * not really matter where we put their computation, provided
311 * that the columns that we do read are done at this point.
312 */
313 for(unsigned int jl = 0; jl < b; jl++) {
314 unsigned int j = ctable[jl][1];
315 if (!is_pivot[j])
316 pivot_columns.push_back(j);
317 }
318
319 #ifdef HAVE_OPENMP
320 #pragma omp parallel
321 #endif
322 {
323 abelt_ur tmp_pi;
324 abelt_ur tmp;
325 abelt_ur_init(ab, &tmp);
326 abelt_ur_init(ab, &tmp_pi);
327
328 for(unsigned int jl = 0 ; jl < b ; ++jl) {
329 unsigned int j = pivot_columns[jl];
330 /* compute column j completely. We may put this interface in
331 * matpoly, but it's really special-purposed, to the point
332 * that it really makes little sense IMO
333 *
334 * Beware: operations on the different columns are *not*
335 * independent, here ! Operations on the different degrees,
336 * on the other hand, are. As well of course as the
337 * operations on the different entries in each column.
338 */
339
340 #ifndef NDEBUG
341 #ifdef HAVE_OPENMP
342 #pragma omp critical
343 #endif
344 {
345 for(unsigned int kl = m ; kl < b ; kl++) {
346 unsigned int k = pivot_columns[kl];
347 absrc_elt Tkj = T.coeff(k, j, 0);
348 ASSERT_ALWAYS(abcmp_ui(ab, Tkj, k==j) == 0);
349 }
350 for(unsigned int kl = 0 ; kl < MIN(m,jl) ; kl++) {
351 unsigned int k = pivot_columns[kl];
352 absrc_elt Tkj = T.coeff(k, j, 0);
353 if (abcmp_ui(ab, Tkj, 0) == 0) continue;
354 ASSERT_ALWAYS(pi_lengths[k] <= pi_lengths[j]);
355 pi_real_lengths[j] = std::max(pi_real_lengths[k], pi_real_lengths[j]);
356 }
357 }
358 #endif
359
360 /* Icc 2019 synthetizes a pragma omp single around
361 * accesses to pi_real_lengths. I don't think it makes
362 * sense in that particular case, it's fine enough here
363 * to read the data now after the critical section above.
364 */
365 unsigned int dummy = pi_real_lengths[j];
366
367 #ifdef HAVE_OPENMP
368 #pragma omp for collapse(2)
369 #endif
370 for(unsigned int i = 0 ; i < b ; i++) {
371 for(unsigned int s = 0 ; s < dummy ; s++) {
372 abdst_elt piijs = pi.coeff(i, j, s);
373
374 abelt_ur_set_elt(ab, tmp_pi, piijs);
375
376 for(unsigned int kl = 0 ; kl < MIN(m,jl) ; kl++) {
377 unsigned int k = pivot_columns[kl];
378 /* TODO: if column k was already a pivot on previous
379 * turn (which could happen, depending on m and n),
380 * then the corresponding entry is probably zero
381 * (exact condition needs to be written more
382 * accurately).
383 */
384
385 absrc_elt Tkj = T.coeff(k, j, 0);
386 if (abcmp_ui(ab, Tkj, 0) == 0) continue;
387 /* pi[i,k] has length pi_lengths[k]. Multiply
388 * that by T[k,j], which is a constant. Add
389 * to the unreduced thing. We don't have an
390 * mpfq api call for that operation.
391 */
392 absrc_elt piiks = pi.coeff(i, k, s);
393 abmul_ur(ab, tmp, piiks, Tkj);
394 abelt_ur_add(ab, tmp_pi, tmp_pi, tmp);
395 }
396 abreduce(ab, piijs, tmp_pi);
397 }
398 }
399 }
400 abelt_ur_clear(ab, &tmp);
401 abelt_ur_clear(ab, &tmp_pi);
402 }
403 /* }}} */
404
405 ASSERT_ALWAYS(r == m);
406
407 /* {{{ Now for all pivots, multiply column in pi by x */
408 for (unsigned int j = 0; j < b ; j++) {
409 if (!is_pivot[j]) continue;
410 if (pi_real_lengths[j] >= pi.capacity()) {
411 if (!generator_found) {
412 pi.realloc(pi.capacity() + MAX(pi.capacity() / (m+n), 1));
413 printf("t=%u, expanding allocation for pi (now %zu%%) ; lengths: ",
414 bm.t,
415 100 * pi.capacity() / pi_room_base);
416 for(unsigned int j = 0; j < b; j++)
417 printf(" %u", pi_real_lengths[j]);
418 printf("\n");
419 } else {
420 ASSERT_ALWAYS(bm.lucky[j] <= 0);
421 if (bm.lucky[j] == 0)
422 printf("t=%u, column %u discarded from now on\n",
423 bm.t, j);
424 bm.lucky[j] = -1;
425 pi_lengths[j]++;
426 pi_real_lengths[j]++;
427 bm.delta[j]++;
428 continue;
429 }
430 }
431 pi.multiply_column_by_x(j, pi_real_lengths[j]);
432 pi_real_lengths[j]++;
433 pi_lengths[j]++;
434 bm.delta[j]++;
435 }
436 /* }}} */
437 }
438
439 unsigned int pisize = 0;
440 for(unsigned int j = 0; j < b; j++) {
441 if (pi_real_lengths[j] > pisize)
442 pisize = pi_real_lengths[j];
443 }
444 /* Given the structure of the computation, there's no reason for the
445 * initial estimate to go wrong.
446 */
447 ASSERT_ALWAYS(pisize <= pi.capacity());
448 pi.set_size(pisize);
449
450 for(unsigned int j = 0; j < b; j++) {
451 for(unsigned int k = pi_real_lengths[j] ; k < pi.get_size() ; k++) {
452 for(unsigned int i = 0 ; i < b ; i++) {
453 ASSERT_ALWAYS(abis_zero(ab, pi.coeff(i, j, k)));
454 }
455 }
456 }
457
458 bm.done = generator_found;
459 return pi;
460 }
461 /* }}} */
462
463 /* wrap this up */
464 matpoly
bw_lingen_basecase(bmstatus & bm,matpoly & E)465 bw_lingen_basecase(bmstatus & bm, matpoly & E)
466 {
467 lingen_call_companion const & C = bm.companion(bm.depth(), E.get_size());
468 tree_stats::sentinel dummy(bm.stats, "basecase", E.get_size(), C.total_ncalls, true);
469 bm.stats.plan_smallstep("basecase", C.ttb);
470 bm.stats.begin_smallstep("basecase");
471 matpoly pi = bw_lingen_basecase_raw(bm, E);
472 bm.stats.end_smallstep();
473 E = matpoly();
474 return pi;
475 }
476
test_basecase(abdst_field ab,unsigned int m,unsigned int n,size_t L,gmp_randstate_t rstate)477 void test_basecase(abdst_field ab, unsigned int m, unsigned int n, size_t L, gmp_randstate_t rstate)/*{{{*/
478 {
479 /* used by testing code */
480 bmstatus bm(m,n);
481 cxx_mpz p;
482 abfield_characteristic(ab, p);
483 abfield_specify(bm.d.ab, MPFQ_PRIME_MPZ, (mpz_srcptr) p);
484 unsigned int t0 = iceildiv(m,n);
485 bm.set_t0(t0);
486 matpoly E(ab, m, m+n, L);
487 E.zero_pad(L);
488 E.fill_random(0, L, rstate);
489 bw_lingen_basecase_raw(bm, E);
490 }/*}}}*/
491