1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <sys/param.h>
3 #include <cstdio>                        // for printf, fprintf, stderr
4 #include <cstdlib>                       // for exit, qsort, EXIT_FAILURE
5 #include <tuple>                          // for tie, tuple
6 #include <vector>                         // for vector
7 #include "cxx_mpz.hpp"
8 #include "lingen_abfield.hpp" // IWYU pragma: keep
9 #include "lingen_bmstatus.hpp"            // for bmstatus
10 #include "lingen_bw_dimensions.hpp"
11 #include "lingen_call_companion.hpp"      // for lingen_call_companion
12 #include "lingen_expected_pi_length.hpp"
13 #include "lingen_qcode_prime.hpp"
14 #include "macros.h"                       // for ASSERT_ALWAYS, ASSERT, icei...
15 #include "tree_stats.hpp"                 // for tree_stats, tree_stats::sen...
16 
17 /* This destructively cancels the first len coefficients of E, and
18  * computes the appropriate matrix pi which achieves this. The
19  * elimination is done in accordance with the nominal degrees found in
20  * delta.
21  *
22  * The result is expected to have degree ceil(len*m/b) coefficients, so
23  * that E*pi is divisible by X^len.
24  */
25 
26 /*{{{ basecase */
27 
28 /* {{{ col sorting */
29 /* We sort only with respect to the global delta[] parameter. As it turns
30  * out, we also access the column index in the same aray and sort with
31  * respect to it, but this is only cosmetic.
32  *
33  * Note that unlike what is asserted in other coiped of the code, sorting
34  * w.r.t. the local delta[] value is completely useless. Code which
35  * relies on this should be fixed.
36  */
37 
38 typedef int (*sortfunc_t) (const void*, const void*);
39 
lexcmp2(const int x[2],const int y[2])40 static int lexcmp2(const int x[2], const int y[2])
41 {
42     for(int i = 0 ; i < 2 ; i++) {
43         int d = x[i] - y[i];
44         if (d) return d;
45     }
46     return 0;
47 }
48 
49 /* }}} */
50 
51 /*}}}*/
52 
bw_lingen_basecase_raw(bmstatus & bm,matpoly const & E)53 matpoly bw_lingen_basecase_raw(bmstatus & bm, matpoly const & E) /*{{{*/
54 {
55     int generator_found = 0;
56 
57     bw_dimensions & d = bm.d;
58     unsigned int m = d.m;
59     unsigned int n = d.n;
60     unsigned int b = m + n;
61     abdst_field ab = d.ab;
62     ASSERT(E.m == m);
63     ASSERT(E.n == b);
64 
65     /* Allocate something large enough for the result. This will be
66      * soon freed anyway. Set it to identity. */
67     unsigned int mi, ma;
68     std::tie(mi, ma) = get_minmax_delta(bm.delta);
69 
70     unsigned int pi_room_base = expected_pi_length(d, bm.delta, E.get_size());
71 
72     matpoly pi(ab, b, b, pi_room_base);
73     pi.set_size(pi_room_base);
74 
75     /* Also keep track of the
76      * number of coefficients for the columns of pi. Set pi to Id */
77 
78     std::vector<unsigned int> pi_lengths(b, 1);
79     std::vector<unsigned int> pi_real_lengths(b, 1);
80     pi.set_constant_ui(1);
81 
82     for(unsigned int i = 0 ; i < b ; i++) {
83         pi_lengths[i] = 1;
84         /* Fix for check 21744-bis. Our column priority will allow adding
85          * to a row if its delta is above the average. But then this
86          * might surprise us, because in truth we dont start with
87          * something of large degree here. So we're bumping pi_length a
88          * little bit to accomodate for this situation.
89          */
90         pi_lengths[i] += bm.delta[i] - mi;
91     }
92 
93     /* Keep a list of columns which have been used as pivots at the
94      * previous iteration */
95     std::vector<unsigned int> pivots(m, 0);
96     std::vector<int> is_pivot(b, 0);
97 
98     matpoly e(ab, m, b, 1);
99     e.set_size(1);
100 
101     matpoly T(ab, b, b, 1);
102     int (*ctable)[2] = new int[b][2];
103 
104     for (unsigned int t = 0; t < E.get_size() ; t++, bm.t++) {
105 
106         /* {{{ Update the columns of e for degree t. Save computation
107          * time by not recomputing those which can easily be derived from
108          * previous iteration. Notice that the columns of e are exactly
109          * at the physical positions of the corresponding columns of pi.
110          */
111 
112         std::vector<unsigned int> todo;
113         for(unsigned int j = 0 ; j < b ; j++) {
114             if (is_pivot[j]) continue;
115             /* We should never have to recompute from pi using discarded
116              * columns. Discarded columns should always correspond to
117              * pivots */
118             ASSERT_ALWAYS(bm.lucky[j] >= 0);
119             todo.push_back(j);
120         }
121         /* icc openmp doesn't grok todo.size() as being a constant
122          * loop bound */
123         unsigned int nj = todo.size();
124 
125 #ifdef HAVE_OPENMP
126 #pragma omp parallel
127 #endif
128         {
129             abelt_ur tmp_ur;
130             abelt_ur_init(ab, &tmp_ur);
131 
132             abelt_ur e_ur;
133             abelt_ur_init(ab, &e_ur);
134 
135 #ifdef HAVE_OPENMP
136 #pragma omp for collapse(2)
137 #endif
138             for(unsigned int jl = 0 ; jl < nj ; ++jl) {
139                 for(unsigned int i = 0 ; i < m ; ++i) {
140                     unsigned int j = todo[jl];
141                     unsigned int lj = MIN(pi_real_lengths[j], t + 1);
142                     abelt_ur_set_zero(ab, e_ur);
143                     for(unsigned int k = 0 ; k < b ; ++k) {
144                         for(unsigned int s = 0 ; s < lj ; s++) {
145                             abmul_ur(ab, tmp_ur,
146                                     E.coeff(i, k, t - s),
147                                     pi.coeff(k, j, s));
148                             abelt_ur_add(ab, e_ur, e_ur, tmp_ur);
149                         }
150                     }
151                     abreduce(ab, e.coeff(i, j, 0), e_ur);
152                 }
153             }
154 
155             abelt_ur_clear(ab, &tmp_ur);
156             abelt_ur_clear(ab, &e_ur);
157         }
158         /* }}} */
159 
160         /* {{{ check for cancellations */
161 
162         unsigned int newluck = 0;
163         for(unsigned int jl = 0 ; jl < todo.size() ; ++jl) {
164             unsigned int j = todo[jl];
165             unsigned int nz = 0;
166             for(unsigned int i = 0 ; i < m ; i++) {
167                 nz += abcmp_ui(ab, e.coeff(i, j, 0), 0) == 0;
168             }
169             if (nz == m) {
170                 newluck++, bm.lucky[j]++;
171             } else if (bm.lucky[j] > 0) {
172                 bm.lucky[j] = 0;
173             }
174         }
175 
176 
177         if (newluck) {
178             /* If newluck == n, then we probably have a generator. We add an
179              * extra guarantee. newluck==n, for a total of k iterations in a
180              * row, means m*n*k coefficients cancelling magically. We would
181              * like this to be impossible by mere chance. Thus we want n*k >
182              * luck_mini, which can easily be checked */
183 
184             int luck_mini = expected_pi_length(d);
185             unsigned int luck_sure = 0;
186 
187             printf("t=%d, canceled columns:", bm.t);
188             for(unsigned int j = 0 ; j < b ; j++) {
189                 if (bm.lucky[j] > 0) {
190                     printf(" %u", j);
191                     luck_sure += bm.lucky[j] >= luck_mini;
192                 }
193             }
194 
195             if (newluck == n && luck_sure == n) {
196                 if (!generator_found) {
197                     printf(", complete generator found, for sure");
198                 }
199                 generator_found = 1;
200             }
201             printf(".\n");
202         }
203         /* }}} */
204 
205         if (generator_found) break;
206 
207         /* {{{ Now see in which order I may look at the columns of pi, so
208          * as to keep the nominal degrees correct. In contrast with what
209          * we used to do before, we no longer apply the permutation to
210          * delta. So the delta[] array keeps referring to physical
211          * indices, and we'll tune this in the end. */
212         for(unsigned int j = 0; j < b; j++) {
213             ctable[j][0] = bm.delta[j];
214             ctable[j][1] = j;
215         }
216         qsort(ctable, b, 2 * sizeof(int), (sortfunc_t) & lexcmp2);
217         /* }}} */
218 
219         /* {{{ Now do Gaussian elimination */
220 
221         /*
222          * The matrix T is *not* used for actually storing the product of
223          * the transvections, just the *list* of transvections. Then,
224          * instead of applying them row-major, we apply them column-major
225          * (abiding by the ordering of pivots), so that we get a better
226          * opportunity to do lazy reductions.
227          */
228 
229         T.set_constant_ui(1);
230 
231         is_pivot.assign(b, 0);
232         unsigned int r = 0;
233 
234         std::vector<unsigned int> pivot_columns;
235         /* Loop through logical indices */
236         for(unsigned int jl = 0; jl < b; jl++) {
237             unsigned int j = ctable[jl][1];
238             unsigned int u = 0;
239             /* {{{ Find the pivot */
240             for( ; u < m ; u++) {
241                 if (abcmp_ui(ab, e.coeff(u, j, 0), 0) != 0)
242                     break;
243             }
244             if (u == m) continue;
245             ASSERT(r < m);
246             /* }}} */
247             pivots[r++] = j;
248             is_pivot[j] = 1;
249             pivot_columns.push_back(j);
250             /* {{{ Cancel this coeff in all other columns. */
251             abelt inv;
252             abinit(ab, &inv);
253             int rc = abinv(ab, inv, e.coeff(u, j, 0));
254             if (!rc) {
255                 fprintf(stderr, "Error, found a factor of the modulus: ");
256                 abfprint(ab, stderr, inv);
257                 fprintf(stderr, "\n");
258                 exit(EXIT_FAILURE);
259             }
260             abneg(ab, inv, inv);
261             for (unsigned int kl = jl + 1; kl < b ; kl++) {
262                 unsigned int k = ctable[kl][1];
263                 if (abcmp_ui(ab, e.coeff(u, k, 0), 0) == 0)
264                     continue;
265                 // add lambda = e[u,k]*-e[u,j]^-1 times col j to col k.
266                 abelt lambda;
267                 abinit(ab, &lambda);
268                 abmul(ab, lambda, inv, e.coeff(u, k, 0));
269                 ASSERT(bm.delta[j] <= bm.delta[k]);
270                 /* {{{ Apply on both e and pi */
271                 abelt tmp;
272                 abinit(ab, &tmp);
273                 for(unsigned int i = 0 ; i < m ; i++) {
274                     /* TODO: Would be better if mpfq had an addmul */
275                     abmul(ab, tmp, lambda, e.coeff(i, j, 0));
276                     abadd(ab,
277                             e.coeff(i, k, 0),
278                             e.coeff(i, k, 0),
279                             tmp);
280                 }
281                 if (bm.lucky[k] < 0) {
282                     /* This column is already discarded, don't bother */
283                     continue;
284                 }
285                 if (bm.lucky[j] < 0) {
286                     /* This column is discarded. This is going to
287                      * invalidate another column of pi. Not a problem,
288                      * unless it's been marked as lucky previously ! */
289                     ASSERT_ALWAYS(bm.lucky[k] <= 0);
290                     printf("Column %u discarded from now on (through addition from column %u)\n", k, j);
291                     bm.lucky[k] = -1;
292                     continue;
293                 }
294                 /* We do *NOT* really update T. T is only used as
295                  * storage!
296                  */
297                 abset(ab, T.coeff(j, k, 0), lambda);
298                 abclear(ab, &tmp);
299                 /* }}} */
300                 abclear(ab, &lambda);
301             }
302             abclear(ab, &inv); /* }}} */
303         }
304         /* }}} */
305 
306         /* {{{ apply the transformations, using the transvection
307          * reordering trick */
308 
309         /* non-pivot columns are only added to and never read, so it does
310          * not really matter where we put their computation, provided
311          * that the columns that we do read are done at this point.
312          */
313         for(unsigned int jl = 0; jl < b; jl++) {
314             unsigned int j = ctable[jl][1];
315             if (!is_pivot[j])
316                 pivot_columns.push_back(j);
317         }
318 
319 #ifdef HAVE_OPENMP
320 #pragma omp parallel
321 #endif
322         {
323             abelt_ur tmp_pi;
324             abelt_ur tmp;
325             abelt_ur_init(ab, &tmp);
326             abelt_ur_init(ab, &tmp_pi);
327 
328             for(unsigned int jl = 0 ; jl < b ; ++jl) {
329                 unsigned int j = pivot_columns[jl];
330                 /* compute column j completely. We may put this interface in
331                  * matpoly, but it's really special-purposed, to the point
332                  * that it really makes little sense IMO
333                  *
334                  * Beware: operations on the different columns are *not*
335                  * independent, here ! Operations on the different degrees,
336                  * on the other hand, are. As well of course as the
337                  * operations on the different entries in each column.
338                  */
339 
340 #ifndef NDEBUG
341 #ifdef HAVE_OPENMP
342 #pragma omp critical
343 #endif
344                 {
345                     for(unsigned int kl = m ; kl < b ; kl++) {
346                         unsigned int k = pivot_columns[kl];
347                         absrc_elt Tkj = T.coeff(k, j, 0);
348                         ASSERT_ALWAYS(abcmp_ui(ab, Tkj, k==j) == 0);
349                     }
350                     for(unsigned int kl = 0 ; kl < MIN(m,jl) ; kl++) {
351                         unsigned int k = pivot_columns[kl];
352                         absrc_elt Tkj = T.coeff(k, j, 0);
353                         if (abcmp_ui(ab, Tkj, 0) == 0) continue;
354                         ASSERT_ALWAYS(pi_lengths[k] <= pi_lengths[j]);
355                         pi_real_lengths[j] = std::max(pi_real_lengths[k], pi_real_lengths[j]);
356                     }
357                 }
358 #endif
359 
360                 /* Icc 2019 synthetizes a pragma omp single around
361                  * accesses to pi_real_lengths. I don't think it makes
362                  * sense in that particular case, it's fine enough here
363                  * to read the data now after the critical section above.
364                  */
365                 unsigned int dummy = pi_real_lengths[j];
366 
367 #ifdef HAVE_OPENMP
368 #pragma omp for collapse(2)
369 #endif
370                 for(unsigned int i = 0 ; i < b ; i++) {
371                     for(unsigned int s = 0 ; s < dummy ; s++) {
372                         abdst_elt piijs = pi.coeff(i, j, s);
373 
374                         abelt_ur_set_elt(ab, tmp_pi, piijs);
375 
376                         for(unsigned int kl = 0 ; kl < MIN(m,jl) ; kl++) {
377                             unsigned int k = pivot_columns[kl];
378                             /* TODO: if column k was already a pivot on previous
379                              * turn (which could happen, depending on m and n),
380                              * then the corresponding entry is probably zero
381                              * (exact condition needs to be written more
382                              * accurately).
383                              */
384 
385                             absrc_elt Tkj = T.coeff(k, j, 0);
386                             if (abcmp_ui(ab, Tkj, 0) == 0) continue;
387                             /* pi[i,k] has length pi_lengths[k]. Multiply
388                              * that by T[k,j], which is a constant. Add
389                              * to the unreduced thing. We don't have an
390                              * mpfq api call for that operation.
391                              */
392                             absrc_elt piiks = pi.coeff(i, k, s);
393                             abmul_ur(ab, tmp, piiks, Tkj);
394                             abelt_ur_add(ab, tmp_pi, tmp_pi, tmp);
395                         }
396                         abreduce(ab, piijs, tmp_pi);
397                     }
398                 }
399             }
400             abelt_ur_clear(ab, &tmp);
401             abelt_ur_clear(ab, &tmp_pi);
402         }
403         /* }}} */
404 
405         ASSERT_ALWAYS(r == m);
406 
407         /* {{{ Now for all pivots, multiply column in pi by x */
408         for (unsigned int j = 0; j < b ; j++) {
409             if (!is_pivot[j]) continue;
410             if (pi_real_lengths[j] >= pi.capacity()) {
411                 if (!generator_found) {
412                     pi.realloc(pi.capacity() + MAX(pi.capacity() / (m+n), 1));
413                     printf("t=%u, expanding allocation for pi (now %zu%%) ; lengths: ",
414                             bm.t,
415                             100 * pi.capacity() / pi_room_base);
416                     for(unsigned int j = 0; j < b; j++)
417                         printf(" %u", pi_real_lengths[j]);
418                     printf("\n");
419                 } else {
420                     ASSERT_ALWAYS(bm.lucky[j] <= 0);
421                     if (bm.lucky[j] == 0)
422                         printf("t=%u, column %u discarded from now on\n",
423                                 bm.t, j);
424                     bm.lucky[j] = -1;
425                     pi_lengths[j]++;
426                     pi_real_lengths[j]++;
427                     bm.delta[j]++;
428                     continue;
429                 }
430             }
431             pi.multiply_column_by_x(j, pi_real_lengths[j]);
432             pi_real_lengths[j]++;
433             pi_lengths[j]++;
434             bm.delta[j]++;
435         }
436         /* }}} */
437     }
438 
439     unsigned int pisize = 0;
440     for(unsigned int j = 0; j < b; j++) {
441         if (pi_real_lengths[j] > pisize)
442             pisize = pi_real_lengths[j];
443     }
444     /* Given the structure of the computation, there's no reason for the
445      * initial estimate to go wrong.
446      */
447     ASSERT_ALWAYS(pisize <= pi.capacity());
448     pi.set_size(pisize);
449 
450     for(unsigned int j = 0; j < b; j++) {
451         for(unsigned int k = pi_real_lengths[j] ; k < pi.get_size() ; k++) {
452             for(unsigned int i = 0 ; i < b ; i++) {
453                 ASSERT_ALWAYS(abis_zero(ab, pi.coeff(i, j, k)));
454             }
455         }
456     }
457 
458     bm.done = generator_found;
459     return pi;
460 }
461 /* }}} */
462 
463 /* wrap this up */
464 matpoly
bw_lingen_basecase(bmstatus & bm,matpoly & E)465 bw_lingen_basecase(bmstatus & bm, matpoly & E)
466 {
467     lingen_call_companion const & C = bm.companion(bm.depth(), E.get_size());
468     tree_stats::sentinel dummy(bm.stats, "basecase", E.get_size(), C.total_ncalls, true);
469     bm.stats.plan_smallstep("basecase", C.ttb);
470     bm.stats.begin_smallstep("basecase");
471     matpoly pi = bw_lingen_basecase_raw(bm, E);
472     bm.stats.end_smallstep();
473     E = matpoly();
474     return pi;
475 }
476 
test_basecase(abdst_field ab,unsigned int m,unsigned int n,size_t L,gmp_randstate_t rstate)477 void test_basecase(abdst_field ab, unsigned int m, unsigned int n, size_t L, gmp_randstate_t rstate)/*{{{*/
478 {
479     /* used by testing code */
480     bmstatus bm(m,n);
481     cxx_mpz p;
482     abfield_characteristic(ab, p);
483     abfield_specify(bm.d.ab, MPFQ_PRIME_MPZ, (mpz_srcptr) p);
484     unsigned int t0 = iceildiv(m,n);
485     bm.set_t0(t0);
486     matpoly E(ab, m, m+n, L);
487     E.zero_pad(L);
488     E.fill_random(0, L, rstate);
489     bw_lingen_basecase_raw(bm, E);
490 }/*}}}*/
491