1 
2 #include <NTL/mat_lzz_p.h>
3 #include <NTL/vec_long.h>
4 
5 
6 #include <NTL/BasicThreadPool.h>
7 
8 
9 
10 #ifdef NTL_HAVE_AVX
11 #include <immintrin.h>
12 #endif
13 
14 NTL_START_IMPL
15 
16 
17 #define PAR_THRESH_SQ (200)
18 #define PAR_THRESH (40000.0)
19 
20 
21 // *******************************************************
22 //
23 // Matrix Window data structure: perhaps some day this
24 // will be made public.
25 //
26 // *******************************************************
27 
28 struct mat_window_zz_p {
29    mat_zz_p &A;
30    long r_offset;
31    long c_offset;
32    long nrows;
33    long ncols;
34 
mat_window_zz_pmat_window_zz_p35    mat_window_zz_p(mat_zz_p& _A) :
36    A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
37 
mat_window_zz_pmat_window_zz_p38    mat_window_zz_p(const mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
39    A(w.A)
40    {
41       if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
42          LogicError("mat_window_zz_p: bad args");
43 
44       r_offset = w.r_offset + r1;
45       c_offset = w.c_offset + c1;
46       nrows = r2-r1;
47       ncols = c2-c1;
48    }
49 
operator []mat_window_zz_p50    zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
51 
NumRowsmat_window_zz_p52    long NumRows() const { return nrows; }
NumColsmat_window_zz_p53    long NumCols() const { return ncols; }
54 
55 };
56 
57 
58 struct const_mat_window_zz_p {
59    const mat_zz_p &A;
60    long r_offset;
61    long c_offset;
62    long nrows;
63    long ncols;
64 
const_mat_window_zz_pconst_mat_window_zz_p65    const_mat_window_zz_p(const mat_zz_p& _A) :
66    A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
67 
const_mat_window_zz_pconst_mat_window_zz_p68    const_mat_window_zz_p(const mat_window_zz_p& w) :
69    A(w.A), r_offset(w.r_offset), c_offset(w.c_offset), nrows(w.nrows), ncols(w.ncols) { }
70 
const_mat_window_zz_pconst_mat_window_zz_p71    const_mat_window_zz_p(const const_mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
72    A(w.A)
73    {
74       if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
75          LogicError("const_mat_window_zz_p: bad args");
76 
77       r_offset = w.r_offset + r1;
78       c_offset = w.c_offset + c1;
79       nrows = r2-r1;
80       ncols = c2-c1;
81    }
82 
operator []const_mat_window_zz_p83    const zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
84 
NumRowsconst_mat_window_zz_p85    long NumRows() const { return nrows; }
NumColsconst_mat_window_zz_p86    long NumCols() const { return ncols; }
87 
88 };
89 
add(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)90 void add(const mat_window_zz_p& X,
91          const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
92 {
93    long n = A.NumRows();
94    long m = A.NumCols();
95 
96    if (B.NumRows() != n || B.NumCols() != m)
97       LogicError("matrix add: dimension mismatch");
98 
99    if (X.NumRows() != n || X.NumCols() != m)
100       LogicError("matrix add: dimension mismatch");
101 
102    long p = zz_p::modulus();
103 
104    for (long i = 0; i < n; i++) {
105       zz_p *x = X[i];
106       const zz_p *a = A[i];
107       const zz_p *b = B[i];
108       for (long j = 0; j < m; j++) {
109          x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
110       }
111    }
112 }
113 
sub(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)114 void sub(const mat_window_zz_p& X,
115          const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
116 {
117    long n = A.NumRows();
118    long m = A.NumCols();
119 
120    if (B.NumRows() != n || B.NumCols() != m)
121       LogicError("matrix sub: dimension mismatch");
122 
123    if (X.NumRows() != n || X.NumCols() != m)
124       LogicError("matrix sub: dimension mismatch");
125 
126    long p = zz_p::modulus();
127 
128    for (long i = 0; i < n; i++) {
129       zz_p *x = X[i];
130       const zz_p *a = A[i];
131       const zz_p *b = B[i];
132       for (long j = 0; j < m; j++) {
133          x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
134       }
135    }
136 }
137 
138 
clear(const mat_window_zz_p & X)139 void clear(const mat_window_zz_p& X)
140 {
141    long n = X.NumRows();
142    long m = X.NumCols();
143 
144    for (long i = 0; i < n; i++)
145       for (long j = 0; j < m; j++)
146          clear(X[i][j]);
147 }
148 
149 
150 
151 // ***********************************************************
152 
153 
154 
155 
156 
157 
add(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)158 void add(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
159 {
160    long n = A.NumRows();
161    long m = A.NumCols();
162 
163    if (B.NumRows() != n || B.NumCols() != m)
164       LogicError("matrix add: dimension mismatch");
165 
166    X.SetDims(n, m);
167 
168    long p = zz_p::modulus();
169 
170    for (long i = 0; i < n; i++) {
171       zz_p *x = X[i].elts();
172       const zz_p *a = A[i].elts();
173       const zz_p *b = B[i].elts();
174       for (long j = 0; j < m; j++) {
175          x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
176       }
177    }
178 }
179 
sub(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)180 void sub(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
181 {
182    long n = A.NumRows();
183    long m = A.NumCols();
184 
185    if (B.NumRows() != n || B.NumCols() != m)
186       LogicError("matrix sub: dimension mismatch");
187 
188    X.SetDims(n, m);
189 
190    long p = zz_p::modulus();
191 
192    for (long i = 0; i < n; i++) {
193       zz_p *x = X[i].elts();
194       const zz_p *a = A[i].elts();
195       const zz_p *b = B[i].elts();
196       for (long j = 0; j < m; j++) {
197          x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
198       }
199    }
200 
201 }
202 
203 
204 
205 
206 
diag(mat_zz_p & X,long n,zz_p d)207 void diag(mat_zz_p& X, long n, zz_p d)
208 {
209    X.SetDims(n, n);
210    long i, j;
211 
212    for (i = 1; i <= n; i++)
213       for (j = 1; j <= n; j++)
214          if (i == j)
215             X(i, j) = d;
216          else
217             clear(X(i, j));
218 }
219 
IsDiag(const mat_zz_p & A,long n,zz_p d)220 long IsDiag(const mat_zz_p& A, long n, zz_p d)
221 {
222    if (A.NumRows() != n || A.NumCols() != n)
223       return 0;
224 
225    long i, j;
226 
227    for (i = 1; i <= n; i++)
228       for (j = 1; j <= n; j++)
229          if (i != j) {
230             if (!IsZero(A(i, j))) return 0;
231          }
232          else {
233             if (A(i, j) != d) return 0;
234          }
235 
236    return 1;
237 }
238 
negate(mat_zz_p & X,const mat_zz_p & A)239 void negate(mat_zz_p& X, const mat_zz_p& A)
240 {
241    long n = A.NumRows();
242    long m = A.NumCols();
243 
244 
245    X.SetDims(n, m);
246 
247    long p = zz_p::modulus();
248 
249    for (long i = 0; i < n; i++) {
250       zz_p *x = X[i].elts();
251       const zz_p *a = A[i].elts();
252       for (long j = 0; j < m; j++) {
253          x[j].LoopHole() = NegateMod(rep(a[j]), p);
254       }
255    }
256 }
257 
IsZero(const mat_zz_p & a)258 long IsZero(const mat_zz_p& a)
259 {
260    long n = a.NumRows();
261    long i;
262 
263    for (i = 0; i < n; i++)
264       if (!IsZero(a[i]))
265          return 0;
266 
267    return 1;
268 }
269 
clear(mat_zz_p & x)270 void clear(mat_zz_p& x)
271 {
272    long n = x.NumRows();
273    long i;
274    for (i = 0; i < n; i++)
275       clear(x[i]);
276 }
277 
278 
ident(mat_zz_p & X,long n)279 void ident(mat_zz_p& X, long n)
280 {
281    X.SetDims(n, n);
282    long i, j;
283 
284    for (i = 1; i <= n; i++)
285       for (j = 1; j <= n; j++)
286          if (i == j)
287             set(X(i, j));
288          else
289             clear(X(i, j));
290 }
291 
292 
IsIdent(const mat_zz_p & A,long n)293 long IsIdent(const mat_zz_p& A, long n)
294 {
295    if (A.NumRows() != n || A.NumCols() != n)
296       return 0;
297 
298    long i, j;
299 
300    for (i = 1; i <= n; i++)
301       for (j = 1; j <= n; j++)
302          if (i != j) {
303             if (!IsZero(A(i, j))) return 0;
304          }
305          else {
306             if (!IsOne(A(i, j))) return 0;
307          }
308 
309    return 1;
310 }
311 
312 
transpose(mat_zz_p & X,const mat_zz_p & A)313 void transpose(mat_zz_p& X, const mat_zz_p& A)
314 {
315    long n = A.NumRows();
316    long m = A.NumCols();
317 
318    long i, j;
319 
320    if (&X == & A) {
321       if (n == m)
322          for (i = 1; i <= n; i++)
323             for (j = i+1; j <= n; j++)
324                swap(X(i, j), X(j, i));
325       else {
326          mat_zz_p tmp;
327          tmp.SetDims(m, n);
328          for (i = 1; i <= n; i++)
329             for (j = 1; j <= m; j++)
330                tmp(j, i) = A(i, j);
331          X.kill();
332          X = tmp;
333       }
334    }
335    else {
336       X.SetDims(m, n);
337       for (i = 1; i <= n; i++)
338          for (j = 1; j <= m; j++)
339             X(j, i) = A(i, j);
340    }
341 }
342 
343 
344 
345 
relaxed_power(mat_zz_p & X,const mat_zz_p & A,const ZZ & e,bool relax)346 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax)
347 {
348    if (A.NumRows() != A.NumCols()) LogicError("power: non-square matrix");
349 
350    if (e == 0) {
351       ident(X, A.NumRows());
352       return;
353    }
354 
355    mat_zz_p T1, T2;
356    long i, k;
357 
358    k = NumBits(e);
359    T1 = A;
360 
361    for (i = k-2; i >= 0; i--) {
362       sqr(T2, T1);
363       if (bit(e, i))
364          mul(T1, T2, A);
365       else
366          T1 = T2;
367    }
368 
369    if (e < 0)
370       relaxed_inv(X, T1, relax);
371    else
372       X = T1;
373 }
374 
375 
376 
377 // ******************************************************************
378 //
379 // matrix-vector multiplication code
380 //
381 // ******************************************************************
382 
383 
384 
385 
386 
387 
mul(vec_zz_p & x,const vec_zz_p & a,const mat_zz_p & B)388 void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
389 {
390    long l = a.length();
391    long m = B.NumCols();
392 
393    if (l != B.NumRows())
394       LogicError("matrix mul: dimension mismatch");
395 
396    if (m == 0) {
397 
398       x.SetLength(0);
399 
400    }
401    else if (m == 1) {
402 
403       long p = zz_p::modulus();
404       mulmod_t pinv = zz_p::ModulusInverse();
405 
406       long acc, tmp;
407       long k;
408 
409       acc = 0;
410       for(k = 1; k <= l; k++) {
411          tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
412          acc = AddMod(acc, tmp, p);
413       }
414 
415       x.SetLength(1);
416       x(1).LoopHole()  = acc;
417 
418    }
419    else {  // m > 1.  precondition and EXEC_RANGE
420 
421 
422       long p = zz_p::modulus();
423       mulmod_t pinv = zz_p::ModulusInverse();
424 
425       NTL_TLS_LOCAL(vec_long, mul_aux_vec);
426       vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
427       mul_aux_vec.SetLength(m);
428       long *acc = mul_aux_vec.elts();
429 
430       const zz_p* ap = a.elts();
431 
432       for (long j = 0; j < m; j++) acc[j] = 0;
433 
434       const bool seq = double(l)*double(m) < PAR_THRESH;
435 
436       NTL_GEXEC_RANGE(seq, m, first, last) {
437 
438          for (long k = 0;  k < l; k++) {
439             long aa = rep(ap[k]);
440             if (aa != 0) {
441                const zz_p* bp = B[k].elts();
442                long T1;
443                mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
444 
445                for (long j = first; j < last; j++) {
446                   T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
447                   acc[j] = AddMod(acc[j], T1, p);
448                }
449             }
450          }
451 
452       } NTL_GEXEC_RANGE_END
453 
454       x.SetLength(m);
455       zz_p *xp = x.elts();
456       for (long j = 0; j < m; j++)
457          xp[j].LoopHole() = acc[j];
458    }
459 }
460 
461 
mul_aux(vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b)462 void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
463 {
464    long n = A.NumRows();
465    long l = A.NumCols();
466 
467    if (l != b.length())
468       LogicError("matrix mul: dimension mismatch");
469 
470    x.SetLength(n);
471    zz_p* xp = x.elts();
472 
473    long p = zz_p::modulus();
474    const zz_p* bp = b.elts();
475 
476    const bool seq = double(n)*double(l) < PAR_THRESH;
477 
478 
479 #ifdef NTL_HAVE_LL_TYPE
480 
481    if (InnerProd_L_viable(l, p)) {
482 
483       sp_reduce_struct red_struct = zz_p::red_struct();
484       long bound = InnerProd_L_bound(p);
485 
486       NTL_GEXEC_RANGE(seq, n, first, last) {
487 
488          for (long i = first; i < last; i++) {
489             xp[i].LoopHole() = InnerProd_L(A[i].elts(), bp, l, p, red_struct, bound);
490          }
491 
492       } NTL_GEXEC_RANGE_END
493    }
494    else {
495       sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
496 
497       NTL_GEXEC_RANGE(seq, n, first, last) {
498 
499          for (long i = first; i < last; i++) {
500             xp[i].LoopHole() = InnerProd_LL(A[i].elts(), bp, l, p, ll_red_struct);
501          }
502 
503       } NTL_GEXEC_RANGE_END
504 
505    }
506 
507 #else
508 
509    mulmod_t pinv = zz_p::ModulusInverse();
510 
511    if (n <= 1) {
512 
513       for (long i = 0; i < n; i++) {
514          long acc = 0;
515          const zz_p* ap = A[i].elts();
516 
517          for (long k = 0; k < l; k++) {
518             long tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
519             acc = AddMod(acc, tmp, p);
520          }
521 
522          xp[i].LoopHole() = acc;
523       }
524 
525    }
526    else {
527 
528       NTL_TLS_LOCAL(Vec<mulmod_precon_t>, precon_vec);
529       Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
530       precon_vec.SetLength(l);
531       mulmod_precon_t *bpinv = precon_vec.elts();
532 
533       for (long k = 0; k < l; k++)
534          bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
535 
536 
537       NTL_GEXEC_RANGE(seq, n, first, last) {
538          for (long i = first; i < last; i++) {
539             long acc = 0;
540             const zz_p* ap = A[i].elts();
541 
542             for (long k = 0; k < l; k++) {
543                long tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
544                acc = AddMod(acc, tmp, p);
545             }
546 
547             xp[i].LoopHole() = acc;
548          }
549       } NTL_GEXEC_RANGE_END
550 
551    }
552 
553 #endif
554 }
555 
mul(vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b)556 void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
557 {
558    if (&b == &x || A.alias(x)) {
559       vec_zz_p tmp;
560       mul_aux(tmp, A, b);
561       x = tmp;
562    }
563    else
564       mul_aux(x, A, b);
565 
566 }
567 
568 
mul(mat_zz_p & X,const mat_zz_p & A,zz_p b)569 void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
570 {
571    long n = A.NumRows();
572    long m = A.NumCols();
573 
574    X.SetDims(n, m);
575 
576 
577    if (n == 0 || m == 0 || (n == 1 && m == 1)) {
578       long i, j;
579 
580       for (i = 0; i < n; i++)
581          for (j = 0; j < m; j++)
582             mul(X[i][j], A[i][j], b);
583 
584    }
585    else {
586 
587       long p = zz_p::modulus();
588       mulmod_t pinv = zz_p::ModulusInverse();
589       long bb = rep(b);
590       mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
591 
592       const bool seq = double(n)*double(m) < PAR_THRESH;
593 
594       NTL_GEXEC_RANGE(seq, n, first, last)
595       long i, j;
596       for (i = first; i < last; i++) {
597          const zz_p *ap = A[i].elts();
598          zz_p *xp = X[i].elts();
599 
600          for (j = 0; j < m; j++)
601             xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
602       }
603       NTL_GEXEC_RANGE_END
604 
605 
606    }
607 }
608 
mul(mat_zz_p & X,const mat_zz_p & A,long b_in)609 void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
610 {
611    zz_p b;
612    b = b_in;
613    mul(X, A, b);
614 }
615 
616 
617 // ******************************************************************
618 //
619 // Code shared by block-matrix code
620 //
621 // ******************************************************************
622 
623 //#undef NTL_HAVE_AVX
624 //#undef NTL_HAVE_FMA
625 //#undef NTL_HAVE_AVX512F
626 // for testing purposes
627 
628 #if (defined(NTL_HAVE_AVX512F) && defined(NTL_AVOID_AVX512))
629 #undef NTL_HAVE_AVX512F
630 #endif
631 
632 #define MAT_BLK_SZ (32)
633 
634 
635 #ifdef NTL_HAVE_LL_TYPE
636 
637 #ifdef NTL_HAVE_AVX
638 
639 #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
640 // max int representable exactly as a double
641 // this assumes NTL_DBL_PRECISION <= NTL_BITS_PER_LONG-2, which is
642 // checked in the code that tests for HAVE_AVX, but we check it here as
643 // well
644 
645 #if (NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2)
646 #error "NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2"
647 #endif
648 
649 
650 // MUL_ADD(a, b, c): a += b*c
651 #ifdef NTL_HAVE_FMA
652 #define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
653 #else
654 #define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
655 #endif
656 
657 
658 #ifdef NTL_HAVE_AVX512F
659 #define MUL_ADD512(a, b, c) a = _mm512_fmadd_pd(b, c, a)
660 #endif
661 
662 
663 
664 #ifdef NTL_HAVE_AVX512F
665 
666 static
muladd1_by_32(double * x,const double * a,const double * b,long n)667 void muladd1_by_32(double *x, const double *a, const double *b, long n)
668 {
669    __m512d avec0, bvec;
670 
671    __m512d acc00, acc01, acc02, acc03;
672 
673    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
674    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
675    acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
676    acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
677 
678    for (long i = 0; i < n; i++) {
679       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
680 
681       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
682       MUL_ADD512(acc00, avec0, bvec);
683 
684       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
685       MUL_ADD512(acc01, avec0, bvec);
686 
687       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
688       MUL_ADD512(acc02, avec0, bvec);
689 
690       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
691       MUL_ADD512(acc03, avec0, bvec);
692    }
693 
694 
695    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
696    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
697    _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
698    _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
699 
700 }
701 
702 static
muladd2_by_32(double * x,const double * a,const double * b,long n)703 void muladd2_by_32(double *x, const double *a, const double *b, long n)
704 {
705    __m512d avec0, avec1, bvec;
706 
707    __m512d acc00, acc01, acc02, acc03;
708    __m512d acc10, acc11, acc12, acc13;
709 
710 
711 
712    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
713    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
714    acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
715    acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
716 
717    acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
718    acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
719    acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
720    acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
721 
722    for (long i = 0; i < n; i++) {
723       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
724       avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
725 
726       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
727       MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
728 
729       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
730       MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
731 
732       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
733       MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
734 
735       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
736       MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
737    }
738 
739 
740    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
741    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
742    _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
743    _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
744 
745    _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
746    _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
747    _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
748    _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
749 
750 }
751 
752 
753 static
muladd3_by_32(double * x,const double * a,const double * b,long n)754 void muladd3_by_32(double *x, const double *a, const double *b, long n)
755 {
756    __m512d avec0, avec1, avec2, bvec;
757 
758    __m512d acc00, acc01, acc02, acc03;
759    __m512d acc10, acc11, acc12, acc13;
760    __m512d acc20, acc21, acc22, acc23;
761 
762 
763 
764    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
765    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
766    acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
767    acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
768 
769    acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
770    acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
771    acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
772    acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
773 
774    acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
775    acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
776    acc22=_mm512_load_pd(x + 2*8 + 2*MAT_BLK_SZ);
777    acc23=_mm512_load_pd(x + 3*8 + 2*MAT_BLK_SZ);
778 
779    for (long i = 0; i < n; i++) {
780       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
781       avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
782       avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
783 
784       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
785       MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
786       MUL_ADD512(acc20, avec2, bvec);
787 
788       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
789       MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
790       MUL_ADD512(acc21, avec2, bvec);
791 
792       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
793       MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
794       MUL_ADD512(acc22, avec2, bvec);
795 
796       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
797       MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
798       MUL_ADD512(acc23, avec2, bvec);
799    }
800 
801    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
802    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
803    _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
804    _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
805 
806    _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
807    _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
808    _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
809    _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
810 
811    _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
812    _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
813    _mm512_store_pd(x + 2*8 + 2*MAT_BLK_SZ, acc22);
814    _mm512_store_pd(x + 3*8 + 2*MAT_BLK_SZ, acc23);
815 
816 
817 }
818 
819 
820 static
muladd1_by_16(double * x,const double * a,const double * b,long n)821 void muladd1_by_16(double *x, const double *a, const double *b, long n)
822 {
823    __m512d avec0, bvec;
824 
825    __m512d acc00, acc01;
826 
827 
828 
829    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
830    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
831 
832    for (long i = 0; i < n; i++) {
833       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
834 
835       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
836       MUL_ADD512(acc00, avec0, bvec);
837 
838       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
839       MUL_ADD512(acc01, avec0, bvec);
840    }
841 
842 
843    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
844    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
845 
846 }
847 
848 static
muladd2_by_16(double * x,const double * a,const double * b,long n)849 void muladd2_by_16(double *x, const double *a, const double *b, long n)
850 {
851    __m512d avec0, avec1, bvec;
852 
853    __m512d acc00, acc01;
854    __m512d acc10, acc11;
855 
856 
857 
858    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
859    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
860 
861    acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
862    acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
863 
864    for (long i = 0; i < n; i++) {
865       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
866       avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
867 
868       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
869       MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
870 
871       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
872       MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
873    }
874 
875 
876    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
877    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
878 
879    _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
880    _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
881 }
882 
883 
884 static
muladd3_by_16(double * x,const double * a,const double * b,long n)885 void muladd3_by_16(double *x, const double *a, const double *b, long n)
886 {
887    __m512d avec0, avec1, avec2, bvec;
888 
889    __m512d acc00, acc01;
890    __m512d acc10, acc11;
891    __m512d acc20, acc21;
892 
893 
894 
895    acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
896    acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
897 
898    acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
899    acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
900 
901    acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
902    acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
903 
904 
905    for (long i = 0; i < n; i++) {
906       avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
907       avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
908       avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
909 
910       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
911       MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
912       MUL_ADD512(acc20, avec2, bvec);
913 
914       bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
915       MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
916       MUL_ADD512(acc21, avec2, bvec);
917    }
918 
919 
920    _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
921    _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
922 
923    _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
924    _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
925 
926    _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
927    _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
928 
929 }
930 
931 
932 
933 #else
934 
935 static
muladd1_by_32(double * x,const double * a,const double * b,long n)936 void muladd1_by_32(double *x, const double *a, const double *b, long n)
937 {
938    __m256d avec, bvec;
939 
940 
941    __m256d acc0=_mm256_load_pd(x + 0*4);
942    __m256d acc1=_mm256_load_pd(x + 1*4);
943    __m256d acc2=_mm256_load_pd(x + 2*4);
944    __m256d acc3=_mm256_load_pd(x + 3*4);
945    __m256d acc4=_mm256_load_pd(x + 4*4);
946    __m256d acc5=_mm256_load_pd(x + 5*4);
947    __m256d acc6=_mm256_load_pd(x + 6*4);
948    __m256d acc7=_mm256_load_pd(x + 7*4);
949 
950 
951    for (long i = 0; i < n; i++) {
952       avec = _mm256_broadcast_sd(a); a++;
953 
954 
955       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
956       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
957       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
958       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
959       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
960       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
961       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
962       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
963    }
964 
965 
966    _mm256_store_pd(x + 0*4, acc0);
967    _mm256_store_pd(x + 1*4, acc1);
968    _mm256_store_pd(x + 2*4, acc2);
969    _mm256_store_pd(x + 3*4, acc3);
970    _mm256_store_pd(x + 4*4, acc4);
971    _mm256_store_pd(x + 5*4, acc5);
972    _mm256_store_pd(x + 6*4, acc6);
973    _mm256_store_pd(x + 7*4, acc7);
974 }
975 
976 static
muladd2_by_32(double * x,const double * a,const double * b,long n)977 void muladd2_by_32(double *x, const double *a, const double *b, long n)
978 {
979    __m256d avec0, avec1, bvec;
980    __m256d acc00, acc01, acc02, acc03;
981    __m256d acc10, acc11, acc12, acc13;
982 
983 
984    // round 0
985 
986    acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
987    acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
988    acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
989    acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
990 
991    acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
992    acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
993    acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
994    acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
995 
996    for (long i = 0; i < n; i++) {
997       avec0 = _mm256_broadcast_sd(&a[i]);
998       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
999 
1000       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1001       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1002       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1003       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1004    }
1005 
1006 
1007    _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1008    _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1009    _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1010    _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1011 
1012    _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1013    _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1014    _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1015    _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1016 
1017    // round 1
1018 
1019    acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1020    acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1021    acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1022    acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1023 
1024    acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1025    acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1026    acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1027    acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1028 
1029    for (long i = 0; i < n; i++) {
1030       avec0 = _mm256_broadcast_sd(&a[i]);
1031       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1032 
1033       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1034       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1035       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1036       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1037    }
1038 
1039 
1040    _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1041    _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1042    _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1043    _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1044 
1045    _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1046    _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1047    _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1048    _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1049 
1050 }
1051 
1052 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
1053 // it could be faster on AVX2/FMA, where there should be enough registers
1054 static
muladd3_by_32(double * x,const double * a,const double * b,long n)1055 void muladd3_by_32(double *x, const double *a, const double *b, long n)
1056 {
1057    __m256d avec0, avec1, avec2, bvec;
1058    __m256d acc00, acc01, acc02, acc03;
1059    __m256d acc10, acc11, acc12, acc13;
1060    __m256d acc20, acc21, acc22, acc23;
1061 
1062 
1063    // round 0
1064 
1065    acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1066    acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1067    acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1068    acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1069 
1070    acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1071    acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1072    acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1073    acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1074 
1075    acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
1076    acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
1077    acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
1078    acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
1079 
1080    for (long i = 0; i < n; i++) {
1081       avec0 = _mm256_broadcast_sd(&a[i]);
1082       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1083       avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1084 
1085       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1086       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1087       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1088       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1089    }
1090 
1091 
1092    _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1093    _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1094    _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1095    _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1096 
1097    _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1098    _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1099    _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1100    _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1101 
1102    _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1103    _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1104    _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1105    _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1106 
1107    // round 1
1108 
1109    acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1110    acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1111    acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1112    acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1113 
1114    acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1115    acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1116    acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1117    acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1118 
1119    acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
1120    acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
1121    acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
1122    acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
1123 
1124    for (long i = 0; i < n; i++) {
1125       avec0 = _mm256_broadcast_sd(&a[i]);
1126       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1127       avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1128 
1129       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1130       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1131       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1132       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1133    }
1134 
1135 
1136    _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1137    _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1138    _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1139    _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1140 
1141    _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1142    _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1143    _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1144    _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1145 
1146    _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
1147    _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
1148    _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
1149    _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
1150 
1151 }
1152 
1153 static
muladd1_by_16(double * x,const double * a,const double * b,long n)1154 void muladd1_by_16(double *x, const double *a, const double *b, long n)
1155 {
1156    __m256d avec, bvec;
1157 
1158 
1159    __m256d acc0=_mm256_load_pd(x + 0*4);
1160    __m256d acc1=_mm256_load_pd(x + 1*4);
1161    __m256d acc2=_mm256_load_pd(x + 2*4);
1162    __m256d acc3=_mm256_load_pd(x + 3*4);
1163 
1164 
1165    for (long i = 0; i < n; i++) {
1166       avec = _mm256_broadcast_sd(a); a++;
1167 
1168 
1169       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
1170       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
1171       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
1172       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
1173       b += 16;
1174    }
1175 
1176 
1177    _mm256_store_pd(x + 0*4, acc0);
1178    _mm256_store_pd(x + 1*4, acc1);
1179    _mm256_store_pd(x + 2*4, acc2);
1180    _mm256_store_pd(x + 3*4, acc3);
1181 }
1182 
1183 
1184 
1185 static
muladd2_by_16(double * x,const double * a,const double * b,long n)1186 void muladd2_by_16(double *x, const double *a, const double *b, long n)
1187 {
1188    __m256d avec0, avec1, bvec;
1189    __m256d acc00, acc01, acc02, acc03;
1190    __m256d acc10, acc11, acc12, acc13;
1191 
1192 
1193    // round 0
1194 
1195    acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1196    acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1197    acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1198    acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1199 
1200    acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1201    acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1202    acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1203    acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1204 
1205    for (long i = 0; i < n; i++) {
1206       avec0 = _mm256_broadcast_sd(&a[i]);
1207       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1208 
1209       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1210       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1211       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1212       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1213    }
1214 
1215 
1216    _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1217    _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1218    _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1219    _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1220 
1221    _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1222    _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1223    _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1224    _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1225 
1226 }
1227 
1228 
1229 static
muladd3_by_16(double * x,const double * a,const double * b,long n)1230 void muladd3_by_16(double *x, const double *a, const double *b, long n)
1231 {
1232    __m256d avec0, avec1, avec2, bvec;
1233    __m256d acc00, acc01, acc02, acc03;
1234    __m256d acc10, acc11, acc12, acc13;
1235    __m256d acc20, acc21, acc22, acc23;
1236 
1237 
1238    // round 0
1239 
1240    acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1241    acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1242    acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1243    acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1244 
1245    acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1246    acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1247    acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1248    acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1249 
1250    acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
1251    acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
1252    acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
1253    acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
1254 
1255    for (long i = 0; i < n; i++) {
1256       avec0 = _mm256_broadcast_sd(&a[i]);
1257       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1258       avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1259 
1260       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1261       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1262       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1263       bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1264    }
1265 
1266 
1267    _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1268    _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1269    _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1270    _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1271 
1272    _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1273    _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1274    _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1275    _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1276 
1277    _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1278    _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1279    _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1280    _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1281 
1282 }
1283 
1284 
1285 
1286 
1287 #endif
1288 
1289 
1290 
1291 
1292 static inline
muladd_all_by_32(long first,long last,double * x,const double * a,const double * b,long n)1293 void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
1294 {
1295    long i = first;
1296 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
1297    // process three rows at a time
1298    for (; i <= last-3; i+=3)
1299       muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1300    for (; i < last; i++)
1301       muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1302 #else
1303    // process only two rows at a time: not enough registers :-(
1304    for (; i <= last-2; i+=2)
1305       muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1306    for (; i < last; i++)
1307       muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1308 #endif
1309 }
1310 
1311 
1312 static inline
muladd_all_by_16(long first,long last,double * x,const double * a,const double * b,long n)1313 void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
1314 {
1315    long i = first;
1316 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
1317    // processing three rows at a time is faster
1318    for (; i <= last-3; i+=3)
1319       muladd3_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1320    for (; i < last; i++)
1321       muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1322 #else
1323    // process only two rows at a time: not enough registers :-(
1324    for (; i <= last-2; i+=2)
1325       muladd2_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1326    for (; i < last; i++)
1327       muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1328 #endif
1329 }
1330 
1331 static inline
muladd_all_by_32_width(long first,long last,double * x,const double * a,const double * b,long n,long width)1332 void muladd_all_by_32_width(long first, long last, double *x, const double *a, const double *b, long n, long width)
1333 {
1334    if (width > MAT_BLK_SZ/2)
1335       muladd_all_by_32(first, last, x, a, b, n);
1336    else
1337       muladd_all_by_16(first, last, x, a, b, n);
1338 }
1339 
1340 // muladd_interval1 used in alt_inv_DD and alt_tri_DD
1341 // muladd_interval used in blk_inv_DD and blk_tri_DD, with an
1342 //   argument of MAT_BLK_SZ
1343 
1344 
1345 // this assumes n is a multiple of 16
1346 static inline
muladd_interval(double * NTL_RESTRICT x,double * NTL_RESTRICT y,double c,long n)1347 void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1348 {
1349    __m256d xvec0, xvec1, xvec2, xvec3;
1350    __m256d yvec0, yvec1, yvec2, yvec3;
1351 
1352    __m256d cvec = _mm256_broadcast_sd(&c);
1353 
1354    for (long i = 0; i < n; i += 16, x += 16, y += 16) {
1355       xvec0 = _mm256_load_pd(x+0*4);
1356       xvec1 = _mm256_load_pd(x+1*4);
1357       xvec2 = _mm256_load_pd(x+2*4);
1358       xvec3 = _mm256_load_pd(x+3*4);
1359 
1360       yvec0 = _mm256_load_pd(y+0*4);
1361       yvec1 = _mm256_load_pd(y+1*4);
1362       yvec2 = _mm256_load_pd(y+2*4);
1363       yvec3 = _mm256_load_pd(y+3*4);
1364 
1365       MUL_ADD(xvec0, yvec0, cvec);
1366       MUL_ADD(xvec1, yvec1, cvec);
1367       MUL_ADD(xvec2, yvec2, cvec);
1368       MUL_ADD(xvec3, yvec3, cvec);
1369 
1370       _mm256_store_pd(x + 0*4, xvec0);
1371       _mm256_store_pd(x + 1*4, xvec1);
1372       _mm256_store_pd(x + 2*4, xvec2);
1373       _mm256_store_pd(x + 3*4, xvec3);
1374    }
1375 }
1376 
1377 // this one is more general: does not assume that n is a
1378 // multiple of 16
1379 static inline
muladd_interval1(double * NTL_RESTRICT x,double * NTL_RESTRICT y,double c,long n)1380 void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1381 {
1382 
1383    __m256d xvec0, xvec1, xvec2, xvec3;
1384    __m256d yvec0, yvec1, yvec2, yvec3;
1385    __m256d cvec;
1386 
1387    if (n >= 4)
1388       cvec = _mm256_broadcast_sd(&c);
1389 
1390    long i=0;
1391    for (; i <= n-16; i += 16, x += 16, y += 16) {
1392       xvec0 = _mm256_load_pd(x+0*4);
1393       xvec1 = _mm256_load_pd(x+1*4);
1394       xvec2 = _mm256_load_pd(x+2*4);
1395       xvec3 = _mm256_load_pd(x+3*4);
1396 
1397       yvec0 = _mm256_load_pd(y+0*4);
1398       yvec1 = _mm256_load_pd(y+1*4);
1399       yvec2 = _mm256_load_pd(y+2*4);
1400       yvec3 = _mm256_load_pd(y+3*4);
1401 
1402       MUL_ADD(xvec0, yvec0, cvec);
1403       MUL_ADD(xvec1, yvec1, cvec);
1404       MUL_ADD(xvec2, yvec2, cvec);
1405       MUL_ADD(xvec3, yvec3, cvec);
1406 
1407       _mm256_store_pd(x + 0*4, xvec0);
1408       _mm256_store_pd(x + 1*4, xvec1);
1409       _mm256_store_pd(x + 2*4, xvec2);
1410       _mm256_store_pd(x + 3*4, xvec3);
1411    }
1412 
1413    for (; i <= n-4; i += 4, x += 4, y += 4) {
1414       xvec0 = _mm256_load_pd(x+0*4);
1415       yvec0 = _mm256_load_pd(y+0*4);
1416       MUL_ADD(xvec0, yvec0, cvec);
1417       _mm256_store_pd(x + 0*4, xvec0);
1418    }
1419 
1420    for (; i < n; i++, x++, y++) {
1421       *x += (*y)*c;
1422    }
1423 }
1424 
1425 
1426 #endif
1427 
1428 
1429 //#define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
1430 
1431 static inline
1432 unsigned long
DO_MUL(unsigned long a,unsigned long b)1433 DO_MUL(unsigned long a, unsigned long b)
1434 { return a*b; }
1435 
1436 
1437 static
muladd_interval(unsigned long * NTL_RESTRICT x,unsigned long * NTL_RESTRICT y,unsigned long c,long n)1438 inline void muladd_interval(unsigned long * NTL_RESTRICT x, unsigned long * NTL_RESTRICT y,
1439                      unsigned long c, long n)
1440 {
1441    for (long i = 0; i < n; i++)
1442       x[i] += DO_MUL(y[i], c);
1443 }
1444 
1445 static
muladd1_by_32(unsigned long * x,const unsigned long * a,const unsigned long * b,long n)1446 void muladd1_by_32(unsigned long *x, const unsigned long *a, const unsigned long *b,
1447                    long n)
1448 {
1449    for (long j = 0; j < MAT_BLK_SZ; j++) {
1450       unsigned long sum = x[j];
1451       long i = 0;
1452 
1453       for (; i <= n-4; i += 4) {
1454          sum += DO_MUL(a[i+0], b[i+0]);
1455          sum += DO_MUL(a[i+1], b[i+1]);
1456          sum += DO_MUL(a[i+2], b[i+2]);
1457          sum += DO_MUL(a[i+3], b[i+3]);
1458       }
1459 
1460       for (; i < n; i++)
1461          sum += DO_MUL(a[i], b[i]);
1462 
1463       x[j] = sum;
1464       b += MAT_BLK_SZ;
1465    }
1466 }
1467 
1468 static
muladd1_by_32_width(unsigned long * x,const unsigned long * a,const unsigned long * b,long n,long width)1469 void muladd1_by_32_width(unsigned long *x, const unsigned long *a, const unsigned long *b,
1470                    long n, long width)
1471 {
1472    for (long j = 0; j < width; j++) {
1473       unsigned long sum = x[j];
1474       long i = 0;
1475 
1476       for (; i <= n-4; i += 4) {
1477          sum += DO_MUL(a[i+0], b[i+0]);
1478          sum += DO_MUL(a[i+1], b[i+1]);
1479          sum += DO_MUL(a[i+2], b[i+2]);
1480          sum += DO_MUL(a[i+3], b[i+3]);
1481       }
1482 
1483       for (; i < n; i++)
1484          sum += DO_MUL(a[i], b[i]);
1485 
1486       x[j] = sum;
1487       b += MAT_BLK_SZ;
1488    }
1489 }
1490 
1491 // experiment with shorter int's
1492 static
muladd1_by_32(unsigned long * x,const unsigned int * a,const unsigned int * b,long n)1493 void muladd1_by_32(unsigned long *x, const unsigned int *a, const unsigned int *b,
1494                    long n)
1495 {
1496    for (long j = 0; j < MAT_BLK_SZ; j++) {
1497       unsigned long sum = x[j];
1498       long i = 0;
1499 
1500       for (; i <= n-4; i += 4) {
1501          sum += DO_MUL(a[i+0], b[i+0]);
1502          sum += DO_MUL(a[i+1], b[i+1]);
1503          sum += DO_MUL(a[i+2], b[i+2]);
1504          sum += DO_MUL(a[i+3], b[i+3]);
1505       }
1506 
1507       for (; i < n; i++)
1508          sum += DO_MUL(a[i], b[i]);
1509 
1510       x[j] = sum;
1511       b += MAT_BLK_SZ;
1512    }
1513 }
1514 
1515 static
muladd1_by_32_width(unsigned long * x,const unsigned int * a,const unsigned int * b,long n,long width)1516 void muladd1_by_32_width(unsigned long *x, const unsigned int *a, const unsigned int *b,
1517                    long n, long width)
1518 {
1519    for (long j = 0; j < width; j++) {
1520       unsigned long sum = x[j];
1521       long i = 0;
1522 
1523       for (; i <= n-4; i += 4) {
1524          sum += DO_MUL(a[i+0], b[i+0]);
1525          sum += DO_MUL(a[i+1], b[i+1]);
1526          sum += DO_MUL(a[i+2], b[i+2]);
1527          sum += DO_MUL(a[i+3], b[i+3]);
1528       }
1529 
1530       for (; i < n; i++)
1531          sum += DO_MUL(a[i], b[i]);
1532 
1533       x[j] = sum;
1534       b += MAT_BLK_SZ;
1535    }
1536 }
1537 
1538 #if 0
1539 static
1540 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1541 {
1542    for (long j = 0; j < MAT_BLK_SZ; j++) {
1543       unsigned long sum = x[j];
1544       long i = 0;
1545 
1546       sum += DO_MUL(a[i+0], b[i+0]);
1547       sum += DO_MUL(a[i+1], b[i+1]);
1548       sum += DO_MUL(a[i+2], b[i+2]);
1549       sum += DO_MUL(a[i+3], b[i+3]);
1550       sum += DO_MUL(a[i+4], b[i+4]);
1551       sum += DO_MUL(a[i+5], b[i+5]);
1552       sum += DO_MUL(a[i+6], b[i+6]);
1553       sum += DO_MUL(a[i+7], b[i+7]);
1554       sum += DO_MUL(a[i+8], b[i+8]);
1555       sum += DO_MUL(a[i+9], b[i+9]);
1556       sum += DO_MUL(a[i+10], b[i+10]);
1557       sum += DO_MUL(a[i+11], b[i+11]);
1558       sum += DO_MUL(a[i+12], b[i+12]);
1559       sum += DO_MUL(a[i+13], b[i+13]);
1560       sum += DO_MUL(a[i+14], b[i+14]);
1561       sum += DO_MUL(a[i+15], b[i+15]);
1562       sum += DO_MUL(a[i+16], b[i+16]);
1563       sum += DO_MUL(a[i+17], b[i+17]);
1564       sum += DO_MUL(a[i+18], b[i+18]);
1565       sum += DO_MUL(a[i+19], b[i+19]);
1566       sum += DO_MUL(a[i+20], b[i+20]);
1567       sum += DO_MUL(a[i+21], b[i+21]);
1568       sum += DO_MUL(a[i+22], b[i+22]);
1569       sum += DO_MUL(a[i+23], b[i+23]);
1570       sum += DO_MUL(a[i+24], b[i+24]);
1571       sum += DO_MUL(a[i+25], b[i+25]);
1572       sum += DO_MUL(a[i+26], b[i+26]);
1573       sum += DO_MUL(a[i+27], b[i+27]);
1574       sum += DO_MUL(a[i+28], b[i+28]);
1575       sum += DO_MUL(a[i+29], b[i+29]);
1576       sum += DO_MUL(a[i+30], b[i+30]);
1577       sum += DO_MUL(a[i+31], b[i+31]);
1578 
1579       x[j] = sum;
1580       b += MAT_BLK_SZ;
1581    }
1582 }
1583 #else
1584 
1585 // this version is faster (by about 25%) on a Sandybridge machine
1586 
1587 #define ONE_STEP_L(i) \
1588   sum += DO_MUL(a[i],b[i]);\
1589   sum_1 += DO_MUL(a[i],b_1[i]);\
1590   sum_2 += DO_MUL(a[i],b_2[i]);\
1591   sum_3 += DO_MUL(a[i],b_3[i])\
1592 
1593 
1594 static
muladd1_by_32_full(unsigned long * x,const unsigned long * a,const unsigned long * b)1595 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1596 {
1597    for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1598 
1599       unsigned long sum = x[j];
1600       unsigned long sum_1 = x[j+1];
1601       unsigned long sum_2 = x[j+2];
1602       unsigned long sum_3 = x[j+3];
1603 
1604       const unsigned long *b_1 = b+MAT_BLK_SZ;
1605       const unsigned long *b_2 = b+2*MAT_BLK_SZ;
1606       const unsigned long *b_3 = b+3*MAT_BLK_SZ;
1607 
1608       ONE_STEP_L(0);
1609       ONE_STEP_L(1);
1610       ONE_STEP_L(2);
1611       ONE_STEP_L(3);
1612       ONE_STEP_L(4);
1613       ONE_STEP_L(5);
1614       ONE_STEP_L(6);
1615       ONE_STEP_L(7);
1616       ONE_STEP_L(8);
1617       ONE_STEP_L(9);
1618       ONE_STEP_L(10);
1619       ONE_STEP_L(11);
1620       ONE_STEP_L(12);
1621       ONE_STEP_L(13);
1622       ONE_STEP_L(14);
1623       ONE_STEP_L(15);
1624       ONE_STEP_L(16);
1625       ONE_STEP_L(17);
1626       ONE_STEP_L(18);
1627       ONE_STEP_L(19);
1628       ONE_STEP_L(20);
1629       ONE_STEP_L(21);
1630       ONE_STEP_L(22);
1631       ONE_STEP_L(23);
1632       ONE_STEP_L(24);
1633       ONE_STEP_L(25);
1634       ONE_STEP_L(26);
1635       ONE_STEP_L(27);
1636       ONE_STEP_L(28);
1637       ONE_STEP_L(29);
1638       ONE_STEP_L(30);
1639       ONE_STEP_L(31);
1640 
1641       x[j]   = sum;
1642       x[j+1] = sum_1;
1643       x[j+2] = sum_2;
1644       x[j+3] = sum_3;
1645 
1646       b += 4*MAT_BLK_SZ;
1647    }
1648 }
1649 
1650 static
muladd1_by_32_full_width(unsigned long * x,const unsigned long * a,const unsigned long * b,long width)1651 void muladd1_by_32_full_width(unsigned long *x, const unsigned long *a, const unsigned long *b, long width)
1652 {
1653    long j = 0;
1654    for (; j <= width-4; j+=4) {
1655 
1656       unsigned long sum = x[j];
1657       unsigned long sum_1 = x[j+1];
1658       unsigned long sum_2 = x[j+2];
1659       unsigned long sum_3 = x[j+3];
1660 
1661       const unsigned long *b_1 = b+MAT_BLK_SZ;
1662       const unsigned long *b_2 = b+2*MAT_BLK_SZ;
1663       const unsigned long *b_3 = b+3*MAT_BLK_SZ;
1664 
1665       ONE_STEP_L(0);
1666       ONE_STEP_L(1);
1667       ONE_STEP_L(2);
1668       ONE_STEP_L(3);
1669       ONE_STEP_L(4);
1670       ONE_STEP_L(5);
1671       ONE_STEP_L(6);
1672       ONE_STEP_L(7);
1673       ONE_STEP_L(8);
1674       ONE_STEP_L(9);
1675       ONE_STEP_L(10);
1676       ONE_STEP_L(11);
1677       ONE_STEP_L(12);
1678       ONE_STEP_L(13);
1679       ONE_STEP_L(14);
1680       ONE_STEP_L(15);
1681       ONE_STEP_L(16);
1682       ONE_STEP_L(17);
1683       ONE_STEP_L(18);
1684       ONE_STEP_L(19);
1685       ONE_STEP_L(20);
1686       ONE_STEP_L(21);
1687       ONE_STEP_L(22);
1688       ONE_STEP_L(23);
1689       ONE_STEP_L(24);
1690       ONE_STEP_L(25);
1691       ONE_STEP_L(26);
1692       ONE_STEP_L(27);
1693       ONE_STEP_L(28);
1694       ONE_STEP_L(29);
1695       ONE_STEP_L(30);
1696       ONE_STEP_L(31);
1697 
1698       x[j]   = sum;
1699       x[j+1] = sum_1;
1700       x[j+2] = sum_2;
1701       x[j+3] = sum_3;
1702 
1703       b += 4*MAT_BLK_SZ;
1704    }
1705 
1706    for (; j < width; j++) {
1707       unsigned long sum = x[j];
1708       long i = 0;
1709 
1710       sum += DO_MUL(a[i+0], b[i+0]);
1711       sum += DO_MUL(a[i+1], b[i+1]);
1712       sum += DO_MUL(a[i+2], b[i+2]);
1713       sum += DO_MUL(a[i+3], b[i+3]);
1714       sum += DO_MUL(a[i+4], b[i+4]);
1715       sum += DO_MUL(a[i+5], b[i+5]);
1716       sum += DO_MUL(a[i+6], b[i+6]);
1717       sum += DO_MUL(a[i+7], b[i+7]);
1718       sum += DO_MUL(a[i+8], b[i+8]);
1719       sum += DO_MUL(a[i+9], b[i+9]);
1720       sum += DO_MUL(a[i+10], b[i+10]);
1721       sum += DO_MUL(a[i+11], b[i+11]);
1722       sum += DO_MUL(a[i+12], b[i+12]);
1723       sum += DO_MUL(a[i+13], b[i+13]);
1724       sum += DO_MUL(a[i+14], b[i+14]);
1725       sum += DO_MUL(a[i+15], b[i+15]);
1726       sum += DO_MUL(a[i+16], b[i+16]);
1727       sum += DO_MUL(a[i+17], b[i+17]);
1728       sum += DO_MUL(a[i+18], b[i+18]);
1729       sum += DO_MUL(a[i+19], b[i+19]);
1730       sum += DO_MUL(a[i+20], b[i+20]);
1731       sum += DO_MUL(a[i+21], b[i+21]);
1732       sum += DO_MUL(a[i+22], b[i+22]);
1733       sum += DO_MUL(a[i+23], b[i+23]);
1734       sum += DO_MUL(a[i+24], b[i+24]);
1735       sum += DO_MUL(a[i+25], b[i+25]);
1736       sum += DO_MUL(a[i+26], b[i+26]);
1737       sum += DO_MUL(a[i+27], b[i+27]);
1738       sum += DO_MUL(a[i+28], b[i+28]);
1739       sum += DO_MUL(a[i+29], b[i+29]);
1740       sum += DO_MUL(a[i+30], b[i+30]);
1741       sum += DO_MUL(a[i+31], b[i+31]);
1742 
1743       x[j] = sum;
1744       b += MAT_BLK_SZ;
1745    }
1746 }
1747 
1748 
1749 // experiment with shorter int's
1750 static
muladd1_by_32_full(unsigned long * x,const unsigned int * a,const unsigned int * b)1751 void muladd1_by_32_full(unsigned long *x, const unsigned int *a, const unsigned int *b)
1752 {
1753    for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1754 
1755       unsigned long sum = x[j];
1756       unsigned long sum_1 = x[j+1];
1757       unsigned long sum_2 = x[j+2];
1758       unsigned long sum_3 = x[j+3];
1759 
1760       const unsigned int *b_1 = b+MAT_BLK_SZ;
1761       const unsigned int *b_2 = b+2*MAT_BLK_SZ;
1762       const unsigned int *b_3 = b+3*MAT_BLK_SZ;
1763 
1764       ONE_STEP_L(0);
1765       ONE_STEP_L(1);
1766       ONE_STEP_L(2);
1767       ONE_STEP_L(3);
1768       ONE_STEP_L(4);
1769       ONE_STEP_L(5);
1770       ONE_STEP_L(6);
1771       ONE_STEP_L(7);
1772       ONE_STEP_L(8);
1773       ONE_STEP_L(9);
1774       ONE_STEP_L(10);
1775       ONE_STEP_L(11);
1776       ONE_STEP_L(12);
1777       ONE_STEP_L(13);
1778       ONE_STEP_L(14);
1779       ONE_STEP_L(15);
1780       ONE_STEP_L(16);
1781       ONE_STEP_L(17);
1782       ONE_STEP_L(18);
1783       ONE_STEP_L(19);
1784       ONE_STEP_L(20);
1785       ONE_STEP_L(21);
1786       ONE_STEP_L(22);
1787       ONE_STEP_L(23);
1788       ONE_STEP_L(24);
1789       ONE_STEP_L(25);
1790       ONE_STEP_L(26);
1791       ONE_STEP_L(27);
1792       ONE_STEP_L(28);
1793       ONE_STEP_L(29);
1794       ONE_STEP_L(30);
1795       ONE_STEP_L(31);
1796 
1797       x[j]   = sum;
1798       x[j+1] = sum_1;
1799       x[j+2] = sum_2;
1800       x[j+3] = sum_3;
1801 
1802       b += 4*MAT_BLK_SZ;
1803    }
1804 }
1805 
1806 static
muladd1_by_32_full_width(unsigned long * x,const unsigned int * a,const unsigned int * b,long width)1807 void muladd1_by_32_full_width(unsigned long *x, const unsigned int *a, const unsigned int *b, long width)
1808 {
1809    long j = 0;
1810    for (; j <= width-4; j+=4) {
1811 
1812       unsigned long sum = x[j];
1813       unsigned long sum_1 = x[j+1];
1814       unsigned long sum_2 = x[j+2];
1815       unsigned long sum_3 = x[j+3];
1816 
1817       const unsigned int *b_1 = b+MAT_BLK_SZ;
1818       const unsigned int *b_2 = b+2*MAT_BLK_SZ;
1819       const unsigned int *b_3 = b+3*MAT_BLK_SZ;
1820 
1821       ONE_STEP_L(0);
1822       ONE_STEP_L(1);
1823       ONE_STEP_L(2);
1824       ONE_STEP_L(3);
1825       ONE_STEP_L(4);
1826       ONE_STEP_L(5);
1827       ONE_STEP_L(6);
1828       ONE_STEP_L(7);
1829       ONE_STEP_L(8);
1830       ONE_STEP_L(9);
1831       ONE_STEP_L(10);
1832       ONE_STEP_L(11);
1833       ONE_STEP_L(12);
1834       ONE_STEP_L(13);
1835       ONE_STEP_L(14);
1836       ONE_STEP_L(15);
1837       ONE_STEP_L(16);
1838       ONE_STEP_L(17);
1839       ONE_STEP_L(18);
1840       ONE_STEP_L(19);
1841       ONE_STEP_L(20);
1842       ONE_STEP_L(21);
1843       ONE_STEP_L(22);
1844       ONE_STEP_L(23);
1845       ONE_STEP_L(24);
1846       ONE_STEP_L(25);
1847       ONE_STEP_L(26);
1848       ONE_STEP_L(27);
1849       ONE_STEP_L(28);
1850       ONE_STEP_L(29);
1851       ONE_STEP_L(30);
1852       ONE_STEP_L(31);
1853 
1854       x[j]   = sum;
1855       x[j+1] = sum_1;
1856       x[j+2] = sum_2;
1857       x[j+3] = sum_3;
1858 
1859       b += 4*MAT_BLK_SZ;
1860    }
1861 
1862    for (; j < width; j++) {
1863       unsigned long sum = x[j];
1864       long i = 0;
1865 
1866       sum += DO_MUL(a[i+0], b[i+0]);
1867       sum += DO_MUL(a[i+1], b[i+1]);
1868       sum += DO_MUL(a[i+2], b[i+2]);
1869       sum += DO_MUL(a[i+3], b[i+3]);
1870       sum += DO_MUL(a[i+4], b[i+4]);
1871       sum += DO_MUL(a[i+5], b[i+5]);
1872       sum += DO_MUL(a[i+6], b[i+6]);
1873       sum += DO_MUL(a[i+7], b[i+7]);
1874       sum += DO_MUL(a[i+8], b[i+8]);
1875       sum += DO_MUL(a[i+9], b[i+9]);
1876       sum += DO_MUL(a[i+10], b[i+10]);
1877       sum += DO_MUL(a[i+11], b[i+11]);
1878       sum += DO_MUL(a[i+12], b[i+12]);
1879       sum += DO_MUL(a[i+13], b[i+13]);
1880       sum += DO_MUL(a[i+14], b[i+14]);
1881       sum += DO_MUL(a[i+15], b[i+15]);
1882       sum += DO_MUL(a[i+16], b[i+16]);
1883       sum += DO_MUL(a[i+17], b[i+17]);
1884       sum += DO_MUL(a[i+18], b[i+18]);
1885       sum += DO_MUL(a[i+19], b[i+19]);
1886       sum += DO_MUL(a[i+20], b[i+20]);
1887       sum += DO_MUL(a[i+21], b[i+21]);
1888       sum += DO_MUL(a[i+22], b[i+22]);
1889       sum += DO_MUL(a[i+23], b[i+23]);
1890       sum += DO_MUL(a[i+24], b[i+24]);
1891       sum += DO_MUL(a[i+25], b[i+25]);
1892       sum += DO_MUL(a[i+26], b[i+26]);
1893       sum += DO_MUL(a[i+27], b[i+27]);
1894       sum += DO_MUL(a[i+28], b[i+28]);
1895       sum += DO_MUL(a[i+29], b[i+29]);
1896       sum += DO_MUL(a[i+30], b[i+30]);
1897       sum += DO_MUL(a[i+31], b[i+31]);
1898 
1899       x[j] = sum;
1900       b += MAT_BLK_SZ;
1901    }
1902 }
1903 
1904 #endif
1905 
1906 static inline
muladd_all_by_32(long first,long last,unsigned long * x,const unsigned int * a,const unsigned int * b,long n)1907 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned int *a, const unsigned int *b, long n)
1908 {
1909    if (n == MAT_BLK_SZ) {
1910       for (long i = first; i < last; i++)
1911          muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1912    }
1913    else {
1914       for (long i = first; i < last; i++)
1915          muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1916    }
1917 }
1918 
1919 static inline
muladd_all_by_32_width(long first,long last,unsigned long * x,const unsigned long * a,const unsigned long * b,long n,long width)1920 void muladd_all_by_32_width(long first, long last, unsigned long *x, const unsigned long *a, const unsigned long *b, long n, long width)
1921 {
1922    if (width == MAT_BLK_SZ) {
1923       if (n == MAT_BLK_SZ) {
1924 	 for (long i = first; i < last; i++)
1925 	    muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1926       }
1927       else {
1928 	 for (long i = first; i < last; i++)
1929 	    muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1930       }
1931    }
1932    else {
1933       if (n == MAT_BLK_SZ) {
1934 	 for (long i = first; i < last; i++)
1935 	    muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, width);
1936       }
1937       else {
1938 	 for (long i = first; i < last; i++)
1939 	    muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, width);
1940       }
1941    }
1942 }
1943 
1944 static inline
muladd_all_by_32(long first,long last,unsigned long * x,const unsigned long * a,const unsigned long * b,long n)1945 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned long *a, const unsigned long *b, long n)
1946 {
1947    if (n == MAT_BLK_SZ) {
1948       for (long i = first; i < last; i++)
1949          muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1950    }
1951    else {
1952       for (long i = first; i < last; i++)
1953          muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1954    }
1955 }
1956 
1957 static inline
muladd_all_by_32_width(long first,long last,unsigned long * x,const unsigned int * a,const unsigned int * b,long n,long width)1958 void muladd_all_by_32_width(long first, long last, unsigned long *x, const unsigned int *a, const unsigned int *b, long n, long width)
1959 {
1960    if (width == MAT_BLK_SZ) {
1961       if (n == MAT_BLK_SZ) {
1962 	 for (long i = first; i < last; i++)
1963 	    muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1964       }
1965       else {
1966 	 for (long i = first; i < last; i++)
1967 	    muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1968       }
1969    }
1970    else {
1971       if (n == MAT_BLK_SZ) {
1972 	 for (long i = first; i < last; i++)
1973 	    muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, width);
1974       }
1975       else {
1976 	 for (long i = first; i < last; i++)
1977 	    muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, width);
1978       }
1979    }
1980 }
1981 
1982 #if (!defined(__INTEL_COMPILER) && (NTL_BITS_PER_INT >= NTL_BITS_PER_LONG/2))
1983 // Something goes wrong with the Intel ICC (version 16.0.3) compiler
1984 // in this case.
1985 // It goes away with -O1, so I suspect it is a compiler bug.
1986 
1987 typedef unsigned int uhlong;
1988 
1989 #else
1990 
1991 typedef unsigned long uhlong;
1992 
1993 #endif
1994 
1995 
1996 
1997 
1998 // NOTE: the following code is hardcoded for MAT_BLK_SZ == 32.
1999 // Also, we special case NTL_BITS_PER_LONG-NTL_SP_NBITS > 2, which
2000 // allows us to accumulate all 32 products without additional carries.
2001 
2002 #if (NTL_BITS_PER_LONG-NTL_SP_NBITS > 2)
2003 
2004 static
muladd1_by_32(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2005 void muladd1_by_32(long *x, const long *a, const long *b,
2006                    long n, long p, sp_ll_reduce_struct ll_red_struct)
2007 {
2008    for (long j = 0; j < MAT_BLK_SZ; j++) {
2009 
2010        ll_type sum;
2011        ll_init(sum, x[j]);
2012 #if 0
2013       for (long i = 0; i < n; i++)
2014          ll_imul_add(sum, a[i], b[i]);
2015 #else
2016       long i=0;
2017       for(; i <= n-8; i+= 8) {
2018          ll_imul_add(sum, a[i+0], b[i+0]);
2019          ll_imul_add(sum, a[i+1], b[i+1]);
2020          ll_imul_add(sum, a[i+2], b[i+2]);
2021          ll_imul_add(sum, a[i+3], b[i+3]);
2022 
2023          ll_imul_add(sum, a[i+4], b[i+4]);
2024          ll_imul_add(sum, a[i+5], b[i+5]);
2025          ll_imul_add(sum, a[i+6], b[i+6]);
2026          ll_imul_add(sum, a[i+7], b[i+7]);
2027       }
2028 
2029       for (; i < n; i++)
2030          ll_imul_add(sum, a[i], b[i]);
2031 
2032 #endif
2033 
2034       unsigned long sum0 = ll_get_lo(sum);
2035       unsigned long sum1 = ll_get_hi(sum);
2036 
2037       long res;
2038 
2039       if (ll_red_struct.nbits == NTL_SP_NBITS)
2040          res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2041       else
2042          res =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2043 
2044 
2045       x[j] = res;
2046       b += MAT_BLK_SZ;
2047    }
2048 }
2049 
2050 static
muladd1_by_32_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2051 void muladd1_by_32_width(long *x, const long *a, const long *b,
2052                    long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2053 {
2054    for (long j = 0; j < width; j++) {
2055 
2056        ll_type sum;
2057        ll_init(sum, x[j]);
2058 #if 0
2059       for (long i = 0; i < n; i++)
2060          ll_imul_add(sum, a[i], b[i]);
2061 #else
2062       long i=0;
2063       for(; i <= n-8; i+= 8) {
2064          ll_imul_add(sum, a[i+0], b[i+0]);
2065          ll_imul_add(sum, a[i+1], b[i+1]);
2066          ll_imul_add(sum, a[i+2], b[i+2]);
2067          ll_imul_add(sum, a[i+3], b[i+3]);
2068 
2069          ll_imul_add(sum, a[i+4], b[i+4]);
2070          ll_imul_add(sum, a[i+5], b[i+5]);
2071          ll_imul_add(sum, a[i+6], b[i+6]);
2072          ll_imul_add(sum, a[i+7], b[i+7]);
2073       }
2074 
2075       for (; i < n; i++)
2076          ll_imul_add(sum, a[i], b[i]);
2077 
2078 #endif
2079 
2080       unsigned long sum0 = ll_get_lo(sum);
2081       unsigned long sum1 = ll_get_hi(sum);
2082 
2083       long res;
2084 
2085       if (ll_red_struct.nbits == NTL_SP_NBITS)
2086          res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2087       else
2088          res =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2089 
2090 
2091       x[j] = res;
2092       b += MAT_BLK_SZ;
2093    }
2094 }
2095 
2096 #if 0
2097 static
2098 void muladd1_by_32_full(long *x, const long *a, const long *b,
2099                         long p, sp_ll_reduce_struct ll_red_struct)
2100 {
2101    for (long j = 0; j < MAT_BLK_SZ; j++) {
2102 
2103       ll_type sum;
2104       ll_init(sum, x[j]);
2105 
2106       ll_imul_add(sum, a[0], b[0]);
2107       ll_imul_add(sum, a[1], b[1]);
2108       ll_imul_add(sum, a[2], b[2]);
2109       ll_imul_add(sum, a[3], b[3]);
2110       ll_imul_add(sum, a[4], b[4]);
2111       ll_imul_add(sum, a[5], b[5]);
2112       ll_imul_add(sum, a[6], b[6]);
2113       ll_imul_add(sum, a[7], b[7]);
2114       ll_imul_add(sum, a[8], b[8]);
2115       ll_imul_add(sum, a[9], b[9]);
2116       ll_imul_add(sum, a[10], b[10]);
2117       ll_imul_add(sum, a[11], b[11]);
2118       ll_imul_add(sum, a[12], b[12]);
2119       ll_imul_add(sum, a[13], b[13]);
2120       ll_imul_add(sum, a[14], b[14]);
2121       ll_imul_add(sum, a[15], b[15]);
2122       ll_imul_add(sum, a[16], b[16]);
2123       ll_imul_add(sum, a[17], b[17]);
2124       ll_imul_add(sum, a[18], b[18]);
2125       ll_imul_add(sum, a[19], b[19]);
2126       ll_imul_add(sum, a[20], b[20]);
2127       ll_imul_add(sum, a[21], b[21]);
2128       ll_imul_add(sum, a[22], b[22]);
2129       ll_imul_add(sum, a[23], b[23]);
2130       ll_imul_add(sum, a[24], b[24]);
2131       ll_imul_add(sum, a[25], b[25]);
2132       ll_imul_add(sum, a[26], b[26]);
2133       ll_imul_add(sum, a[27], b[27]);
2134       ll_imul_add(sum, a[28], b[28]);
2135       ll_imul_add(sum, a[29], b[29]);
2136       ll_imul_add(sum, a[30], b[30]);
2137       ll_imul_add(sum, a[31], b[31]);
2138 
2139       unsigned long sum0 = ll_get_lo(sum);
2140       unsigned long sum1 = ll_get_hi(sum);
2141 
2142       long res;
2143 
2144       if (ll_red_struct.nbits == NTL_SP_NBITS)
2145          res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2146       else
2147          res =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2148 
2149 
2150       x[j] = res;
2151       b += MAT_BLK_SZ;
2152    }
2153 }
2154 
2155 static
2156 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2157                         long p, sp_ll_reduce_struct ll_red_struct, long width)
2158 {
2159    for (long j = 0; j < width; j++) {
2160 
2161       ll_type sum;
2162       ll_init(sum, x[j]);
2163 
2164       ll_imul_add(sum, a[0], b[0]);
2165       ll_imul_add(sum, a[1], b[1]);
2166       ll_imul_add(sum, a[2], b[2]);
2167       ll_imul_add(sum, a[3], b[3]);
2168       ll_imul_add(sum, a[4], b[4]);
2169       ll_imul_add(sum, a[5], b[5]);
2170       ll_imul_add(sum, a[6], b[6]);
2171       ll_imul_add(sum, a[7], b[7]);
2172       ll_imul_add(sum, a[8], b[8]);
2173       ll_imul_add(sum, a[9], b[9]);
2174       ll_imul_add(sum, a[10], b[10]);
2175       ll_imul_add(sum, a[11], b[11]);
2176       ll_imul_add(sum, a[12], b[12]);
2177       ll_imul_add(sum, a[13], b[13]);
2178       ll_imul_add(sum, a[14], b[14]);
2179       ll_imul_add(sum, a[15], b[15]);
2180       ll_imul_add(sum, a[16], b[16]);
2181       ll_imul_add(sum, a[17], b[17]);
2182       ll_imul_add(sum, a[18], b[18]);
2183       ll_imul_add(sum, a[19], b[19]);
2184       ll_imul_add(sum, a[20], b[20]);
2185       ll_imul_add(sum, a[21], b[21]);
2186       ll_imul_add(sum, a[22], b[22]);
2187       ll_imul_add(sum, a[23], b[23]);
2188       ll_imul_add(sum, a[24], b[24]);
2189       ll_imul_add(sum, a[25], b[25]);
2190       ll_imul_add(sum, a[26], b[26]);
2191       ll_imul_add(sum, a[27], b[27]);
2192       ll_imul_add(sum, a[28], b[28]);
2193       ll_imul_add(sum, a[29], b[29]);
2194       ll_imul_add(sum, a[30], b[30]);
2195       ll_imul_add(sum, a[31], b[31]);
2196 
2197       unsigned long sum0 = ll_get_lo(sum);
2198       unsigned long sum1 = ll_get_hi(sum);
2199 
2200       long res;
2201 
2202       if (ll_red_struct.nbits == NTL_SP_NBITS)
2203          res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2204       else
2205          res =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2206 
2207 
2208       x[j] = res;
2209       b += MAT_BLK_SZ;
2210    }
2211 }
2212 
2213 #elif 1
2214 // This version is consistently fastest on tests on Sandybridge and Haswell
2215 
2216 
2217 
2218 #define ONE_STEP(i) \
2219   ll_imul_add(sum, a[i], b[i]);\
2220   ll_imul_add(sum_1, a[i], b_1[i]);\
2221   ll_imul_add(sum_2, a[i], b_2[i]);\
2222   ll_imul_add(sum_3, a[i], b_3[i]);\
2223 
2224 
muladd1_by_32_full(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct)2225 void muladd1_by_32_full(long *x, const long *a, const long *b,
2226                         long p, sp_ll_reduce_struct ll_red_struct)
2227 {
2228    for (long j = 0; j < MAT_BLK_SZ; j+=4) {
2229 
2230       ll_type sum, sum_1, sum_2, sum_3;
2231       ll_init(sum, x[j]);
2232       ll_init(sum_1, x[j+1]);
2233       ll_init(sum_2, x[j+2]);
2234       ll_init(sum_3, x[j+3]);
2235 
2236       const long *b_1 = b+MAT_BLK_SZ;
2237       const long *b_2 = b+2*MAT_BLK_SZ;
2238       const long *b_3 = b+3*MAT_BLK_SZ;
2239 
2240       ONE_STEP(0);
2241       ONE_STEP(1);
2242       ONE_STEP(2);
2243       ONE_STEP(3);
2244       ONE_STEP(4);
2245       ONE_STEP(5);
2246       ONE_STEP(6);
2247       ONE_STEP(7);
2248       ONE_STEP(8);
2249       ONE_STEP(9);
2250       ONE_STEP(10);
2251       ONE_STEP(11);
2252       ONE_STEP(12);
2253       ONE_STEP(13);
2254       ONE_STEP(14);
2255       ONE_STEP(15);
2256       ONE_STEP(16);
2257       ONE_STEP(17);
2258       ONE_STEP(18);
2259       ONE_STEP(19);
2260       ONE_STEP(20);
2261       ONE_STEP(21);
2262       ONE_STEP(22);
2263       ONE_STEP(23);
2264       ONE_STEP(24);
2265       ONE_STEP(25);
2266       ONE_STEP(26);
2267       ONE_STEP(27);
2268       ONE_STEP(28);
2269       ONE_STEP(29);
2270       ONE_STEP(30);
2271       ONE_STEP(31);
2272 
2273       unsigned long sum0 = ll_get_lo(sum);
2274       unsigned long sum1 = ll_get_hi(sum);
2275 
2276       unsigned long sum0_1 = ll_get_lo(sum_1);
2277       unsigned long sum1_1 = ll_get_hi(sum_1);
2278 
2279       unsigned long sum0_2 = ll_get_lo(sum_2);
2280       unsigned long sum1_2 = ll_get_hi(sum_2);
2281 
2282       unsigned long sum0_3 = ll_get_lo(sum_3);
2283       unsigned long sum1_3 = ll_get_hi(sum_3);
2284 
2285       if (ll_red_struct.nbits == NTL_SP_NBITS) {
2286          x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2287          x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
2288          x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
2289          x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
2290       }
2291       else {
2292          x[j] =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2293          x[j+1] =  sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
2294          x[j+2] =  sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
2295          x[j+3] =  sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
2296       }
2297 
2298 
2299       b += 4*MAT_BLK_SZ;
2300    }
2301 }
2302 
muladd1_by_32_full_width(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct,long width)2303 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2304                         long p, sp_ll_reduce_struct ll_red_struct, long width)
2305 {
2306    long j = 0;
2307    for (; j <= width-4; j+=4) {
2308 
2309       ll_type sum, sum_1, sum_2, sum_3;
2310       ll_init(sum, x[j]);
2311       ll_init(sum_1, x[j+1]);
2312       ll_init(sum_2, x[j+2]);
2313       ll_init(sum_3, x[j+3]);
2314 
2315       const long *b_1 = b+MAT_BLK_SZ;
2316       const long *b_2 = b+2*MAT_BLK_SZ;
2317       const long *b_3 = b+3*MAT_BLK_SZ;
2318 
2319       ONE_STEP(0);
2320       ONE_STEP(1);
2321       ONE_STEP(2);
2322       ONE_STEP(3);
2323       ONE_STEP(4);
2324       ONE_STEP(5);
2325       ONE_STEP(6);
2326       ONE_STEP(7);
2327       ONE_STEP(8);
2328       ONE_STEP(9);
2329       ONE_STEP(10);
2330       ONE_STEP(11);
2331       ONE_STEP(12);
2332       ONE_STEP(13);
2333       ONE_STEP(14);
2334       ONE_STEP(15);
2335       ONE_STEP(16);
2336       ONE_STEP(17);
2337       ONE_STEP(18);
2338       ONE_STEP(19);
2339       ONE_STEP(20);
2340       ONE_STEP(21);
2341       ONE_STEP(22);
2342       ONE_STEP(23);
2343       ONE_STEP(24);
2344       ONE_STEP(25);
2345       ONE_STEP(26);
2346       ONE_STEP(27);
2347       ONE_STEP(28);
2348       ONE_STEP(29);
2349       ONE_STEP(30);
2350       ONE_STEP(31);
2351 
2352       unsigned long sum0 = ll_get_lo(sum);
2353       unsigned long sum1 = ll_get_hi(sum);
2354 
2355       unsigned long sum0_1 = ll_get_lo(sum_1);
2356       unsigned long sum1_1 = ll_get_hi(sum_1);
2357 
2358       unsigned long sum0_2 = ll_get_lo(sum_2);
2359       unsigned long sum1_2 = ll_get_hi(sum_2);
2360 
2361       unsigned long sum0_3 = ll_get_lo(sum_3);
2362       unsigned long sum1_3 = ll_get_hi(sum_3);
2363 
2364       if (ll_red_struct.nbits == NTL_SP_NBITS) {
2365          x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2366          x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
2367          x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
2368          x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
2369       }
2370       else {
2371          x[j] =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2372          x[j+1] =  sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
2373          x[j+2] =  sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
2374          x[j+3] =  sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
2375       }
2376 
2377 
2378       b += 4*MAT_BLK_SZ;
2379    }
2380 
2381    for (; j < width; j++) {
2382 
2383       ll_type sum;
2384       ll_init(sum, x[j]);
2385 
2386       ll_imul_add(sum, a[0], b[0]);
2387       ll_imul_add(sum, a[1], b[1]);
2388       ll_imul_add(sum, a[2], b[2]);
2389       ll_imul_add(sum, a[3], b[3]);
2390       ll_imul_add(sum, a[4], b[4]);
2391       ll_imul_add(sum, a[5], b[5]);
2392       ll_imul_add(sum, a[6], b[6]);
2393       ll_imul_add(sum, a[7], b[7]);
2394       ll_imul_add(sum, a[8], b[8]);
2395       ll_imul_add(sum, a[9], b[9]);
2396       ll_imul_add(sum, a[10], b[10]);
2397       ll_imul_add(sum, a[11], b[11]);
2398       ll_imul_add(sum, a[12], b[12]);
2399       ll_imul_add(sum, a[13], b[13]);
2400       ll_imul_add(sum, a[14], b[14]);
2401       ll_imul_add(sum, a[15], b[15]);
2402       ll_imul_add(sum, a[16], b[16]);
2403       ll_imul_add(sum, a[17], b[17]);
2404       ll_imul_add(sum, a[18], b[18]);
2405       ll_imul_add(sum, a[19], b[19]);
2406       ll_imul_add(sum, a[20], b[20]);
2407       ll_imul_add(sum, a[21], b[21]);
2408       ll_imul_add(sum, a[22], b[22]);
2409       ll_imul_add(sum, a[23], b[23]);
2410       ll_imul_add(sum, a[24], b[24]);
2411       ll_imul_add(sum, a[25], b[25]);
2412       ll_imul_add(sum, a[26], b[26]);
2413       ll_imul_add(sum, a[27], b[27]);
2414       ll_imul_add(sum, a[28], b[28]);
2415       ll_imul_add(sum, a[29], b[29]);
2416       ll_imul_add(sum, a[30], b[30]);
2417       ll_imul_add(sum, a[31], b[31]);
2418 
2419       unsigned long sum0 = ll_get_lo(sum);
2420       unsigned long sum1 = ll_get_hi(sum);
2421 
2422       long res;
2423 
2424       if (ll_red_struct.nbits == NTL_SP_NBITS)
2425          res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2426       else
2427          res =  sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2428 
2429 
2430       x[j] = res;
2431       b += MAT_BLK_SZ;
2432    }
2433 }
2434 
2435 
2436 #endif
2437 
2438 
2439 #else
2440 
2441 
2442 static
muladd1_by_32(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2443 void muladd1_by_32(long *x, const long *a, const long *b,
2444                    long n, long p, sp_ll_reduce_struct ll_red_struct)
2445 {
2446    for (long j = 0; j < MAT_BLK_SZ; j++) {
2447 
2448       ll_type sum;
2449       ll_init(sum, x[j]);
2450 
2451       long i = 0;
2452       for (; i < n-16; i++)
2453          ll_imul_add(sum, a[i], b[i]);
2454 
2455       ll_type acc21;
2456       ll_init(acc21, ll_get_hi(sum));
2457       unsigned long acc0 = ll_get_lo(sum);
2458       ll_init(sum, acc0);
2459 
2460       for (; i < n; i++)
2461          ll_imul_add(sum, a[i], b[i]);
2462 
2463       acc0 = ll_get_lo(sum);
2464       ll_add(acc21, ll_get_hi(sum));
2465 
2466       long res;
2467 
2468       if (ll_red_struct.nbits == NTL_SP_NBITS)
2469          res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2470       else
2471          res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2472 
2473       x[j] = res;
2474       b += MAT_BLK_SZ;
2475    }
2476 }
2477 
2478 static
muladd1_by_32_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2479 void muladd1_by_32_width(long *x, const long *a, const long *b,
2480                    long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2481 {
2482    for (long j = 0; j < width; j++) {
2483 
2484       ll_type sum;
2485       ll_init(sum, x[j]);
2486 
2487       long i = 0;
2488       for (; i < n-16; i++)
2489          ll_imul_add(sum, a[i], b[i]);
2490 
2491       ll_type acc21;
2492       ll_init(acc21, ll_get_hi(sum));
2493       unsigned long acc0 = ll_get_lo(sum);
2494       ll_init(sum, acc0);
2495 
2496       for (; i < n; i++)
2497          ll_imul_add(sum, a[i], b[i]);
2498 
2499       acc0 = ll_get_lo(sum);
2500       ll_add(acc21, ll_get_hi(sum));
2501 
2502       long res;
2503 
2504       if (ll_red_struct.nbits == NTL_SP_NBITS)
2505          res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2506       else
2507          res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2508 
2509       x[j] = res;
2510       b += MAT_BLK_SZ;
2511    }
2512 }
2513 
2514 static
muladd1_by_32_full(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct)2515 void muladd1_by_32_full(long *x, const long *a, const long *b,
2516                         long p, sp_ll_reduce_struct ll_red_struct)
2517 {
2518    for (long j = 0; j < MAT_BLK_SZ; j++) {
2519 
2520       ll_type sum;
2521       ll_init(sum, x[j]);
2522 
2523       ll_imul_add(sum, a[0], b[0]);
2524       ll_imul_add(sum, a[1], b[1]);
2525       ll_imul_add(sum, a[2], b[2]);
2526       ll_imul_add(sum, a[3], b[3]);
2527       ll_imul_add(sum, a[4], b[4]);
2528       ll_imul_add(sum, a[5], b[5]);
2529       ll_imul_add(sum, a[6], b[6]);
2530       ll_imul_add(sum, a[7], b[7]);
2531       ll_imul_add(sum, a[8], b[8]);
2532       ll_imul_add(sum, a[9], b[9]);
2533       ll_imul_add(sum, a[10], b[10]);
2534       ll_imul_add(sum, a[11], b[11]);
2535       ll_imul_add(sum, a[12], b[12]);
2536       ll_imul_add(sum, a[13], b[13]);
2537       ll_imul_add(sum, a[14], b[14]);
2538       ll_imul_add(sum, a[15], b[15]);
2539 
2540       ll_type acc21;
2541       ll_init(acc21, ll_get_hi(sum));
2542       unsigned long acc0 = ll_get_lo(sum);
2543       ll_init(sum, acc0);
2544 
2545       ll_imul_add(sum, a[16], b[16]);
2546       ll_imul_add(sum, a[17], b[17]);
2547       ll_imul_add(sum, a[18], b[18]);
2548       ll_imul_add(sum, a[19], b[19]);
2549       ll_imul_add(sum, a[20], b[20]);
2550       ll_imul_add(sum, a[21], b[21]);
2551       ll_imul_add(sum, a[22], b[22]);
2552       ll_imul_add(sum, a[23], b[23]);
2553       ll_imul_add(sum, a[24], b[24]);
2554       ll_imul_add(sum, a[25], b[25]);
2555       ll_imul_add(sum, a[26], b[26]);
2556       ll_imul_add(sum, a[27], b[27]);
2557       ll_imul_add(sum, a[28], b[28]);
2558       ll_imul_add(sum, a[29], b[29]);
2559       ll_imul_add(sum, a[30], b[30]);
2560       ll_imul_add(sum, a[31], b[31]);
2561 
2562       acc0 = ll_get_lo(sum);
2563       ll_add(acc21, ll_get_hi(sum));
2564 
2565       long res;
2566 
2567       if (ll_red_struct.nbits == NTL_SP_NBITS)
2568          res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2569       else
2570          res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2571 
2572       x[j] = res;
2573       b += MAT_BLK_SZ;
2574    }
2575 }
2576 
2577 static
muladd1_by_32_full_width(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct,long width)2578 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2579                         long p, sp_ll_reduce_struct ll_red_struct, long width)
2580 {
2581    for (long j = 0; j < width; j++) {
2582 
2583       ll_type sum;
2584       ll_init(sum, x[j]);
2585 
2586       ll_imul_add(sum, a[0], b[0]);
2587       ll_imul_add(sum, a[1], b[1]);
2588       ll_imul_add(sum, a[2], b[2]);
2589       ll_imul_add(sum, a[3], b[3]);
2590       ll_imul_add(sum, a[4], b[4]);
2591       ll_imul_add(sum, a[5], b[5]);
2592       ll_imul_add(sum, a[6], b[6]);
2593       ll_imul_add(sum, a[7], b[7]);
2594       ll_imul_add(sum, a[8], b[8]);
2595       ll_imul_add(sum, a[9], b[9]);
2596       ll_imul_add(sum, a[10], b[10]);
2597       ll_imul_add(sum, a[11], b[11]);
2598       ll_imul_add(sum, a[12], b[12]);
2599       ll_imul_add(sum, a[13], b[13]);
2600       ll_imul_add(sum, a[14], b[14]);
2601       ll_imul_add(sum, a[15], b[15]);
2602 
2603       ll_type acc21;
2604       ll_init(acc21, ll_get_hi(sum));
2605       unsigned long acc0 = ll_get_lo(sum);
2606       ll_init(sum, acc0);
2607 
2608       ll_imul_add(sum, a[16], b[16]);
2609       ll_imul_add(sum, a[17], b[17]);
2610       ll_imul_add(sum, a[18], b[18]);
2611       ll_imul_add(sum, a[19], b[19]);
2612       ll_imul_add(sum, a[20], b[20]);
2613       ll_imul_add(sum, a[21], b[21]);
2614       ll_imul_add(sum, a[22], b[22]);
2615       ll_imul_add(sum, a[23], b[23]);
2616       ll_imul_add(sum, a[24], b[24]);
2617       ll_imul_add(sum, a[25], b[25]);
2618       ll_imul_add(sum, a[26], b[26]);
2619       ll_imul_add(sum, a[27], b[27]);
2620       ll_imul_add(sum, a[28], b[28]);
2621       ll_imul_add(sum, a[29], b[29]);
2622       ll_imul_add(sum, a[30], b[30]);
2623       ll_imul_add(sum, a[31], b[31]);
2624 
2625       acc0 = ll_get_lo(sum);
2626       ll_add(acc21, ll_get_hi(sum));
2627 
2628       long res;
2629 
2630       if (ll_red_struct.nbits == NTL_SP_NBITS)
2631          res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2632       else
2633          res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2634 
2635       x[j] = res;
2636       b += MAT_BLK_SZ;
2637    }
2638 }
2639 
2640 
2641 #endif
2642 
2643 
2644 static
muladd1_by_32_half2(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2645 void muladd1_by_32_half2(long *x, const long *a, const long *b,
2646                         long n, long p, sp_ll_reduce_struct ll_red_struct)
2647 {
2648    for (long j = 0; j < MAT_BLK_SZ; j++) {
2649 
2650       unsigned long sum[2];
2651       sum[0] = x[j];
2652       sum[1] = 0;
2653 
2654       long k=0;
2655       long i=0;
2656       for(; i <= n-16; i+= 16) {
2657          unsigned long lsum = a[i+0]*b[i+0];
2658          lsum += a[i+1]*b[i+1];
2659          lsum += a[i+2]*b[i+2];
2660          lsum += a[i+3]*b[i+3];
2661          lsum += a[i+4]*b[i+4];
2662          lsum += a[i+5]*b[i+5];
2663          lsum += a[i+6]*b[i+6];
2664          lsum += a[i+7]*b[i+7];
2665          lsum += a[i+8]*b[i+8];
2666          lsum += a[i+9]*b[i+9];
2667          lsum += a[i+10]*b[i+10];
2668          lsum += a[i+11]*b[i+11];
2669          lsum += a[i+12]*b[i+12];
2670          lsum += a[i+13]*b[i+13];
2671          lsum += a[i+14]*b[i+14];
2672          lsum += a[i+15]*b[i+15];
2673          sum[k++] += lsum;
2674       }
2675 
2676       if (i < n) {
2677          unsigned long lsum = a[i]*b[i];
2678 	 for (i++; i < n; i++)
2679 	    lsum += a[i]*b[i];
2680          sum[k++] += lsum;
2681       }
2682 
2683 
2684       long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
2685       long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
2686       x[j] = AddMod(t0, t1, p);
2687 
2688       b += MAT_BLK_SZ;
2689    }
2690 }
2691 
2692 static
muladd1_by_32_half2_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2693 void muladd1_by_32_half2_width(long *x, const long *a, const long *b,
2694                         long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2695 {
2696    for (long j = 0; j < width; j++) {
2697 
2698       unsigned long sum[2];
2699       sum[0] = x[j];
2700       sum[1] = 0;
2701 
2702       long k=0;
2703       long i=0;
2704       for(; i <= n-16; i+= 16) {
2705          unsigned long lsum = a[i+0]*b[i+0];
2706          lsum += a[i+1]*b[i+1];
2707          lsum += a[i+2]*b[i+2];
2708          lsum += a[i+3]*b[i+3];
2709          lsum += a[i+4]*b[i+4];
2710          lsum += a[i+5]*b[i+5];
2711          lsum += a[i+6]*b[i+6];
2712          lsum += a[i+7]*b[i+7];
2713          lsum += a[i+8]*b[i+8];
2714          lsum += a[i+9]*b[i+9];
2715          lsum += a[i+10]*b[i+10];
2716          lsum += a[i+11]*b[i+11];
2717          lsum += a[i+12]*b[i+12];
2718          lsum += a[i+13]*b[i+13];
2719          lsum += a[i+14]*b[i+14];
2720          lsum += a[i+15]*b[i+15];
2721          sum[k++] += lsum;
2722       }
2723 
2724       if (i < n) {
2725          unsigned long lsum = a[i]*b[i];
2726 	 for (i++; i < n; i++)
2727 	    lsum += a[i]*b[i];
2728          sum[k++] += lsum;
2729       }
2730 
2731 
2732       long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
2733       long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
2734       x[j] = AddMod(t0, t1, p);
2735 
2736       b += MAT_BLK_SZ;
2737    }
2738 }
2739 
2740 
2741 
2742 // NOTE: oddly, this is slightly faster than the half2 routine, which
2743 // I would have thought would be faster
2744 // DIRT: this assumes MAT_BLK_SZ < (1L << NTL_BITS_PER_LONG/2),
2745 // which will hold unconditionally for MAT_BLK_SZ < 2^16.
2746 
2747 static
muladd1_by_32_half1(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2748 void muladd1_by_32_half1(long *x, const long *a, const long *b,
2749                         long n, long p, sp_ll_reduce_struct ll_red_struct)
2750 {
2751    for (long j = 0; j < MAT_BLK_SZ; j++) {
2752 
2753       ll_type sum;
2754       ll_init(sum, x[j]);
2755 
2756       long i=0;
2757       for(; i <= n-4; i+= 4) {
2758          unsigned long lsum = a[i+0]*b[i+0];
2759          lsum += a[i+1]*b[i+1];
2760          lsum += a[i+2]*b[i+2];
2761          lsum += a[i+3]*b[i+3];
2762          ll_add(sum, lsum);
2763       }
2764 
2765       if (i < n) {
2766          unsigned long lsum = a[i]*b[i];
2767 	 for (i++; i < n; i++)
2768 	    lsum += a[i]*b[i];
2769          ll_add(sum, lsum);
2770       }
2771 
2772       unsigned long sum0 = ll_get_lo(sum);
2773       unsigned long sum1 = ll_get_hi(sum);
2774       x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
2775 
2776       b += MAT_BLK_SZ;
2777    }
2778 }
2779 
2780 static
muladd1_by_32_half1_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2781 void muladd1_by_32_half1_width(long *x, const long *a, const long *b,
2782                         long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2783 {
2784    for (long j = 0; j < width; j++) {
2785 
2786       ll_type sum;
2787       ll_init(sum, x[j]);
2788 
2789       long i=0;
2790       for(; i <= n-4; i+= 4) {
2791          unsigned long lsum = a[i+0]*b[i+0];
2792          lsum += a[i+1]*b[i+1];
2793          lsum += a[i+2]*b[i+2];
2794          lsum += a[i+3]*b[i+3];
2795          ll_add(sum, lsum);
2796       }
2797 
2798       if (i < n) {
2799          unsigned long lsum = a[i]*b[i];
2800 	 for (i++; i < n; i++)
2801 	    lsum += a[i]*b[i];
2802          ll_add(sum, lsum);
2803       }
2804 
2805       unsigned long sum0 = ll_get_lo(sum);
2806       unsigned long sum1 = ll_get_hi(sum);
2807       x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
2808 
2809       b += MAT_BLK_SZ;
2810    }
2811 }
2812 
2813 static inline
muladd_all_by_32(long first,long last,long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2814 void muladd_all_by_32(long first, long last, long *x, const long *a, const long *b, long n,
2815                       long p, sp_ll_reduce_struct ll_red_struct)
2816 {
2817    if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2818       if (n == MAT_BLK_SZ) {
2819 	 for (long i = first; i < last; i++)
2820 	    muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct);
2821       }
2822       else {
2823 	 for (long i = first; i < last; i++)
2824 	    muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2825       }
2826    }
2827    else {
2828       for (long i = first; i < last; i++)
2829 	 muladd1_by_32_half1(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2830    }
2831 }
2832 
2833 static inline
muladd_all_by_32_width(long first,long last,long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2834 void muladd_all_by_32_width(long first, long last, long *x, const long *a, const long *b, long n,
2835                       long p, sp_ll_reduce_struct ll_red_struct, long width)
2836 {
2837    if (width == MAT_BLK_SZ) {
2838       if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2839 	 if (n == MAT_BLK_SZ) {
2840 	    for (long i = first; i < last; i++)
2841 	       muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct);
2842 	 }
2843 	 else {
2844 	    for (long i = first; i < last; i++)
2845 	       muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2846 	 }
2847       }
2848       else {
2849 	 for (long i = first; i < last; i++)
2850 	    muladd1_by_32_half1(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2851       }
2852    }
2853    else {
2854       if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2855 	 if (n == MAT_BLK_SZ) {
2856 	    for (long i = first; i < last; i++)
2857 	       muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct, width);
2858 	 }
2859 	 else {
2860 	    for (long i = first; i < last; i++)
2861 	       muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct, width);
2862 	 }
2863       }
2864       else {
2865 	 for (long i = first; i < last; i++)
2866 	    muladd1_by_32_half1_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct, width);
2867       }
2868    }
2869 }
2870 
2871 
2872 #endif
2873 
2874 
2875 
2876 static
muladd_interval(long * NTL_RESTRICT x,long * NTL_RESTRICT y,long c,long n,long p,mulmod_t pinv)2877 inline void muladd_interval(long * NTL_RESTRICT x, long * NTL_RESTRICT y,
2878                      long c, long n, long p, mulmod_t pinv)
2879 {
2880    mulmod_precon_t cpinv = PrepMulModPrecon(c, p, pinv);
2881    for (long i = 0; i < n; i++) {
2882       long t = MulModPrecon(y[i], c, p, cpinv);
2883       x[i] = AddMod(x[i], t, p);
2884    }
2885 }
2886 
2887 
2888 // ******************************************************************
2889 //
2890 // General matrix multiplication code
2891 //
2892 // ******************************************************************
2893 
2894 
2895 
2896 
2897 
2898 static
basic_mul(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2899 void basic_mul(const mat_window_zz_p& X,
2900                const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2901 {
2902    long n = A.NumRows();
2903    long l = A.NumCols();
2904    long m = B.NumCols();
2905 
2906    long p = zz_p::modulus();
2907    mulmod_t pinv = zz_p::ModulusInverse();
2908 
2909    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2910 
2911    NTL_GEXEC_RANGE(seq, n, first, last) {
2912 
2913       for (long i = first; i < last; i++) {
2914          long j, k;
2915          const zz_p* ap = &A[i][0];
2916 
2917          zz_p *xp = &X[i][0];
2918          for (j = 0; j < m; j++) xp[j].LoopHole() = 0;
2919 
2920          for (k = 0;  k < l; k++) {
2921             long aa = rep(ap[k]);
2922             if (aa != 0) {
2923                const zz_p* bp = &B[k][0];
2924                long T1;
2925                mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
2926 
2927                for (j = 0; j < m; j++) {
2928                   T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
2929                   xp[j].LoopHole() = AddMod(rep(xp[j]), T1, p);
2930                }
2931             }
2932          }
2933       }
2934 
2935    } NTL_GEXEC_RANGE_END
2936 }
2937 
2938 
2939 
2940 
2941 #ifdef NTL_HAVE_LL_TYPE
2942 
2943 static
alt_mul_L(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2944 void alt_mul_L(const mat_window_zz_p& X,
2945                const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2946 {
2947    long n = A.NumRows();
2948    long l = A.NumCols();
2949    long m = B.NumCols();
2950 
2951    long p = zz_p::modulus();
2952    sp_reduce_struct red_struct = zz_p::red_struct();
2953    long bound = InnerProd_L_bound(p);
2954 
2955    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2956 
2957    NTL_GEXEC_RANGE(seq, m, first, last) {
2958 
2959       Vec<long> B_col;
2960       B_col.SetLength(l);
2961       long *bp = B_col.elts();
2962 
2963       long i, j, k;
2964 
2965       for (j = first; j < last; j++) {
2966          for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
2967 
2968          for (i = 0; i < n; i++) {
2969             const zz_p *ap = &A[i][0];
2970             X[i][j].LoopHole() = InnerProd_L(bp, ap, l, p, red_struct, bound);
2971          }
2972       }
2973 
2974    } NTL_GEXEC_RANGE_END
2975 }
2976 
2977 
2978 static
alt_mul_LL(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2979 void alt_mul_LL(const mat_window_zz_p& X,
2980                 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2981 {
2982    long n = A.NumRows();
2983    long l = A.NumCols();
2984    long m = B.NumCols();
2985 
2986    long p = zz_p::modulus();
2987    sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
2988 
2989    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2990 
2991    NTL_GEXEC_RANGE(seq, m, first, last) {
2992 
2993       Vec<long> B_col;
2994       B_col.SetLength(l);
2995       long *bp = B_col.elts();
2996 
2997       long i, j, k;
2998 
2999       for (j = first; j < last; j++) {
3000          for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
3001 
3002          for (i = 0; i < n; i++) {
3003             const zz_p *ap = &A[i][0];
3004             X[i][j].LoopHole() = InnerProd_LL(bp, ap, l, p, ll_red_struct);
3005          }
3006       }
3007 
3008    } NTL_GEXEC_RANGE_END
3009 }
3010 
3011 
3012 #ifdef NTL_HAVE_AVX
3013 
3014 static
blk_mul_DD(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3015 void blk_mul_DD(const mat_window_zz_p& X,
3016                 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3017 {
3018    long n = A.NumRows();
3019    long l = A.NumCols();
3020    long m = B.NumCols();
3021 
3022    long p = zz_p::modulus();
3023    sp_reduce_struct red_struct = zz_p::red_struct();
3024 
3025    UniqueArray< AlignedArray<double> > A_buf;
3026    long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3027    A_buf.SetLength(npanels);
3028 
3029    for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3030       long k_max = min(kk+MAT_BLK_SZ, l);
3031 
3032       A_buf[panel].SetLength(n * MAT_BLK_SZ);
3033       double *abp = &A_buf[panel][0];
3034 
3035       for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3036          const zz_p *ap1 = &A[i][0];
3037          for (long k = kk; k < k_max; k++) {
3038             abp[k-kk] = rep(ap1[k]);
3039          }
3040          for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3041             abp[k-kk] = 0;
3042          }
3043       }
3044    }
3045 
3046    long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3047 
3048    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3049 
3050    NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3051    NTL_IMPORT(n)
3052    NTL_IMPORT(l)
3053    NTL_IMPORT(m)
3054    NTL_IMPORT(p)
3055    NTL_IMPORT(red_struct)
3056 
3057    AlignedArray<double> B_rec;
3058    B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3059    double *brec = B_rec.get();
3060 
3061    AlignedArray<double> X_buf;
3062    X_buf.SetLength(n*MAT_BLK_SZ);
3063    double *xbp = X_buf.get();
3064 
3065    long jj, kk;
3066    long i, j, k;
3067    long panel;
3068    long xpanel;
3069 
3070    for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3071         xpanel++, jj += MAT_BLK_SZ) {
3072 
3073       long j_max = min(jj+MAT_BLK_SZ, m);
3074 
3075       for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3076 
3077       long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3078       long red_count = red_trigger;
3079 
3080       for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3081          long k_max = min(kk+MAT_BLK_SZ, l);
3082 
3083          for (k = kk; k < k_max; k++) {
3084             const zz_p *bp = &B[k][0];
3085             for (j = jj; j < j_max; j++)
3086                brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = rep(bp[j]);
3087             for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3088                brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = 0;
3089          }
3090 
3091 
3092          if (red_count-MAT_BLK_SZ < 0) {
3093             red_count = red_trigger;
3094             for (i = 0; i < n*MAT_BLK_SZ; i++)
3095                xbp[i] = rem((unsigned long)(long)xbp[i], p, red_struct);
3096          }
3097 
3098          red_count = red_count-MAT_BLK_SZ;
3099 
3100          const double *abp = &A_buf[panel][0];
3101 
3102          muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, j_max-jj);
3103       }
3104 
3105 
3106       for (i = 0; i < n; i++) {
3107          zz_p *xp = &X[i][0];
3108          for (j = jj; j < j_max; j++)
3109             xp[j].LoopHole() =
3110               rem((unsigned long)(long)xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
3111       }
3112    }
3113 
3114    NTL_GEXEC_RANGE_END
3115 }
3116 
3117 #endif
3118 
3119 
3120 static
blk_mul_LL(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3121 void blk_mul_LL(const mat_window_zz_p& X,
3122                 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3123 {
3124    long n = A.NumRows();
3125    long l = A.NumCols();
3126    long m = B.NumCols();
3127 
3128    long p = zz_p::modulus();
3129    sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
3130 
3131    Vec< Vec<long> > A_buf;
3132    Vec<long *> abufp;
3133    long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3134    A_buf.SetLength(npanels);
3135    abufp.SetLength(npanels);
3136 
3137    for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3138       long k_max = min(kk+MAT_BLK_SZ, l);
3139 
3140       A_buf[panel].SetLength(n * MAT_BLK_SZ);
3141       long *abp = A_buf[panel].elts();
3142       abufp[panel] = abp;
3143 
3144       for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3145          const zz_p *ap1 = &A[i][0];
3146          for (long k = kk; k < k_max; k++) {
3147             abp[k-kk] = rep(ap1[k]);
3148          }
3149          for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3150             abp[k-kk] = 0;
3151          }
3152       }
3153    }
3154 
3155    long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3156 
3157    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3158 
3159    NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3160    NTL_IMPORT(n)
3161    NTL_IMPORT(l)
3162    NTL_IMPORT(m)
3163    NTL_IMPORT(p)
3164    NTL_IMPORT(ll_red_struct)
3165 
3166    UniqueArray<long> B_rec;
3167    B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3168    long *brec = B_rec.get();
3169 
3170    UniqueArray<long> X_buf;
3171    X_buf.SetLength(n*MAT_BLK_SZ);
3172    long *xbp = X_buf.get();
3173 
3174    long jj, kk;
3175    long i, j, k;
3176    long panel;
3177    long xpanel;
3178 
3179    for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3180         xpanel++, jj += MAT_BLK_SZ) {
3181 
3182       long j_max = min(jj+MAT_BLK_SZ, m);
3183 
3184       for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3185 
3186       for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3187          long k_max = min(kk+MAT_BLK_SZ, l);
3188 
3189          // fill brec, transposed
3190 
3191          for (k = kk; k < k_max; k++) {
3192             const zz_p *bp = &B[k][0];
3193             for (j = jj; j < j_max; j++)
3194                brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
3195             for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3196                brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
3197          }
3198 
3199          const long *abp = abufp[panel];
3200 
3201          muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, p, ll_red_struct, j_max-jj);
3202       }
3203 
3204 
3205       for (i = 0; i < n; i++) {
3206          zz_p *xp = &X[i][0];
3207          for (j = jj; j < j_max; j++)
3208             xp[j].LoopHole() =  xbp[i*MAT_BLK_SZ + (j-jj)];
3209       }
3210    }
3211 
3212    NTL_GEXEC_RANGE_END
3213 }
3214 
3215 
3216 static
blk_mul_L(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3217 void blk_mul_L(const mat_window_zz_p& X,
3218                const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3219 {
3220    long n = A.NumRows();
3221    long l = A.NumCols();
3222    long m = B.NumCols();
3223 
3224    long p = zz_p::modulus();
3225    sp_reduce_struct red_struct = zz_p::red_struct();
3226 
3227    Vec< Vec<uhlong> > A_buf;
3228    Vec<uhlong*> abufp;
3229    long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3230    A_buf.SetLength(npanels);
3231    abufp.SetLength(npanels);
3232 
3233    for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3234       long k_max = min(kk+MAT_BLK_SZ, l);
3235 
3236       A_buf[panel].SetLength(n * MAT_BLK_SZ);
3237       uhlong *abp = A_buf[panel].elts();
3238       abufp[panel] = abp;
3239 
3240       for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3241          const zz_p *ap1 = &A[i][0];
3242          for (long k = kk; k < k_max; k++) {
3243             abp[k-kk] = rep(ap1[k]);
3244          }
3245          for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3246             abp[k-kk] = 0;
3247          }
3248       }
3249    }
3250 
3251    long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3252 
3253    const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3254 
3255    NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3256    NTL_IMPORT(n)
3257    NTL_IMPORT(l)
3258    NTL_IMPORT(m)
3259    NTL_IMPORT(p)
3260    NTL_IMPORT(red_struct)
3261 
3262    UniqueArray<uhlong> B_rec;
3263    B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3264    uhlong *brec = B_rec.get();
3265 
3266    UniqueArray<unsigned long> X_buf;
3267    X_buf.SetLength(n*MAT_BLK_SZ);
3268    unsigned long *xbp = X_buf.get();
3269 
3270    long jj, kk;
3271    long i, j, k;
3272    long panel;
3273    long xpanel;
3274 
3275    for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3276         xpanel++, jj += MAT_BLK_SZ) {
3277 
3278       long j_max = min(jj+MAT_BLK_SZ, m);
3279 
3280       for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3281 
3282       unsigned long ured_trigger =
3283          (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
3284       // NOTE: corner case at p == 2: need unsigned long to prevent overflow
3285 
3286       long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
3287 
3288       long red_count = red_trigger;
3289 
3290       for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3291          long k_max = min(kk+MAT_BLK_SZ, l);
3292 
3293          // fill brec, transposed
3294 
3295          for (k = kk; k < k_max; k++) {
3296             const zz_p *bp = &B[k][0];
3297             for (j = jj; j < j_max; j++)
3298                brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
3299             for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3300                brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
3301          }
3302 
3303          if (red_count-MAT_BLK_SZ < 0) {
3304             red_count = red_trigger;
3305             for (i = 0; i < n*MAT_BLK_SZ; i++)
3306                xbp[i] = rem(xbp[i], p, red_struct);
3307          }
3308 
3309          red_count = red_count-MAT_BLK_SZ;
3310 
3311          const uhlong *abp = abufp[panel];
3312 
3313          muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, j_max-jj);
3314       }
3315 
3316 
3317       for (i = 0; i < n; i++) {
3318          zz_p *xp = &X[i][0];
3319          for (j = jj; j < j_max; j++)
3320             xp[j].LoopHole() =
3321               rem(xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
3322       }
3323    }
3324 
3325    NTL_GEXEC_RANGE_END
3326 }
3327 
3328 
3329 #endif
3330 
3331 
3332 static
mul_base(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3333 void mul_base (const mat_window_zz_p& X,
3334                const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3335 {
3336    long n = A.NumRows();
3337    long l = A.NumCols();
3338    long m = B.NumCols();
3339 
3340    if (n == 0 || l == 0 || m == 0) {
3341       clear(X);
3342       return;
3343    }
3344 
3345 #ifndef NTL_HAVE_LL_TYPE
3346 
3347    basic_mul(X, A, B);
3348 
3349 #else
3350 
3351    long p = zz_p::modulus();
3352    long V = MAT_BLK_SZ*4;
3353 
3354 #ifdef NTL_HAVE_AVX
3355 
3356    // experimentally, blk_mul_DD beats all the alternatives
3357    // if each dimension is at least 16
3358 
3359    if (n >= 16 && l >= 16 && m >= 16 &&
3360        p-1 <= MAX_DBL_INT &&
3361        V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3362        V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
3363    {
3364       if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
3365       if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
3366       if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
3367 
3368       //cerr << "blk_mul_DD\n";
3369       blk_mul_DD(X, A, B);
3370       return;
3371    }
3372 #endif
3373 
3374 
3375    if (n < 32 || l < 32 || m < 32) {
3376 
3377 
3378       if (InnerProd_L_viable(l, p)) {
3379          //cerr << "alt_mul_L\n";
3380          alt_mul_L(X, A, B);
3381       }
3382       else {
3383          //cerr << "alt_mul_LL\n";
3384          alt_mul_LL(X, A, B);
3385       }
3386 
3387    }
3388    else {
3389 
3390       // Experimentally, the block versions are better when all dimensions
3391       // are at least 32
3392 
3393       if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
3394       if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
3395       if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
3396 
3397 
3398       if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
3399           cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
3400          //cerr << "blk_mul_L\n";
3401          blk_mul_L(X, A, B);
3402 
3403       }
3404       else {
3405          //cerr << "blk_mul_LL\n";
3406          blk_mul_LL(X, A, B);
3407       }
3408 
3409    }
3410 
3411 #endif
3412 
3413 
3414 }
3415 
3416 
3417 
3418 // The following implementation of Strassen is derived directly
3419 // from the implementation in FLINT (see http://www.flintlib.org),
3420 // although a number of details have changed.
3421 // The following copyright notice appears in the relevant
3422 // file, which can be obtained at
3423 // https://github.com/fredrik-johansson/flint2/blob/trunk/nmod_mat/mul_strassen.c
3424 // committed on April 26, 2016.
3425 
3426 /*
3427     Copyright (C) 2008, Martin Albrecht
3428     Copyright (C) 2008, 2009 William Hart.
3429     Copyright (C) 2010, Fredrik Johansson
3430     This file is part of FLINT.
3431     FLINT is free software: you can redistribute it and/or modify it under
3432     the terms of the GNU Lesser General Public License (LGPL) as published
3433     by the Free Software Foundation; either version 2.1 of the License, or
3434     (at your option) any later version.  See <http://www.gnu.org/licenses/>.
3435 */
3436 
3437 
mul_strassen(const mat_window_zz_p & C,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3438 void mul_strassen(const mat_window_zz_p& C,
3439                   const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3440 {
3441     long a, b, c;
3442     long anr, anc, bnr, bnc;
3443 
3444 
3445     a = A.NumRows();
3446     b = A.NumCols();
3447     c = B.NumCols();
3448 
3449 
3450     bool use_DD = false;
3451     // this code determines if mul_base triggers blk_mul_DD,
3452     // in which case a higher crossover is used
3453 
3454 #if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
3455     {
3456        long V = MAT_BLK_SZ*4;
3457        long p = zz_p::modulus();
3458 
3459        if (p-1 <= MAX_DBL_INT &&
3460            V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3461            V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
3462        {
3463           use_DD = true;
3464        }
3465     }
3466 #endif
3467 
3468     long nt = AvailableThreads();
3469 
3470     long xover;
3471     // now we set the crossover -- it is kind of a heauristic
3472     // mess based on nt and use_DD...I've run some tests to
3473     // make sure these settings are reasonable, but a more
3474     // rational approach would be preferable
3475 
3476     if (nt > 1) {
3477        if (use_DD || nt > 8192/(2*MAT_BLK_SZ))
3478           xover = 8192;
3479        else
3480           xover = max(800, nt*2*MAT_BLK_SZ);
3481     }
3482     else {
3483        if (use_DD)
3484           xover = 800;
3485        else
3486           xover = 448;
3487     }
3488 
3489     if (a <= xover  || b <= xover || c <= xover)
3490     {
3491         mul_base(C, A, B);
3492         return;
3493     }
3494 
3495     anr = a / 2;
3496     anc = b / 2;
3497     bnr = anc;
3498     bnc = c / 2;
3499 
3500     const_mat_window_zz_p A11(A, 0, 0, anr, anc);
3501     const_mat_window_zz_p A12(A, 0, anc, anr, 2*anc);
3502     const_mat_window_zz_p A21(A, anr, 0, 2*anr, anc);
3503     const_mat_window_zz_p A22(A, anr, anc, 2*anr, 2*anc);
3504 
3505     const_mat_window_zz_p B11(B, 0, 0, bnr, bnc);
3506     const_mat_window_zz_p B12(B, 0, bnc, bnr, 2*bnc);
3507     const_mat_window_zz_p B21(B, bnr, 0, 2*bnr, bnc);
3508     const_mat_window_zz_p B22(B, bnr, bnc, 2*bnr, 2*bnc);
3509 
3510     mat_window_zz_p C11(C, 0, 0, anr, bnc);
3511     mat_window_zz_p C12(C, 0, bnc, anr, 2*bnc);
3512     mat_window_zz_p C21(C, anr, 0, 2*anr, bnc);
3513     mat_window_zz_p C22(C, anr, bnc, 2*anr, 2*bnc);
3514 
3515     mat_zz_p X1_store;
3516     X1_store.SetDims(anr, max(bnc, anc));
3517 
3518     mat_window_zz_p X1a(X1_store, 0, 0, anr, anc);
3519     mat_window_zz_p X1b(X1_store, 0, 0, anr, bnc);
3520 
3521     mat_zz_p X2;
3522     X2.SetDims(anc, bnc);
3523 
3524     /*
3525         See Jean-Guillaume Dumas, Clement Pernet, Wei Zhou; "Memory
3526         efficient scheduling of Strassen-Winograd's matrix multiplication
3527         algorithm"; http://arxiv.org/pdf/0707.2347v3 for reference on the
3528         used operation scheduling.
3529     */
3530 
3531     sub(X1a, A11, A21);
3532     sub(X2, B22, B12);
3533     mul_strassen(C21, X1a, X2);
3534 
3535     add(X1a, A21, A22);
3536     sub(X2, B12, B11);
3537     mul_strassen(C22, X1a, X2);
3538 
3539     sub(X1a, X1a, A11);
3540     sub(X2, B22, X2);
3541     mul_strassen(C12, X1a, X2);
3542 
3543     sub(X1a, A12, X1a);
3544     mul_strassen(C11, X1a, B22);
3545 
3546 
3547     mul_strassen(X1b, A11, B11);
3548 
3549     add(C12, X1b, C12);
3550     add(C21, C12, C21);
3551     add(C12, C12, C22);
3552     add(C22, C21, C22);
3553     add(C12, C12, C11);
3554     sub(X2, X2, B21);
3555     mul_strassen(C11, A22, X2);
3556 
3557     X2.kill();
3558 
3559     sub(C21, C21, C11);
3560     mul_strassen(C11, A12, B21);
3561 
3562     add(C11, X1b, C11);
3563 
3564     X1_store.kill();
3565 
3566     if (c > 2*bnc) /* A by last col of B -> last col of C */
3567     {
3568         const_mat_window_zz_p Bc(B, 0, 2*bnc, b, c);
3569         mat_window_zz_p Cc(C, 0, 2*bnc, a, c);
3570 
3571         mul_strassen(Cc, A, Bc);
3572     }
3573 
3574     if (a > 2*anr) /* last row of A by B -> last row of C */
3575     {
3576         const_mat_window_zz_p Ar(A, 2*anr, 0, a, b);
3577         mat_window_zz_p Cr(C, 2*anr, 0, a, c);
3578         mul_strassen(Cr, Ar, B);
3579     }
3580 
3581     if (b > 2*anc) /* last col of A by last row of B -> C */
3582     {
3583         const_mat_window_zz_p Ac(A, 0, 2*anc, 2*anr, b);
3584         const_mat_window_zz_p Br(B, 2*bnr, 0, b, 2*bnc);
3585         mat_window_zz_p Cb(C, 0, 0, 2*anr, 2*bnc);
3586 
3587         // Cb += Ac*Br
3588         mat_zz_p tmp;
3589         tmp.SetDims(Cb.NumRows(), Cb.NumCols());
3590         mul_strassen(tmp, Ac, Br);
3591         add(Cb, Cb, tmp);
3592     }
3593 }
3594 
3595 
3596 
3597 
3598 
3599 
3600 
3601 static
mul_aux(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)3602 void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
3603 {
3604    long n = A.NumRows();
3605    long l = A.NumCols();
3606    long m = B.NumCols();
3607 
3608    if (l != B.NumRows())
3609       LogicError("matrix mul: dimension mismatch");
3610 
3611    X.SetDims(n, m);
3612 
3613    if (n == 0 || l == 0 || m == 0) {
3614       clear(X);
3615       return;
3616    }
3617 
3618    mul_strassen(X, A, B);
3619 }
3620 
3621 
mul(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)3622 void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
3623 {
3624    if (&X == &A || &X == &B) {
3625       mat_zz_p tmp;
3626       mul_aux(tmp, A, B);
3627       X = tmp;
3628    }
3629    else
3630       mul_aux(X, A, B);
3631 }
3632 
3633 
3634 // ******************************************************************
3635 //
3636 // Matrix inversion code
3637 //
3638 // ******************************************************************
3639 
3640 static
relaxed_InvModStatus(long & x,long a,long n,bool relax)3641 long relaxed_InvModStatus(long& x, long a, long n, bool relax)
3642 {
3643    if (relax) {
3644       return InvModStatus(x, a, n);
3645    }
3646    else {
3647       x = InvMod(a, n);
3648       return 0;
3649    }
3650 }
3651 
3652 static
basic_inv(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3653 void basic_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3654 {
3655    long n = A.NumRows();
3656 
3657    if (A.NumCols() != n)
3658       LogicError("inv: nonsquare matrix");
3659 
3660    if (n == 0) {
3661       set(d);
3662       X.SetDims(0, 0);
3663       return;
3664    }
3665 
3666 
3667    Mat<long> M;
3668    conv(M, A);
3669    // scratch space
3670 
3671    Vec<long> P;
3672    P.SetLength(n);
3673    for (long k = 0; k < n; k++) P[k] = k;
3674    // records swap operations
3675 
3676    long det;
3677    det = 1;
3678 
3679    long p = zz_p::modulus();
3680    mulmod_t pinv = zz_p::ModulusInverse();
3681 
3682    bool seq = n < PAR_THRESH_SQ;
3683 
3684    bool pivoting = false;
3685 
3686    for (long k = 0; k < n; k++) {
3687       long pos = -1;
3688       long pivot_inv;
3689       for (long i = k; i < n; i++) {
3690          // NOTE: by using InvModStatus, this code will work
3691          // for prime-powers as well as primes
3692          long pivot = M[i][k];
3693          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3694             pos = i;
3695             break;
3696          }
3697       }
3698 
3699       if (pos != -1) {
3700          if (k != pos) {
3701             swap(M[pos], M[k]);
3702             det = NegateMod(det, p);
3703             P[k] = pos;
3704             pivoting = true;
3705          }
3706 
3707          det = MulMod(det, M[k][k], p);
3708 
3709          {
3710             // multiply row k by pivot_inv
3711             long t1 = pivot_inv;
3712             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3713             long *y = &M[k][0];
3714             for (long j = 0; j < n; j++)
3715                y[j] = MulModPrecon(y[j], t1, p, t1pinv);
3716 
3717             y[k] = pivot_inv;
3718          }
3719 
3720 
3721 
3722          NTL_GEXEC_RANGE(seq, n, first, last)
3723          NTL_IMPORT(p)
3724          NTL_IMPORT(n)
3725          NTL_IMPORT(k)
3726          long *y = &M[k][0];
3727          for (long i = first; i < last; i++) {
3728             if (i == k) continue; // skip row k
3729 
3730             long *x = &M[i][0];
3731             long t1 = x[k];
3732             t1 = NegateMod(t1, p);
3733             x[k] = 0;
3734             if (t1 == 0) continue;
3735 
3736             // add t1 * row k to row i
3737             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3738 
3739             for (long j = 0; j < n; j++) {
3740                long t2 = MulModPrecon(y[j], t1, p, t1pinv);
3741                x[j] = AddMod(x[j], t2, p);
3742             }
3743          }
3744          NTL_GEXEC_RANGE_END
3745       }
3746       else {
3747          clear(d);
3748          return;
3749       }
3750    }
3751 
3752    if (pivoting) {
3753       // pivot colums, using reverse swap sequence
3754 
3755       for (long i = 0; i < n; i++) {
3756          long *x = &M[i][0];
3757 
3758          for (long k = n-1; k >= 0; k--) {
3759             long pos = P[k];
3760             if (pos != k) _ntl_swap(x[pos], x[k]);
3761          }
3762       }
3763    }
3764 
3765    X.SetDims(n, n);
3766    for (long i = 0; i < n; i++)
3767       for (long j = 0; j < n; j++)
3768          X[i][j].LoopHole() = M[i][j];
3769 
3770    d.LoopHole() = det;
3771 }
3772 
3773 
3774 
3775 #ifdef NTL_HAVE_LL_TYPE
3776 
3777 
3778 
3779 static
alt_inv_L(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3780 void alt_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3781 {
3782    long n = A.NumRows();
3783 
3784    if (A.NumCols() != n)
3785       LogicError("inv: nonsquare matrix");
3786 
3787    if (n == 0) {
3788       set(d);
3789       X.SetDims(0, 0);
3790       return;
3791    }
3792 
3793 
3794    Mat<unsigned long> M;
3795    conv(M, A);
3796    // scractch space
3797 
3798    Vec<long> P;
3799    P.SetLength(n);
3800    for (long k = 0; k < n; k++) P[k] = k;
3801    // records swap operations
3802 
3803    long det;
3804    det = 1;
3805 
3806    long p = zz_p::modulus();
3807    mulmod_t pinv = zz_p::ModulusInverse();
3808    sp_reduce_struct red_struct = zz_p::red_struct();
3809 
3810 
3811 
3812    bool seq = n < PAR_THRESH_SQ;
3813 
3814    bool pivoting = false;
3815 
3816    unsigned long ured_trigger =
3817       (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
3818    // NOTE: corner case at p == 2: need unsigned long to prevent overflow
3819 
3820    long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
3821 
3822    long red_count = red_trigger;
3823 
3824 
3825    for (long k = 0; k < n; k++) {
3826       bool cleanup = false;
3827 
3828       if (red_count-1 < 0) {
3829          red_count = red_trigger;
3830          cleanup = true;
3831       }
3832 
3833       red_count = red_count-1;
3834 
3835       long pos = -1;
3836       long pivot;
3837       long pivot_inv;
3838 
3839       for (long i = k; i < n; i++) {
3840          // NOTE: by using InvModStatus, this code will work
3841          // for prime-powers as well as primes
3842          pivot = rem(M[i][k], p, red_struct);
3843          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3844             pos = i;
3845             break;
3846          }
3847       }
3848 
3849       if (pos != -1) {
3850          if (k != pos) {
3851             swap(M[pos], M[k]);
3852             det = NegateMod(det, p);
3853             P[k] = pos;
3854             pivoting = true;
3855          }
3856 
3857          det = MulMod(det, pivot, p);
3858 
3859          {
3860             // multiply row k by pivot_inv
3861             long t1 = pivot_inv;
3862             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3863             unsigned long *y = &M[k][0];
3864             for (long j = 0; j < n; j++) {
3865                long t2 = rem(y[j], p, red_struct);
3866                y[j] = MulModPrecon(t2, t1, p, t1pinv);
3867             }
3868 
3869             y[k] = pivot_inv;
3870          }
3871 
3872 
3873          NTL_GEXEC_RANGE(seq, n, first, last)
3874          NTL_IMPORT(p)
3875          NTL_IMPORT(n)
3876          NTL_IMPORT(k)
3877          NTL_IMPORT(red_struct)
3878          unsigned long *y = &M[k][0];
3879          if (cleanup) {
3880             for (long i = first; i < last; i++) {
3881                if (i == k) continue;
3882                // skip row k: the data won't change, but it
3883                // technically is a race condition in a multi-theaded
3884                // execution
3885 
3886                unsigned long *x = &M[i][0];
3887                for (long j = 0; j < n; j++) {
3888                   x[j] = rem(x[j], p, red_struct);
3889                }
3890             }
3891          }
3892 
3893 
3894          for (long i = first; i < last; i++) {
3895             if (i == k) continue; // skip row k
3896 
3897             unsigned long *x = &M[i][0];
3898             long t1 = rem(x[k], p, red_struct);
3899             t1 = NegateMod(t1, p);
3900             x[k] = 0;
3901             if (t1 == 0) continue;
3902 
3903             // add t1 * row k to row i
3904             unsigned long ut1 = t1;
3905             long j;
3906             for (j = 0; j <= n-4; j+=4) {
3907                unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
3908                unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
3909                unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
3910                unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
3911                x[j+0] = xj0;
3912                x[j+1] = xj1;
3913                x[j+2] = xj2;
3914                x[j+3] = xj3;
3915             }
3916             for (; j < n; j++) {
3917                x[j] += DO_MUL(y[j], ut1);
3918             }
3919          }
3920          NTL_GEXEC_RANGE_END
3921       }
3922       else {
3923          clear(d);
3924          return;
3925       }
3926    }
3927 
3928    if (pivoting) {
3929       // pivot colums, using reverse swap sequence
3930 
3931       for (long i = 0; i < n; i++) {
3932          unsigned long *x = &M[i][0];
3933 
3934          for (long k = n-1; k >= 0; k--) {
3935             long pos = P[k];
3936             if (pos != k) _ntl_swap(x[pos], x[k]);
3937          }
3938       }
3939    }
3940 
3941    X.SetDims(n, n);
3942    for (long i = 0; i < n; i++)
3943       for (long j = 0; j < n; j++)
3944          X[i][j].LoopHole() = rem(M[i][j], p, red_struct);
3945 
3946    d.LoopHole() = det;
3947 }
3948 
3949 
3950 
3951 
3952 
3953 #ifdef NTL_HAVE_AVX
3954 
3955 static
alt_inv_DD(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3956 void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3957 {
3958    long n = A.NumRows();
3959 
3960    if (A.NumCols() != n)
3961       LogicError("inv: nonsquare matrix");
3962 
3963    if (n == 0) {
3964       set(d);
3965       X.SetDims(0, 0);
3966       return;
3967    }
3968 
3969    Vec< AlignedArray<double> > M;
3970    M.SetLength(n);
3971    for (long i = 0; i < n; i++) M[i].SetLength(n);
3972 
3973    for (long i = 0; i < n; i++) {
3974       for (long j = 0; j < n; j++)
3975          M[i][j] = rep(A[i][j]);
3976    }
3977 
3978 
3979    Vec<long> P;
3980    P.SetLength(n);
3981    for (long k = 0; k < n; k++) P[k] = k;
3982    // records swap operations
3983 
3984    long det;
3985    det = 1;
3986 
3987    long p = zz_p::modulus();
3988    mulmod_t pinv = zz_p::ModulusInverse();
3989    sp_reduce_struct red_struct = zz_p::red_struct();
3990 
3991 
3992 
3993    bool seq = n < PAR_THRESH_SQ;
3994 
3995    bool pivoting = false;
3996 
3997    long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3998    long red_count = red_trigger;
3999 
4000    for (long k = 0; k < n; k++) {
4001       bool cleanup = false;
4002 
4003       if (red_count-1 < 0) {
4004          red_count = red_trigger;
4005          cleanup = true;
4006       }
4007 
4008       red_count = red_count-1;
4009 
4010       long pos = -1;
4011       long pivot;
4012       long pivot_inv;
4013 
4014 
4015 
4016       for (long i = k; i < n; i++) {
4017          // NOTE: by using InvModStatus, this code will work
4018          // for prime-powers as well as primes
4019          pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
4020          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4021             pos = i;
4022             break;
4023          }
4024       }
4025 
4026       if (pos != -1) {
4027          if (k != pos) {
4028             swap(M[pos], M[k]);
4029             det = NegateMod(det, p);
4030             P[k] = pos;
4031             pivoting = true;
4032          }
4033 
4034          det = MulMod(det, pivot, p);
4035 
4036          {
4037             // multiply row k by pivot_inv
4038             long t1 = pivot_inv;
4039             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4040             double *y = &M[k][0];
4041             for (long j = 0; j < n; j++) {
4042                long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4043                y[j] = MulModPrecon(t2, t1, p, t1pinv);
4044             }
4045 
4046             y[k] = pivot_inv;
4047          }
4048 
4049 
4050          NTL_GEXEC_RANGE(seq, n, first, last)
4051          NTL_IMPORT(p)
4052          NTL_IMPORT(n)
4053          NTL_IMPORT(k)
4054          NTL_IMPORT(red_struct)
4055          double *y = &M[k][0];
4056          if (cleanup) {
4057             for (long i = first; i < last; i++) {
4058                if (i == k) continue;
4059                // skip row k: the data won't change, but it
4060                // technically is a race condition in a multi-theaded
4061                // execution
4062 
4063                double *x = &M[i][0];
4064                for (long j = 0; j < n; j++) {
4065                   x[j] = rem((unsigned long)(long)x[j], p, red_struct);
4066                }
4067             }
4068          }
4069 
4070 
4071          for (long i = first; i < last; i++) {
4072             if (i == k) continue; // skip row k
4073 
4074             double *x = &M[i][0];
4075             long t1 = rem((unsigned long)(long)x[k], p, red_struct);
4076             t1 = NegateMod(t1, p);
4077             x[k] = 0;
4078             if (t1 == 0) continue;
4079 
4080             // add t1 * row k to row i
4081             double ut1 = t1;
4082             muladd_interval1(x, y, ut1, n);
4083          }
4084          NTL_GEXEC_RANGE_END
4085       }
4086       else {
4087          clear(d);
4088          return;
4089       }
4090    }
4091 
4092 
4093    if (pivoting) {
4094       // pivot colums, using reverse swap sequence
4095 
4096       for (long i = 0; i < n; i++) {
4097          double *x = &M[i][0];
4098 
4099          for (long k = n-1; k >= 0; k--) {
4100             long pos = P[k];
4101             if (pos != k) _ntl_swap(x[pos], x[k]);
4102          }
4103       }
4104    }
4105 
4106 
4107    X.SetDims(n, n);
4108    for (long i = 0; i < n; i++)
4109       for (long j = 0; j < n; j++)
4110          X[i][j].LoopHole() = rem((unsigned long)(long)M[i][j], p, red_struct);
4111 
4112    d.LoopHole() = det;
4113 }
4114 
4115 #endif
4116 
4117 
4118 
4119 
4120 
4121 #ifdef NTL_HAVE_AVX
4122 
4123 static
blk_inv_DD(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4124 void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4125 {
4126    long n = A.NumRows();
4127 
4128    if (A.NumCols() != n)
4129       LogicError("inv: nonsquare matrix");
4130 
4131    if (n == 0) {
4132       set(d);
4133       X.SetDims(0, 0);
4134       return;
4135    }
4136 
4137    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4138 
4139    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4140 
4141 
4142    Vec< AlignedArray<double> > M;
4143    M.SetLength(npanels);
4144    for (long panel = 0; panel < npanels; panel++) {
4145       M[panel].SetLength(n*MAT_BLK_SZ);
4146       double *panelp = &M[panel][0];
4147 
4148       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4149    }
4150 
4151    // copy A into panels
4152    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4153       long j_max = min(jj+MAT_BLK_SZ, n);
4154       double *panelp = &M[panel][0];
4155 
4156       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4157          const zz_p *ap = A[i].elts() + jj;
4158 
4159          for (long j = jj; j < j_max; j++)
4160             panelp[j-jj] = rep(ap[j-jj]);
4161       }
4162    }
4163 
4164    Vec<long> P;
4165    P.SetLength(n);
4166    for (long k = 0; k < n; k++) P[k] = k;
4167    // records swap operations
4168 
4169 
4170    long det;
4171    det = 1;
4172 
4173    long p = zz_p::modulus();
4174    mulmod_t pinv = zz_p::ModulusInverse();
4175    sp_reduce_struct red_struct = zz_p::red_struct();
4176 
4177 
4178    bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4179 
4180    bool pivoting = false;
4181 
4182    long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
4183    long red_count = red_trigger;
4184 
4185    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4186       long k_max = min(kk+MAT_BLK_SZ, n);
4187 
4188       bool cleanup = false;
4189 
4190       if (red_count-MAT_BLK_SZ < 0) {
4191          red_count = red_trigger;
4192          cleanup = true;
4193       }
4194 
4195       red_count = red_count-MAT_BLK_SZ;
4196       double *kpanelp = &M[kpanel][0];
4197 
4198       if (cleanup) {
4199          for (long r = 0; r < n*MAT_BLK_SZ; r++)
4200             kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4201       }
4202 
4203       for (long k = kk; k < k_max; k++) {
4204 
4205          long pos = -1;
4206          long pivot;
4207          long pivot_inv;
4208 
4209          for (long i = k; i < n; i++) {
4210             // NOTE: by using InvModStatus, this code will work
4211             // for prime-powers as well as primes
4212             pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4213             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4214                pos = i;
4215                break;
4216             }
4217          }
4218 
4219          if (pos == -1) {
4220             clear(d);
4221             return;
4222          }
4223 
4224          double *y = &kpanelp[k*MAT_BLK_SZ];
4225          if (k != pos) {
4226             // swap rows pos and k
4227             double *x = &kpanelp[pos*MAT_BLK_SZ];
4228             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4229 
4230             det = NegateMod(det, p);
4231             P[k] = pos;
4232             pivoting = true;
4233          }
4234 
4235          det = MulMod(det, pivot, p);
4236 
4237          {
4238             // multiply row k by pivot_inv
4239             long t1 = pivot_inv;
4240             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4241             for (long j = 0; j < MAT_BLK_SZ; j++) {
4242                long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4243                y[j] = MulModPrecon(t2, t1, p, t1pinv);
4244             }
4245 
4246             y[k-kk] = pivot_inv;
4247          }
4248 
4249          for (long i = 0; i < n; i++) {
4250             if (i == k) continue; // skip row k
4251 
4252             double *x = &kpanelp[i*MAT_BLK_SZ];
4253             long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
4254             t1 = NegateMod(t1, p);
4255             x[k-kk] = 0;
4256             if (t1 == 0) continue;
4257 
4258             // add t1 * row k to row i
4259             double ut1 = t1;
4260             muladd_interval(x, y, ut1, MAT_BLK_SZ);
4261          }
4262       }
4263 
4264 
4265       // finished processing current kpanel
4266       // next, reduce and apply to all other kpanels
4267 
4268       for (long r = 0; r < n*MAT_BLK_SZ; r++)
4269          kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4270 
4271       // special processing: subtract 1 off of diangonal
4272 
4273       for (long k = kk; k < k_max; k++)
4274          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4275 
4276 
4277       NTL_GEXEC_RANGE(seq, npanels, first, last)
4278       NTL_IMPORT(p)
4279       NTL_IMPORT(n)
4280       NTL_IMPORT(red_struct)
4281       NTL_IMPORT(kpanel)
4282       NTL_IMPORT(kpanelp)
4283       NTL_IMPORT(kk)
4284       NTL_IMPORT(k_max)
4285 
4286 
4287       AlignedArray<double> buf_store;
4288       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4289       double *buf = &buf_store[0];
4290 
4291       for (long jpanel = first; jpanel < last; jpanel++) {
4292          if (jpanel == kpanel) continue;
4293 
4294          double *jpanelp = &M[jpanel][0];
4295 
4296          if (cleanup) {
4297             for (long r = 0; r < n*MAT_BLK_SZ; r++)
4298                jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
4299          }
4300 
4301          // perform swaps
4302          for (long k = kk; k < k_max; k++) {
4303             long pos = P[k];
4304             if (pos != k) {
4305                // swap rows pos and k
4306                double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4307                double *k_p = &jpanelp[k*MAT_BLK_SZ];
4308                for (long j = 0; j < MAT_BLK_SZ; j++)
4309                   _ntl_swap(pos_p[j], k_p[j]);
4310             }
4311          }
4312 
4313          // copy block number kpanel (the one on the diagonal)  into buf
4314 
4315          for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
4316             buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
4317 
4318          // jpanel += kpanel*buf
4319 
4320          muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
4321       }
4322 
4323       NTL_GEXEC_RANGE_END
4324 
4325       // special processing: add 1 back to the diangonal
4326 
4327       for (long k = kk; k < k_max; k++)
4328          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4329 
4330    }
4331 
4332    if (pivoting) {
4333       // pivot colums, using reverse swap sequence
4334 
4335       for (long k = n-1; k >= 0; k--) {
4336          long pos = P[k];
4337          if (pos != k) {
4338             // swap columns pos and k
4339 
4340             double *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4341             double *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4342             for (long i = 0; i < n; i++) {
4343                _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4344             }
4345          }
4346       }
4347    }
4348 
4349 
4350    // copy panels into X
4351    X.SetDims(n, n);
4352    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4353       long j_max = min(jj+MAT_BLK_SZ, n);
4354       double *panelp = &M[panel][0];
4355 
4356       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4357          zz_p *xp = X[i].elts() + jj;
4358 
4359          for (long j = jj; j < j_max; j++)
4360             xp[j-jj].LoopHole() = rem((unsigned long)(long)panelp[j-jj], p, red_struct);
4361       }
4362    }
4363 
4364    d.LoopHole() = det;
4365 
4366 }
4367 
4368 #endif
4369 
4370 
4371 
4372 static
blk_inv_L(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4373 void blk_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4374 {
4375    long n = A.NumRows();
4376 
4377    if (A.NumCols() != n)
4378       LogicError("inv: nonsquare matrix");
4379 
4380    if (n == 0) {
4381       set(d);
4382       X.SetDims(0, 0);
4383       return;
4384    }
4385 
4386    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4387 
4388    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4389 
4390    Vec< UniqueArray<unsigned long> > M;
4391    M.SetLength(npanels);
4392    for (long panel = 0; panel < npanels; panel++) {
4393       M[panel].SetLength(n*MAT_BLK_SZ);
4394       unsigned long *panelp = &M[panel][0];
4395 
4396       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4397    }
4398 
4399    // copy A into panels
4400    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4401       long j_max = min(jj+MAT_BLK_SZ, n);
4402       unsigned long *panelp = &M[panel][0];
4403 
4404       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4405          const zz_p *ap = A[i].elts() + jj;
4406 
4407          for (long j = jj; j < j_max; j++)
4408             panelp[j-jj] = rep(ap[j-jj]);
4409       }
4410    }
4411 
4412    Vec<long> P;
4413    P.SetLength(n);
4414    for (long k = 0; k < n; k++) P[k] = k;
4415    // records swap operations
4416 
4417 
4418    long det;
4419    det = 1;
4420 
4421    long p = zz_p::modulus();
4422    mulmod_t pinv = zz_p::ModulusInverse();
4423    sp_reduce_struct red_struct = zz_p::red_struct();
4424 
4425 
4426    bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4427 
4428    bool pivoting = false;
4429 
4430    unsigned long ured_trigger =
4431       (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
4432    // NOTE: corner case at p == 2: need unsigned long to prevent overflow
4433 
4434    long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
4435 
4436    long red_count = red_trigger;
4437 
4438    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4439       long k_max = min(kk+MAT_BLK_SZ, n);
4440 
4441       bool cleanup = false;
4442 
4443       if (red_count-MAT_BLK_SZ < 0) {
4444          red_count = red_trigger;
4445          cleanup = true;
4446       }
4447 
4448       red_count = red_count-MAT_BLK_SZ;
4449       unsigned long *kpanelp = &M[kpanel][0];
4450 
4451       if (cleanup) {
4452          for (long r = 0; r < n*MAT_BLK_SZ; r++)
4453             kpanelp[r] = rem(kpanelp[r], p, red_struct);
4454       }
4455 
4456       for (long k = kk; k < k_max; k++) {
4457 
4458          long pos = -1;
4459          long pivot;
4460          long pivot_inv;
4461 
4462          for (long i = k; i < n; i++) {
4463             // NOTE: by using InvModStatus, this code will work
4464             // for prime-powers as well as primes
4465             pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4466             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4467                pos = i;
4468                break;
4469             }
4470          }
4471 
4472          if (pos == -1) {
4473             clear(d);
4474             return;
4475          }
4476 
4477          unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
4478          if (k != pos) {
4479             // swap rows pos and k
4480             unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
4481             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4482 
4483             det = NegateMod(det, p);
4484             P[k] = pos;
4485             pivoting = true;
4486          }
4487 
4488          det = MulMod(det, pivot, p);
4489 
4490          {
4491             // multiply row k by pivot_inv
4492             long t1 = pivot_inv;
4493             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4494             for (long j = 0; j < MAT_BLK_SZ; j++) {
4495                long t2 = rem(y[j], p, red_struct);
4496                y[j] = MulModPrecon(t2, t1, p, t1pinv);
4497             }
4498 
4499             y[k-kk] = pivot_inv;
4500          }
4501 
4502          for (long i = 0; i < n; i++) {
4503             if (i == k) continue; // skip row k
4504 
4505             unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
4506             long t1 = rem(x[k-kk], p, red_struct);
4507             t1 = NegateMod(t1, p);
4508             x[k-kk] = 0;
4509             if (t1 == 0) continue;
4510 
4511             // add t1 * row k to row i
4512             unsigned long ut1 = t1;
4513             muladd_interval(x, y, ut1, MAT_BLK_SZ);
4514          }
4515       }
4516 
4517 
4518       // finished processing current kpanel
4519       // next, reduce and apply to all other kpanels
4520 
4521       for (long r = 0; r < n*MAT_BLK_SZ; r++)
4522          kpanelp[r] = rem(kpanelp[r], p, red_struct);
4523 
4524       // special processing: subtract 1 off of diangonal
4525 
4526       for (long k = kk; k < k_max; k++)
4527          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4528 
4529 
4530       NTL_GEXEC_RANGE(seq, npanels, first, last)
4531       NTL_IMPORT(p)
4532       NTL_IMPORT(n)
4533       NTL_IMPORT(red_struct)
4534       NTL_IMPORT(kpanel)
4535       NTL_IMPORT(kpanelp)
4536       NTL_IMPORT(kk)
4537       NTL_IMPORT(k_max)
4538 
4539 
4540       UniqueArray<unsigned long> buf_store;
4541       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4542       unsigned long *buf = &buf_store[0];
4543 
4544       for (long jpanel = first; jpanel < last; jpanel++) {
4545          if (jpanel == kpanel) continue;
4546 
4547          unsigned long *jpanelp = &M[jpanel][0];
4548 
4549          if (cleanup) {
4550             for (long r = 0; r < n*MAT_BLK_SZ; r++)
4551                jpanelp[r] = rem(jpanelp[r], p, red_struct);
4552          }
4553 
4554          // perform swaps
4555          for (long k = kk; k < k_max; k++) {
4556             long pos = P[k];
4557             if (pos != k) {
4558                // swap rows pos and k
4559                unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4560                unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
4561                for (long j = 0; j < MAT_BLK_SZ; j++)
4562                   _ntl_swap(pos_p[j], k_p[j]);
4563             }
4564          }
4565 
4566          // copy block number kpanel (the one on the diagonal)  into buf
4567          // here, we transpose it
4568 
4569          for (long k = kk; k < k_max; k++)
4570             for (long j = 0; j < MAT_BLK_SZ; j++)
4571                buf[j*MAT_BLK_SZ + (k-kk)] =
4572                   rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
4573 
4574          // jpanel += kpanel*buf
4575 
4576          muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
4577       }
4578 
4579       NTL_GEXEC_RANGE_END
4580 
4581       // special processing: add 1 back to the diangonal
4582 
4583       for (long k = kk; k < k_max; k++)
4584          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4585 
4586    }
4587 
4588    if (pivoting) {
4589       // pivot colums, using reverse swap sequence
4590 
4591       for (long k = n-1; k >= 0; k--) {
4592          long pos = P[k];
4593          if (pos != k) {
4594             // swap columns pos and k
4595 
4596             unsigned long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4597             unsigned long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4598             for (long i = 0; i < n; i++) {
4599                _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4600             }
4601          }
4602       }
4603    }
4604 
4605    // copy panels into X
4606    X.SetDims(n, n);
4607    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4608       long j_max = min(jj+MAT_BLK_SZ, n);
4609       unsigned long *panelp = &M[panel][0];
4610 
4611       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4612          zz_p *xp = X[i].elts() + jj;
4613 
4614          for (long j = jj; j < j_max; j++)
4615             xp[j-jj].LoopHole() = rem(panelp[j-jj], p, red_struct);
4616       }
4617    }
4618 
4619    d.LoopHole() = det;
4620 
4621 }
4622 
4623 
4624 
4625 
4626 
4627 
4628 
4629 
4630 static
blk_inv_LL(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4631 void blk_inv_LL(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4632 {
4633    long n = A.NumRows();
4634 
4635    if (A.NumCols() != n)
4636       LogicError("inv: nonsquare matrix");
4637 
4638    if (n == 0) {
4639       set(d);
4640       X.SetDims(0, 0);
4641       return;
4642    }
4643 
4644    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too big");
4645 
4646    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4647 
4648    Vec< UniqueArray<long> > M;
4649    M.SetLength(npanels);
4650    for (long panel = 0; panel < npanels; panel++) {
4651       M[panel].SetLength(n*MAT_BLK_SZ);
4652       long *panelp = &M[panel][0];
4653 
4654       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4655    }
4656 
4657 
4658    // copy A into panels
4659    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4660       long j_max = min(jj+MAT_BLK_SZ, n);
4661       long *panelp = &M[panel][0];
4662 
4663       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4664          const zz_p *ap = A[i].elts() + jj;
4665 
4666          for (long j = jj; j < j_max; j++)
4667             panelp[j-jj] = rep(ap[j-jj]);
4668       }
4669    }
4670 
4671    Vec<long> P;
4672    P.SetLength(n);
4673    for (long k = 0; k < n; k++) P[k] = k;
4674    // records swap operations
4675 
4676 
4677    long det;
4678    det = 1;
4679 
4680    long p = zz_p::modulus();
4681    mulmod_t pinv = zz_p::ModulusInverse();
4682    sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
4683 
4684 
4685    bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4686 
4687    bool pivoting = false;
4688 
4689    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4690       long k_max = min(kk+MAT_BLK_SZ, n);
4691 
4692       long *kpanelp = &M[kpanel][0];
4693 
4694 
4695       for (long k = kk; k < k_max; k++) {
4696 
4697          long pos = -1;
4698          long pivot;
4699          long pivot_inv;
4700 
4701          for (long i = k; i < n; i++) {
4702             // NOTE: by using InvModStatus, this code will work
4703             // for prime-powers as well as primes
4704             pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
4705             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4706                pos = i;
4707                break;
4708             }
4709          }
4710 
4711          if (pos == -1) {
4712             clear(d);
4713             return;
4714          }
4715 
4716          long *y = &kpanelp[k*MAT_BLK_SZ];
4717          if (k != pos) {
4718             // swap rows pos and k
4719             long *x = &kpanelp[pos*MAT_BLK_SZ];
4720             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4721 
4722             det = NegateMod(det, p);
4723             P[k] = pos;
4724             pivoting = true;
4725          }
4726 
4727          det = MulMod(det, pivot, p);
4728 
4729          {
4730             // multiply row k by pivot_inv
4731             long t1 = pivot_inv;
4732             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4733             for (long j = 0; j < MAT_BLK_SZ; j++) {
4734                y[j] = MulModPrecon(y[j], t1, p, t1pinv);
4735             }
4736 
4737             y[k-kk] = pivot_inv;
4738          }
4739 
4740          for (long i = 0; i < n; i++) {
4741             if (i == k) continue; // skip row k
4742 
4743             long *x = &kpanelp[i*MAT_BLK_SZ];
4744             long t1 = x[k-kk];
4745             t1 = NegateMod(t1, p);
4746             x[k-kk] = 0;
4747             if (t1 == 0) continue;
4748 
4749             // add t1 * row k to row i
4750             long ut1 = t1;
4751             muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
4752          }
4753       }
4754 
4755 
4756       // finished processing current kpanel
4757       // next, reduce and apply to all other kpanels
4758 
4759       // special processing: subtract 1 off of diangonal
4760 
4761       for (long k = kk; k < k_max; k++)
4762          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4763 
4764 
4765       NTL_GEXEC_RANGE(seq, npanels, first, last)
4766       NTL_IMPORT(p)
4767       NTL_IMPORT(n)
4768       NTL_IMPORT(ll_red_struct)
4769       NTL_IMPORT(kpanel)
4770       NTL_IMPORT(kpanelp)
4771       NTL_IMPORT(kk)
4772       NTL_IMPORT(k_max)
4773 
4774 
4775       UniqueArray<long> buf_store;
4776       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4777       long *buf = &buf_store[0];
4778 
4779       for (long jpanel = first; jpanel < last; jpanel++) {
4780          if (jpanel == kpanel) continue;
4781 
4782          long *jpanelp = &M[jpanel][0];
4783 
4784          // perform swaps
4785          for (long k = kk; k < k_max; k++) {
4786             long pos = P[k];
4787             if (pos != k) {
4788                // swap rows pos and k
4789                long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4790                long *k_p = &jpanelp[k*MAT_BLK_SZ];
4791                for (long j = 0; j < MAT_BLK_SZ; j++)
4792                   _ntl_swap(pos_p[j], k_p[j]);
4793             }
4794          }
4795 
4796          // copy block number kpanel (the one on the diagonal)  into buf
4797          // here, we transpose it
4798 
4799          for (long k = kk; k < k_max; k++)
4800             for (long j = 0; j < MAT_BLK_SZ; j++)
4801                buf[j*MAT_BLK_SZ + (k-kk)] =
4802                   jpanelp[k*MAT_BLK_SZ+j];
4803 
4804 
4805          // jpanel += kpanel*buf
4806 
4807          muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
4808       }
4809 
4810       NTL_GEXEC_RANGE_END
4811 
4812       // special processing: add 1 back to the diangonal
4813 
4814       for (long k = kk; k < k_max; k++)
4815          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4816 
4817    }
4818 
4819    if (pivoting) {
4820       // pivot colums, using reverse swap sequence
4821 
4822       for (long k = n-1; k >= 0; k--) {
4823          long pos = P[k];
4824          if (pos != k) {
4825             // swap columns pos and k
4826 
4827             long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4828             long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4829             for (long i = 0; i < n; i++) {
4830                _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4831             }
4832          }
4833       }
4834    }
4835 
4836    // copy panels into X
4837    X.SetDims(n, n);
4838    for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4839       long j_max = min(jj+MAT_BLK_SZ, n);
4840       long *panelp = &M[panel][0];
4841 
4842       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4843          zz_p *xp = X[i].elts() + jj;
4844 
4845          for (long j = jj; j < j_max; j++)
4846             xp[j-jj].LoopHole() = panelp[j-jj];
4847       }
4848    }
4849 
4850    d.LoopHole() = det;
4851 
4852 }
4853 
4854 
4855 
4856 #endif
4857 
4858 
4859 
relaxed_inv(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4860 void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4861 {
4862    long n = A.NumRows();
4863 
4864    if (A.NumCols() != n)
4865       LogicError("inv: nonsquare matrix");
4866 
4867 #ifndef NTL_HAVE_LL_TYPE
4868 
4869    basic_inv(d, X, A, relax);
4870 
4871 #else
4872 
4873    long p = zz_p::modulus();
4874 
4875    if (n < 16) {
4876       //cerr << "basic_inv\n";
4877       basic_inv(d, X, A, relax);
4878    }
4879    else if (n/MAT_BLK_SZ < 4) {
4880       long V = 64;
4881 
4882 #ifdef NTL_HAVE_AVX
4883       if (p-1 <= MAX_DBL_INT &&
4884           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
4885           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
4886 
4887          //cerr << "alt_inv_DD\n";
4888          alt_inv_DD(d, X, A, relax);
4889       }
4890       else
4891 #endif
4892            if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
4893                cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
4894 
4895          //cerr << "alt_inv_L\n";
4896          alt_inv_L(d, X, A, relax);
4897 
4898       }
4899       else {
4900 
4901          //cerr << "basic_inv\n";
4902          basic_inv(d, X, A, relax);
4903       }
4904    }
4905    else {
4906       long V = 4*MAT_BLK_SZ;
4907 
4908 #ifdef NTL_HAVE_AVX
4909       if (p-1 <= MAX_DBL_INT &&
4910           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
4911           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
4912 
4913          //cerr << "blk_inv_DD\n";
4914          blk_inv_DD(d, X, A, relax);
4915       }
4916       else
4917 #endif
4918            if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
4919                cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
4920 
4921          //cerr << "blk_inv_L\n";
4922          blk_inv_L(d, X, A, relax);
4923 
4924       }
4925       else {
4926 
4927          //cerr << "blk_inv_LL\n";
4928          blk_inv_LL(d, X, A, relax);
4929       }
4930 
4931    }
4932 
4933 #endif
4934 
4935 
4936 
4937 }
4938 
4939 
4940 
4941 // ******************************************************************
4942 //
4943 // Triangularizing square matrices, with applications
4944 // to solving linear systems and computing determinants.
4945 // Should be about 3x faster than the matrix inverse
4946 // algorithms.
4947 //
4948 // ******************************************************************
4949 
4950 
4951 static
basic_tri(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)4952 void basic_tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4953                vec_zz_p *xp, bool trans, bool relax)
4954 {
4955    long n = A.NumRows();
4956 
4957    // adjust
4958    if (A.NumCols() != n)
4959       LogicError("tri: nonsquare matrix");
4960 
4961    // adjust
4962    if (bp && bp->length() != n)
4963       LogicError("tri: dimension mismatch");
4964 
4965    // adjust
4966    if (bp && !xp)
4967       LogicError("tri: bad args");
4968 
4969    if (n == 0) {
4970       set(d);
4971       // adjust
4972       if (xp) xp->SetLength(0);
4973       return;
4974    }
4975 
4976    // adjust (several lines)
4977    // scratch space
4978    Mat<long> M;
4979    if (!trans) {
4980       conv(M, A);
4981    }
4982    else {
4983       M.SetDims(n, n);
4984       for (long i = 0; i < n; i++)
4985          for (long j = 0; j < n; j++)
4986             M[i][j] = rep(A[j][i]);
4987    }
4988 
4989    Vec<long> bv;
4990    if (bp) conv(bv, *bp);
4991    // end adjust
4992 
4993 
4994    Vec<long> P;
4995    P.SetLength(n);
4996    for (long k = 0; k < n; k++) P[k] = k;
4997    // records swap operations
4998 
4999    long det;
5000    det = 1;
5001 
5002    long p = zz_p::modulus();
5003    mulmod_t pinv = zz_p::ModulusInverse();
5004 
5005 
5006    bool pivoting = false;
5007 
5008    for (long k = 0; k < n; k++) {
5009       long pos = -1;
5010       long pivot_inv;
5011       for (long i = k; i < n; i++) {
5012          // NOTE: by using InvModStatus, this code will work
5013          // for prime-powers as well as primes
5014          long pivot = M[i][k];
5015          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5016             pos = i;
5017             break;
5018          }
5019       }
5020 
5021       if (pos != -1) {
5022          if (k != pos) {
5023             swap(M[pos], M[k]);
5024             det = NegateMod(det, p);
5025             P[k] = pos;
5026             pivoting = true;
5027 
5028             // adjust
5029             if (bp) _ntl_swap(bv[pos], bv[k]);
5030          }
5031 
5032          det = MulMod(det, M[k][k], p);
5033 
5034          {
5035             // multiply row k by pivot_inv
5036             long t1 = pivot_inv;
5037             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5038             long *y = &M[k][0];
5039             // adjust
5040             for (long j = k+1; j < n; j++)
5041                y[j] = MulModPrecon(y[j], t1, p, t1pinv);
5042 
5043             // adjust // y[k] = pivot_inv;
5044 
5045             // adjust
5046             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5047          }
5048 
5049 
5050 
5051          // adjust
5052          bool seq = n-(k+1) < PAR_THRESH_SQ;
5053          NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5054          NTL_IMPORT(p)
5055          NTL_IMPORT(n)
5056          NTL_IMPORT(k)
5057          long *y = &M[k][0];
5058 
5059          // adjust
5060          for (long ii = first; ii < last; ii++) {
5061             long i = ii + k+1;
5062 
5063             long *x = &M[i][0];
5064             long t1 = x[k];
5065             t1 = NegateMod(t1, p);
5066             // adjust // x[k] = 0;
5067             if (t1 == 0) continue;
5068 
5069             // add t1 * row k to row i
5070             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5071 
5072             // adjust
5073             for (long j = k+1; j < n; j++) {
5074                long t2 = MulModPrecon(y[j], t1, p, t1pinv);
5075                x[j] = AddMod(x[j], t2, p);
5076             }
5077 
5078             // adjust
5079             if (bp)
5080             {
5081                long t2 = MulModPrecon(bv[k], t1, p, t1pinv);
5082                bv[i] = AddMod(bv[i], t2, p);
5083             }
5084          }
5085          NTL_GEXEC_RANGE_END
5086       }
5087       else {
5088          clear(d);
5089          return;
5090       }
5091    }
5092 
5093 
5094    // adjust
5095    if (bp) {
5096       xp->SetLength(n);
5097       zz_p *X = xp->elts();
5098 
5099       for (long i = n-1; i >= 0; i--) {
5100          long t1 = 0;
5101          for (long j = i+1; j < n; j++) {
5102             long t2 = MulMod(rep(X[j]), M[i][j], p);
5103             t1 = AddMod(t1, t2, p);
5104          }
5105          X[i].LoopHole() = SubMod(bv[i], t1, p);
5106       }
5107    }
5108 
5109    d.LoopHole() = det;
5110 }
5111 
5112 
5113 
5114 
5115 #ifdef NTL_HAVE_LL_TYPE
5116 
5117 
5118 
5119 static
alt_tri_L(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5120 void alt_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5121                vec_zz_p *xp, bool trans, bool relax)
5122 {
5123    long n = A.NumRows();
5124 
5125    if (A.NumCols() != n)
5126       LogicError("tri: nonsquare matrix");
5127 
5128    // adjust
5129    if (bp && bp->length() != n)
5130       LogicError("tri: dimension mismatch");
5131 
5132    // adjust
5133    if (bp && !xp)
5134       LogicError("tri: bad args");
5135 
5136    if (n == 0) {
5137       set(d);
5138       if (xp) xp->SetLength(0);
5139       return;
5140    }
5141 
5142 
5143    // scratch space
5144    Mat<unsigned long> M;
5145    if (!trans) {
5146       conv(M, A);
5147    }
5148    else {
5149       M.SetDims(n, n);
5150       for (long i = 0; i < n; i++)
5151          for (long j = 0; j < n; j++)
5152             M[i][j] = rep(A[j][i]);
5153    }
5154 
5155    Vec<long> bv;
5156    if (bp) conv(bv, *bp);
5157 
5158    Vec<long> P;
5159    P.SetLength(n);
5160    for (long k = 0; k < n; k++) P[k] = k;
5161    // records swap operations
5162 
5163    long det;
5164    det = 1;
5165 
5166    long p = zz_p::modulus();
5167    mulmod_t pinv = zz_p::ModulusInverse();
5168    sp_reduce_struct red_struct = zz_p::red_struct();
5169 
5170 
5171 
5172    bool pivoting = false;
5173 
5174    unsigned long ured_trigger =
5175       (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
5176    // NOTE: corner case at p == 2: need unsigned long to prevent overflow
5177 
5178    long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
5179 
5180    long red_count = red_trigger;
5181 
5182 
5183    for (long k = 0; k < n; k++) {
5184       bool cleanup = false;
5185 
5186       if (red_count-1 < 0) {
5187          red_count = red_trigger;
5188          cleanup = true;
5189       }
5190 
5191       red_count = red_count-1;
5192 
5193       long pos = -1;
5194       long pivot;
5195       long pivot_inv;
5196 
5197       for (long i = k; i < n; i++) {
5198          // NOTE: by using InvModStatus, this code will work
5199          // for prime-powers as well as primes
5200          pivot = rem(M[i][k], p, red_struct);
5201          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5202             pos = i;
5203             break;
5204          }
5205       }
5206 
5207       if (pos != -1) {
5208          if (k != pos) {
5209             swap(M[pos], M[k]);
5210             det = NegateMod(det, p);
5211             P[k] = pos;
5212             pivoting = true;
5213 
5214             if (bp) _ntl_swap(bv[pos], bv[k]);
5215          }
5216 
5217          det = MulMod(det, pivot, p);
5218 
5219          {
5220             // multiply row k by pivot_inv
5221             long t1 = pivot_inv;
5222             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5223             unsigned long *y = &M[k][0];
5224             for (long j = k+1; j < n; j++) {
5225                long t2 = rem(y[j], p, red_struct);
5226                y[j] = MulModPrecon(t2, t1, p, t1pinv);
5227             }
5228 
5229             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5230          }
5231 
5232 
5233 
5234          bool seq = n-(k+1) < PAR_THRESH_SQ;
5235          NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5236          NTL_IMPORT(p)
5237          NTL_IMPORT(n)
5238          NTL_IMPORT(k)
5239          NTL_IMPORT(red_struct)
5240          unsigned long *y = &M[k][0];
5241          if (cleanup) {
5242             for (long ii = first; ii < last; ii++) {
5243                long i = ii + k+1;
5244 
5245                unsigned long *x = &M[i][0];
5246                for (long j = k+1; j < n; j++) {
5247                   x[j] = rem(x[j], p, red_struct);
5248                }
5249             }
5250          }
5251 
5252 
5253          for (long ii = first; ii < last; ii++) {
5254             long i = ii + k+1;
5255 
5256             unsigned long *x = &M[i][0];
5257             long t1 = rem(x[k], p, red_struct);
5258             t1 = NegateMod(t1, p);
5259             if (t1 == 0) continue;
5260 
5261             // add t1 * row k to row i
5262             unsigned long ut1 = t1;
5263             long j;
5264             for (j = k+1; j <= n-4; j+=4) {
5265                unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
5266                unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
5267                unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
5268                unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
5269                x[j+0] = xj0;
5270                x[j+1] = xj1;
5271                x[j+2] = xj2;
5272                x[j+3] = xj3;
5273             }
5274             for (; j < n; j++) {
5275                x[j] += DO_MUL(y[j], ut1);
5276             }
5277 
5278             if (bp)
5279             {
5280                long t2 = MulMod(bv[k], t1, p);
5281                bv[i] = AddMod(bv[i], t2, p);
5282             }
5283          }
5284          NTL_GEXEC_RANGE_END
5285       }
5286       else {
5287          clear(d);
5288          return;
5289       }
5290    }
5291 
5292 
5293 
5294    if (bp) {
5295       xp->SetLength(n);
5296       zz_p *X = xp->elts();
5297 
5298       for (long i = n-1; i >= 0; i--) {
5299          long t1 = 0;
5300          for (long j = i+1; j < n; j++) {
5301             long t0 = rem(M[i][j], p, red_struct);
5302             long t2 = MulMod(rep(X[j]), t0, p);
5303             t1 = AddMod(t1, t2, p);
5304          }
5305          X[i].LoopHole() = SubMod(bv[i], t1, p);
5306       }
5307    }
5308 
5309    d.LoopHole() = det;
5310 }
5311 
5312 
5313 
5314 
5315 #ifdef NTL_HAVE_AVX
5316 
5317 static
alt_tri_DD(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5318 void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5319                vec_zz_p *xp, bool trans, bool relax)
5320 {
5321    long n = A.NumRows();
5322 
5323    if (A.NumCols() != n)
5324       LogicError("tri: nonsquare matrix");
5325 
5326    // adjust
5327    if (bp && bp->length() != n)
5328       LogicError("tri: dimension mismatch");
5329 
5330    // adjust
5331    if (bp && !xp)
5332       LogicError("tri: bad args");
5333 
5334    if (n == 0) {
5335       set(d);
5336       if (xp) xp->SetLength(0);
5337       return;
5338    }
5339 
5340 
5341    // scratch space
5342 
5343    Vec< AlignedArray<double> > M;
5344    M.SetLength(n);
5345    for (long i = 0; i < n; i++) M[i].SetLength(n);
5346    if (!trans) {
5347       for (long i = 0; i < n; i++)
5348          for (long j = 0; j < n; j++)
5349             M[i][j] = rep(A[i][j]);
5350    }
5351    else {
5352       for (long i = 0; i < n; i++)
5353          for (long j = 0; j < n; j++)
5354             M[i][j] = rep(A[j][i]);
5355    }
5356 
5357    Vec<long> bv;
5358    if (bp) conv(bv, *bp);
5359 
5360    Vec<long> P;
5361    P.SetLength(n);
5362    for (long k = 0; k < n; k++) P[k] = k;
5363    // records swap operations
5364 
5365    long det;
5366    det = 1;
5367 
5368    long p = zz_p::modulus();
5369    mulmod_t pinv = zz_p::ModulusInverse();
5370    sp_reduce_struct red_struct = zz_p::red_struct();
5371 
5372 
5373 
5374    bool pivoting = false;
5375 
5376    long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
5377    long red_count = red_trigger;
5378 
5379    for (long k = 0; k < n; k++) {
5380       bool cleanup = false;
5381 
5382       if (red_count-1 < 0) {
5383          red_count = red_trigger;
5384          cleanup = true;
5385       }
5386 
5387       red_count = red_count-1;
5388 
5389       long pos = -1;
5390       long pivot;
5391       long pivot_inv;
5392 
5393       for (long i = k; i < n; i++) {
5394          // NOTE: by using InvModStatus, this code will work
5395          // for prime-powers as well as primes
5396          pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
5397          if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5398             pos = i;
5399             break;
5400          }
5401       }
5402 
5403       if (pos != -1) {
5404          if (k != pos) {
5405             swap(M[pos], M[k]);
5406             det = NegateMod(det, p);
5407             P[k] = pos;
5408             pivoting = true;
5409 
5410             if (bp) _ntl_swap(bv[pos], bv[k]);
5411          }
5412 
5413          det = MulMod(det, pivot, p);
5414 
5415          {
5416             // multiply row k by pivot_inv
5417             long t1 = pivot_inv;
5418             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5419             double *y = &M[k][0];
5420             for (long j = k+1; j < n; j++) {
5421                long t2 = rem((unsigned long)(long)y[j], p, red_struct);
5422                y[j] = MulModPrecon(t2, t1, p, t1pinv);
5423             }
5424 
5425             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5426          }
5427 
5428 
5429 
5430          bool seq = n-(k+1) < PAR_THRESH_SQ;
5431          NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5432          NTL_IMPORT(p)
5433          NTL_IMPORT(n)
5434          NTL_IMPORT(k)
5435          NTL_IMPORT(red_struct)
5436          double *y = &M[k][0];
5437          if (cleanup) {
5438             for (long ii = first; ii < last; ii++) {
5439                long i = ii + k+1;
5440 
5441                double *x = &M[i][0];
5442                for (long j = k+1; j < n; j++) {
5443                   x[j] = rem((unsigned long)(long)x[j], p, red_struct);
5444                }
5445             }
5446          }
5447 
5448          long align_boundary =
5449             min((((k+1)+(NTL_AVX_DBL_ALIGN-1))/NTL_AVX_DBL_ALIGN)*NTL_AVX_DBL_ALIGN, n);
5450 
5451 
5452          for (long ii = first; ii < last; ii++) {
5453             long i = ii + k+1;
5454 
5455             double *x = &M[i][0];
5456             long t1 = rem((unsigned long)(long)x[k], p, red_struct);
5457             t1 = NegateMod(t1, p);
5458             if (t1 == 0) continue;
5459 
5460             // add t1 * row k to row i
5461             double ut1 = t1;
5462             for (long j = k+1; j < align_boundary; j++) x[j] += y[j]*ut1;
5463             muladd_interval1(x+align_boundary, y+align_boundary, ut1, n-align_boundary);
5464 
5465             if (bp)
5466             {
5467                long t2 = MulMod(bv[k], t1, p);
5468                bv[i] = AddMod(bv[i], t2, p);
5469             }
5470          }
5471          NTL_GEXEC_RANGE_END
5472       }
5473       else {
5474          clear(d);
5475          return;
5476       }
5477    }
5478 
5479 
5480 
5481    if (bp) {
5482       xp->SetLength(n);
5483       zz_p *X = xp->elts();
5484 
5485       for (long i = n-1; i >= 0; i--) {
5486          long t1 = 0;
5487          for (long j = i+1; j < n; j++) {
5488             long t0 = rem((unsigned long)(long)M[i][j], p, red_struct);
5489             long t2 = MulMod(rep(X[j]), t0, p);
5490             t1 = AddMod(t1, t2, p);
5491          }
5492          X[i].LoopHole() = SubMod(bv[i], t1, p);
5493       }
5494    }
5495 
5496    d.LoopHole() = det;
5497 }
5498 
5499 
5500 #endif
5501 
5502 
5503 
5504 
5505 #ifdef NTL_HAVE_AVX
5506 
5507 static
blk_tri_DD(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5508 void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5509                vec_zz_p *xp, bool trans, bool relax)
5510 {
5511    long n = A.NumRows();
5512 
5513    if (A.NumCols() != n)
5514       LogicError("tri: nonsquare matrix");
5515 
5516    if (bp && bp->length() != n)
5517       LogicError("tri: dimension mismatch");
5518 
5519    if (bp && !xp)
5520       LogicError("tri: bad args");
5521 
5522    if (n == 0) {
5523       set(d);
5524       if (xp) xp->SetLength(0);
5525       return;
5526    }
5527 
5528    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5529 
5530    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5531 
5532    Vec< AlignedArray<double> > M;
5533    M.SetLength(npanels);
5534    for (long panel = 0; panel < npanels; panel++) {
5535       M[panel].SetLength(n*MAT_BLK_SZ);
5536       double *panelp = &M[panel][0];
5537 
5538       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
5539    }
5540 
5541    if (trans) {
5542       // copy A transposed into panels
5543       for (long i = 0; i < n; i++) {
5544          const zz_p *row = &A[i][0];
5545          double *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
5546          for (long j = 0; j < n; j++)
5547             col[j*MAT_BLK_SZ] = rep(row[j]);
5548       }
5549    }
5550    else {
5551       // copy A into panels
5552       for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
5553          long j_max = min(jj+MAT_BLK_SZ, n);
5554          double *panelp = &M[panel][0];
5555 
5556          for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5557             const zz_p *ap = A[i].elts() + jj;
5558 
5559             for (long j = jj; j < j_max; j++)
5560                panelp[j-jj] = rep(ap[j-jj]);
5561          }
5562       }
5563    }
5564 
5565    Vec<long> bv;
5566    if (bp) conv(bv, *bp);
5567 
5568    Vec<long> P;
5569    P.SetLength(n);
5570    for (long k = 0; k < n; k++) P[k] = k;
5571    // records swap operations
5572 
5573 
5574    long det;
5575    det = 1;
5576 
5577    long p = zz_p::modulus();
5578    mulmod_t pinv = zz_p::ModulusInverse();
5579    sp_reduce_struct red_struct = zz_p::red_struct();
5580 
5581 
5582    bool pivoting = false;
5583 
5584    long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
5585    long red_count = red_trigger;
5586 
5587    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5588       long k_max = min(kk+MAT_BLK_SZ, n);
5589 
5590       bool cleanup = false;
5591 
5592       if (red_count-MAT_BLK_SZ < 0) {
5593          red_count = red_trigger;
5594          cleanup = true;
5595       }
5596 
5597       red_count = red_count-MAT_BLK_SZ;
5598       double *kpanelp = &M[kpanel][0];
5599 
5600       if (cleanup) {
5601          for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5602             kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
5603       }
5604 
5605       for (long k = kk; k < k_max; k++) {
5606 
5607          long pos = -1;
5608          long pivot;
5609          long pivot_inv;
5610 
5611          for (long i = k; i < n; i++) {
5612             // NOTE: by using InvModStatus, this code will work
5613             // for prime-powers as well as primes
5614             pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
5615             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5616                pos = i;
5617                break;
5618             }
5619          }
5620 
5621          if (pos == -1) {
5622             clear(d);
5623             return;
5624          }
5625 
5626          double *y = &kpanelp[k*MAT_BLK_SZ];
5627          if (k != pos) {
5628             // swap rows pos and k
5629             double *x = &kpanelp[pos*MAT_BLK_SZ];
5630             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5631 
5632             det = NegateMod(det, p);
5633             P[k] = pos;
5634             pivoting = true;
5635 
5636             if (bp) _ntl_swap(bv[pos], bv[k]);
5637          }
5638 
5639          det = MulMod(det, pivot, p);
5640 
5641          {
5642             // multiply row k by pivot_inv
5643             long t1 = pivot_inv;
5644             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5645             for (long j = 0; j < MAT_BLK_SZ; j++) {
5646                long t2 = rem((unsigned long)(long)y[j], p, red_struct);
5647                y[j] = MulModPrecon(t2, t1, p, t1pinv);
5648             }
5649 
5650             y[k-kk] = pivot_inv;
5651 
5652             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5653          }
5654 
5655          for (long i = kk; i < n; i++) {
5656             if (i == k) continue; // skip row k
5657 
5658             double *x = &kpanelp[i*MAT_BLK_SZ];
5659             long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
5660             t1 = NegateMod(t1, p);
5661             x[k-kk] = 0;
5662             if (t1 == 0) continue;
5663 
5664             // add t1 * row k to row i
5665             double ut1 = t1;
5666             muladd_interval(x, y, ut1, MAT_BLK_SZ);
5667             if (bp)
5668             {
5669                long t2 = MulMod(bv[k], t1, p);
5670                bv[i] = AddMod(bv[i], t2, p);
5671             }
5672          }
5673       }
5674 
5675 
5676       // finished processing current kpanel
5677       // next, reduce and apply to all other kpanels
5678 
5679       for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5680          kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
5681 
5682       // special processing: subtract 1 off of diangonal
5683 
5684       for (long k = kk; k < k_max; k++)
5685          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5686 
5687 
5688       bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
5689 
5690       NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5691       NTL_IMPORT(p)
5692       NTL_IMPORT(n)
5693       NTL_IMPORT(red_struct)
5694       NTL_IMPORT(kpanel)
5695       NTL_IMPORT(kpanelp)
5696       NTL_IMPORT(kk)
5697       NTL_IMPORT(k_max)
5698 
5699 
5700       AlignedArray<double> buf_store;
5701       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5702       double *buf = &buf_store[0];
5703 
5704       for (long index = first; index < last; index++) {
5705          long jpanel = index + kpanel+1;
5706 
5707          double *jpanelp = &M[jpanel][0];
5708 
5709          if (cleanup) {
5710             for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5711                jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
5712          }
5713 
5714          // perform swaps
5715          for (long k = kk; k < k_max; k++) {
5716             long pos = P[k];
5717             if (pos != k) {
5718                // swap rows pos and k
5719                double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5720                double *k_p = &jpanelp[k*MAT_BLK_SZ];
5721                for (long j = 0; j < MAT_BLK_SZ; j++)
5722                   _ntl_swap(pos_p[j], k_p[j]);
5723             }
5724          }
5725 
5726          // copy block number kpanel (the one on the diagonal)  into buf
5727 
5728          for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
5729             buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
5730 
5731          // jpanel += kpanel*buf
5732 
5733          muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
5734       }
5735 
5736       NTL_GEXEC_RANGE_END
5737 
5738       // special processing: add 1 back to the diangonal
5739 
5740       for (long k = kk; k < k_max; k++)
5741          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5742 
5743    }
5744 
5745    if (bp) {
5746       xp->SetLength(n);
5747       zz_p *X = xp->elts();
5748 
5749       for (long i = n-1; i >= 0; i--) {
5750          long t1 = 0;
5751          long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5752          for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
5753              jj < n; jj += MAT_BLK_SZ, panel++) {
5754             long j_max = min(jj+MAT_BLK_SZ, n);
5755             double *row = &M[panel][i*MAT_BLK_SZ];
5756             for (long j = jj; j < j_max; j++) {
5757                long t0 = rem((unsigned long)(long)row[j-jj], p, red_struct);
5758                long t2 = MulMod(rep(X[j]), t0, p);
5759                t1 = AddMod(t1, t2, p);
5760             }
5761          }
5762          X[i].LoopHole() = SubMod(bv[i], t1, p);
5763       }
5764    }
5765 
5766    d.LoopHole() = det;
5767 
5768 }
5769 
5770 #endif
5771 
5772 
5773 static
blk_tri_L(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5774 void blk_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5775                vec_zz_p *xp, bool trans, bool relax)
5776 {
5777    long n = A.NumRows();
5778 
5779    if (A.NumCols() != n)
5780       LogicError("tri: nonsquare matrix");
5781 
5782    if (bp && bp->length() != n)
5783       LogicError("tri: dimension mismatch");
5784 
5785    if (bp && !xp)
5786       LogicError("tri: bad args");
5787 
5788    if (n == 0) {
5789       set(d);
5790       if (xp) xp->SetLength(0);
5791       return;
5792    }
5793 
5794    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5795 
5796    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5797 
5798    Vec< UniqueArray<unsigned long> > M;
5799    M.SetLength(npanels);
5800    for (long panel = 0; panel < npanels; panel++) {
5801       M[panel].SetLength(n*MAT_BLK_SZ);
5802       unsigned long *panelp = &M[panel][0];
5803 
5804       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
5805    }
5806 
5807    if (trans) {
5808       // copy A transposed into panels
5809       for (long i = 0; i < n; i++) {
5810          const zz_p *row = &A[i][0];
5811          unsigned long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
5812          for (long j = 0; j < n; j++)
5813             col[j*MAT_BLK_SZ] = rep(row[j]);
5814       }
5815    }
5816    else {
5817       // copy A into panels
5818       for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
5819          long j_max = min(jj+MAT_BLK_SZ, n);
5820          unsigned long *panelp = &M[panel][0];
5821 
5822          for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5823             const zz_p *ap = A[i].elts() + jj;
5824 
5825             for (long j = jj; j < j_max; j++)
5826                panelp[j-jj] = rep(ap[j-jj]);
5827          }
5828       }
5829    }
5830 
5831    Vec<long> bv;
5832    if (bp) conv(bv, *bp);
5833 
5834    Vec<long> P;
5835    P.SetLength(n);
5836    for (long k = 0; k < n; k++) P[k] = k;
5837    // records swap operations
5838 
5839 
5840    long det;
5841    det = 1;
5842 
5843    long p = zz_p::modulus();
5844    mulmod_t pinv = zz_p::ModulusInverse();
5845    sp_reduce_struct red_struct = zz_p::red_struct();
5846 
5847 
5848    bool pivoting = false;
5849 
5850    unsigned long ured_trigger =
5851       (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
5852    // NOTE: corner case at p == 2: need unsigned long to prevent overflow
5853 
5854    long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
5855 
5856    long red_count = red_trigger;
5857 
5858    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5859       long k_max = min(kk+MAT_BLK_SZ, n);
5860 
5861       bool cleanup = false;
5862 
5863       if (red_count-MAT_BLK_SZ < 0) {
5864          red_count = red_trigger;
5865          cleanup = true;
5866       }
5867 
5868       red_count = red_count-MAT_BLK_SZ;
5869       unsigned long *kpanelp = &M[kpanel][0];
5870 
5871       if (cleanup) {
5872          for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5873             kpanelp[r] = rem(kpanelp[r], p, red_struct);
5874       }
5875 
5876       for (long k = kk; k < k_max; k++) {
5877 
5878          long pos = -1;
5879          long pivot;
5880          long pivot_inv;
5881 
5882          for (long i = k; i < n; i++) {
5883             // NOTE: by using InvModStatus, this code will work
5884             // for prime-powers as well as primes
5885             pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
5886             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5887                pos = i;
5888                break;
5889             }
5890          }
5891 
5892          if (pos == -1) {
5893             clear(d);
5894             return;
5895          }
5896 
5897          unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
5898          if (k != pos) {
5899             // swap rows pos and k
5900             unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
5901             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5902 
5903             det = NegateMod(det, p);
5904             P[k] = pos;
5905             pivoting = true;
5906 
5907             if (bp) _ntl_swap(bv[pos], bv[k]);
5908          }
5909 
5910          det = MulMod(det, pivot, p);
5911 
5912          {
5913             // multiply row k by pivot_inv
5914             long t1 = pivot_inv;
5915             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5916             for (long j = 0; j < MAT_BLK_SZ; j++) {
5917                long t2 = rem(y[j], p, red_struct);
5918                y[j] = MulModPrecon(t2, t1, p, t1pinv);
5919             }
5920 
5921             y[k-kk] = pivot_inv;
5922 
5923             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5924          }
5925 
5926          for (long i = kk; i < n; i++) {
5927             if (i == k) continue; // skip row k
5928 
5929             unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
5930             long t1 = rem(x[k-kk], p, red_struct);
5931             t1 = NegateMod(t1, p);
5932             x[k-kk] = 0;
5933             if (t1 == 0) continue;
5934 
5935             // add t1 * row k to row i
5936             unsigned long ut1 = t1;
5937             muladd_interval(x, y, ut1, MAT_BLK_SZ);
5938             if (bp)
5939             {
5940                long t2 = MulMod(bv[k], t1, p);
5941                bv[i] = AddMod(bv[i], t2, p);
5942             }
5943          }
5944       }
5945 
5946 
5947       // finished processing current kpanel
5948       // next, reduce and apply to all other kpanels
5949 
5950       for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5951          kpanelp[r] = rem(kpanelp[r], p, red_struct);
5952 
5953       // special processing: subtract 1 off of diangonal
5954 
5955       for (long k = kk; k < k_max; k++)
5956          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5957 
5958 
5959       bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
5960       NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5961       NTL_IMPORT(p)
5962       NTL_IMPORT(n)
5963       NTL_IMPORT(red_struct)
5964       NTL_IMPORT(kpanel)
5965       NTL_IMPORT(kpanelp)
5966       NTL_IMPORT(kk)
5967       NTL_IMPORT(k_max)
5968 
5969 
5970       UniqueArray<unsigned long> buf_store;
5971       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5972       unsigned long *buf = &buf_store[0];
5973 
5974       for (long index = first; index < last; index++) {
5975          long jpanel = index + kpanel+1;
5976 
5977          unsigned long *jpanelp = &M[jpanel][0];
5978 
5979          if (cleanup) {
5980             for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5981                jpanelp[r] = rem(jpanelp[r], p, red_struct);
5982          }
5983 
5984          // perform swaps
5985          for (long k = kk; k < k_max; k++) {
5986             long pos = P[k];
5987             if (pos != k) {
5988                // swap rows pos and k
5989                unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5990                unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
5991                for (long j = 0; j < MAT_BLK_SZ; j++)
5992                   _ntl_swap(pos_p[j], k_p[j]);
5993             }
5994          }
5995 
5996          // copy block number kpanel (the one on the diagonal)  into buf
5997          // here, we transpose it
5998 
5999          for (long k = kk; k < k_max; k++)
6000             for (long j = 0; j < MAT_BLK_SZ; j++)
6001                buf[j*MAT_BLK_SZ + (k-kk)] =
6002                   rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
6003 
6004          // jpanel += kpanel*buf
6005 
6006          muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
6007       }
6008 
6009       NTL_GEXEC_RANGE_END
6010 
6011       // special processing: add 1 back to the diangonal
6012 
6013       for (long k = kk; k < k_max; k++)
6014          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6015 
6016    }
6017 
6018    if (bp) {
6019       xp->SetLength(n);
6020       zz_p *X = xp->elts();
6021 
6022       for (long i = n-1; i >= 0; i--) {
6023          long t1 = 0;
6024          long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6025          for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
6026              jj < n; jj += MAT_BLK_SZ, panel++) {
6027             long j_max = min(jj+MAT_BLK_SZ, n);
6028             unsigned long *row = &M[panel][i*MAT_BLK_SZ];
6029             for (long j = jj; j < j_max; j++) {
6030                long t0 = rem(row[j-jj], p, red_struct);
6031                long t2 = MulMod(rep(X[j]), t0, p);
6032                t1 = AddMod(t1, t2, p);
6033             }
6034          }
6035          X[i].LoopHole() = SubMod(bv[i], t1, p);
6036       }
6037    }
6038 
6039    d.LoopHole() = det;
6040 
6041 }
6042 
6043 
6044 static
blk_tri_LL(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)6045 void blk_tri_LL(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
6046                vec_zz_p *xp, bool trans, bool relax)
6047 {
6048    long n = A.NumRows();
6049 
6050    if (A.NumCols() != n)
6051       LogicError("tri: nonsquare matrix");
6052 
6053    if (bp && bp->length() != n)
6054       LogicError("tri: dimension mismatch");
6055 
6056    if (bp && !xp)
6057       LogicError("tri: bad args");
6058 
6059    if (n == 0) {
6060       set(d);
6061       if (xp) xp->SetLength(0);
6062       return;
6063    }
6064 
6065    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6066 
6067    long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6068 
6069    Vec< UniqueArray<long> > M;
6070    M.SetLength(npanels);
6071    for (long panel = 0; panel < npanels; panel++) {
6072       M[panel].SetLength(n*MAT_BLK_SZ);
6073       long *panelp = &M[panel][0];
6074 
6075       for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
6076    }
6077 
6078    if (trans) {
6079       // copy A transposed into panels
6080       for (long i = 0; i < n; i++) {
6081          const zz_p *row = &A[i][0];
6082          long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
6083          for (long j = 0; j < n; j++)
6084             col[j*MAT_BLK_SZ] = rep(row[j]);
6085       }
6086    }
6087    else {
6088       // copy A into panels
6089       for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
6090          long j_max = min(jj+MAT_BLK_SZ, n);
6091          long *panelp = &M[panel][0];
6092 
6093          for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6094             const zz_p *ap = A[i].elts() + jj;
6095 
6096             for (long j = jj; j < j_max; j++)
6097                panelp[j-jj] = rep(ap[j-jj]);
6098          }
6099       }
6100    }
6101 
6102    Vec<long> bv;
6103    if (bp) conv(bv, *bp);
6104 
6105    Vec<long> P;
6106    P.SetLength(n);
6107    for (long k = 0; k < n; k++) P[k] = k;
6108    // records swap operations
6109 
6110 
6111    long det;
6112    det = 1;
6113 
6114    long p = zz_p::modulus();
6115    mulmod_t pinv = zz_p::ModulusInverse();
6116    sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
6117 
6118 
6119    bool pivoting = false;
6120 
6121    for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
6122       long k_max = min(kk+MAT_BLK_SZ, n);
6123 
6124       long *kpanelp = &M[kpanel][0];
6125 
6126       for (long k = kk; k < k_max; k++) {
6127 
6128          long pos = -1;
6129          long pivot;
6130          long pivot_inv;
6131 
6132          for (long i = k; i < n; i++) {
6133             // NOTE: by using InvModStatus, this code will work
6134             // for prime-powers as well as primes
6135             pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
6136             if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
6137                pos = i;
6138                break;
6139             }
6140          }
6141 
6142          if (pos == -1) {
6143             clear(d);
6144             return;
6145          }
6146 
6147          long *y = &kpanelp[k*MAT_BLK_SZ];
6148          if (k != pos) {
6149             // swap rows pos and k
6150             long *x = &kpanelp[pos*MAT_BLK_SZ];
6151             for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6152 
6153             det = NegateMod(det, p);
6154             P[k] = pos;
6155             pivoting = true;
6156 
6157             if (bp) _ntl_swap(bv[pos], bv[k]);
6158          }
6159 
6160          det = MulMod(det, pivot, p);
6161 
6162          {
6163             // multiply row k by pivot_inv
6164             long t1 = pivot_inv;
6165             mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
6166             for (long j = 0; j < MAT_BLK_SZ; j++) {
6167                y[j] = MulModPrecon(y[j], t1, p, t1pinv);
6168             }
6169 
6170             y[k-kk] = pivot_inv;
6171 
6172             if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
6173          }
6174 
6175          for (long i = kk; i < n; i++) {
6176             if (i == k) continue; // skip row k
6177 
6178             long *x = &kpanelp[i*MAT_BLK_SZ];
6179             long t1 = x[k-kk];
6180             t1 = NegateMod(t1, p);
6181             x[k-kk] = 0;
6182             if (t1 == 0) continue;
6183 
6184             // add t1 * row k to row i
6185             long ut1 = t1;
6186             muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
6187             if (bp)
6188             {
6189                long t2 = MulMod(bv[k], t1, p);
6190                bv[i] = AddMod(bv[i], t2, p);
6191             }
6192          }
6193       }
6194 
6195 
6196       // finished processing current kpanel
6197       // next, reduce and apply to all other kpanels
6198 
6199       // special processing: subtract 1 off of diangonal
6200 
6201       for (long k = kk; k < k_max; k++)
6202          kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6203 
6204 
6205       bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
6206       NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6207       NTL_IMPORT(p)
6208       NTL_IMPORT(n)
6209       NTL_IMPORT(ll_red_struct)
6210       NTL_IMPORT(kpanel)
6211       NTL_IMPORT(kpanelp)
6212       NTL_IMPORT(kk)
6213       NTL_IMPORT(k_max)
6214 
6215 
6216       UniqueArray<long> buf_store;
6217       buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6218       long *buf = &buf_store[0];
6219 
6220       for (long index = first; index < last; index++) {
6221          long jpanel = index + kpanel+1;
6222 
6223          long *jpanelp = &M[jpanel][0];
6224 
6225          // perform swaps
6226          for (long k = kk; k < k_max; k++) {
6227             long pos = P[k];
6228             if (pos != k) {
6229                // swap rows pos and k
6230                long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
6231                long *k_p = &jpanelp[k*MAT_BLK_SZ];
6232                for (long j = 0; j < MAT_BLK_SZ; j++)
6233                   _ntl_swap(pos_p[j], k_p[j]);
6234             }
6235          }
6236 
6237          // copy block number kpanel (the one on the diagonal)  into buf
6238          // here, we transpose it
6239 
6240          for (long k = kk; k < k_max; k++)
6241             for (long j = 0; j < MAT_BLK_SZ; j++)
6242                buf[j*MAT_BLK_SZ + (k-kk)] = jpanelp[k*MAT_BLK_SZ+j];
6243 
6244          // jpanel += kpanel*buf
6245 
6246          muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
6247       }
6248 
6249       NTL_GEXEC_RANGE_END
6250 
6251       // special processing: add 1 back to the diangonal
6252 
6253       for (long k = kk; k < k_max; k++)
6254          kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6255 
6256    }
6257 
6258    if (bp) {
6259       xp->SetLength(n);
6260       zz_p *X = xp->elts();
6261 
6262       for (long i = n-1; i >= 0; i--) {
6263          long t1 = 0;
6264          long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6265          for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
6266              jj < n; jj += MAT_BLK_SZ, panel++) {
6267             long j_max = min(jj+MAT_BLK_SZ, n);
6268             long *row = &M[panel][i*MAT_BLK_SZ];
6269             for (long j = jj; j < j_max; j++) {
6270                long t0 = row[j-jj];
6271                long t2 = MulMod(rep(X[j]), t0, p);
6272                t1 = AddMod(t1, t2, p);
6273             }
6274          }
6275          X[i].LoopHole() = SubMod(bv[i], t1, p);
6276       }
6277    }
6278 
6279    d.LoopHole() = det;
6280 
6281 }
6282 
6283 
6284 
6285 #endif
6286 
6287 
6288 
6289 static
tri(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)6290 void tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
6291                vec_zz_p *xp, bool trans, bool relax)
6292 {
6293    long n = A.NumRows();
6294 
6295    if (A.NumCols() != n)
6296       LogicError("inv: nonsquare matrix");
6297 
6298    if (bp && bp->length() != n)
6299       LogicError("tri: dimension mismatch");
6300 
6301    if (bp && !xp)
6302       LogicError("tri: bad args");
6303 
6304 #ifndef NTL_HAVE_LL_TYPE
6305 
6306    basic_tri(d, A, bp, xp, trans, relax);
6307 
6308 #else
6309 
6310    long p = zz_p::modulus();
6311 
6312    if (n < 16) {
6313       //cerr << "basic_tri\n";
6314       basic_tri(d, A, bp, xp, trans, relax);
6315    }
6316    else if (n/MAT_BLK_SZ < 4) {
6317       long V = 64;
6318 
6319 #ifdef NTL_HAVE_AVX
6320       if (p-1 <= MAX_DBL_INT &&
6321           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
6322           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
6323 
6324          //cerr << "alt_tri_DD\n";
6325          alt_tri_DD(d, A, bp, xp, trans, relax);
6326       }
6327       else
6328 #endif
6329            if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
6330                cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
6331 
6332          //cerr << "alt_tri_L\n";
6333          alt_tri_L(d, A, bp, xp, trans, relax);
6334 
6335       }
6336       else {
6337 
6338          //cerr << "basic_tri\n";
6339          basic_tri(d, A, bp, xp, trans, relax);
6340       }
6341    }
6342    else {
6343       long V = 4*MAT_BLK_SZ;
6344 
6345 #ifdef NTL_HAVE_AVX
6346       if (p-1 <= MAX_DBL_INT &&
6347           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
6348           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
6349 
6350          //cerr << "blk_tri_DD\n";
6351          blk_tri_DD(d, A, bp, xp, trans, relax);
6352       }
6353       else
6354 #endif
6355            if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
6356                cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
6357 
6358          //cerr << "blk_tri_L\n";
6359          blk_tri_L(d, A, bp, xp, trans, relax);
6360 
6361       }
6362       else {
6363 
6364          //cerr << "blk_tri_LL\n";
6365          blk_tri_LL(d, A, bp, xp, trans, relax);
6366       }
6367 
6368    }
6369 
6370 #endif
6371 
6372 
6373 
6374 }
6375 
6376 
6377 
relaxed_determinant(zz_p & d,const mat_zz_p & A,bool relax)6378 void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax)
6379 {
6380    tri(d, A, 0, 0, false, relax);
6381 }
6382 
6383 
relaxed_solve(zz_p & d,vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b,bool relax)6384 void relaxed_solve(zz_p& d, vec_zz_p& x,
6385            const mat_zz_p& A, const vec_zz_p& b, bool relax)
6386 {
6387    tri(d, A, &b, &x, true, relax);
6388 }
6389 
relaxed_solve(zz_p & d,const mat_zz_p & A,vec_zz_p & x,const vec_zz_p & b,bool relax)6390 void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax)
6391 {
6392    tri(d, A, &b, &x, false, relax);
6393 }
6394 
6395 // ******************************************************************
6396 //
6397 // new image and kernel routines
6398 //
6399 // ******************************************************************
6400 
6401 
6402 static
elim_basic(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)6403 long elim_basic(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6404                 long w, bool full)
6405 {
6406    long n = A.NumRows();
6407    long m = A.NumCols();
6408 
6409    if (w < 0 || w > m) LogicError("elim: bad args");
6410 
6411    // take care of corner cases
6412    if (n == 0) {
6413       if (im) im->SetDims(0, m);
6414       if (ker) ker->SetDims(0, 0);
6415       return 0;
6416    }
6417 
6418    if (w == 0) {
6419       if (im) {
6420          if (full)
6421             (*im) = A;
6422          else
6423             im->SetDims(0, m);
6424       }
6425       if (ker) ident(*ker, n);
6426       return 0;
6427    }
6428 
6429    Mat<long> M;
6430    conv(M, A);
6431 
6432    Vec<long> P;
6433    P.SetLength(n);
6434    for (long k = 0; k < n; k++) P[k] = k;
6435    // records swap operations
6436 
6437    Vec<long> pcol;
6438    pcol.SetLength(n);
6439    // pcol[i] records pivot columns for row i
6440 
6441    long p = zz_p::modulus();
6442    mulmod_t pinv = zz_p::ModulusInverse();
6443 
6444    bool pivoting = false;
6445 
6446    long r = 0;
6447 
6448    for (long k = 0; k < w; k++) {
6449       long pos = -1;
6450       long pivot_inv;
6451       for (long i = r; i < n; i++) {
6452          long pivot = M[i][k];
6453          if (pivot != 0) {
6454             pivot_inv = InvMod(pivot, p);
6455             pos = i;
6456             break;
6457          }
6458       }
6459 
6460       if (pos == -1)
6461          continue;
6462 
6463       if (r != pos) {
6464          swap(M[pos], M[r]);
6465          P[r] = pos;
6466          pivoting = true;
6467       }
6468 
6469       bool seq = double(n-r)*double(m-k) < PAR_THRESH;
6470 
6471       NTL_GEXEC_RANGE(seq, n-(r+1), first, last)
6472       NTL_IMPORT(p)
6473       NTL_IMPORT(n)
6474       NTL_IMPORT(k)
6475       NTL_IMPORT(r)
6476       long *y = &M[r][0];
6477 
6478       for (long ii = first; ii < last; ii++) {
6479          long i = ii + r+1;
6480 
6481          long *x = &M[i][0];
6482          long t1 = x[k];
6483          t1 = MulMod(t1, pivot_inv, p);
6484          t1 = NegateMod(t1, p);
6485          x[k] = t1;
6486          if (t1 == 0) continue;
6487 
6488          // add t1 * row r to row i
6489          mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
6490 
6491          for (long j = k+1; j < m; j++) {
6492             long t2 = MulModPrecon(y[j], t1, p, t1pinv);
6493             x[j] = AddMod(x[j], t2, p);
6494          }
6495       }
6496       NTL_GEXEC_RANGE_END
6497 
6498       pcol[r] = k;
6499       r++;
6500    }
6501 
6502    if (im) {
6503       mat_zz_p& Im = *im;;
6504       if (full)
6505          Im.SetDims(n, m);
6506       else
6507          Im.SetDims(r, m);
6508 
6509       for (long i = 0; i < r; i++) {
6510          long pc = pcol[i];
6511          for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6512          for (long j = pc; j < m; j++) Im[i][j].LoopHole() = M[i][j];
6513       }
6514 
6515       if (full) {
6516          for (long i = r; i < n; i++) {
6517             for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6518             for (long j = w; j < m; j++) Im[i][j].LoopHole() = M[i][j];
6519          }
6520       }
6521    }
6522 
6523    if (ker) {
6524 
6525       if (n == r) {
6526          mat_zz_p& Ker = *ker;
6527          Ker.SetDims(n-r, n);
6528       }
6529       else {
6530 	 Mat<long> colbuf;
6531 	 colbuf.SetDims(r, n);
6532 
6533          for (long k = 0; k < r; k++) {
6534 	    long pc = pcol[k];
6535 	    for (long i = k+1; i < n; i++) colbuf[k][i] = M[i][pc];
6536          }
6537 
6538          M.kill();
6539 
6540 	 Mat<long> X;
6541 	 X.SetDims(n-r, r);
6542 
6543          bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6544 	 NTL_GEXEC_RANGE(seq, n-r, first, last)
6545 	 NTL_IMPORT(p)
6546 	 NTL_IMPORT(r)
6547 
6548 	 for (long i = first; i < last; i++) {
6549 	    long *Xi = &X[i][0];
6550 
6551 	    for (long k = r-1; k >= 0; k--) {
6552 	       long *cvecp = &colbuf[k][0];
6553 
6554 	       long acc = cvecp[i+r];
6555 	       for (long j = k+1; j < r; j++) {
6556 		  acc = AddMod( acc,  MulMod(Xi[j], cvecp[j], p), p );
6557 	       }
6558 	       Xi[k] = acc;
6559 	    }
6560 
6561 	 }
6562 
6563 	 NTL_GEXEC_RANGE_END
6564 
6565 	 mat_zz_p& Ker = *ker;
6566 	 Ker.SetDims(n-r, n);
6567 	 for (long i = 0; i < n-r; i++) {
6568 	    for (long j = 0; j < r; j++) Ker[i][j].LoopHole() = X[i][j];
6569 	    for (long j = r; j < n; j++) Ker[i][j].LoopHole() = 0;
6570 	    Ker[i][r+i].LoopHole() = 1;
6571 	 }
6572 
6573 	 if (pivoting) {
6574 	    for (long i = 0; i < n-r; i++) {
6575 	       zz_p *x = Ker[i].elts();
6576 
6577 	       for (long k = n-1; k >= 0; k--) {
6578 		  long pos = P[k];
6579 		  if (pos != k) swap(x[pos], x[k]);
6580 	       }
6581 	    }
6582 	 }
6583       }
6584    }
6585 
6586    return r;
6587 }
6588 
6589 #ifdef NTL_HAVE_LL_TYPE
6590 
6591 
6592 #ifdef NTL_HAVE_AVX
6593 
6594 
6595 static inline
CopyBlock(double * dst_ptr,long dst_blk,const double * src_ptr,long src_blk,long src_limit)6596 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk, long src_limit)
6597 {
6598    long src_row = src_blk*MAT_BLK_SZ;
6599    long dst_row = dst_blk*MAT_BLK_SZ;
6600 
6601    long nrows = min(MAT_BLK_SZ, src_limit - src_row);
6602 
6603    for (long i = 0; i < nrows; i++)
6604       for (long j = 0; j < MAT_BLK_SZ; j++)
6605          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6606 
6607    for (long i = nrows; i < MAT_BLK_SZ; i++)
6608       for (long j = 0; j < MAT_BLK_SZ; j++)
6609          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
6610 
6611 }
6612 
6613 static inline
CopyBlock(double * dst_ptr,long dst_blk,const double * src_ptr,long src_blk)6614 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk)
6615 {
6616    long src_row = src_blk*MAT_BLK_SZ;
6617    long dst_row = dst_blk*MAT_BLK_SZ;
6618 
6619    long nrows = MAT_BLK_SZ;
6620 
6621    for (long i = 0; i < nrows; i++)
6622       for (long j = 0; j < MAT_BLK_SZ; j++)
6623          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6624 }
6625 
6626 static inline
SwapOneRow(double * panelp,long i,long pos)6627 void SwapOneRow(double *panelp, long i, long pos)
6628 {
6629    double *pos_p = &panelp[pos*MAT_BLK_SZ];
6630    double *i_p = &panelp[i*MAT_BLK_SZ];
6631    for (long j = 0; j < MAT_BLK_SZ; j++)
6632       _ntl_swap(pos_p[j], i_p[j]);
6633 }
6634 
6635 static inline
ApplySwaps(double * panelp,long start,long end,const Vec<long> & P)6636 void ApplySwaps(double *panelp, long start, long end, const Vec<long>& P)
6637 {
6638    for (long i = start; i < end; i++) {
6639       long pos = P[i];
6640       if (pos != i)
6641          SwapOneRow(panelp, i, pos);
6642    }
6643 }
6644 
6645 
6646 static inline
MulAddBlock(double * x,const double * y,const double * z)6647 void MulAddBlock(double *x, const double *y, const double *z)
6648 {
6649    // x += y*z
6650    muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
6651 }
6652 
6653 
6654 static
elim_blk_DD(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)6655 long elim_blk_DD(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6656                  long w, bool full)
6657 {
6658    long n = A.NumRows();
6659    long m = A.NumCols();
6660 
6661    if (w < 0 || w > m) LogicError("elim: bad args");
6662 
6663    // take care of corner cases
6664    if (n == 0) {
6665       if (im) im->SetDims(0, m);
6666       if (ker) ker->SetDims(0, 0);
6667       return 0;
6668    }
6669 
6670    if (w == 0) {
6671       if (im) {
6672          if (full)
6673             (*im) = A;
6674          else
6675             im->SetDims(0, m);
6676       }
6677       if (ker) ident(*ker, n);
6678       return 0;
6679    }
6680 
6681    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6682    if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6683 
6684    long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6685 
6686 
6687    Vec< AlignedArray<double> > M;
6688    M.SetLength(npanels);
6689    for (long panel = 0; panel < npanels; panel++) {
6690       M[panel].SetLength(n*MAT_BLK_SZ);
6691       double *panelp = &M[panel][0];
6692 
6693       for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
6694    }
6695 
6696    // copy A into panels
6697    for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
6698       long j_max = min(jj+MAT_BLK_SZ, m);
6699       double *panelp = &M[panel][0];
6700 
6701       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6702          const zz_p *ap = A[i].elts() + jj;
6703 
6704          for (long j = jj; j < j_max; j++)
6705             panelp[j-jj] = rep(ap[j-jj]);
6706       }
6707    }
6708 
6709    AlignedArray<double> aux_panel_store;
6710    aux_panel_store.SetLength(n*MAT_BLK_SZ);
6711    double *aux_panel = &aux_panel_store[0];
6712 
6713 
6714    AlignedArray<double> buf_store1;
6715    buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6716    double *buf1 = &buf_store1[0];
6717 
6718    Vec<long> P;
6719    P.SetLength(n);
6720    for (long k = 0; k < n; k++) P[k] = k;
6721    // records swap operations
6722 
6723    Vec<long> pcol;
6724    pcol.SetLength(n);
6725    // pcol[i] records pivot columns for row i
6726 
6727    long p = zz_p::modulus();
6728    mulmod_t pinv = zz_p::ModulusInverse();
6729    sp_reduce_struct red_struct = zz_p::red_struct();
6730 
6731    bool pivoting = false;
6732 
6733    long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
6734    long red_count = red_trigger;
6735 
6736    long r = 0, rr = 0, k = 0, kk = 0;
6737    long rpanel = 0, kpanel = 0;
6738 
6739    while (k < w) {
6740 
6741       if (r > rr && ker) {
6742          // we have a panel from a previous iteration
6743          // we store enough of it to facilitate the kernel
6744          // computation later. At this point, we have
6745          // r == rr+INV_BLK_SIZE, and it suffices to store
6746          // rows [r..n) into M[rpanel], and this will not
6747          // overwrite anything useful in M[rpanel]
6748 
6749          double *panelp = &M[rpanel][0];
6750          for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
6751             panelp[h] = aux_panel[h];
6752          }
6753 
6754          rpanel++;
6755       }
6756 
6757       rr = r;
6758 
6759       for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
6760 
6761       bool cleanup = false;
6762 
6763       if (red_count-MAT_BLK_SZ < 0) {
6764          red_count = red_trigger;
6765          cleanup = true;
6766       }
6767 
6768       red_count = red_count-MAT_BLK_SZ;
6769 
6770       for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
6771 
6772          if (k == kk+MAT_BLK_SZ) { // start new kpanel
6773             kk = k;
6774             kpanel++;
6775          }
6776 
6777          double *kpanelp = &M[kpanel][0];
6778 
6779          if (k == kk) { // a fresh kpanel -- special processing
6780 
6781             if (cleanup) {
6782                for (long h = 0; h < n*MAT_BLK_SZ; h++)
6783                   kpanelp[h] = rem((unsigned long)(long)kpanelp[h], p, red_struct);
6784             }
6785 
6786             if (r > rr) {
6787 
6788 
6789                // apply current sequence of permutations
6790 
6791                ApplySwaps(kpanelp, rr, r, P);
6792 
6793 	       // clean aux_panel
6794 	       for (long h = 0; h < n*MAT_BLK_SZ; h++)
6795 		  aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
6796 
6797                // copy rows [rr..r) of kpanel into buf1
6798                for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6799                   buf1[i] = rem((unsigned long)(long)kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6800 
6801                // kpanel[rr..n) += aux_panel[rr..n)*buf1
6802 
6803                muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
6804             }
6805          }
6806 
6807          long pos = -1;
6808          long pivot;
6809          long pivot_inv;
6810          for (long i = r; i < n; i++) {
6811             pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
6812             kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
6813 
6814             if (pivot != 0) {
6815                pivot_inv = InvMod(pivot, p);
6816                pos = i;
6817                break;
6818             }
6819          }
6820 
6821          if (pos == -1) {
6822             continue;
6823          }
6824 
6825          double *y = &kpanelp[r*MAT_BLK_SZ];
6826          double *y1 = &aux_panel[r*MAT_BLK_SZ];
6827          if (r != pos) {
6828             // swap rows pos and r
6829             double *x = &kpanelp[pos*MAT_BLK_SZ];
6830             double *x1 = &aux_panel[pos*MAT_BLK_SZ];
6831 
6832             for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6833             for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
6834 
6835             P[r] = pos;
6836             pivoting = true;
6837          }
6838 
6839          // clean up row r of kpanel and aux_panel
6840          for (long j = k-kk; j < MAT_BLK_SZ; j++)
6841             y[j] = rem((unsigned long)(long)y[j], p, red_struct);
6842          for (long j = 0; j < r-rr; j++)
6843             y1[j] = rem((unsigned long)(long)y1[j], p, red_struct);
6844 
6845          // clear column
6846          for (long i = r+1; i < n; i++) {
6847             double *x = &kpanelp[i*MAT_BLK_SZ];
6848             double *x1 = &aux_panel[i*MAT_BLK_SZ];
6849             long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
6850             t1 = MulMod(t1, pivot_inv, p);
6851             t1 = NegateMod(t1, p);
6852             x[k-kk] = 0;
6853             x1[r-rr] = t1;
6854             if (t1 == 0) continue;
6855 
6856             // add t1 * row r to row i
6857             double ut1 = t1;
6858 
6859             for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
6860                x[j] += y[j]*ut1;
6861             for (long j = 0; j < r-rr; j++)
6862                x1[j] += y1[j]*ut1;
6863          }
6864 
6865          pcol[r] = k;
6866          r++;
6867       }
6868 
6869       if (r > rr) {
6870 
6871          // we have a panel
6872 
6873          // clean it up
6874          for (long h = 0; h < n*MAT_BLK_SZ; h++)
6875             aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
6876 
6877          bool seq =
6878             double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
6879 
6880          // apply aux_panel to remaining panels: [kpanel+1..npanels)
6881          NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6882          NTL_IMPORT(p)
6883          NTL_IMPORT(n)
6884          NTL_IMPORT(red_struct)
6885          NTL_IMPORT(aux_panel)
6886          NTL_IMPORT(rr)
6887          NTL_IMPORT(r)
6888 
6889 
6890          AlignedArray<double> buf_store;
6891          buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6892          double *buf = &buf_store[0];
6893 
6894 
6895          for (long index = first; index < last; index++) {
6896             long jpanel = index + kpanel+1;
6897 
6898             double *jpanelp = &M[jpanel][0];
6899 
6900             if (cleanup) {
6901                for (long h = 0; h < n*MAT_BLK_SZ; h++)
6902                   jpanelp[h] = rem((unsigned long)(long)jpanelp[h], p, red_struct);
6903             }
6904 
6905             // perform swaps
6906             ApplySwaps(jpanelp, rr, r, P);
6907 
6908             // copy rows [rr..r) of jpanel into buf
6909             for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6910                buf[i] = rem((unsigned long)(long)jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6911 
6912             // jpanel[rr..n) += aux_panel[rr..n)*buf
6913 
6914             muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
6915          }
6916 
6917          NTL_GEXEC_RANGE_END
6918 
6919       }
6920 
6921    }
6922 
6923    if (im) {
6924       mat_zz_p& Im = *im;;
6925       if (full)
6926          Im.SetDims(n, m);
6927       else
6928          Im.SetDims(r, m);
6929 
6930       for (long i = 0; i < r; i++) {
6931          long pc = pcol[i];
6932          for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6933          for (long j = pc; j < m; j++) {
6934             double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6935             Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
6936          }
6937       }
6938 
6939       if (full) {
6940 	 for (long i = r; i < n; i++) {
6941 	    for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6942 	    for (long j = w; j < m; j++) {
6943 	       double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6944 	       Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
6945 	    }
6946 	 }
6947       }
6948    }
6949 
6950    if (ker) {
6951       if (r == 0) {
6952          ident(*ker, n);
6953          return 0;
6954       }
6955 
6956       mat_zz_p& Ker = *ker;
6957       Ker.SetDims(n-r, n);
6958       if (r < n) {
6959 
6960 	 long start_block = r/MAT_BLK_SZ;
6961 	 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6962 	 long vblocks = end_block-start_block;
6963 	 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6964 
6965 	 Vec< AlignedArray<double> > kerbuf;
6966 	 kerbuf.SetLength(vblocks);
6967 	 for (long i = 0; i < vblocks; i++)
6968 	    kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
6969 
6970 	 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6971 
6972 	 // if r > rr, we have a panel sitting in
6973 	 // aux_panel, which may or may not be a full panel
6974 
6975          double *initial_panel = 0;
6976          if (r > rr) {
6977             initial_panel = aux_panel;
6978          }
6979          else {
6980             initial_panel = &M[hblocks-1][0];
6981          }
6982 
6983          for (long vb = start_block; vb < end_block; vb++)
6984             CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
6985 
6986          for (long hb = hblocks-2; hb >= 0; hb--) {
6987 
6988             ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
6989 
6990             for (long b = hb+1; b < end_block; b++)
6991                CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
6992          }
6993 
6994          bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6995 
6996 
6997 	 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
6998 	 NTL_IMPORT(p)
6999 	 NTL_IMPORT(red_struct)
7000 	 NTL_IMPORT(hblocks)
7001 
7002 	 for (long index = first; index < last; index++) {
7003 	    long vb = index + start_block;
7004 	    double *kerbufp = &kerbuf[vb-start_block][0];
7005 
7006 	    for (long hb = hblocks-2; hb >= 0; hb--) {
7007 	       double *colbuf = &M[hb][0];
7008 	       double *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7009 
7010 	       CopyBlock(acc, 0, colbuf, vb-1);
7011 
7012 	       long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
7013 	       long red_count = red_trigger;
7014 
7015 	       for (long b = hb+1; b < hblocks; b++) {
7016 
7017 		  if (red_count-MAT_BLK_SZ < 0) {
7018 		     red_count = red_trigger;
7019 		     for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7020 			acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
7021 
7022 		  }
7023 		  red_count = red_count-MAT_BLK_SZ;
7024 
7025 		  MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7026 				   &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
7027 	       }
7028 
7029 	       for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7030 		  acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
7031 	    }
7032          }
7033 
7034 	 NTL_GEXEC_RANGE_END
7035 
7036          for (long i = r; i < n; i++) {
7037 
7038             double *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7039 
7040             for (long j = 0; j < r; j++) {
7041                double t0 =
7042                   kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
7043                           (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7044 
7045                Ker[i-r][j].LoopHole() = long(t0);
7046             }
7047          }
7048 
7049          for (long i = 0; i < n-r; i++) {
7050             for (long j = 0; j < n-r; j++) {
7051                Ker[i][j+r].LoopHole() = 0;
7052             }
7053             Ker[i][i+r].LoopHole() = 1;
7054          }
7055 
7056 	 if (pivoting) {
7057 	    for (long i = 0; i < n-r; i++) {
7058 	       zz_p *x = Ker[i].elts();
7059 
7060 	       for (long k = n-1; k >= 0; k--) {
7061 		  long pos = P[k];
7062 		  if (pos != k) swap(x[pos], x[k]);
7063 	       }
7064 	    }
7065 	 }
7066       }
7067    }
7068 
7069    return r;
7070 
7071 }
7072 
7073 #endif
7074 
7075 
7076 
7077 static inline
CopyBlock(unsigned long * dst_ptr,long dst_blk,const unsigned long * src_ptr,long src_blk,long src_limit)7078 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk, long src_limit)
7079 {
7080    long src_row = src_blk*MAT_BLK_SZ;
7081    long dst_row = dst_blk*MAT_BLK_SZ;
7082 
7083    long nrows = min(MAT_BLK_SZ, src_limit - src_row);
7084 
7085    for (long i = 0; i < nrows; i++)
7086       for (long j = 0; j < MAT_BLK_SZ; j++)
7087          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7088 
7089    for (long i = nrows; i < MAT_BLK_SZ; i++)
7090       for (long j = 0; j < MAT_BLK_SZ; j++)
7091          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
7092 
7093 }
7094 
7095 static inline
CopyBlock(unsigned long * dst_ptr,long dst_blk,const unsigned long * src_ptr,long src_blk)7096 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk)
7097 {
7098    long src_row = src_blk*MAT_BLK_SZ;
7099    long dst_row = dst_blk*MAT_BLK_SZ;
7100 
7101    long nrows = MAT_BLK_SZ;
7102 
7103    for (long i = 0; i < nrows; i++)
7104       for (long j = 0; j < MAT_BLK_SZ; j++)
7105          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7106 }
7107 
7108 static inline
TransposeBlock(unsigned long * dst_ptr,long dst_blk)7109 void TransposeBlock(unsigned long *dst_ptr, long dst_blk)
7110 {
7111    dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
7112 
7113    for (long i = 0; i < MAT_BLK_SZ; i++)
7114       for (long j = 0; j < i; j++)
7115          _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
7116 }
7117 
7118 static inline
SwapOneRow(unsigned long * panelp,long i,long pos)7119 void SwapOneRow(unsigned long *panelp, long i, long pos)
7120 {
7121    unsigned long *pos_p = &panelp[pos*MAT_BLK_SZ];
7122    unsigned long *i_p = &panelp[i*MAT_BLK_SZ];
7123    for (long j = 0; j < MAT_BLK_SZ; j++)
7124       _ntl_swap(pos_p[j], i_p[j]);
7125 }
7126 
7127 static inline
ApplySwaps(unsigned long * panelp,long start,long end,const Vec<long> & P)7128 void ApplySwaps(unsigned long *panelp, long start, long end, const Vec<long>& P)
7129 {
7130    for (long i = start; i < end; i++) {
7131       long pos = P[i];
7132       if (pos != i)
7133          SwapOneRow(panelp, i, pos);
7134    }
7135 }
7136 
7137 
7138 static inline
MulAddBlock(unsigned long * x,const unsigned long * y,const unsigned long * z)7139 void MulAddBlock(unsigned long *x, const unsigned long *y, const unsigned long *z)
7140 {
7141    // x += y*z
7142 
7143    muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
7144 }
7145 
7146 
7147 static
elim_blk_L(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)7148 long elim_blk_L(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
7149                  long w, bool full)
7150 {
7151    long n = A.NumRows();
7152    long m = A.NumCols();
7153 
7154    if (w < 0 || w > m) LogicError("elim: bad args");
7155 
7156    // take care of corner cases
7157    if (n == 0) {
7158       if (im) im->SetDims(0, m);
7159       if (ker) ker->SetDims(0, 0);
7160       return 0;
7161    }
7162 
7163    if (w == 0) {
7164       if (im) {
7165          if (full)
7166             (*im) = A;
7167          else
7168             im->SetDims(0, m);
7169       }
7170       if (ker) ident(*ker, n);
7171       return 0;
7172    }
7173 
7174    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7175    if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7176 
7177    long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7178 
7179 
7180    Vec< UniqueArray<unsigned long> > M;
7181    M.SetLength(npanels);
7182    for (long panel = 0; panel < npanels; panel++) {
7183       M[panel].SetLength(n*MAT_BLK_SZ);
7184       unsigned long *panelp = &M[panel][0];
7185 
7186       for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
7187    }
7188 
7189    // copy A into panels
7190    for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
7191       long j_max = min(jj+MAT_BLK_SZ, m);
7192       unsigned long *panelp = &M[panel][0];
7193 
7194       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
7195          const zz_p *ap = A[i].elts() + jj;
7196 
7197          for (long j = jj; j < j_max; j++)
7198             panelp[j-jj] = rep(ap[j-jj]);
7199       }
7200    }
7201 
7202    UniqueArray<unsigned long> aux_panel_store;
7203    aux_panel_store.SetLength(n*MAT_BLK_SZ);
7204    unsigned long *aux_panel = &aux_panel_store[0];
7205 
7206 
7207    UniqueArray<unsigned long> buf_store1;
7208    buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7209    unsigned long *buf1 = &buf_store1[0];
7210 
7211    Vec<long> P;
7212    P.SetLength(n);
7213    for (long k = 0; k < n; k++) P[k] = k;
7214    // records swap operations
7215 
7216    Vec<long> pcol;
7217    pcol.SetLength(n);
7218    // pcol[i] records pivot columns for row i
7219 
7220    long p = zz_p::modulus();
7221    mulmod_t pinv = zz_p::ModulusInverse();
7222    sp_reduce_struct red_struct = zz_p::red_struct();
7223 
7224    bool pivoting = false;
7225 
7226    unsigned long ured_trigger =
7227       (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
7228    // NOTE: corner case at p == 2: need unsigned long to prevent overflow
7229 
7230    long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
7231 
7232    long red_count = red_trigger;
7233 
7234    long r = 0, rr = 0, k = 0, kk = 0;
7235    long rpanel = 0, kpanel = 0;
7236 
7237    while (k < w) {
7238 
7239       if (r > rr && ker) {
7240          // we have a panel from a previous iteration
7241          // we store enough of it to facilitate the kernel
7242          // computation later. At this point, we have
7243          // r == rr+INV_BLK_SIZE, and it suffices to store
7244          // rows [r..n) into M[rpanel], and this will not
7245          // overwrite anything useful in M[rpanel]
7246 
7247          unsigned long *panelp = &M[rpanel][0];
7248          for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
7249             panelp[h] = aux_panel[h];
7250          }
7251 
7252          rpanel++;
7253       }
7254 
7255       rr = r;
7256 
7257       for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
7258 
7259       bool cleanup = false;
7260 
7261       if (red_count-MAT_BLK_SZ < 0) {
7262          red_count = red_trigger;
7263          cleanup = true;
7264       }
7265 
7266       red_count = red_count-MAT_BLK_SZ;
7267 
7268       for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
7269 
7270          if (k == kk+MAT_BLK_SZ) { // start new kpanel
7271             kk = k;
7272             kpanel++;
7273          }
7274 
7275          unsigned long *kpanelp = &M[kpanel][0];
7276 
7277          if (k == kk) { // a fresh kpanel -- special processing
7278 
7279             if (cleanup) {
7280                for (long h = 0; h < n*MAT_BLK_SZ; h++)
7281                   kpanelp[h] = rem(kpanelp[h], p, red_struct);
7282             }
7283 
7284             if (r > rr) {
7285 
7286 
7287                // apply current sequence of permutations
7288 
7289                ApplySwaps(kpanelp, rr, r, P);
7290 
7291 	       // clean aux_panel
7292 	       for (long h = 0; h < n*MAT_BLK_SZ; h++)
7293 		  aux_panel[h] = rem(aux_panel[h], p, red_struct);
7294 
7295                // copy rows [rr..r) of kpanel into buf1
7296                for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7297                   buf1[i] = rem(kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
7298 
7299                TransposeBlock(buf1, 0);
7300 
7301                // kpanel[rr..n) += aux_panel[rr..n)*buf1
7302 
7303                muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
7304             }
7305          }
7306 
7307          long pos = -1;
7308          long pivot;
7309          long pivot_inv;
7310          for (long i = r; i < n; i++) {
7311             pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
7312             kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
7313 
7314             if (pivot != 0) {
7315                pivot_inv = InvMod(pivot, p);
7316                pos = i;
7317                break;
7318             }
7319          }
7320 
7321          if (pos == -1) {
7322             continue;
7323          }
7324 
7325          unsigned long *y = &kpanelp[r*MAT_BLK_SZ];
7326          unsigned long *y1 = &aux_panel[r*MAT_BLK_SZ];
7327          if (r != pos) {
7328             // swap rows pos and r
7329             unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
7330             unsigned long *x1 = &aux_panel[pos*MAT_BLK_SZ];
7331 
7332             for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7333             for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
7334 
7335             P[r] = pos;
7336             pivoting = true;
7337          }
7338 
7339          // clean up row r of kpanel and aux_panel
7340          for (long j = k-kk; j < MAT_BLK_SZ; j++)
7341             y[j] = rem(y[j], p, red_struct);
7342          for (long j = 0; j < r-rr; j++)
7343             y1[j] = rem(y1[j], p, red_struct);
7344 
7345          // clear column
7346          for (long i = r+1; i < n; i++) {
7347             unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
7348             unsigned long *x1 = &aux_panel[i*MAT_BLK_SZ];
7349             long t1 = rem(x[k-kk], p, red_struct);
7350             t1 = MulMod(t1, pivot_inv, p);
7351             t1 = NegateMod(t1, p);
7352             x[k-kk] = 0;
7353             x1[r-rr] = t1;
7354             if (t1 == 0) continue;
7355 
7356             // add t1 * row r to row i
7357             unsigned long ut1 = t1;
7358 
7359             for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
7360                x[j] += y[j]*ut1;
7361             for (long j = 0; j < r-rr; j++)
7362                x1[j] += y1[j]*ut1;
7363          }
7364 
7365          pcol[r] = k;
7366          r++;
7367       }
7368 
7369       if (r > rr) {
7370 
7371          // we have a panel
7372 
7373          // clean it up
7374          for (long h = 0; h < n*MAT_BLK_SZ; h++)
7375             aux_panel[h] = rem(aux_panel[h], p, red_struct);
7376 
7377          bool seq =
7378             double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
7379 
7380          // apply aux_panel to remaining panels: [kpanel+1..npanels)
7381          NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
7382          NTL_IMPORT(p)
7383          NTL_IMPORT(n)
7384          NTL_IMPORT(red_struct)
7385          NTL_IMPORT(aux_panel)
7386          NTL_IMPORT(rr)
7387          NTL_IMPORT(r)
7388 
7389 
7390          UniqueArray<unsigned long> buf_store;
7391          buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7392          unsigned long *buf = &buf_store[0];
7393 
7394 
7395          for (long index = first; index < last; index++) {
7396             long jpanel = index + kpanel+1;
7397 
7398             unsigned long *jpanelp = &M[jpanel][0];
7399 
7400             if (cleanup) {
7401                for (long h = 0; h < n*MAT_BLK_SZ; h++)
7402                   jpanelp[h] = rem(jpanelp[h], p, red_struct);
7403             }
7404 
7405             // perform swaps
7406             ApplySwaps(jpanelp, rr, r, P);
7407 
7408             // copy rows [rr..r) of jpanel into buf
7409             for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7410                buf[i] = rem(jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
7411 
7412             TransposeBlock(buf, 0);
7413 
7414             // jpanel[rr..n) += aux_panel[rr..n)*buf
7415 
7416             muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
7417          }
7418 
7419          NTL_GEXEC_RANGE_END
7420 
7421       }
7422 
7423    }
7424 
7425    if (im) {
7426       mat_zz_p& Im = *im;;
7427       if (full)
7428          Im.SetDims(n, m);
7429       else
7430          Im.SetDims(r, m);
7431 
7432       for (long i = 0; i < r; i++) {
7433          long pc = pcol[i];
7434          for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
7435          for (long j = pc; j < m; j++) {
7436             unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7437             Im[i][j].LoopHole() = rem(t0, p, red_struct);
7438          }
7439       }
7440 
7441       if (full) {
7442 	 for (long i = r; i < n; i++) {
7443 	    for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
7444 	    for (long j = w; j < m; j++) {
7445 	       unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7446 	       Im[i][j].LoopHole() = rem(t0, p, red_struct);
7447 	    }
7448 	 }
7449       }
7450    }
7451 
7452    if (ker) {
7453       if (r == 0) {
7454          ident(*ker, n);
7455          return 0;
7456       }
7457 
7458       mat_zz_p& Ker = *ker;
7459       Ker.SetDims(n-r, n);
7460       if (r < n) {
7461 
7462 	 long start_block = r/MAT_BLK_SZ;
7463 	 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7464 	 long vblocks = end_block-start_block;
7465 	 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7466 
7467 	 Vec< UniqueArray<unsigned long> > kerbuf;
7468 	 kerbuf.SetLength(vblocks);
7469 	 for (long i = 0; i < vblocks; i++)
7470 	    kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
7471 
7472 	 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7473 
7474 	 // if r > rr, we have a panel sitting in
7475 	 // aux_panel, which may or may not be a full panel
7476 
7477          unsigned long *initial_panel = 0;
7478          if (r > rr) {
7479             initial_panel = aux_panel;
7480          }
7481          else {
7482             initial_panel = &M[hblocks-1][0];
7483          }
7484 
7485          for (long vb = start_block; vb < end_block; vb++)
7486             CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
7487 
7488          for (long hb = hblocks-2; hb >= 0; hb--) {
7489 
7490             ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
7491 
7492             for (long b = hb+1; b < end_block; b++) {
7493                CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
7494                TransposeBlock(&M[hb][0], b-1);
7495             }
7496          }
7497 
7498          bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
7499 
7500 
7501 	 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
7502 	 NTL_IMPORT(p)
7503 	 NTL_IMPORT(red_struct)
7504 	 NTL_IMPORT(hblocks)
7505 
7506 	 for (long index = first; index < last; index++) {
7507 	    long vb = index + start_block;
7508 	    unsigned long *kerbufp = &kerbuf[vb-start_block][0];
7509 
7510 	    for (long hb = hblocks-2; hb >= 0; hb--) {
7511 	       unsigned long *colbuf = &M[hb][0];
7512 	       unsigned long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7513 
7514 	       CopyBlock(acc, 0, colbuf, vb-1);
7515                TransposeBlock(acc, 0);
7516 
7517 
7518                unsigned long ured_trigger =
7519                   (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
7520                // NOTE: corner case at p == 2: need unsigned long to prevent overflow
7521 
7522                long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
7523 	       long red_count = red_trigger;
7524 
7525 	       for (long b = hb+1; b < hblocks; b++) {
7526 
7527 		  if (red_count-MAT_BLK_SZ < 0) {
7528 		     red_count = red_trigger;
7529 		     for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7530 			acc[h] = rem(acc[h], p, red_struct);
7531 
7532 		  }
7533 		  red_count = red_count-MAT_BLK_SZ;
7534 
7535 		  MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7536 				   &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
7537 	       }
7538 
7539 	       for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7540 		  acc[h] = rem(acc[h], p, red_struct);
7541 	    }
7542          }
7543 
7544 	 NTL_GEXEC_RANGE_END
7545 
7546          for (long i = r; i < n; i++) {
7547 
7548             unsigned long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7549 
7550             for (long j = 0; j < r; j++) {
7551                unsigned long t0 =
7552                   kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
7553                           (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7554 
7555                Ker[i-r][j].LoopHole() = long(t0);
7556             }
7557          }
7558 
7559          for (long i = 0; i < n-r; i++) {
7560             for (long j = 0; j < n-r; j++) {
7561                Ker[i][j+r].LoopHole() = 0;
7562             }
7563             Ker[i][i+r].LoopHole() = 1;
7564          }
7565 
7566 	 if (pivoting) {
7567 	    for (long i = 0; i < n-r; i++) {
7568 	       zz_p *x = Ker[i].elts();
7569 
7570 	       for (long k = n-1; k >= 0; k--) {
7571 		  long pos = P[k];
7572 		  if (pos != k) swap(x[pos], x[k]);
7573 	       }
7574 	    }
7575 	 }
7576       }
7577    }
7578 
7579    return r;
7580 
7581 }
7582 
7583 
7584 static inline
CopyBlock(long * dst_ptr,long dst_blk,const long * src_ptr,long src_blk,long src_limit)7585 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk, long src_limit)
7586 {
7587    long src_row = src_blk*MAT_BLK_SZ;
7588    long dst_row = dst_blk*MAT_BLK_SZ;
7589 
7590    long nrows = min(MAT_BLK_SZ, src_limit - src_row);
7591 
7592    for (long i = 0; i < nrows; i++)
7593       for (long j = 0; j < MAT_BLK_SZ; j++)
7594          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7595 
7596    for (long i = nrows; i < MAT_BLK_SZ; i++)
7597       for (long j = 0; j < MAT_BLK_SZ; j++)
7598          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
7599 
7600 }
7601 
7602 static inline
CopyBlock(long * dst_ptr,long dst_blk,const long * src_ptr,long src_blk)7603 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk)
7604 {
7605    long src_row = src_blk*MAT_BLK_SZ;
7606    long dst_row = dst_blk*MAT_BLK_SZ;
7607 
7608    long nrows = MAT_BLK_SZ;
7609 
7610    for (long i = 0; i < nrows; i++)
7611       for (long j = 0; j < MAT_BLK_SZ; j++)
7612          dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7613 }
7614 
7615 static inline
TransposeBlock(long * dst_ptr,long dst_blk)7616 void TransposeBlock(long *dst_ptr, long dst_blk)
7617 {
7618    dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
7619 
7620    for (long i = 0; i < MAT_BLK_SZ; i++)
7621       for (long j = 0; j < i; j++)
7622          _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
7623 }
7624 
7625 static inline
SwapOneRow(long * panelp,long i,long pos)7626 void SwapOneRow(long *panelp, long i, long pos)
7627 {
7628    long *pos_p = &panelp[pos*MAT_BLK_SZ];
7629    long *i_p = &panelp[i*MAT_BLK_SZ];
7630    for (long j = 0; j < MAT_BLK_SZ; j++)
7631       _ntl_swap(pos_p[j], i_p[j]);
7632 }
7633 
7634 static inline
ApplySwaps(long * panelp,long start,long end,const Vec<long> & P)7635 void ApplySwaps(long *panelp, long start, long end, const Vec<long>& P)
7636 {
7637    for (long i = start; i < end; i++) {
7638       long pos = P[i];
7639       if (pos != i)
7640          SwapOneRow(panelp, i, pos);
7641    }
7642 }
7643 
7644 
7645 static inline
MulAddBlock(long * x,const long * y,const long * z,long p,sp_ll_reduce_struct ll_red_struct)7646 void MulAddBlock(long *x, const long *y, const long *z,
7647                  long p, sp_ll_reduce_struct ll_red_struct)
7648 {
7649    // x += y*z
7650 
7651    muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ, p, ll_red_struct);
7652 }
7653 
7654 
7655 
7656 static
elim_blk_LL(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)7657 long elim_blk_LL(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
7658                  long w, bool full)
7659 {
7660    long n = A.NumRows();
7661    long m = A.NumCols();
7662 
7663    if (w < 0 || w > m) LogicError("elim: bad args");
7664 
7665    // take care of corner cases
7666    if (n == 0) {
7667       if (im) im->SetDims(0, m);
7668       if (ker) ker->SetDims(0, 0);
7669       return 0;
7670    }
7671 
7672    if (w == 0) {
7673       if (im) {
7674          if (full)
7675             (*im) = A;
7676          else
7677             im->SetDims(0, m);
7678       }
7679       if (ker) ident(*ker, n);
7680       return 0;
7681    }
7682 
7683    if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7684    if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7685 
7686    long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7687 
7688 
7689    Vec< UniqueArray<long> > M;
7690    M.SetLength(npanels);
7691    for (long panel = 0; panel < npanels; panel++) {
7692       M[panel].SetLength(n*MAT_BLK_SZ);
7693       long *panelp = &M[panel][0];
7694 
7695       for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
7696    }
7697 
7698    // copy A into panels
7699    for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
7700       long j_max = min(jj+MAT_BLK_SZ, m);
7701       long *panelp = &M[panel][0];
7702 
7703       for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
7704          const zz_p *ap = A[i].elts() + jj;
7705 
7706          for (long j = jj; j < j_max; j++)
7707             panelp[j-jj] = rep(ap[j-jj]);
7708       }
7709    }
7710 
7711    UniqueArray<long> aux_panel_store;
7712    aux_panel_store.SetLength(n*MAT_BLK_SZ);
7713    long *aux_panel = &aux_panel_store[0];
7714 
7715 
7716    UniqueArray<long> buf_store1;
7717    buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7718    long *buf1 = &buf_store1[0];
7719 
7720    Vec<long> P;
7721    P.SetLength(n);
7722    for (long k = 0; k < n; k++) P[k] = k;
7723    // records swap operations
7724 
7725    Vec<long> pcol;
7726    pcol.SetLength(n);
7727    // pcol[i] records pivot columns for row i
7728 
7729    long p = zz_p::modulus();
7730    mulmod_t pinv = zz_p::ModulusInverse();
7731    sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
7732 
7733    bool pivoting = false;
7734 
7735    long r = 0, rr = 0, k = 0, kk = 0;
7736    long rpanel = 0, kpanel = 0;
7737 
7738    while (k < w) {
7739 
7740       if (r > rr && ker) {
7741          // we have a panel from a previous iteration
7742          // we store enough of it to facilitate the kernel
7743          // computation later. At this point, we have
7744          // r == rr+INV_BLK_SIZE, and it suffices to store
7745          // rows [r..n) into M[rpanel], and this will not
7746          // overwrite anything useful in M[rpanel]
7747 
7748          long *panelp = &M[rpanel][0];
7749          for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
7750             panelp[h] = aux_panel[h];
7751          }
7752 
7753          rpanel++;
7754       }
7755 
7756       rr = r;
7757 
7758       for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
7759 
7760       for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
7761 
7762          if (k == kk+MAT_BLK_SZ) { // start new kpanel
7763             kk = k;
7764             kpanel++;
7765          }
7766 
7767          long *kpanelp = &M[kpanel][0];
7768 
7769          if (k == kk) { // a fresh kpanel -- special processing
7770 
7771 
7772             if (r > rr) {
7773 
7774 
7775                // apply current sequence of permutations
7776 
7777                ApplySwaps(kpanelp, rr, r, P);
7778 
7779                // copy rows [rr..r) of kpanel into buf1
7780                for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7781                   buf1[i] = kpanelp[rr*MAT_BLK_SZ+i];
7782 
7783                TransposeBlock(buf1, 0);
7784 
7785                // kpanel[rr..n) += aux_panel[rr..n)*buf1
7786 
7787                muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr, p, ll_red_struct);
7788             }
7789          }
7790 
7791          long pos = -1;
7792          long pivot;
7793          long pivot_inv;
7794          for (long i = r; i < n; i++) {
7795             pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
7796             kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
7797 
7798             if (pivot != 0) {
7799                pivot_inv = InvMod(pivot, p);
7800                pos = i;
7801                break;
7802             }
7803          }
7804 
7805          if (pos == -1) {
7806             continue;
7807          }
7808 
7809          long *y = &kpanelp[r*MAT_BLK_SZ];
7810          long *y1 = &aux_panel[r*MAT_BLK_SZ];
7811          if (r != pos) {
7812             // swap rows pos and r
7813             long *x = &kpanelp[pos*MAT_BLK_SZ];
7814             long *x1 = &aux_panel[pos*MAT_BLK_SZ];
7815 
7816             for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7817             for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
7818 
7819             P[r] = pos;
7820             pivoting = true;
7821          }
7822 
7823          // clear column
7824          for (long i = r+1; i < n; i++) {
7825             long *x = &kpanelp[i*MAT_BLK_SZ];
7826             long *x1 = &aux_panel[i*MAT_BLK_SZ];
7827             long t1 = x[k-kk];
7828             t1 = MulMod(t1, pivot_inv, p);
7829             t1 = NegateMod(t1, p);
7830             x[k-kk] = 0;
7831             x1[r-rr] = t1;
7832             if (t1 == 0) continue;
7833 
7834             // add t1 * row r to row i
7835             long ut1 = t1;
7836             mulmod_precon_t ut1_pinv = PrepMulModPrecon(ut1, p, pinv);
7837 
7838             for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
7839                x[j] = AddMod(x[j], MulModPrecon(y[j], ut1, p, ut1_pinv), p);
7840             for (long j = 0; j < r-rr; j++)
7841                x1[j] = AddMod(x1[j], MulModPrecon(y1[j], ut1, p, ut1_pinv), p);
7842          }
7843 
7844          pcol[r] = k;
7845          r++;
7846       }
7847 
7848       if (r > rr) {
7849 
7850          // we have a panel
7851 
7852          bool seq =
7853             double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
7854 
7855          // apply aux_panel to remaining panels: [kpanel+1..npanels)
7856          NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
7857          NTL_IMPORT(p)
7858          NTL_IMPORT(n)
7859          NTL_IMPORT(ll_red_struct)
7860          NTL_IMPORT(aux_panel)
7861          NTL_IMPORT(rr)
7862          NTL_IMPORT(r)
7863 
7864 
7865          UniqueArray<long> buf_store;
7866          buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7867          long *buf = &buf_store[0];
7868 
7869 
7870          for (long index = first; index < last; index++) {
7871             long jpanel = index + kpanel+1;
7872 
7873             long *jpanelp = &M[jpanel][0];
7874 
7875             // perform swaps
7876             ApplySwaps(jpanelp, rr, r, P);
7877 
7878             // copy rows [rr..r) of jpanel into buf
7879             for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7880                buf[i] = jpanelp[rr*MAT_BLK_SZ+i];
7881 
7882             TransposeBlock(buf, 0);
7883 
7884             // jpanel[rr..n) += aux_panel[rr..n)*buf
7885 
7886             muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr, p, ll_red_struct);
7887          }
7888 
7889          NTL_GEXEC_RANGE_END
7890 
7891       }
7892 
7893    }
7894 
7895    if (im) {
7896       mat_zz_p& Im = *im;;
7897       if (full)
7898          Im.SetDims(n, m);
7899       else
7900          Im.SetDims(r, m);
7901 
7902       for (long i = 0; i < r; i++) {
7903          long pc = pcol[i];
7904          for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
7905          for (long j = pc; j < m; j++) {
7906             long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7907             Im[i][j].LoopHole() = t0;
7908          }
7909       }
7910 
7911       if (full) {
7912 	 for (long i = r; i < n; i++) {
7913 	    for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
7914 	    for (long j = w; j < m; j++) {
7915 	       long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7916 	       Im[i][j].LoopHole() = t0;
7917 	    }
7918 	 }
7919       }
7920    }
7921 
7922    if (ker) {
7923       if (r == 0) {
7924          ident(*ker, n);
7925          return 0;
7926       }
7927 
7928       mat_zz_p& Ker = *ker;
7929       Ker.SetDims(n-r, n);
7930       if (r < n) {
7931 
7932 	 long start_block = r/MAT_BLK_SZ;
7933 	 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7934 	 long vblocks = end_block-start_block;
7935 	 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7936 
7937 	 Vec< UniqueArray<long> > kerbuf;
7938 	 kerbuf.SetLength(vblocks);
7939 	 for (long i = 0; i < vblocks; i++)
7940 	    kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
7941 
7942 	 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7943 
7944 	 // if r > rr, we have a panel sitting in
7945 	 // aux_panel, which may or may not be a full panel
7946 
7947          long *initial_panel = 0;
7948          if (r > rr) {
7949             initial_panel = aux_panel;
7950          }
7951          else {
7952             initial_panel = &M[hblocks-1][0];
7953          }
7954 
7955          for (long vb = start_block; vb < end_block; vb++)
7956             CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
7957 
7958          for (long hb = hblocks-2; hb >= 0; hb--) {
7959 
7960             ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
7961 
7962             for (long b = hb+1; b < end_block; b++) {
7963                CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
7964                TransposeBlock(&M[hb][0], b-1);
7965             }
7966          }
7967 
7968          bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
7969 
7970 
7971 	 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
7972 	 NTL_IMPORT(p)
7973 	 NTL_IMPORT(ll_red_struct)
7974 	 NTL_IMPORT(hblocks)
7975 
7976 	 for (long index = first; index < last; index++) {
7977 	    long vb = index + start_block;
7978 	    long *kerbufp = &kerbuf[vb-start_block][0];
7979 
7980 	    for (long hb = hblocks-2; hb >= 0; hb--) {
7981 	       long *colbuf = &M[hb][0];
7982 	       long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7983 
7984 	       CopyBlock(acc, 0, colbuf, vb-1);
7985                TransposeBlock(acc, 0);
7986 
7987 	       for (long b = hb+1; b < hblocks; b++) {
7988 		  MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7989 				   &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ], p, ll_red_struct);
7990 	       }
7991 	    }
7992          }
7993 
7994 	 NTL_GEXEC_RANGE_END
7995 
7996          for (long i = r; i < n; i++) {
7997 
7998             long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7999 
8000             for (long j = 0; j < r; j++) {
8001                long t0 =
8002                   kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
8003                           (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
8004 
8005                Ker[i-r][j].LoopHole() = long(t0);
8006             }
8007          }
8008 
8009          for (long i = 0; i < n-r; i++) {
8010             for (long j = 0; j < n-r; j++) {
8011                Ker[i][j+r].LoopHole() = 0;
8012             }
8013             Ker[i][i+r].LoopHole() = 1;
8014          }
8015 
8016 	 if (pivoting) {
8017 	    for (long i = 0; i < n-r; i++) {
8018 	       zz_p *x = Ker[i].elts();
8019 
8020 	       for (long k = n-1; k >= 0; k--) {
8021 		  long pos = P[k];
8022 		  if (pos != k) swap(x[pos], x[k]);
8023 	       }
8024 	    }
8025 	 }
8026       }
8027    }
8028 
8029    return r;
8030 
8031 }
8032 
8033 
8034 #endif
8035 
8036 
8037 
8038 static
elim(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)8039 long elim(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker, long w, bool full)
8040 {
8041    long n = A.NumRows();
8042    long m = A.NumCols();
8043 
8044    if (w < 0 || w > m) LogicError("elim: bad args");
8045 
8046 #ifndef NTL_HAVE_LL_TYPE
8047 
8048    return elim_basic(A, im, ker, w, full);
8049 
8050 #else
8051 
8052    long p = zz_p::modulus();
8053 
8054    if (n/MAT_BLK_SZ < 4 || w/MAT_BLK_SZ < 4) {
8055       return elim_basic(A, im, ker, w, full);
8056    }
8057    else {
8058       long V = 4*MAT_BLK_SZ;
8059 
8060 #ifdef NTL_HAVE_AVX
8061       if (p-1 <= MAX_DBL_INT &&
8062           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
8063           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
8064 
8065          return elim_blk_DD(A, im, ker, w, full);
8066       }
8067       else
8068 #endif
8069            if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
8070                cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1))  {
8071 
8072          return elim_blk_L(A, im, ker, w, full);
8073 
8074       }
8075       else {
8076 
8077          return elim_blk_LL(A, im, ker, w, full);
8078       }
8079 
8080    }
8081 
8082 #endif
8083 
8084 
8085 
8086 }
8087 
8088 
8089 // ******************************************************************
8090 //
8091 // High level interfaces
8092 //
8093 // ******************************************************************
8094 
8095 
8096 
gauss(mat_zz_p & M,long w)8097 long gauss(mat_zz_p& M, long w)
8098 {
8099    return elim(M, &M, 0, w, true);
8100 }
8101 
8102 
gauss(mat_zz_p & M)8103 long gauss(mat_zz_p& M)
8104 {
8105    return gauss(M, M.NumCols());
8106 }
8107 
image(mat_zz_p & X,const mat_zz_p & A)8108 void image(mat_zz_p& X, const mat_zz_p& A)
8109 {
8110    elim(A, &X, 0, A.NumCols(), false);
8111 }
8112 
kernel(mat_zz_p & X,const mat_zz_p & A)8113 void kernel(mat_zz_p& X, const mat_zz_p& A)
8114 {
8115    elim(A, 0, &X, A.NumCols(), false);
8116 }
8117 
8118 
8119 // ******************************************************************
8120 //
8121 // Operator/functional notation
8122 //
8123 // ******************************************************************
8124 
8125 
8126 
8127 
operator +(const mat_zz_p & a,const mat_zz_p & b)8128 mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
8129 {
8130    mat_zz_p res;
8131    add(res, a, b);
8132    NTL_OPT_RETURN(mat_zz_p, res);
8133 }
8134 
operator *(const mat_zz_p & a,const mat_zz_p & b)8135 mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
8136 {
8137    mat_zz_p res;
8138    mul_aux(res, a, b);
8139    NTL_OPT_RETURN(mat_zz_p, res);
8140 }
8141 
operator -(const mat_zz_p & a,const mat_zz_p & b)8142 mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
8143 {
8144    mat_zz_p res;
8145    sub(res, a, b);
8146    NTL_OPT_RETURN(mat_zz_p, res);
8147 }
8148 
8149 
operator -(const mat_zz_p & a)8150 mat_zz_p operator-(const mat_zz_p& a)
8151 {
8152    mat_zz_p res;
8153    negate(res, a);
8154    NTL_OPT_RETURN(mat_zz_p, res);
8155 }
8156 
8157 
operator *(const mat_zz_p & a,const vec_zz_p & b)8158 vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
8159 {
8160    vec_zz_p res;
8161    mul_aux(res, a, b);
8162    NTL_OPT_RETURN(vec_zz_p, res);
8163 }
8164 
operator *(const vec_zz_p & a,const mat_zz_p & b)8165 vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
8166 {
8167    vec_zz_p res;
8168    mul(res, a, b);
8169    NTL_OPT_RETURN(vec_zz_p, res);
8170 }
8171 
8172 
8173 #if 0
8174 // for testing purposes
8175 
8176 void test_alt_mul_L(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8177 {
8178    alt_mul_L(X, A, B);
8179 }
8180 
8181 void test_alt_mul_LL(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8182 {
8183    alt_mul_LL(X, A, B);
8184 }
8185 
8186 void test_blk_mul_DD(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8187 {
8188    blk_mul_DD(X, A, B);
8189 }
8190 
8191 void test_blk_mul_LL(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8192 {
8193    blk_mul_LL(X, A, B);
8194 }
8195 
8196 void test_blk_mul_L(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8197 {
8198    blk_mul_L(X, A, B);
8199 }
8200 
8201 void test_basic_mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8202 {
8203    basic_mul(X, A, B);
8204 }
8205 
8206 #endif
8207 
random(mat_zz_p & x,long n,long m)8208 void random(mat_zz_p& x, long n, long m)
8209 {
8210    x.SetDims(n, m);
8211    for (long i = 0; i < n; i++) random(x[i], m);
8212 }
8213 
8214 NTL_END_IMPL
8215