1
2 #include <NTL/mat_lzz_p.h>
3 #include <NTL/vec_long.h>
4
5
6 #include <NTL/BasicThreadPool.h>
7
8
9
10 #ifdef NTL_HAVE_AVX
11 #include <immintrin.h>
12 #endif
13
14 NTL_START_IMPL
15
16
17 #define PAR_THRESH_SQ (200)
18 #define PAR_THRESH (40000.0)
19
20
21 // *******************************************************
22 //
23 // Matrix Window data structure: perhaps some day this
24 // will be made public.
25 //
26 // *******************************************************
27
28 struct mat_window_zz_p {
29 mat_zz_p &A;
30 long r_offset;
31 long c_offset;
32 long nrows;
33 long ncols;
34
mat_window_zz_pmat_window_zz_p35 mat_window_zz_p(mat_zz_p& _A) :
36 A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
37
mat_window_zz_pmat_window_zz_p38 mat_window_zz_p(const mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
39 A(w.A)
40 {
41 if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
42 LogicError("mat_window_zz_p: bad args");
43
44 r_offset = w.r_offset + r1;
45 c_offset = w.c_offset + c1;
46 nrows = r2-r1;
47 ncols = c2-c1;
48 }
49
operator []mat_window_zz_p50 zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
51
NumRowsmat_window_zz_p52 long NumRows() const { return nrows; }
NumColsmat_window_zz_p53 long NumCols() const { return ncols; }
54
55 };
56
57
58 struct const_mat_window_zz_p {
59 const mat_zz_p &A;
60 long r_offset;
61 long c_offset;
62 long nrows;
63 long ncols;
64
const_mat_window_zz_pconst_mat_window_zz_p65 const_mat_window_zz_p(const mat_zz_p& _A) :
66 A(_A), r_offset(0), c_offset(0), nrows(A.NumRows()), ncols(A.NumCols()) { }
67
const_mat_window_zz_pconst_mat_window_zz_p68 const_mat_window_zz_p(const mat_window_zz_p& w) :
69 A(w.A), r_offset(w.r_offset), c_offset(w.c_offset), nrows(w.nrows), ncols(w.ncols) { }
70
const_mat_window_zz_pconst_mat_window_zz_p71 const_mat_window_zz_p(const const_mat_window_zz_p& w, long r1, long c1, long r2, long c2) :
72 A(w.A)
73 {
74 if (r1 < 0 || c1 < 0 || r2 < r1 || c2 < c1 || r2-r1 > w.nrows || c2-c1 > w.ncols)
75 LogicError("const_mat_window_zz_p: bad args");
76
77 r_offset = w.r_offset + r1;
78 c_offset = w.c_offset + c1;
79 nrows = r2-r1;
80 ncols = c2-c1;
81 }
82
operator []const_mat_window_zz_p83 const zz_p * operator[](long i) const { return A[i+r_offset].elts() + c_offset; }
84
NumRowsconst_mat_window_zz_p85 long NumRows() const { return nrows; }
NumColsconst_mat_window_zz_p86 long NumCols() const { return ncols; }
87
88 };
89
add(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)90 void add(const mat_window_zz_p& X,
91 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
92 {
93 long n = A.NumRows();
94 long m = A.NumCols();
95
96 if (B.NumRows() != n || B.NumCols() != m)
97 LogicError("matrix add: dimension mismatch");
98
99 if (X.NumRows() != n || X.NumCols() != m)
100 LogicError("matrix add: dimension mismatch");
101
102 long p = zz_p::modulus();
103
104 for (long i = 0; i < n; i++) {
105 zz_p *x = X[i];
106 const zz_p *a = A[i];
107 const zz_p *b = B[i];
108 for (long j = 0; j < m; j++) {
109 x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
110 }
111 }
112 }
113
sub(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)114 void sub(const mat_window_zz_p& X,
115 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
116 {
117 long n = A.NumRows();
118 long m = A.NumCols();
119
120 if (B.NumRows() != n || B.NumCols() != m)
121 LogicError("matrix sub: dimension mismatch");
122
123 if (X.NumRows() != n || X.NumCols() != m)
124 LogicError("matrix sub: dimension mismatch");
125
126 long p = zz_p::modulus();
127
128 for (long i = 0; i < n; i++) {
129 zz_p *x = X[i];
130 const zz_p *a = A[i];
131 const zz_p *b = B[i];
132 for (long j = 0; j < m; j++) {
133 x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
134 }
135 }
136 }
137
138
clear(const mat_window_zz_p & X)139 void clear(const mat_window_zz_p& X)
140 {
141 long n = X.NumRows();
142 long m = X.NumCols();
143
144 for (long i = 0; i < n; i++)
145 for (long j = 0; j < m; j++)
146 clear(X[i][j]);
147 }
148
149
150
151 // ***********************************************************
152
153
154
155
156
157
add(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)158 void add(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
159 {
160 long n = A.NumRows();
161 long m = A.NumCols();
162
163 if (B.NumRows() != n || B.NumCols() != m)
164 LogicError("matrix add: dimension mismatch");
165
166 X.SetDims(n, m);
167
168 long p = zz_p::modulus();
169
170 for (long i = 0; i < n; i++) {
171 zz_p *x = X[i].elts();
172 const zz_p *a = A[i].elts();
173 const zz_p *b = B[i].elts();
174 for (long j = 0; j < m; j++) {
175 x[j].LoopHole() = AddMod(rep(a[j]), rep(b[j]), p);
176 }
177 }
178 }
179
sub(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)180 void sub(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
181 {
182 long n = A.NumRows();
183 long m = A.NumCols();
184
185 if (B.NumRows() != n || B.NumCols() != m)
186 LogicError("matrix sub: dimension mismatch");
187
188 X.SetDims(n, m);
189
190 long p = zz_p::modulus();
191
192 for (long i = 0; i < n; i++) {
193 zz_p *x = X[i].elts();
194 const zz_p *a = A[i].elts();
195 const zz_p *b = B[i].elts();
196 for (long j = 0; j < m; j++) {
197 x[j].LoopHole() = SubMod(rep(a[j]), rep(b[j]), p);
198 }
199 }
200
201 }
202
203
204
205
206
diag(mat_zz_p & X,long n,zz_p d)207 void diag(mat_zz_p& X, long n, zz_p d)
208 {
209 X.SetDims(n, n);
210 long i, j;
211
212 for (i = 1; i <= n; i++)
213 for (j = 1; j <= n; j++)
214 if (i == j)
215 X(i, j) = d;
216 else
217 clear(X(i, j));
218 }
219
IsDiag(const mat_zz_p & A,long n,zz_p d)220 long IsDiag(const mat_zz_p& A, long n, zz_p d)
221 {
222 if (A.NumRows() != n || A.NumCols() != n)
223 return 0;
224
225 long i, j;
226
227 for (i = 1; i <= n; i++)
228 for (j = 1; j <= n; j++)
229 if (i != j) {
230 if (!IsZero(A(i, j))) return 0;
231 }
232 else {
233 if (A(i, j) != d) return 0;
234 }
235
236 return 1;
237 }
238
negate(mat_zz_p & X,const mat_zz_p & A)239 void negate(mat_zz_p& X, const mat_zz_p& A)
240 {
241 long n = A.NumRows();
242 long m = A.NumCols();
243
244
245 X.SetDims(n, m);
246
247 long p = zz_p::modulus();
248
249 for (long i = 0; i < n; i++) {
250 zz_p *x = X[i].elts();
251 const zz_p *a = A[i].elts();
252 for (long j = 0; j < m; j++) {
253 x[j].LoopHole() = NegateMod(rep(a[j]), p);
254 }
255 }
256 }
257
IsZero(const mat_zz_p & a)258 long IsZero(const mat_zz_p& a)
259 {
260 long n = a.NumRows();
261 long i;
262
263 for (i = 0; i < n; i++)
264 if (!IsZero(a[i]))
265 return 0;
266
267 return 1;
268 }
269
clear(mat_zz_p & x)270 void clear(mat_zz_p& x)
271 {
272 long n = x.NumRows();
273 long i;
274 for (i = 0; i < n; i++)
275 clear(x[i]);
276 }
277
278
ident(mat_zz_p & X,long n)279 void ident(mat_zz_p& X, long n)
280 {
281 X.SetDims(n, n);
282 long i, j;
283
284 for (i = 1; i <= n; i++)
285 for (j = 1; j <= n; j++)
286 if (i == j)
287 set(X(i, j));
288 else
289 clear(X(i, j));
290 }
291
292
IsIdent(const mat_zz_p & A,long n)293 long IsIdent(const mat_zz_p& A, long n)
294 {
295 if (A.NumRows() != n || A.NumCols() != n)
296 return 0;
297
298 long i, j;
299
300 for (i = 1; i <= n; i++)
301 for (j = 1; j <= n; j++)
302 if (i != j) {
303 if (!IsZero(A(i, j))) return 0;
304 }
305 else {
306 if (!IsOne(A(i, j))) return 0;
307 }
308
309 return 1;
310 }
311
312
transpose(mat_zz_p & X,const mat_zz_p & A)313 void transpose(mat_zz_p& X, const mat_zz_p& A)
314 {
315 long n = A.NumRows();
316 long m = A.NumCols();
317
318 long i, j;
319
320 if (&X == & A) {
321 if (n == m)
322 for (i = 1; i <= n; i++)
323 for (j = i+1; j <= n; j++)
324 swap(X(i, j), X(j, i));
325 else {
326 mat_zz_p tmp;
327 tmp.SetDims(m, n);
328 for (i = 1; i <= n; i++)
329 for (j = 1; j <= m; j++)
330 tmp(j, i) = A(i, j);
331 X.kill();
332 X = tmp;
333 }
334 }
335 else {
336 X.SetDims(m, n);
337 for (i = 1; i <= n; i++)
338 for (j = 1; j <= m; j++)
339 X(j, i) = A(i, j);
340 }
341 }
342
343
344
345
relaxed_power(mat_zz_p & X,const mat_zz_p & A,const ZZ & e,bool relax)346 void relaxed_power(mat_zz_p& X, const mat_zz_p& A, const ZZ& e, bool relax)
347 {
348 if (A.NumRows() != A.NumCols()) LogicError("power: non-square matrix");
349
350 if (e == 0) {
351 ident(X, A.NumRows());
352 return;
353 }
354
355 mat_zz_p T1, T2;
356 long i, k;
357
358 k = NumBits(e);
359 T1 = A;
360
361 for (i = k-2; i >= 0; i--) {
362 sqr(T2, T1);
363 if (bit(e, i))
364 mul(T1, T2, A);
365 else
366 T1 = T2;
367 }
368
369 if (e < 0)
370 relaxed_inv(X, T1, relax);
371 else
372 X = T1;
373 }
374
375
376
377 // ******************************************************************
378 //
379 // matrix-vector multiplication code
380 //
381 // ******************************************************************
382
383
384
385
386
387
mul(vec_zz_p & x,const vec_zz_p & a,const mat_zz_p & B)388 void mul(vec_zz_p& x, const vec_zz_p& a, const mat_zz_p& B)
389 {
390 long l = a.length();
391 long m = B.NumCols();
392
393 if (l != B.NumRows())
394 LogicError("matrix mul: dimension mismatch");
395
396 if (m == 0) {
397
398 x.SetLength(0);
399
400 }
401 else if (m == 1) {
402
403 long p = zz_p::modulus();
404 mulmod_t pinv = zz_p::ModulusInverse();
405
406 long acc, tmp;
407 long k;
408
409 acc = 0;
410 for(k = 1; k <= l; k++) {
411 tmp = MulMod(rep(a(k)), rep(B(k,1)), p, pinv);
412 acc = AddMod(acc, tmp, p);
413 }
414
415 x.SetLength(1);
416 x(1).LoopHole() = acc;
417
418 }
419 else { // m > 1. precondition and EXEC_RANGE
420
421
422 long p = zz_p::modulus();
423 mulmod_t pinv = zz_p::ModulusInverse();
424
425 NTL_TLS_LOCAL(vec_long, mul_aux_vec);
426 vec_long::Watcher watch_mul_aux_vec(mul_aux_vec);
427 mul_aux_vec.SetLength(m);
428 long *acc = mul_aux_vec.elts();
429
430 const zz_p* ap = a.elts();
431
432 for (long j = 0; j < m; j++) acc[j] = 0;
433
434 const bool seq = double(l)*double(m) < PAR_THRESH;
435
436 NTL_GEXEC_RANGE(seq, m, first, last) {
437
438 for (long k = 0; k < l; k++) {
439 long aa = rep(ap[k]);
440 if (aa != 0) {
441 const zz_p* bp = B[k].elts();
442 long T1;
443 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
444
445 for (long j = first; j < last; j++) {
446 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
447 acc[j] = AddMod(acc[j], T1, p);
448 }
449 }
450 }
451
452 } NTL_GEXEC_RANGE_END
453
454 x.SetLength(m);
455 zz_p *xp = x.elts();
456 for (long j = 0; j < m; j++)
457 xp[j].LoopHole() = acc[j];
458 }
459 }
460
461
mul_aux(vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b)462 void mul_aux(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
463 {
464 long n = A.NumRows();
465 long l = A.NumCols();
466
467 if (l != b.length())
468 LogicError("matrix mul: dimension mismatch");
469
470 x.SetLength(n);
471 zz_p* xp = x.elts();
472
473 long p = zz_p::modulus();
474 const zz_p* bp = b.elts();
475
476 const bool seq = double(n)*double(l) < PAR_THRESH;
477
478
479 #ifdef NTL_HAVE_LL_TYPE
480
481 if (InnerProd_L_viable(l, p)) {
482
483 sp_reduce_struct red_struct = zz_p::red_struct();
484 long bound = InnerProd_L_bound(p);
485
486 NTL_GEXEC_RANGE(seq, n, first, last) {
487
488 for (long i = first; i < last; i++) {
489 xp[i].LoopHole() = InnerProd_L(A[i].elts(), bp, l, p, red_struct, bound);
490 }
491
492 } NTL_GEXEC_RANGE_END
493 }
494 else {
495 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
496
497 NTL_GEXEC_RANGE(seq, n, first, last) {
498
499 for (long i = first; i < last; i++) {
500 xp[i].LoopHole() = InnerProd_LL(A[i].elts(), bp, l, p, ll_red_struct);
501 }
502
503 } NTL_GEXEC_RANGE_END
504
505 }
506
507 #else
508
509 mulmod_t pinv = zz_p::ModulusInverse();
510
511 if (n <= 1) {
512
513 for (long i = 0; i < n; i++) {
514 long acc = 0;
515 const zz_p* ap = A[i].elts();
516
517 for (long k = 0; k < l; k++) {
518 long tmp = MulMod(rep(ap[k]), rep(bp[k]), p, pinv);
519 acc = AddMod(acc, tmp, p);
520 }
521
522 xp[i].LoopHole() = acc;
523 }
524
525 }
526 else {
527
528 NTL_TLS_LOCAL(Vec<mulmod_precon_t>, precon_vec);
529 Vec<mulmod_precon_t>::Watcher watch_precon_vec(precon_vec);
530 precon_vec.SetLength(l);
531 mulmod_precon_t *bpinv = precon_vec.elts();
532
533 for (long k = 0; k < l; k++)
534 bpinv[k] = PrepMulModPrecon(rep(bp[k]), p, pinv);
535
536
537 NTL_GEXEC_RANGE(seq, n, first, last) {
538 for (long i = first; i < last; i++) {
539 long acc = 0;
540 const zz_p* ap = A[i].elts();
541
542 for (long k = 0; k < l; k++) {
543 long tmp = MulModPrecon(rep(ap[k]), rep(bp[k]), p, bpinv[k]);
544 acc = AddMod(acc, tmp, p);
545 }
546
547 xp[i].LoopHole() = acc;
548 }
549 } NTL_GEXEC_RANGE_END
550
551 }
552
553 #endif
554 }
555
mul(vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b)556 void mul(vec_zz_p& x, const mat_zz_p& A, const vec_zz_p& b)
557 {
558 if (&b == &x || A.alias(x)) {
559 vec_zz_p tmp;
560 mul_aux(tmp, A, b);
561 x = tmp;
562 }
563 else
564 mul_aux(x, A, b);
565
566 }
567
568
mul(mat_zz_p & X,const mat_zz_p & A,zz_p b)569 void mul(mat_zz_p& X, const mat_zz_p& A, zz_p b)
570 {
571 long n = A.NumRows();
572 long m = A.NumCols();
573
574 X.SetDims(n, m);
575
576
577 if (n == 0 || m == 0 || (n == 1 && m == 1)) {
578 long i, j;
579
580 for (i = 0; i < n; i++)
581 for (j = 0; j < m; j++)
582 mul(X[i][j], A[i][j], b);
583
584 }
585 else {
586
587 long p = zz_p::modulus();
588 mulmod_t pinv = zz_p::ModulusInverse();
589 long bb = rep(b);
590 mulmod_precon_t bpinv = PrepMulModPrecon(bb, p, pinv);
591
592 const bool seq = double(n)*double(m) < PAR_THRESH;
593
594 NTL_GEXEC_RANGE(seq, n, first, last)
595 long i, j;
596 for (i = first; i < last; i++) {
597 const zz_p *ap = A[i].elts();
598 zz_p *xp = X[i].elts();
599
600 for (j = 0; j < m; j++)
601 xp[j].LoopHole() = MulModPrecon(rep(ap[j]), bb, p, bpinv);
602 }
603 NTL_GEXEC_RANGE_END
604
605
606 }
607 }
608
mul(mat_zz_p & X,const mat_zz_p & A,long b_in)609 void mul(mat_zz_p& X, const mat_zz_p& A, long b_in)
610 {
611 zz_p b;
612 b = b_in;
613 mul(X, A, b);
614 }
615
616
617 // ******************************************************************
618 //
619 // Code shared by block-matrix code
620 //
621 // ******************************************************************
622
623 //#undef NTL_HAVE_AVX
624 //#undef NTL_HAVE_FMA
625 //#undef NTL_HAVE_AVX512F
626 // for testing purposes
627
628 #if (defined(NTL_HAVE_AVX512F) && defined(NTL_AVOID_AVX512))
629 #undef NTL_HAVE_AVX512F
630 #endif
631
632 #define MAT_BLK_SZ (32)
633
634
635 #ifdef NTL_HAVE_LL_TYPE
636
637 #ifdef NTL_HAVE_AVX
638
639 #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
640 // max int representable exactly as a double
641 // this assumes NTL_DBL_PRECISION <= NTL_BITS_PER_LONG-2, which is
642 // checked in the code that tests for HAVE_AVX, but we check it here as
643 // well
644
645 #if (NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2)
646 #error "NTL_DBL_PRECISION > NTL_BITS_PER_LONG-2"
647 #endif
648
649
650 // MUL_ADD(a, b, c): a += b*c
651 #ifdef NTL_HAVE_FMA
652 #define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a)
653 #else
654 #define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
655 #endif
656
657
658 #ifdef NTL_HAVE_AVX512F
659 #define MUL_ADD512(a, b, c) a = _mm512_fmadd_pd(b, c, a)
660 #endif
661
662
663
664 #ifdef NTL_HAVE_AVX512F
665
666 static
muladd1_by_32(double * x,const double * a,const double * b,long n)667 void muladd1_by_32(double *x, const double *a, const double *b, long n)
668 {
669 __m512d avec0, bvec;
670
671 __m512d acc00, acc01, acc02, acc03;
672
673 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
674 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
675 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
676 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
677
678 for (long i = 0; i < n; i++) {
679 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
680
681 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
682 MUL_ADD512(acc00, avec0, bvec);
683
684 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
685 MUL_ADD512(acc01, avec0, bvec);
686
687 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
688 MUL_ADD512(acc02, avec0, bvec);
689
690 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
691 MUL_ADD512(acc03, avec0, bvec);
692 }
693
694
695 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
696 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
697 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
698 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
699
700 }
701
702 static
muladd2_by_32(double * x,const double * a,const double * b,long n)703 void muladd2_by_32(double *x, const double *a, const double *b, long n)
704 {
705 __m512d avec0, avec1, bvec;
706
707 __m512d acc00, acc01, acc02, acc03;
708 __m512d acc10, acc11, acc12, acc13;
709
710
711
712 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
713 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
714 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
715 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
716
717 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
718 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
719 acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
720 acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
721
722 for (long i = 0; i < n; i++) {
723 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
724 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
725
726 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
727 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
728
729 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
730 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
731
732 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
733 MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
734
735 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
736 MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
737 }
738
739
740 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
741 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
742 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
743 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
744
745 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
746 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
747 _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
748 _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
749
750 }
751
752
753 static
muladd3_by_32(double * x,const double * a,const double * b,long n)754 void muladd3_by_32(double *x, const double *a, const double *b, long n)
755 {
756 __m512d avec0, avec1, avec2, bvec;
757
758 __m512d acc00, acc01, acc02, acc03;
759 __m512d acc10, acc11, acc12, acc13;
760 __m512d acc20, acc21, acc22, acc23;
761
762
763
764 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
765 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
766 acc02=_mm512_load_pd(x + 2*8 + 0*MAT_BLK_SZ);
767 acc03=_mm512_load_pd(x + 3*8 + 0*MAT_BLK_SZ);
768
769 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
770 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
771 acc12=_mm512_load_pd(x + 2*8 + 1*MAT_BLK_SZ);
772 acc13=_mm512_load_pd(x + 3*8 + 1*MAT_BLK_SZ);
773
774 acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
775 acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
776 acc22=_mm512_load_pd(x + 2*8 + 2*MAT_BLK_SZ);
777 acc23=_mm512_load_pd(x + 3*8 + 2*MAT_BLK_SZ);
778
779 for (long i = 0; i < n; i++) {
780 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
781 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
782 avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
783
784 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
785 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
786 MUL_ADD512(acc20, avec2, bvec);
787
788 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
789 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
790 MUL_ADD512(acc21, avec2, bvec);
791
792 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+2*8]);
793 MUL_ADD512(acc02, avec0, bvec); MUL_ADD512(acc12, avec1, bvec);
794 MUL_ADD512(acc22, avec2, bvec);
795
796 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+3*8]);
797 MUL_ADD512(acc03, avec0, bvec); MUL_ADD512(acc13, avec1, bvec);
798 MUL_ADD512(acc23, avec2, bvec);
799 }
800
801 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
802 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
803 _mm512_store_pd(x + 2*8 + 0*MAT_BLK_SZ, acc02);
804 _mm512_store_pd(x + 3*8 + 0*MAT_BLK_SZ, acc03);
805
806 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
807 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
808 _mm512_store_pd(x + 2*8 + 1*MAT_BLK_SZ, acc12);
809 _mm512_store_pd(x + 3*8 + 1*MAT_BLK_SZ, acc13);
810
811 _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
812 _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
813 _mm512_store_pd(x + 2*8 + 2*MAT_BLK_SZ, acc22);
814 _mm512_store_pd(x + 3*8 + 2*MAT_BLK_SZ, acc23);
815
816
817 }
818
819
820 static
muladd1_by_16(double * x,const double * a,const double * b,long n)821 void muladd1_by_16(double *x, const double *a, const double *b, long n)
822 {
823 __m512d avec0, bvec;
824
825 __m512d acc00, acc01;
826
827
828
829 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
830 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
831
832 for (long i = 0; i < n; i++) {
833 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
834
835 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
836 MUL_ADD512(acc00, avec0, bvec);
837
838 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
839 MUL_ADD512(acc01, avec0, bvec);
840 }
841
842
843 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
844 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
845
846 }
847
848 static
muladd2_by_16(double * x,const double * a,const double * b,long n)849 void muladd2_by_16(double *x, const double *a, const double *b, long n)
850 {
851 __m512d avec0, avec1, bvec;
852
853 __m512d acc00, acc01;
854 __m512d acc10, acc11;
855
856
857
858 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
859 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
860
861 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
862 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
863
864 for (long i = 0; i < n; i++) {
865 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
866 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
867
868 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
869 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
870
871 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
872 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
873 }
874
875
876 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
877 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
878
879 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
880 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
881 }
882
883
884 static
muladd3_by_16(double * x,const double * a,const double * b,long n)885 void muladd3_by_16(double *x, const double *a, const double *b, long n)
886 {
887 __m512d avec0, avec1, avec2, bvec;
888
889 __m512d acc00, acc01;
890 __m512d acc10, acc11;
891 __m512d acc20, acc21;
892
893
894
895 acc00=_mm512_load_pd(x + 0*8 + 0*MAT_BLK_SZ);
896 acc01=_mm512_load_pd(x + 1*8 + 0*MAT_BLK_SZ);
897
898 acc10=_mm512_load_pd(x + 0*8 + 1*MAT_BLK_SZ);
899 acc11=_mm512_load_pd(x + 1*8 + 1*MAT_BLK_SZ);
900
901 acc20=_mm512_load_pd(x + 0*8 + 2*MAT_BLK_SZ);
902 acc21=_mm512_load_pd(x + 1*8 + 2*MAT_BLK_SZ);
903
904
905 for (long i = 0; i < n; i++) {
906 avec0 = _mm512_set1_pd(a[i+0*MAT_BLK_SZ]);
907 avec1 = _mm512_set1_pd(a[i+1*MAT_BLK_SZ]);
908 avec2 = _mm512_set1_pd(a[i+2*MAT_BLK_SZ]);
909
910 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+0*8]);
911 MUL_ADD512(acc00, avec0, bvec); MUL_ADD512(acc10, avec1, bvec);
912 MUL_ADD512(acc20, avec2, bvec);
913
914 bvec = _mm512_load_pd(&b[i*MAT_BLK_SZ+1*8]);
915 MUL_ADD512(acc01, avec0, bvec); MUL_ADD512(acc11, avec1, bvec);
916 MUL_ADD512(acc21, avec2, bvec);
917 }
918
919
920 _mm512_store_pd(x + 0*8 + 0*MAT_BLK_SZ, acc00);
921 _mm512_store_pd(x + 1*8 + 0*MAT_BLK_SZ, acc01);
922
923 _mm512_store_pd(x + 0*8 + 1*MAT_BLK_SZ, acc10);
924 _mm512_store_pd(x + 1*8 + 1*MAT_BLK_SZ, acc11);
925
926 _mm512_store_pd(x + 0*8 + 2*MAT_BLK_SZ, acc20);
927 _mm512_store_pd(x + 1*8 + 2*MAT_BLK_SZ, acc21);
928
929 }
930
931
932
933 #else
934
935 static
muladd1_by_32(double * x,const double * a,const double * b,long n)936 void muladd1_by_32(double *x, const double *a, const double *b, long n)
937 {
938 __m256d avec, bvec;
939
940
941 __m256d acc0=_mm256_load_pd(x + 0*4);
942 __m256d acc1=_mm256_load_pd(x + 1*4);
943 __m256d acc2=_mm256_load_pd(x + 2*4);
944 __m256d acc3=_mm256_load_pd(x + 3*4);
945 __m256d acc4=_mm256_load_pd(x + 4*4);
946 __m256d acc5=_mm256_load_pd(x + 5*4);
947 __m256d acc6=_mm256_load_pd(x + 6*4);
948 __m256d acc7=_mm256_load_pd(x + 7*4);
949
950
951 for (long i = 0; i < n; i++) {
952 avec = _mm256_broadcast_sd(a); a++;
953
954
955 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
956 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
957 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
958 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
959 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
960 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
961 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
962 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
963 }
964
965
966 _mm256_store_pd(x + 0*4, acc0);
967 _mm256_store_pd(x + 1*4, acc1);
968 _mm256_store_pd(x + 2*4, acc2);
969 _mm256_store_pd(x + 3*4, acc3);
970 _mm256_store_pd(x + 4*4, acc4);
971 _mm256_store_pd(x + 5*4, acc5);
972 _mm256_store_pd(x + 6*4, acc6);
973 _mm256_store_pd(x + 7*4, acc7);
974 }
975
976 static
muladd2_by_32(double * x,const double * a,const double * b,long n)977 void muladd2_by_32(double *x, const double *a, const double *b, long n)
978 {
979 __m256d avec0, avec1, bvec;
980 __m256d acc00, acc01, acc02, acc03;
981 __m256d acc10, acc11, acc12, acc13;
982
983
984 // round 0
985
986 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
987 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
988 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
989 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
990
991 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
992 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
993 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
994 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
995
996 for (long i = 0; i < n; i++) {
997 avec0 = _mm256_broadcast_sd(&a[i]);
998 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
999
1000 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1001 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1002 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1003 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1004 }
1005
1006
1007 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1008 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1009 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1010 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1011
1012 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1013 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1014 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1015 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1016
1017 // round 1
1018
1019 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1020 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1021 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1022 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1023
1024 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1025 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1026 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1027 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1028
1029 for (long i = 0; i < n; i++) {
1030 avec0 = _mm256_broadcast_sd(&a[i]);
1031 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1032
1033 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1034 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1035 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1036 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1037 }
1038
1039
1040 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1041 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1042 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1043 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1044
1045 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1046 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1047 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1048 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1049
1050 }
1051
1052 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
1053 // it could be faster on AVX2/FMA, where there should be enough registers
1054 static
muladd3_by_32(double * x,const double * a,const double * b,long n)1055 void muladd3_by_32(double *x, const double *a, const double *b, long n)
1056 {
1057 __m256d avec0, avec1, avec2, bvec;
1058 __m256d acc00, acc01, acc02, acc03;
1059 __m256d acc10, acc11, acc12, acc13;
1060 __m256d acc20, acc21, acc22, acc23;
1061
1062
1063 // round 0
1064
1065 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1066 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1067 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1068 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1069
1070 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1071 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1072 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1073 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1074
1075 acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
1076 acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
1077 acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
1078 acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
1079
1080 for (long i = 0; i < n; i++) {
1081 avec0 = _mm256_broadcast_sd(&a[i]);
1082 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1083 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1084
1085 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1086 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1087 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1088 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1089 }
1090
1091
1092 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1093 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1094 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1095 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1096
1097 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1098 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1099 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1100 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1101
1102 _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1103 _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1104 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1105 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1106
1107 // round 1
1108
1109 acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
1110 acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
1111 acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
1112 acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
1113
1114 acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
1115 acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
1116 acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
1117 acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
1118
1119 acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
1120 acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
1121 acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
1122 acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
1123
1124 for (long i = 0; i < n; i++) {
1125 avec0 = _mm256_broadcast_sd(&a[i]);
1126 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1127 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1128
1129 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1130 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1131 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1132 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1133 }
1134
1135
1136 _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
1137 _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
1138 _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
1139 _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
1140
1141 _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
1142 _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
1143 _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
1144 _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
1145
1146 _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
1147 _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
1148 _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
1149 _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
1150
1151 }
1152
1153 static
muladd1_by_16(double * x,const double * a,const double * b,long n)1154 void muladd1_by_16(double *x, const double *a, const double *b, long n)
1155 {
1156 __m256d avec, bvec;
1157
1158
1159 __m256d acc0=_mm256_load_pd(x + 0*4);
1160 __m256d acc1=_mm256_load_pd(x + 1*4);
1161 __m256d acc2=_mm256_load_pd(x + 2*4);
1162 __m256d acc3=_mm256_load_pd(x + 3*4);
1163
1164
1165 for (long i = 0; i < n; i++) {
1166 avec = _mm256_broadcast_sd(a); a++;
1167
1168
1169 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
1170 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
1171 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
1172 bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
1173 b += 16;
1174 }
1175
1176
1177 _mm256_store_pd(x + 0*4, acc0);
1178 _mm256_store_pd(x + 1*4, acc1);
1179 _mm256_store_pd(x + 2*4, acc2);
1180 _mm256_store_pd(x + 3*4, acc3);
1181 }
1182
1183
1184
1185 static
muladd2_by_16(double * x,const double * a,const double * b,long n)1186 void muladd2_by_16(double *x, const double *a, const double *b, long n)
1187 {
1188 __m256d avec0, avec1, bvec;
1189 __m256d acc00, acc01, acc02, acc03;
1190 __m256d acc10, acc11, acc12, acc13;
1191
1192
1193 // round 0
1194
1195 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1196 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1197 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1198 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1199
1200 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1201 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1202 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1203 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1204
1205 for (long i = 0; i < n; i++) {
1206 avec0 = _mm256_broadcast_sd(&a[i]);
1207 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1208
1209 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
1210 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
1211 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
1212 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
1213 }
1214
1215
1216 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1217 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1218 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1219 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1220
1221 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1222 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1223 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1224 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1225
1226 }
1227
1228
1229 static
muladd3_by_16(double * x,const double * a,const double * b,long n)1230 void muladd3_by_16(double *x, const double *a, const double *b, long n)
1231 {
1232 __m256d avec0, avec1, avec2, bvec;
1233 __m256d acc00, acc01, acc02, acc03;
1234 __m256d acc10, acc11, acc12, acc13;
1235 __m256d acc20, acc21, acc22, acc23;
1236
1237
1238 // round 0
1239
1240 acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
1241 acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
1242 acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
1243 acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
1244
1245 acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
1246 acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
1247 acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
1248 acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
1249
1250 acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
1251 acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
1252 acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
1253 acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
1254
1255 for (long i = 0; i < n; i++) {
1256 avec0 = _mm256_broadcast_sd(&a[i]);
1257 avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]);
1258 avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]);
1259
1260 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
1261 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
1262 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
1263 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
1264 }
1265
1266
1267 _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
1268 _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
1269 _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
1270 _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
1271
1272 _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
1273 _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
1274 _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
1275 _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
1276
1277 _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
1278 _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
1279 _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
1280 _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
1281
1282 }
1283
1284
1285
1286
1287 #endif
1288
1289
1290
1291
1292 static inline
muladd_all_by_32(long first,long last,double * x,const double * a,const double * b,long n)1293 void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
1294 {
1295 long i = first;
1296 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
1297 // process three rows at a time
1298 for (; i <= last-3; i+=3)
1299 muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1300 for (; i < last; i++)
1301 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1302 #else
1303 // process only two rows at a time: not enough registers :-(
1304 for (; i <= last-2; i+=2)
1305 muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1306 for (; i < last; i++)
1307 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1308 #endif
1309 }
1310
1311
1312 static inline
muladd_all_by_16(long first,long last,double * x,const double * a,const double * b,long n)1313 void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
1314 {
1315 long i = first;
1316 #if (defined(NTL_HAVE_FMA) || defined(NTL_HAVE_AVX512F))
1317 // processing three rows at a time is faster
1318 for (; i <= last-3; i+=3)
1319 muladd3_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1320 for (; i < last; i++)
1321 muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1322 #else
1323 // process only two rows at a time: not enough registers :-(
1324 for (; i <= last-2; i+=2)
1325 muladd2_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1326 for (; i < last; i++)
1327 muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1328 #endif
1329 }
1330
1331 static inline
muladd_all_by_32_width(long first,long last,double * x,const double * a,const double * b,long n,long width)1332 void muladd_all_by_32_width(long first, long last, double *x, const double *a, const double *b, long n, long width)
1333 {
1334 if (width > MAT_BLK_SZ/2)
1335 muladd_all_by_32(first, last, x, a, b, n);
1336 else
1337 muladd_all_by_16(first, last, x, a, b, n);
1338 }
1339
1340 // muladd_interval1 used in alt_inv_DD and alt_tri_DD
1341 // muladd_interval used in blk_inv_DD and blk_tri_DD, with an
1342 // argument of MAT_BLK_SZ
1343
1344
1345 // this assumes n is a multiple of 16
1346 static inline
muladd_interval(double * NTL_RESTRICT x,double * NTL_RESTRICT y,double c,long n)1347 void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1348 {
1349 __m256d xvec0, xvec1, xvec2, xvec3;
1350 __m256d yvec0, yvec1, yvec2, yvec3;
1351
1352 __m256d cvec = _mm256_broadcast_sd(&c);
1353
1354 for (long i = 0; i < n; i += 16, x += 16, y += 16) {
1355 xvec0 = _mm256_load_pd(x+0*4);
1356 xvec1 = _mm256_load_pd(x+1*4);
1357 xvec2 = _mm256_load_pd(x+2*4);
1358 xvec3 = _mm256_load_pd(x+3*4);
1359
1360 yvec0 = _mm256_load_pd(y+0*4);
1361 yvec1 = _mm256_load_pd(y+1*4);
1362 yvec2 = _mm256_load_pd(y+2*4);
1363 yvec3 = _mm256_load_pd(y+3*4);
1364
1365 MUL_ADD(xvec0, yvec0, cvec);
1366 MUL_ADD(xvec1, yvec1, cvec);
1367 MUL_ADD(xvec2, yvec2, cvec);
1368 MUL_ADD(xvec3, yvec3, cvec);
1369
1370 _mm256_store_pd(x + 0*4, xvec0);
1371 _mm256_store_pd(x + 1*4, xvec1);
1372 _mm256_store_pd(x + 2*4, xvec2);
1373 _mm256_store_pd(x + 3*4, xvec3);
1374 }
1375 }
1376
1377 // this one is more general: does not assume that n is a
1378 // multiple of 16
1379 static inline
muladd_interval1(double * NTL_RESTRICT x,double * NTL_RESTRICT y,double c,long n)1380 void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
1381 {
1382
1383 __m256d xvec0, xvec1, xvec2, xvec3;
1384 __m256d yvec0, yvec1, yvec2, yvec3;
1385 __m256d cvec;
1386
1387 if (n >= 4)
1388 cvec = _mm256_broadcast_sd(&c);
1389
1390 long i=0;
1391 for (; i <= n-16; i += 16, x += 16, y += 16) {
1392 xvec0 = _mm256_load_pd(x+0*4);
1393 xvec1 = _mm256_load_pd(x+1*4);
1394 xvec2 = _mm256_load_pd(x+2*4);
1395 xvec3 = _mm256_load_pd(x+3*4);
1396
1397 yvec0 = _mm256_load_pd(y+0*4);
1398 yvec1 = _mm256_load_pd(y+1*4);
1399 yvec2 = _mm256_load_pd(y+2*4);
1400 yvec3 = _mm256_load_pd(y+3*4);
1401
1402 MUL_ADD(xvec0, yvec0, cvec);
1403 MUL_ADD(xvec1, yvec1, cvec);
1404 MUL_ADD(xvec2, yvec2, cvec);
1405 MUL_ADD(xvec3, yvec3, cvec);
1406
1407 _mm256_store_pd(x + 0*4, xvec0);
1408 _mm256_store_pd(x + 1*4, xvec1);
1409 _mm256_store_pd(x + 2*4, xvec2);
1410 _mm256_store_pd(x + 3*4, xvec3);
1411 }
1412
1413 for (; i <= n-4; i += 4, x += 4, y += 4) {
1414 xvec0 = _mm256_load_pd(x+0*4);
1415 yvec0 = _mm256_load_pd(y+0*4);
1416 MUL_ADD(xvec0, yvec0, cvec);
1417 _mm256_store_pd(x + 0*4, xvec0);
1418 }
1419
1420 for (; i < n; i++, x++, y++) {
1421 *x += (*y)*c;
1422 }
1423 }
1424
1425
1426 #endif
1427
1428
1429 //#define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
1430
1431 static inline
1432 unsigned long
DO_MUL(unsigned long a,unsigned long b)1433 DO_MUL(unsigned long a, unsigned long b)
1434 { return a*b; }
1435
1436
1437 static
muladd_interval(unsigned long * NTL_RESTRICT x,unsigned long * NTL_RESTRICT y,unsigned long c,long n)1438 inline void muladd_interval(unsigned long * NTL_RESTRICT x, unsigned long * NTL_RESTRICT y,
1439 unsigned long c, long n)
1440 {
1441 for (long i = 0; i < n; i++)
1442 x[i] += DO_MUL(y[i], c);
1443 }
1444
1445 static
muladd1_by_32(unsigned long * x,const unsigned long * a,const unsigned long * b,long n)1446 void muladd1_by_32(unsigned long *x, const unsigned long *a, const unsigned long *b,
1447 long n)
1448 {
1449 for (long j = 0; j < MAT_BLK_SZ; j++) {
1450 unsigned long sum = x[j];
1451 long i = 0;
1452
1453 for (; i <= n-4; i += 4) {
1454 sum += DO_MUL(a[i+0], b[i+0]);
1455 sum += DO_MUL(a[i+1], b[i+1]);
1456 sum += DO_MUL(a[i+2], b[i+2]);
1457 sum += DO_MUL(a[i+3], b[i+3]);
1458 }
1459
1460 for (; i < n; i++)
1461 sum += DO_MUL(a[i], b[i]);
1462
1463 x[j] = sum;
1464 b += MAT_BLK_SZ;
1465 }
1466 }
1467
1468 static
muladd1_by_32_width(unsigned long * x,const unsigned long * a,const unsigned long * b,long n,long width)1469 void muladd1_by_32_width(unsigned long *x, const unsigned long *a, const unsigned long *b,
1470 long n, long width)
1471 {
1472 for (long j = 0; j < width; j++) {
1473 unsigned long sum = x[j];
1474 long i = 0;
1475
1476 for (; i <= n-4; i += 4) {
1477 sum += DO_MUL(a[i+0], b[i+0]);
1478 sum += DO_MUL(a[i+1], b[i+1]);
1479 sum += DO_MUL(a[i+2], b[i+2]);
1480 sum += DO_MUL(a[i+3], b[i+3]);
1481 }
1482
1483 for (; i < n; i++)
1484 sum += DO_MUL(a[i], b[i]);
1485
1486 x[j] = sum;
1487 b += MAT_BLK_SZ;
1488 }
1489 }
1490
1491 // experiment with shorter int's
1492 static
muladd1_by_32(unsigned long * x,const unsigned int * a,const unsigned int * b,long n)1493 void muladd1_by_32(unsigned long *x, const unsigned int *a, const unsigned int *b,
1494 long n)
1495 {
1496 for (long j = 0; j < MAT_BLK_SZ; j++) {
1497 unsigned long sum = x[j];
1498 long i = 0;
1499
1500 for (; i <= n-4; i += 4) {
1501 sum += DO_MUL(a[i+0], b[i+0]);
1502 sum += DO_MUL(a[i+1], b[i+1]);
1503 sum += DO_MUL(a[i+2], b[i+2]);
1504 sum += DO_MUL(a[i+3], b[i+3]);
1505 }
1506
1507 for (; i < n; i++)
1508 sum += DO_MUL(a[i], b[i]);
1509
1510 x[j] = sum;
1511 b += MAT_BLK_SZ;
1512 }
1513 }
1514
1515 static
muladd1_by_32_width(unsigned long * x,const unsigned int * a,const unsigned int * b,long n,long width)1516 void muladd1_by_32_width(unsigned long *x, const unsigned int *a, const unsigned int *b,
1517 long n, long width)
1518 {
1519 for (long j = 0; j < width; j++) {
1520 unsigned long sum = x[j];
1521 long i = 0;
1522
1523 for (; i <= n-4; i += 4) {
1524 sum += DO_MUL(a[i+0], b[i+0]);
1525 sum += DO_MUL(a[i+1], b[i+1]);
1526 sum += DO_MUL(a[i+2], b[i+2]);
1527 sum += DO_MUL(a[i+3], b[i+3]);
1528 }
1529
1530 for (; i < n; i++)
1531 sum += DO_MUL(a[i], b[i]);
1532
1533 x[j] = sum;
1534 b += MAT_BLK_SZ;
1535 }
1536 }
1537
1538 #if 0
1539 static
1540 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1541 {
1542 for (long j = 0; j < MAT_BLK_SZ; j++) {
1543 unsigned long sum = x[j];
1544 long i = 0;
1545
1546 sum += DO_MUL(a[i+0], b[i+0]);
1547 sum += DO_MUL(a[i+1], b[i+1]);
1548 sum += DO_MUL(a[i+2], b[i+2]);
1549 sum += DO_MUL(a[i+3], b[i+3]);
1550 sum += DO_MUL(a[i+4], b[i+4]);
1551 sum += DO_MUL(a[i+5], b[i+5]);
1552 sum += DO_MUL(a[i+6], b[i+6]);
1553 sum += DO_MUL(a[i+7], b[i+7]);
1554 sum += DO_MUL(a[i+8], b[i+8]);
1555 sum += DO_MUL(a[i+9], b[i+9]);
1556 sum += DO_MUL(a[i+10], b[i+10]);
1557 sum += DO_MUL(a[i+11], b[i+11]);
1558 sum += DO_MUL(a[i+12], b[i+12]);
1559 sum += DO_MUL(a[i+13], b[i+13]);
1560 sum += DO_MUL(a[i+14], b[i+14]);
1561 sum += DO_MUL(a[i+15], b[i+15]);
1562 sum += DO_MUL(a[i+16], b[i+16]);
1563 sum += DO_MUL(a[i+17], b[i+17]);
1564 sum += DO_MUL(a[i+18], b[i+18]);
1565 sum += DO_MUL(a[i+19], b[i+19]);
1566 sum += DO_MUL(a[i+20], b[i+20]);
1567 sum += DO_MUL(a[i+21], b[i+21]);
1568 sum += DO_MUL(a[i+22], b[i+22]);
1569 sum += DO_MUL(a[i+23], b[i+23]);
1570 sum += DO_MUL(a[i+24], b[i+24]);
1571 sum += DO_MUL(a[i+25], b[i+25]);
1572 sum += DO_MUL(a[i+26], b[i+26]);
1573 sum += DO_MUL(a[i+27], b[i+27]);
1574 sum += DO_MUL(a[i+28], b[i+28]);
1575 sum += DO_MUL(a[i+29], b[i+29]);
1576 sum += DO_MUL(a[i+30], b[i+30]);
1577 sum += DO_MUL(a[i+31], b[i+31]);
1578
1579 x[j] = sum;
1580 b += MAT_BLK_SZ;
1581 }
1582 }
1583 #else
1584
1585 // this version is faster (by about 25%) on a Sandybridge machine
1586
1587 #define ONE_STEP_L(i) \
1588 sum += DO_MUL(a[i],b[i]);\
1589 sum_1 += DO_MUL(a[i],b_1[i]);\
1590 sum_2 += DO_MUL(a[i],b_2[i]);\
1591 sum_3 += DO_MUL(a[i],b_3[i])\
1592
1593
1594 static
muladd1_by_32_full(unsigned long * x,const unsigned long * a,const unsigned long * b)1595 void muladd1_by_32_full(unsigned long *x, const unsigned long *a, const unsigned long *b)
1596 {
1597 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1598
1599 unsigned long sum = x[j];
1600 unsigned long sum_1 = x[j+1];
1601 unsigned long sum_2 = x[j+2];
1602 unsigned long sum_3 = x[j+3];
1603
1604 const unsigned long *b_1 = b+MAT_BLK_SZ;
1605 const unsigned long *b_2 = b+2*MAT_BLK_SZ;
1606 const unsigned long *b_3 = b+3*MAT_BLK_SZ;
1607
1608 ONE_STEP_L(0);
1609 ONE_STEP_L(1);
1610 ONE_STEP_L(2);
1611 ONE_STEP_L(3);
1612 ONE_STEP_L(4);
1613 ONE_STEP_L(5);
1614 ONE_STEP_L(6);
1615 ONE_STEP_L(7);
1616 ONE_STEP_L(8);
1617 ONE_STEP_L(9);
1618 ONE_STEP_L(10);
1619 ONE_STEP_L(11);
1620 ONE_STEP_L(12);
1621 ONE_STEP_L(13);
1622 ONE_STEP_L(14);
1623 ONE_STEP_L(15);
1624 ONE_STEP_L(16);
1625 ONE_STEP_L(17);
1626 ONE_STEP_L(18);
1627 ONE_STEP_L(19);
1628 ONE_STEP_L(20);
1629 ONE_STEP_L(21);
1630 ONE_STEP_L(22);
1631 ONE_STEP_L(23);
1632 ONE_STEP_L(24);
1633 ONE_STEP_L(25);
1634 ONE_STEP_L(26);
1635 ONE_STEP_L(27);
1636 ONE_STEP_L(28);
1637 ONE_STEP_L(29);
1638 ONE_STEP_L(30);
1639 ONE_STEP_L(31);
1640
1641 x[j] = sum;
1642 x[j+1] = sum_1;
1643 x[j+2] = sum_2;
1644 x[j+3] = sum_3;
1645
1646 b += 4*MAT_BLK_SZ;
1647 }
1648 }
1649
1650 static
muladd1_by_32_full_width(unsigned long * x,const unsigned long * a,const unsigned long * b,long width)1651 void muladd1_by_32_full_width(unsigned long *x, const unsigned long *a, const unsigned long *b, long width)
1652 {
1653 long j = 0;
1654 for (; j <= width-4; j+=4) {
1655
1656 unsigned long sum = x[j];
1657 unsigned long sum_1 = x[j+1];
1658 unsigned long sum_2 = x[j+2];
1659 unsigned long sum_3 = x[j+3];
1660
1661 const unsigned long *b_1 = b+MAT_BLK_SZ;
1662 const unsigned long *b_2 = b+2*MAT_BLK_SZ;
1663 const unsigned long *b_3 = b+3*MAT_BLK_SZ;
1664
1665 ONE_STEP_L(0);
1666 ONE_STEP_L(1);
1667 ONE_STEP_L(2);
1668 ONE_STEP_L(3);
1669 ONE_STEP_L(4);
1670 ONE_STEP_L(5);
1671 ONE_STEP_L(6);
1672 ONE_STEP_L(7);
1673 ONE_STEP_L(8);
1674 ONE_STEP_L(9);
1675 ONE_STEP_L(10);
1676 ONE_STEP_L(11);
1677 ONE_STEP_L(12);
1678 ONE_STEP_L(13);
1679 ONE_STEP_L(14);
1680 ONE_STEP_L(15);
1681 ONE_STEP_L(16);
1682 ONE_STEP_L(17);
1683 ONE_STEP_L(18);
1684 ONE_STEP_L(19);
1685 ONE_STEP_L(20);
1686 ONE_STEP_L(21);
1687 ONE_STEP_L(22);
1688 ONE_STEP_L(23);
1689 ONE_STEP_L(24);
1690 ONE_STEP_L(25);
1691 ONE_STEP_L(26);
1692 ONE_STEP_L(27);
1693 ONE_STEP_L(28);
1694 ONE_STEP_L(29);
1695 ONE_STEP_L(30);
1696 ONE_STEP_L(31);
1697
1698 x[j] = sum;
1699 x[j+1] = sum_1;
1700 x[j+2] = sum_2;
1701 x[j+3] = sum_3;
1702
1703 b += 4*MAT_BLK_SZ;
1704 }
1705
1706 for (; j < width; j++) {
1707 unsigned long sum = x[j];
1708 long i = 0;
1709
1710 sum += DO_MUL(a[i+0], b[i+0]);
1711 sum += DO_MUL(a[i+1], b[i+1]);
1712 sum += DO_MUL(a[i+2], b[i+2]);
1713 sum += DO_MUL(a[i+3], b[i+3]);
1714 sum += DO_MUL(a[i+4], b[i+4]);
1715 sum += DO_MUL(a[i+5], b[i+5]);
1716 sum += DO_MUL(a[i+6], b[i+6]);
1717 sum += DO_MUL(a[i+7], b[i+7]);
1718 sum += DO_MUL(a[i+8], b[i+8]);
1719 sum += DO_MUL(a[i+9], b[i+9]);
1720 sum += DO_MUL(a[i+10], b[i+10]);
1721 sum += DO_MUL(a[i+11], b[i+11]);
1722 sum += DO_MUL(a[i+12], b[i+12]);
1723 sum += DO_MUL(a[i+13], b[i+13]);
1724 sum += DO_MUL(a[i+14], b[i+14]);
1725 sum += DO_MUL(a[i+15], b[i+15]);
1726 sum += DO_MUL(a[i+16], b[i+16]);
1727 sum += DO_MUL(a[i+17], b[i+17]);
1728 sum += DO_MUL(a[i+18], b[i+18]);
1729 sum += DO_MUL(a[i+19], b[i+19]);
1730 sum += DO_MUL(a[i+20], b[i+20]);
1731 sum += DO_MUL(a[i+21], b[i+21]);
1732 sum += DO_MUL(a[i+22], b[i+22]);
1733 sum += DO_MUL(a[i+23], b[i+23]);
1734 sum += DO_MUL(a[i+24], b[i+24]);
1735 sum += DO_MUL(a[i+25], b[i+25]);
1736 sum += DO_MUL(a[i+26], b[i+26]);
1737 sum += DO_MUL(a[i+27], b[i+27]);
1738 sum += DO_MUL(a[i+28], b[i+28]);
1739 sum += DO_MUL(a[i+29], b[i+29]);
1740 sum += DO_MUL(a[i+30], b[i+30]);
1741 sum += DO_MUL(a[i+31], b[i+31]);
1742
1743 x[j] = sum;
1744 b += MAT_BLK_SZ;
1745 }
1746 }
1747
1748
1749 // experiment with shorter int's
1750 static
muladd1_by_32_full(unsigned long * x,const unsigned int * a,const unsigned int * b)1751 void muladd1_by_32_full(unsigned long *x, const unsigned int *a, const unsigned int *b)
1752 {
1753 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
1754
1755 unsigned long sum = x[j];
1756 unsigned long sum_1 = x[j+1];
1757 unsigned long sum_2 = x[j+2];
1758 unsigned long sum_3 = x[j+3];
1759
1760 const unsigned int *b_1 = b+MAT_BLK_SZ;
1761 const unsigned int *b_2 = b+2*MAT_BLK_SZ;
1762 const unsigned int *b_3 = b+3*MAT_BLK_SZ;
1763
1764 ONE_STEP_L(0);
1765 ONE_STEP_L(1);
1766 ONE_STEP_L(2);
1767 ONE_STEP_L(3);
1768 ONE_STEP_L(4);
1769 ONE_STEP_L(5);
1770 ONE_STEP_L(6);
1771 ONE_STEP_L(7);
1772 ONE_STEP_L(8);
1773 ONE_STEP_L(9);
1774 ONE_STEP_L(10);
1775 ONE_STEP_L(11);
1776 ONE_STEP_L(12);
1777 ONE_STEP_L(13);
1778 ONE_STEP_L(14);
1779 ONE_STEP_L(15);
1780 ONE_STEP_L(16);
1781 ONE_STEP_L(17);
1782 ONE_STEP_L(18);
1783 ONE_STEP_L(19);
1784 ONE_STEP_L(20);
1785 ONE_STEP_L(21);
1786 ONE_STEP_L(22);
1787 ONE_STEP_L(23);
1788 ONE_STEP_L(24);
1789 ONE_STEP_L(25);
1790 ONE_STEP_L(26);
1791 ONE_STEP_L(27);
1792 ONE_STEP_L(28);
1793 ONE_STEP_L(29);
1794 ONE_STEP_L(30);
1795 ONE_STEP_L(31);
1796
1797 x[j] = sum;
1798 x[j+1] = sum_1;
1799 x[j+2] = sum_2;
1800 x[j+3] = sum_3;
1801
1802 b += 4*MAT_BLK_SZ;
1803 }
1804 }
1805
1806 static
muladd1_by_32_full_width(unsigned long * x,const unsigned int * a,const unsigned int * b,long width)1807 void muladd1_by_32_full_width(unsigned long *x, const unsigned int *a, const unsigned int *b, long width)
1808 {
1809 long j = 0;
1810 for (; j <= width-4; j+=4) {
1811
1812 unsigned long sum = x[j];
1813 unsigned long sum_1 = x[j+1];
1814 unsigned long sum_2 = x[j+2];
1815 unsigned long sum_3 = x[j+3];
1816
1817 const unsigned int *b_1 = b+MAT_BLK_SZ;
1818 const unsigned int *b_2 = b+2*MAT_BLK_SZ;
1819 const unsigned int *b_3 = b+3*MAT_BLK_SZ;
1820
1821 ONE_STEP_L(0);
1822 ONE_STEP_L(1);
1823 ONE_STEP_L(2);
1824 ONE_STEP_L(3);
1825 ONE_STEP_L(4);
1826 ONE_STEP_L(5);
1827 ONE_STEP_L(6);
1828 ONE_STEP_L(7);
1829 ONE_STEP_L(8);
1830 ONE_STEP_L(9);
1831 ONE_STEP_L(10);
1832 ONE_STEP_L(11);
1833 ONE_STEP_L(12);
1834 ONE_STEP_L(13);
1835 ONE_STEP_L(14);
1836 ONE_STEP_L(15);
1837 ONE_STEP_L(16);
1838 ONE_STEP_L(17);
1839 ONE_STEP_L(18);
1840 ONE_STEP_L(19);
1841 ONE_STEP_L(20);
1842 ONE_STEP_L(21);
1843 ONE_STEP_L(22);
1844 ONE_STEP_L(23);
1845 ONE_STEP_L(24);
1846 ONE_STEP_L(25);
1847 ONE_STEP_L(26);
1848 ONE_STEP_L(27);
1849 ONE_STEP_L(28);
1850 ONE_STEP_L(29);
1851 ONE_STEP_L(30);
1852 ONE_STEP_L(31);
1853
1854 x[j] = sum;
1855 x[j+1] = sum_1;
1856 x[j+2] = sum_2;
1857 x[j+3] = sum_3;
1858
1859 b += 4*MAT_BLK_SZ;
1860 }
1861
1862 for (; j < width; j++) {
1863 unsigned long sum = x[j];
1864 long i = 0;
1865
1866 sum += DO_MUL(a[i+0], b[i+0]);
1867 sum += DO_MUL(a[i+1], b[i+1]);
1868 sum += DO_MUL(a[i+2], b[i+2]);
1869 sum += DO_MUL(a[i+3], b[i+3]);
1870 sum += DO_MUL(a[i+4], b[i+4]);
1871 sum += DO_MUL(a[i+5], b[i+5]);
1872 sum += DO_MUL(a[i+6], b[i+6]);
1873 sum += DO_MUL(a[i+7], b[i+7]);
1874 sum += DO_MUL(a[i+8], b[i+8]);
1875 sum += DO_MUL(a[i+9], b[i+9]);
1876 sum += DO_MUL(a[i+10], b[i+10]);
1877 sum += DO_MUL(a[i+11], b[i+11]);
1878 sum += DO_MUL(a[i+12], b[i+12]);
1879 sum += DO_MUL(a[i+13], b[i+13]);
1880 sum += DO_MUL(a[i+14], b[i+14]);
1881 sum += DO_MUL(a[i+15], b[i+15]);
1882 sum += DO_MUL(a[i+16], b[i+16]);
1883 sum += DO_MUL(a[i+17], b[i+17]);
1884 sum += DO_MUL(a[i+18], b[i+18]);
1885 sum += DO_MUL(a[i+19], b[i+19]);
1886 sum += DO_MUL(a[i+20], b[i+20]);
1887 sum += DO_MUL(a[i+21], b[i+21]);
1888 sum += DO_MUL(a[i+22], b[i+22]);
1889 sum += DO_MUL(a[i+23], b[i+23]);
1890 sum += DO_MUL(a[i+24], b[i+24]);
1891 sum += DO_MUL(a[i+25], b[i+25]);
1892 sum += DO_MUL(a[i+26], b[i+26]);
1893 sum += DO_MUL(a[i+27], b[i+27]);
1894 sum += DO_MUL(a[i+28], b[i+28]);
1895 sum += DO_MUL(a[i+29], b[i+29]);
1896 sum += DO_MUL(a[i+30], b[i+30]);
1897 sum += DO_MUL(a[i+31], b[i+31]);
1898
1899 x[j] = sum;
1900 b += MAT_BLK_SZ;
1901 }
1902 }
1903
1904 #endif
1905
1906 static inline
muladd_all_by_32(long first,long last,unsigned long * x,const unsigned int * a,const unsigned int * b,long n)1907 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned int *a, const unsigned int *b, long n)
1908 {
1909 if (n == MAT_BLK_SZ) {
1910 for (long i = first; i < last; i++)
1911 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1912 }
1913 else {
1914 for (long i = first; i < last; i++)
1915 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1916 }
1917 }
1918
1919 static inline
muladd_all_by_32_width(long first,long last,unsigned long * x,const unsigned long * a,const unsigned long * b,long n,long width)1920 void muladd_all_by_32_width(long first, long last, unsigned long *x, const unsigned long *a, const unsigned long *b, long n, long width)
1921 {
1922 if (width == MAT_BLK_SZ) {
1923 if (n == MAT_BLK_SZ) {
1924 for (long i = first; i < last; i++)
1925 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1926 }
1927 else {
1928 for (long i = first; i < last; i++)
1929 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1930 }
1931 }
1932 else {
1933 if (n == MAT_BLK_SZ) {
1934 for (long i = first; i < last; i++)
1935 muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, width);
1936 }
1937 else {
1938 for (long i = first; i < last; i++)
1939 muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, width);
1940 }
1941 }
1942 }
1943
1944 static inline
muladd_all_by_32(long first,long last,unsigned long * x,const unsigned long * a,const unsigned long * b,long n)1945 void muladd_all_by_32(long first, long last, unsigned long *x, const unsigned long *a, const unsigned long *b, long n)
1946 {
1947 if (n == MAT_BLK_SZ) {
1948 for (long i = first; i < last; i++)
1949 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1950 }
1951 else {
1952 for (long i = first; i < last; i++)
1953 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1954 }
1955 }
1956
1957 static inline
muladd_all_by_32_width(long first,long last,unsigned long * x,const unsigned int * a,const unsigned int * b,long n,long width)1958 void muladd_all_by_32_width(long first, long last, unsigned long *x, const unsigned int *a, const unsigned int *b, long n, long width)
1959 {
1960 if (width == MAT_BLK_SZ) {
1961 if (n == MAT_BLK_SZ) {
1962 for (long i = first; i < last; i++)
1963 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b);
1964 }
1965 else {
1966 for (long i = first; i < last; i++)
1967 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
1968 }
1969 }
1970 else {
1971 if (n == MAT_BLK_SZ) {
1972 for (long i = first; i < last; i++)
1973 muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, width);
1974 }
1975 else {
1976 for (long i = first; i < last; i++)
1977 muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, width);
1978 }
1979 }
1980 }
1981
1982 #if (!defined(__INTEL_COMPILER) && (NTL_BITS_PER_INT >= NTL_BITS_PER_LONG/2))
1983 // Something goes wrong with the Intel ICC (version 16.0.3) compiler
1984 // in this case.
1985 // It goes away with -O1, so I suspect it is a compiler bug.
1986
1987 typedef unsigned int uhlong;
1988
1989 #else
1990
1991 typedef unsigned long uhlong;
1992
1993 #endif
1994
1995
1996
1997
1998 // NOTE: the following code is hardcoded for MAT_BLK_SZ == 32.
1999 // Also, we special case NTL_BITS_PER_LONG-NTL_SP_NBITS > 2, which
2000 // allows us to accumulate all 32 products without additional carries.
2001
2002 #if (NTL_BITS_PER_LONG-NTL_SP_NBITS > 2)
2003
2004 static
muladd1_by_32(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2005 void muladd1_by_32(long *x, const long *a, const long *b,
2006 long n, long p, sp_ll_reduce_struct ll_red_struct)
2007 {
2008 for (long j = 0; j < MAT_BLK_SZ; j++) {
2009
2010 ll_type sum;
2011 ll_init(sum, x[j]);
2012 #if 0
2013 for (long i = 0; i < n; i++)
2014 ll_imul_add(sum, a[i], b[i]);
2015 #else
2016 long i=0;
2017 for(; i <= n-8; i+= 8) {
2018 ll_imul_add(sum, a[i+0], b[i+0]);
2019 ll_imul_add(sum, a[i+1], b[i+1]);
2020 ll_imul_add(sum, a[i+2], b[i+2]);
2021 ll_imul_add(sum, a[i+3], b[i+3]);
2022
2023 ll_imul_add(sum, a[i+4], b[i+4]);
2024 ll_imul_add(sum, a[i+5], b[i+5]);
2025 ll_imul_add(sum, a[i+6], b[i+6]);
2026 ll_imul_add(sum, a[i+7], b[i+7]);
2027 }
2028
2029 for (; i < n; i++)
2030 ll_imul_add(sum, a[i], b[i]);
2031
2032 #endif
2033
2034 unsigned long sum0 = ll_get_lo(sum);
2035 unsigned long sum1 = ll_get_hi(sum);
2036
2037 long res;
2038
2039 if (ll_red_struct.nbits == NTL_SP_NBITS)
2040 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2041 else
2042 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2043
2044
2045 x[j] = res;
2046 b += MAT_BLK_SZ;
2047 }
2048 }
2049
2050 static
muladd1_by_32_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2051 void muladd1_by_32_width(long *x, const long *a, const long *b,
2052 long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2053 {
2054 for (long j = 0; j < width; j++) {
2055
2056 ll_type sum;
2057 ll_init(sum, x[j]);
2058 #if 0
2059 for (long i = 0; i < n; i++)
2060 ll_imul_add(sum, a[i], b[i]);
2061 #else
2062 long i=0;
2063 for(; i <= n-8; i+= 8) {
2064 ll_imul_add(sum, a[i+0], b[i+0]);
2065 ll_imul_add(sum, a[i+1], b[i+1]);
2066 ll_imul_add(sum, a[i+2], b[i+2]);
2067 ll_imul_add(sum, a[i+3], b[i+3]);
2068
2069 ll_imul_add(sum, a[i+4], b[i+4]);
2070 ll_imul_add(sum, a[i+5], b[i+5]);
2071 ll_imul_add(sum, a[i+6], b[i+6]);
2072 ll_imul_add(sum, a[i+7], b[i+7]);
2073 }
2074
2075 for (; i < n; i++)
2076 ll_imul_add(sum, a[i], b[i]);
2077
2078 #endif
2079
2080 unsigned long sum0 = ll_get_lo(sum);
2081 unsigned long sum1 = ll_get_hi(sum);
2082
2083 long res;
2084
2085 if (ll_red_struct.nbits == NTL_SP_NBITS)
2086 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2087 else
2088 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2089
2090
2091 x[j] = res;
2092 b += MAT_BLK_SZ;
2093 }
2094 }
2095
2096 #if 0
2097 static
2098 void muladd1_by_32_full(long *x, const long *a, const long *b,
2099 long p, sp_ll_reduce_struct ll_red_struct)
2100 {
2101 for (long j = 0; j < MAT_BLK_SZ; j++) {
2102
2103 ll_type sum;
2104 ll_init(sum, x[j]);
2105
2106 ll_imul_add(sum, a[0], b[0]);
2107 ll_imul_add(sum, a[1], b[1]);
2108 ll_imul_add(sum, a[2], b[2]);
2109 ll_imul_add(sum, a[3], b[3]);
2110 ll_imul_add(sum, a[4], b[4]);
2111 ll_imul_add(sum, a[5], b[5]);
2112 ll_imul_add(sum, a[6], b[6]);
2113 ll_imul_add(sum, a[7], b[7]);
2114 ll_imul_add(sum, a[8], b[8]);
2115 ll_imul_add(sum, a[9], b[9]);
2116 ll_imul_add(sum, a[10], b[10]);
2117 ll_imul_add(sum, a[11], b[11]);
2118 ll_imul_add(sum, a[12], b[12]);
2119 ll_imul_add(sum, a[13], b[13]);
2120 ll_imul_add(sum, a[14], b[14]);
2121 ll_imul_add(sum, a[15], b[15]);
2122 ll_imul_add(sum, a[16], b[16]);
2123 ll_imul_add(sum, a[17], b[17]);
2124 ll_imul_add(sum, a[18], b[18]);
2125 ll_imul_add(sum, a[19], b[19]);
2126 ll_imul_add(sum, a[20], b[20]);
2127 ll_imul_add(sum, a[21], b[21]);
2128 ll_imul_add(sum, a[22], b[22]);
2129 ll_imul_add(sum, a[23], b[23]);
2130 ll_imul_add(sum, a[24], b[24]);
2131 ll_imul_add(sum, a[25], b[25]);
2132 ll_imul_add(sum, a[26], b[26]);
2133 ll_imul_add(sum, a[27], b[27]);
2134 ll_imul_add(sum, a[28], b[28]);
2135 ll_imul_add(sum, a[29], b[29]);
2136 ll_imul_add(sum, a[30], b[30]);
2137 ll_imul_add(sum, a[31], b[31]);
2138
2139 unsigned long sum0 = ll_get_lo(sum);
2140 unsigned long sum1 = ll_get_hi(sum);
2141
2142 long res;
2143
2144 if (ll_red_struct.nbits == NTL_SP_NBITS)
2145 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2146 else
2147 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2148
2149
2150 x[j] = res;
2151 b += MAT_BLK_SZ;
2152 }
2153 }
2154
2155 static
2156 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2157 long p, sp_ll_reduce_struct ll_red_struct, long width)
2158 {
2159 for (long j = 0; j < width; j++) {
2160
2161 ll_type sum;
2162 ll_init(sum, x[j]);
2163
2164 ll_imul_add(sum, a[0], b[0]);
2165 ll_imul_add(sum, a[1], b[1]);
2166 ll_imul_add(sum, a[2], b[2]);
2167 ll_imul_add(sum, a[3], b[3]);
2168 ll_imul_add(sum, a[4], b[4]);
2169 ll_imul_add(sum, a[5], b[5]);
2170 ll_imul_add(sum, a[6], b[6]);
2171 ll_imul_add(sum, a[7], b[7]);
2172 ll_imul_add(sum, a[8], b[8]);
2173 ll_imul_add(sum, a[9], b[9]);
2174 ll_imul_add(sum, a[10], b[10]);
2175 ll_imul_add(sum, a[11], b[11]);
2176 ll_imul_add(sum, a[12], b[12]);
2177 ll_imul_add(sum, a[13], b[13]);
2178 ll_imul_add(sum, a[14], b[14]);
2179 ll_imul_add(sum, a[15], b[15]);
2180 ll_imul_add(sum, a[16], b[16]);
2181 ll_imul_add(sum, a[17], b[17]);
2182 ll_imul_add(sum, a[18], b[18]);
2183 ll_imul_add(sum, a[19], b[19]);
2184 ll_imul_add(sum, a[20], b[20]);
2185 ll_imul_add(sum, a[21], b[21]);
2186 ll_imul_add(sum, a[22], b[22]);
2187 ll_imul_add(sum, a[23], b[23]);
2188 ll_imul_add(sum, a[24], b[24]);
2189 ll_imul_add(sum, a[25], b[25]);
2190 ll_imul_add(sum, a[26], b[26]);
2191 ll_imul_add(sum, a[27], b[27]);
2192 ll_imul_add(sum, a[28], b[28]);
2193 ll_imul_add(sum, a[29], b[29]);
2194 ll_imul_add(sum, a[30], b[30]);
2195 ll_imul_add(sum, a[31], b[31]);
2196
2197 unsigned long sum0 = ll_get_lo(sum);
2198 unsigned long sum1 = ll_get_hi(sum);
2199
2200 long res;
2201
2202 if (ll_red_struct.nbits == NTL_SP_NBITS)
2203 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2204 else
2205 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2206
2207
2208 x[j] = res;
2209 b += MAT_BLK_SZ;
2210 }
2211 }
2212
2213 #elif 1
2214 // This version is consistently fastest on tests on Sandybridge and Haswell
2215
2216
2217
2218 #define ONE_STEP(i) \
2219 ll_imul_add(sum, a[i], b[i]);\
2220 ll_imul_add(sum_1, a[i], b_1[i]);\
2221 ll_imul_add(sum_2, a[i], b_2[i]);\
2222 ll_imul_add(sum_3, a[i], b_3[i]);\
2223
2224
muladd1_by_32_full(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct)2225 void muladd1_by_32_full(long *x, const long *a, const long *b,
2226 long p, sp_ll_reduce_struct ll_red_struct)
2227 {
2228 for (long j = 0; j < MAT_BLK_SZ; j+=4) {
2229
2230 ll_type sum, sum_1, sum_2, sum_3;
2231 ll_init(sum, x[j]);
2232 ll_init(sum_1, x[j+1]);
2233 ll_init(sum_2, x[j+2]);
2234 ll_init(sum_3, x[j+3]);
2235
2236 const long *b_1 = b+MAT_BLK_SZ;
2237 const long *b_2 = b+2*MAT_BLK_SZ;
2238 const long *b_3 = b+3*MAT_BLK_SZ;
2239
2240 ONE_STEP(0);
2241 ONE_STEP(1);
2242 ONE_STEP(2);
2243 ONE_STEP(3);
2244 ONE_STEP(4);
2245 ONE_STEP(5);
2246 ONE_STEP(6);
2247 ONE_STEP(7);
2248 ONE_STEP(8);
2249 ONE_STEP(9);
2250 ONE_STEP(10);
2251 ONE_STEP(11);
2252 ONE_STEP(12);
2253 ONE_STEP(13);
2254 ONE_STEP(14);
2255 ONE_STEP(15);
2256 ONE_STEP(16);
2257 ONE_STEP(17);
2258 ONE_STEP(18);
2259 ONE_STEP(19);
2260 ONE_STEP(20);
2261 ONE_STEP(21);
2262 ONE_STEP(22);
2263 ONE_STEP(23);
2264 ONE_STEP(24);
2265 ONE_STEP(25);
2266 ONE_STEP(26);
2267 ONE_STEP(27);
2268 ONE_STEP(28);
2269 ONE_STEP(29);
2270 ONE_STEP(30);
2271 ONE_STEP(31);
2272
2273 unsigned long sum0 = ll_get_lo(sum);
2274 unsigned long sum1 = ll_get_hi(sum);
2275
2276 unsigned long sum0_1 = ll_get_lo(sum_1);
2277 unsigned long sum1_1 = ll_get_hi(sum_1);
2278
2279 unsigned long sum0_2 = ll_get_lo(sum_2);
2280 unsigned long sum1_2 = ll_get_hi(sum_2);
2281
2282 unsigned long sum0_3 = ll_get_lo(sum_3);
2283 unsigned long sum1_3 = ll_get_hi(sum_3);
2284
2285 if (ll_red_struct.nbits == NTL_SP_NBITS) {
2286 x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2287 x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
2288 x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
2289 x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
2290 }
2291 else {
2292 x[j] = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2293 x[j+1] = sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
2294 x[j+2] = sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
2295 x[j+3] = sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
2296 }
2297
2298
2299 b += 4*MAT_BLK_SZ;
2300 }
2301 }
2302
muladd1_by_32_full_width(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct,long width)2303 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2304 long p, sp_ll_reduce_struct ll_red_struct, long width)
2305 {
2306 long j = 0;
2307 for (; j <= width-4; j+=4) {
2308
2309 ll_type sum, sum_1, sum_2, sum_3;
2310 ll_init(sum, x[j]);
2311 ll_init(sum_1, x[j+1]);
2312 ll_init(sum_2, x[j+2]);
2313 ll_init(sum_3, x[j+3]);
2314
2315 const long *b_1 = b+MAT_BLK_SZ;
2316 const long *b_2 = b+2*MAT_BLK_SZ;
2317 const long *b_3 = b+3*MAT_BLK_SZ;
2318
2319 ONE_STEP(0);
2320 ONE_STEP(1);
2321 ONE_STEP(2);
2322 ONE_STEP(3);
2323 ONE_STEP(4);
2324 ONE_STEP(5);
2325 ONE_STEP(6);
2326 ONE_STEP(7);
2327 ONE_STEP(8);
2328 ONE_STEP(9);
2329 ONE_STEP(10);
2330 ONE_STEP(11);
2331 ONE_STEP(12);
2332 ONE_STEP(13);
2333 ONE_STEP(14);
2334 ONE_STEP(15);
2335 ONE_STEP(16);
2336 ONE_STEP(17);
2337 ONE_STEP(18);
2338 ONE_STEP(19);
2339 ONE_STEP(20);
2340 ONE_STEP(21);
2341 ONE_STEP(22);
2342 ONE_STEP(23);
2343 ONE_STEP(24);
2344 ONE_STEP(25);
2345 ONE_STEP(26);
2346 ONE_STEP(27);
2347 ONE_STEP(28);
2348 ONE_STEP(29);
2349 ONE_STEP(30);
2350 ONE_STEP(31);
2351
2352 unsigned long sum0 = ll_get_lo(sum);
2353 unsigned long sum1 = ll_get_hi(sum);
2354
2355 unsigned long sum0_1 = ll_get_lo(sum_1);
2356 unsigned long sum1_1 = ll_get_hi(sum_1);
2357
2358 unsigned long sum0_2 = ll_get_lo(sum_2);
2359 unsigned long sum1_2 = ll_get_hi(sum_2);
2360
2361 unsigned long sum0_3 = ll_get_lo(sum_3);
2362 unsigned long sum1_3 = ll_get_hi(sum_3);
2363
2364 if (ll_red_struct.nbits == NTL_SP_NBITS) {
2365 x[j] = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2366 x[j+1] = sp_ll_red_31_normalized(0, sum1_1, sum0_1, p, ll_red_struct);
2367 x[j+2] = sp_ll_red_31_normalized(0, sum1_2, sum0_2, p, ll_red_struct);
2368 x[j+3] = sp_ll_red_31_normalized(0, sum1_3, sum0_3, p, ll_red_struct);
2369 }
2370 else {
2371 x[j] = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2372 x[j+1] = sp_ll_red_31(0, sum1_1, sum0_1, p, ll_red_struct);
2373 x[j+2] = sp_ll_red_31(0, sum1_2, sum0_2, p, ll_red_struct);
2374 x[j+3] = sp_ll_red_31(0, sum1_3, sum0_3, p, ll_red_struct);
2375 }
2376
2377
2378 b += 4*MAT_BLK_SZ;
2379 }
2380
2381 for (; j < width; j++) {
2382
2383 ll_type sum;
2384 ll_init(sum, x[j]);
2385
2386 ll_imul_add(sum, a[0], b[0]);
2387 ll_imul_add(sum, a[1], b[1]);
2388 ll_imul_add(sum, a[2], b[2]);
2389 ll_imul_add(sum, a[3], b[3]);
2390 ll_imul_add(sum, a[4], b[4]);
2391 ll_imul_add(sum, a[5], b[5]);
2392 ll_imul_add(sum, a[6], b[6]);
2393 ll_imul_add(sum, a[7], b[7]);
2394 ll_imul_add(sum, a[8], b[8]);
2395 ll_imul_add(sum, a[9], b[9]);
2396 ll_imul_add(sum, a[10], b[10]);
2397 ll_imul_add(sum, a[11], b[11]);
2398 ll_imul_add(sum, a[12], b[12]);
2399 ll_imul_add(sum, a[13], b[13]);
2400 ll_imul_add(sum, a[14], b[14]);
2401 ll_imul_add(sum, a[15], b[15]);
2402 ll_imul_add(sum, a[16], b[16]);
2403 ll_imul_add(sum, a[17], b[17]);
2404 ll_imul_add(sum, a[18], b[18]);
2405 ll_imul_add(sum, a[19], b[19]);
2406 ll_imul_add(sum, a[20], b[20]);
2407 ll_imul_add(sum, a[21], b[21]);
2408 ll_imul_add(sum, a[22], b[22]);
2409 ll_imul_add(sum, a[23], b[23]);
2410 ll_imul_add(sum, a[24], b[24]);
2411 ll_imul_add(sum, a[25], b[25]);
2412 ll_imul_add(sum, a[26], b[26]);
2413 ll_imul_add(sum, a[27], b[27]);
2414 ll_imul_add(sum, a[28], b[28]);
2415 ll_imul_add(sum, a[29], b[29]);
2416 ll_imul_add(sum, a[30], b[30]);
2417 ll_imul_add(sum, a[31], b[31]);
2418
2419 unsigned long sum0 = ll_get_lo(sum);
2420 unsigned long sum1 = ll_get_hi(sum);
2421
2422 long res;
2423
2424 if (ll_red_struct.nbits == NTL_SP_NBITS)
2425 res = sp_ll_red_31_normalized(0, sum1, sum0, p, ll_red_struct);
2426 else
2427 res = sp_ll_red_31(0, sum1, sum0, p, ll_red_struct);
2428
2429
2430 x[j] = res;
2431 b += MAT_BLK_SZ;
2432 }
2433 }
2434
2435
2436 #endif
2437
2438
2439 #else
2440
2441
2442 static
muladd1_by_32(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2443 void muladd1_by_32(long *x, const long *a, const long *b,
2444 long n, long p, sp_ll_reduce_struct ll_red_struct)
2445 {
2446 for (long j = 0; j < MAT_BLK_SZ; j++) {
2447
2448 ll_type sum;
2449 ll_init(sum, x[j]);
2450
2451 long i = 0;
2452 for (; i < n-16; i++)
2453 ll_imul_add(sum, a[i], b[i]);
2454
2455 ll_type acc21;
2456 ll_init(acc21, ll_get_hi(sum));
2457 unsigned long acc0 = ll_get_lo(sum);
2458 ll_init(sum, acc0);
2459
2460 for (; i < n; i++)
2461 ll_imul_add(sum, a[i], b[i]);
2462
2463 acc0 = ll_get_lo(sum);
2464 ll_add(acc21, ll_get_hi(sum));
2465
2466 long res;
2467
2468 if (ll_red_struct.nbits == NTL_SP_NBITS)
2469 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2470 else
2471 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2472
2473 x[j] = res;
2474 b += MAT_BLK_SZ;
2475 }
2476 }
2477
2478 static
muladd1_by_32_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2479 void muladd1_by_32_width(long *x, const long *a, const long *b,
2480 long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2481 {
2482 for (long j = 0; j < width; j++) {
2483
2484 ll_type sum;
2485 ll_init(sum, x[j]);
2486
2487 long i = 0;
2488 for (; i < n-16; i++)
2489 ll_imul_add(sum, a[i], b[i]);
2490
2491 ll_type acc21;
2492 ll_init(acc21, ll_get_hi(sum));
2493 unsigned long acc0 = ll_get_lo(sum);
2494 ll_init(sum, acc0);
2495
2496 for (; i < n; i++)
2497 ll_imul_add(sum, a[i], b[i]);
2498
2499 acc0 = ll_get_lo(sum);
2500 ll_add(acc21, ll_get_hi(sum));
2501
2502 long res;
2503
2504 if (ll_red_struct.nbits == NTL_SP_NBITS)
2505 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2506 else
2507 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2508
2509 x[j] = res;
2510 b += MAT_BLK_SZ;
2511 }
2512 }
2513
2514 static
muladd1_by_32_full(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct)2515 void muladd1_by_32_full(long *x, const long *a, const long *b,
2516 long p, sp_ll_reduce_struct ll_red_struct)
2517 {
2518 for (long j = 0; j < MAT_BLK_SZ; j++) {
2519
2520 ll_type sum;
2521 ll_init(sum, x[j]);
2522
2523 ll_imul_add(sum, a[0], b[0]);
2524 ll_imul_add(sum, a[1], b[1]);
2525 ll_imul_add(sum, a[2], b[2]);
2526 ll_imul_add(sum, a[3], b[3]);
2527 ll_imul_add(sum, a[4], b[4]);
2528 ll_imul_add(sum, a[5], b[5]);
2529 ll_imul_add(sum, a[6], b[6]);
2530 ll_imul_add(sum, a[7], b[7]);
2531 ll_imul_add(sum, a[8], b[8]);
2532 ll_imul_add(sum, a[9], b[9]);
2533 ll_imul_add(sum, a[10], b[10]);
2534 ll_imul_add(sum, a[11], b[11]);
2535 ll_imul_add(sum, a[12], b[12]);
2536 ll_imul_add(sum, a[13], b[13]);
2537 ll_imul_add(sum, a[14], b[14]);
2538 ll_imul_add(sum, a[15], b[15]);
2539
2540 ll_type acc21;
2541 ll_init(acc21, ll_get_hi(sum));
2542 unsigned long acc0 = ll_get_lo(sum);
2543 ll_init(sum, acc0);
2544
2545 ll_imul_add(sum, a[16], b[16]);
2546 ll_imul_add(sum, a[17], b[17]);
2547 ll_imul_add(sum, a[18], b[18]);
2548 ll_imul_add(sum, a[19], b[19]);
2549 ll_imul_add(sum, a[20], b[20]);
2550 ll_imul_add(sum, a[21], b[21]);
2551 ll_imul_add(sum, a[22], b[22]);
2552 ll_imul_add(sum, a[23], b[23]);
2553 ll_imul_add(sum, a[24], b[24]);
2554 ll_imul_add(sum, a[25], b[25]);
2555 ll_imul_add(sum, a[26], b[26]);
2556 ll_imul_add(sum, a[27], b[27]);
2557 ll_imul_add(sum, a[28], b[28]);
2558 ll_imul_add(sum, a[29], b[29]);
2559 ll_imul_add(sum, a[30], b[30]);
2560 ll_imul_add(sum, a[31], b[31]);
2561
2562 acc0 = ll_get_lo(sum);
2563 ll_add(acc21, ll_get_hi(sum));
2564
2565 long res;
2566
2567 if (ll_red_struct.nbits == NTL_SP_NBITS)
2568 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2569 else
2570 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2571
2572 x[j] = res;
2573 b += MAT_BLK_SZ;
2574 }
2575 }
2576
2577 static
muladd1_by_32_full_width(long * x,const long * a,const long * b,long p,sp_ll_reduce_struct ll_red_struct,long width)2578 void muladd1_by_32_full_width(long *x, const long *a, const long *b,
2579 long p, sp_ll_reduce_struct ll_red_struct, long width)
2580 {
2581 for (long j = 0; j < width; j++) {
2582
2583 ll_type sum;
2584 ll_init(sum, x[j]);
2585
2586 ll_imul_add(sum, a[0], b[0]);
2587 ll_imul_add(sum, a[1], b[1]);
2588 ll_imul_add(sum, a[2], b[2]);
2589 ll_imul_add(sum, a[3], b[3]);
2590 ll_imul_add(sum, a[4], b[4]);
2591 ll_imul_add(sum, a[5], b[5]);
2592 ll_imul_add(sum, a[6], b[6]);
2593 ll_imul_add(sum, a[7], b[7]);
2594 ll_imul_add(sum, a[8], b[8]);
2595 ll_imul_add(sum, a[9], b[9]);
2596 ll_imul_add(sum, a[10], b[10]);
2597 ll_imul_add(sum, a[11], b[11]);
2598 ll_imul_add(sum, a[12], b[12]);
2599 ll_imul_add(sum, a[13], b[13]);
2600 ll_imul_add(sum, a[14], b[14]);
2601 ll_imul_add(sum, a[15], b[15]);
2602
2603 ll_type acc21;
2604 ll_init(acc21, ll_get_hi(sum));
2605 unsigned long acc0 = ll_get_lo(sum);
2606 ll_init(sum, acc0);
2607
2608 ll_imul_add(sum, a[16], b[16]);
2609 ll_imul_add(sum, a[17], b[17]);
2610 ll_imul_add(sum, a[18], b[18]);
2611 ll_imul_add(sum, a[19], b[19]);
2612 ll_imul_add(sum, a[20], b[20]);
2613 ll_imul_add(sum, a[21], b[21]);
2614 ll_imul_add(sum, a[22], b[22]);
2615 ll_imul_add(sum, a[23], b[23]);
2616 ll_imul_add(sum, a[24], b[24]);
2617 ll_imul_add(sum, a[25], b[25]);
2618 ll_imul_add(sum, a[26], b[26]);
2619 ll_imul_add(sum, a[27], b[27]);
2620 ll_imul_add(sum, a[28], b[28]);
2621 ll_imul_add(sum, a[29], b[29]);
2622 ll_imul_add(sum, a[30], b[30]);
2623 ll_imul_add(sum, a[31], b[31]);
2624
2625 acc0 = ll_get_lo(sum);
2626 ll_add(acc21, ll_get_hi(sum));
2627
2628 long res;
2629
2630 if (ll_red_struct.nbits == NTL_SP_NBITS)
2631 res = sp_ll_red_31_normalized(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2632 else
2633 res = sp_ll_red_31(ll_get_hi(acc21), ll_get_lo(acc21), acc0, p, ll_red_struct);
2634
2635 x[j] = res;
2636 b += MAT_BLK_SZ;
2637 }
2638 }
2639
2640
2641 #endif
2642
2643
2644 static
muladd1_by_32_half2(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2645 void muladd1_by_32_half2(long *x, const long *a, const long *b,
2646 long n, long p, sp_ll_reduce_struct ll_red_struct)
2647 {
2648 for (long j = 0; j < MAT_BLK_SZ; j++) {
2649
2650 unsigned long sum[2];
2651 sum[0] = x[j];
2652 sum[1] = 0;
2653
2654 long k=0;
2655 long i=0;
2656 for(; i <= n-16; i+= 16) {
2657 unsigned long lsum = a[i+0]*b[i+0];
2658 lsum += a[i+1]*b[i+1];
2659 lsum += a[i+2]*b[i+2];
2660 lsum += a[i+3]*b[i+3];
2661 lsum += a[i+4]*b[i+4];
2662 lsum += a[i+5]*b[i+5];
2663 lsum += a[i+6]*b[i+6];
2664 lsum += a[i+7]*b[i+7];
2665 lsum += a[i+8]*b[i+8];
2666 lsum += a[i+9]*b[i+9];
2667 lsum += a[i+10]*b[i+10];
2668 lsum += a[i+11]*b[i+11];
2669 lsum += a[i+12]*b[i+12];
2670 lsum += a[i+13]*b[i+13];
2671 lsum += a[i+14]*b[i+14];
2672 lsum += a[i+15]*b[i+15];
2673 sum[k++] += lsum;
2674 }
2675
2676 if (i < n) {
2677 unsigned long lsum = a[i]*b[i];
2678 for (i++; i < n; i++)
2679 lsum += a[i]*b[i];
2680 sum[k++] += lsum;
2681 }
2682
2683
2684 long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
2685 long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
2686 x[j] = AddMod(t0, t1, p);
2687
2688 b += MAT_BLK_SZ;
2689 }
2690 }
2691
2692 static
muladd1_by_32_half2_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2693 void muladd1_by_32_half2_width(long *x, const long *a, const long *b,
2694 long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2695 {
2696 for (long j = 0; j < width; j++) {
2697
2698 unsigned long sum[2];
2699 sum[0] = x[j];
2700 sum[1] = 0;
2701
2702 long k=0;
2703 long i=0;
2704 for(; i <= n-16; i+= 16) {
2705 unsigned long lsum = a[i+0]*b[i+0];
2706 lsum += a[i+1]*b[i+1];
2707 lsum += a[i+2]*b[i+2];
2708 lsum += a[i+3]*b[i+3];
2709 lsum += a[i+4]*b[i+4];
2710 lsum += a[i+5]*b[i+5];
2711 lsum += a[i+6]*b[i+6];
2712 lsum += a[i+7]*b[i+7];
2713 lsum += a[i+8]*b[i+8];
2714 lsum += a[i+9]*b[i+9];
2715 lsum += a[i+10]*b[i+10];
2716 lsum += a[i+11]*b[i+11];
2717 lsum += a[i+12]*b[i+12];
2718 lsum += a[i+13]*b[i+13];
2719 lsum += a[i+14]*b[i+14];
2720 lsum += a[i+15]*b[i+15];
2721 sum[k++] += lsum;
2722 }
2723
2724 if (i < n) {
2725 unsigned long lsum = a[i]*b[i];
2726 for (i++; i < n; i++)
2727 lsum += a[i]*b[i];
2728 sum[k++] += lsum;
2729 }
2730
2731
2732 long t0 = sp_ll_red_21(0, sum[0], p, ll_red_struct);
2733 long t1 = sp_ll_red_21(0, sum[1], p, ll_red_struct);
2734 x[j] = AddMod(t0, t1, p);
2735
2736 b += MAT_BLK_SZ;
2737 }
2738 }
2739
2740
2741
2742 // NOTE: oddly, this is slightly faster than the half2 routine, which
2743 // I would have thought would be faster
2744 // DIRT: this assumes MAT_BLK_SZ < (1L << NTL_BITS_PER_LONG/2),
2745 // which will hold unconditionally for MAT_BLK_SZ < 2^16.
2746
2747 static
muladd1_by_32_half1(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2748 void muladd1_by_32_half1(long *x, const long *a, const long *b,
2749 long n, long p, sp_ll_reduce_struct ll_red_struct)
2750 {
2751 for (long j = 0; j < MAT_BLK_SZ; j++) {
2752
2753 ll_type sum;
2754 ll_init(sum, x[j]);
2755
2756 long i=0;
2757 for(; i <= n-4; i+= 4) {
2758 unsigned long lsum = a[i+0]*b[i+0];
2759 lsum += a[i+1]*b[i+1];
2760 lsum += a[i+2]*b[i+2];
2761 lsum += a[i+3]*b[i+3];
2762 ll_add(sum, lsum);
2763 }
2764
2765 if (i < n) {
2766 unsigned long lsum = a[i]*b[i];
2767 for (i++; i < n; i++)
2768 lsum += a[i]*b[i];
2769 ll_add(sum, lsum);
2770 }
2771
2772 unsigned long sum0 = ll_get_lo(sum);
2773 unsigned long sum1 = ll_get_hi(sum);
2774 x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
2775
2776 b += MAT_BLK_SZ;
2777 }
2778 }
2779
2780 static
muladd1_by_32_half1_width(long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2781 void muladd1_by_32_half1_width(long *x, const long *a, const long *b,
2782 long n, long p, sp_ll_reduce_struct ll_red_struct, long width)
2783 {
2784 for (long j = 0; j < width; j++) {
2785
2786 ll_type sum;
2787 ll_init(sum, x[j]);
2788
2789 long i=0;
2790 for(; i <= n-4; i+= 4) {
2791 unsigned long lsum = a[i+0]*b[i+0];
2792 lsum += a[i+1]*b[i+1];
2793 lsum += a[i+2]*b[i+2];
2794 lsum += a[i+3]*b[i+3];
2795 ll_add(sum, lsum);
2796 }
2797
2798 if (i < n) {
2799 unsigned long lsum = a[i]*b[i];
2800 for (i++; i < n; i++)
2801 lsum += a[i]*b[i];
2802 ll_add(sum, lsum);
2803 }
2804
2805 unsigned long sum0 = ll_get_lo(sum);
2806 unsigned long sum1 = ll_get_hi(sum);
2807 x[j] = sp_ll_red_21(sum1, sum0, p, ll_red_struct);
2808
2809 b += MAT_BLK_SZ;
2810 }
2811 }
2812
2813 static inline
muladd_all_by_32(long first,long last,long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct)2814 void muladd_all_by_32(long first, long last, long *x, const long *a, const long *b, long n,
2815 long p, sp_ll_reduce_struct ll_red_struct)
2816 {
2817 if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2818 if (n == MAT_BLK_SZ) {
2819 for (long i = first; i < last; i++)
2820 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct);
2821 }
2822 else {
2823 for (long i = first; i < last; i++)
2824 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2825 }
2826 }
2827 else {
2828 for (long i = first; i < last; i++)
2829 muladd1_by_32_half1(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2830 }
2831 }
2832
2833 static inline
muladd_all_by_32_width(long first,long last,long * x,const long * a,const long * b,long n,long p,sp_ll_reduce_struct ll_red_struct,long width)2834 void muladd_all_by_32_width(long first, long last, long *x, const long *a, const long *b, long n,
2835 long p, sp_ll_reduce_struct ll_red_struct, long width)
2836 {
2837 if (width == MAT_BLK_SZ) {
2838 if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2839 if (n == MAT_BLK_SZ) {
2840 for (long i = first; i < last; i++)
2841 muladd1_by_32_full(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct);
2842 }
2843 else {
2844 for (long i = first; i < last; i++)
2845 muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2846 }
2847 }
2848 else {
2849 for (long i = first; i < last; i++)
2850 muladd1_by_32_half1(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct);
2851 }
2852 }
2853 else {
2854 if ((p-1) >= (1L << ((NTL_BITS_PER_LONG/2)-1))) {
2855 if (n == MAT_BLK_SZ) {
2856 for (long i = first; i < last; i++)
2857 muladd1_by_32_full_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, p, ll_red_struct, width);
2858 }
2859 else {
2860 for (long i = first; i < last; i++)
2861 muladd1_by_32_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct, width);
2862 }
2863 }
2864 else {
2865 for (long i = first; i < last; i++)
2866 muladd1_by_32_half1_width(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n, p, ll_red_struct, width);
2867 }
2868 }
2869 }
2870
2871
2872 #endif
2873
2874
2875
2876 static
muladd_interval(long * NTL_RESTRICT x,long * NTL_RESTRICT y,long c,long n,long p,mulmod_t pinv)2877 inline void muladd_interval(long * NTL_RESTRICT x, long * NTL_RESTRICT y,
2878 long c, long n, long p, mulmod_t pinv)
2879 {
2880 mulmod_precon_t cpinv = PrepMulModPrecon(c, p, pinv);
2881 for (long i = 0; i < n; i++) {
2882 long t = MulModPrecon(y[i], c, p, cpinv);
2883 x[i] = AddMod(x[i], t, p);
2884 }
2885 }
2886
2887
2888 // ******************************************************************
2889 //
2890 // General matrix multiplication code
2891 //
2892 // ******************************************************************
2893
2894
2895
2896
2897
2898 static
basic_mul(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2899 void basic_mul(const mat_window_zz_p& X,
2900 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2901 {
2902 long n = A.NumRows();
2903 long l = A.NumCols();
2904 long m = B.NumCols();
2905
2906 long p = zz_p::modulus();
2907 mulmod_t pinv = zz_p::ModulusInverse();
2908
2909 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2910
2911 NTL_GEXEC_RANGE(seq, n, first, last) {
2912
2913 for (long i = first; i < last; i++) {
2914 long j, k;
2915 const zz_p* ap = &A[i][0];
2916
2917 zz_p *xp = &X[i][0];
2918 for (j = 0; j < m; j++) xp[j].LoopHole() = 0;
2919
2920 for (k = 0; k < l; k++) {
2921 long aa = rep(ap[k]);
2922 if (aa != 0) {
2923 const zz_p* bp = &B[k][0];
2924 long T1;
2925 mulmod_precon_t aapinv = PrepMulModPrecon(aa, p, pinv);
2926
2927 for (j = 0; j < m; j++) {
2928 T1 = MulModPrecon(rep(bp[j]), aa, p, aapinv);
2929 xp[j].LoopHole() = AddMod(rep(xp[j]), T1, p);
2930 }
2931 }
2932 }
2933 }
2934
2935 } NTL_GEXEC_RANGE_END
2936 }
2937
2938
2939
2940
2941 #ifdef NTL_HAVE_LL_TYPE
2942
2943 static
alt_mul_L(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2944 void alt_mul_L(const mat_window_zz_p& X,
2945 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2946 {
2947 long n = A.NumRows();
2948 long l = A.NumCols();
2949 long m = B.NumCols();
2950
2951 long p = zz_p::modulus();
2952 sp_reduce_struct red_struct = zz_p::red_struct();
2953 long bound = InnerProd_L_bound(p);
2954
2955 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2956
2957 NTL_GEXEC_RANGE(seq, m, first, last) {
2958
2959 Vec<long> B_col;
2960 B_col.SetLength(l);
2961 long *bp = B_col.elts();
2962
2963 long i, j, k;
2964
2965 for (j = first; j < last; j++) {
2966 for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
2967
2968 for (i = 0; i < n; i++) {
2969 const zz_p *ap = &A[i][0];
2970 X[i][j].LoopHole() = InnerProd_L(bp, ap, l, p, red_struct, bound);
2971 }
2972 }
2973
2974 } NTL_GEXEC_RANGE_END
2975 }
2976
2977
2978 static
alt_mul_LL(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)2979 void alt_mul_LL(const mat_window_zz_p& X,
2980 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
2981 {
2982 long n = A.NumRows();
2983 long l = A.NumCols();
2984 long m = B.NumCols();
2985
2986 long p = zz_p::modulus();
2987 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
2988
2989 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
2990
2991 NTL_GEXEC_RANGE(seq, m, first, last) {
2992
2993 Vec<long> B_col;
2994 B_col.SetLength(l);
2995 long *bp = B_col.elts();
2996
2997 long i, j, k;
2998
2999 for (j = first; j < last; j++) {
3000 for (k = 0; k < l; k++) bp[k] = rep(B[k][j]);
3001
3002 for (i = 0; i < n; i++) {
3003 const zz_p *ap = &A[i][0];
3004 X[i][j].LoopHole() = InnerProd_LL(bp, ap, l, p, ll_red_struct);
3005 }
3006 }
3007
3008 } NTL_GEXEC_RANGE_END
3009 }
3010
3011
3012 #ifdef NTL_HAVE_AVX
3013
3014 static
blk_mul_DD(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3015 void blk_mul_DD(const mat_window_zz_p& X,
3016 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3017 {
3018 long n = A.NumRows();
3019 long l = A.NumCols();
3020 long m = B.NumCols();
3021
3022 long p = zz_p::modulus();
3023 sp_reduce_struct red_struct = zz_p::red_struct();
3024
3025 UniqueArray< AlignedArray<double> > A_buf;
3026 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3027 A_buf.SetLength(npanels);
3028
3029 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3030 long k_max = min(kk+MAT_BLK_SZ, l);
3031
3032 A_buf[panel].SetLength(n * MAT_BLK_SZ);
3033 double *abp = &A_buf[panel][0];
3034
3035 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3036 const zz_p *ap1 = &A[i][0];
3037 for (long k = kk; k < k_max; k++) {
3038 abp[k-kk] = rep(ap1[k]);
3039 }
3040 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3041 abp[k-kk] = 0;
3042 }
3043 }
3044 }
3045
3046 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3047
3048 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3049
3050 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3051 NTL_IMPORT(n)
3052 NTL_IMPORT(l)
3053 NTL_IMPORT(m)
3054 NTL_IMPORT(p)
3055 NTL_IMPORT(red_struct)
3056
3057 AlignedArray<double> B_rec;
3058 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3059 double *brec = B_rec.get();
3060
3061 AlignedArray<double> X_buf;
3062 X_buf.SetLength(n*MAT_BLK_SZ);
3063 double *xbp = X_buf.get();
3064
3065 long jj, kk;
3066 long i, j, k;
3067 long panel;
3068 long xpanel;
3069
3070 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3071 xpanel++, jj += MAT_BLK_SZ) {
3072
3073 long j_max = min(jj+MAT_BLK_SZ, m);
3074
3075 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3076
3077 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3078 long red_count = red_trigger;
3079
3080 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3081 long k_max = min(kk+MAT_BLK_SZ, l);
3082
3083 for (k = kk; k < k_max; k++) {
3084 const zz_p *bp = &B[k][0];
3085 for (j = jj; j < j_max; j++)
3086 brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = rep(bp[j]);
3087 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3088 brec[(k-kk)*MAT_BLK_SZ+(j-jj)] = 0;
3089 }
3090
3091
3092 if (red_count-MAT_BLK_SZ < 0) {
3093 red_count = red_trigger;
3094 for (i = 0; i < n*MAT_BLK_SZ; i++)
3095 xbp[i] = rem((unsigned long)(long)xbp[i], p, red_struct);
3096 }
3097
3098 red_count = red_count-MAT_BLK_SZ;
3099
3100 const double *abp = &A_buf[panel][0];
3101
3102 muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, j_max-jj);
3103 }
3104
3105
3106 for (i = 0; i < n; i++) {
3107 zz_p *xp = &X[i][0];
3108 for (j = jj; j < j_max; j++)
3109 xp[j].LoopHole() =
3110 rem((unsigned long)(long)xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
3111 }
3112 }
3113
3114 NTL_GEXEC_RANGE_END
3115 }
3116
3117 #endif
3118
3119
3120 static
blk_mul_LL(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3121 void blk_mul_LL(const mat_window_zz_p& X,
3122 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3123 {
3124 long n = A.NumRows();
3125 long l = A.NumCols();
3126 long m = B.NumCols();
3127
3128 long p = zz_p::modulus();
3129 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
3130
3131 Vec< Vec<long> > A_buf;
3132 Vec<long *> abufp;
3133 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3134 A_buf.SetLength(npanels);
3135 abufp.SetLength(npanels);
3136
3137 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3138 long k_max = min(kk+MAT_BLK_SZ, l);
3139
3140 A_buf[panel].SetLength(n * MAT_BLK_SZ);
3141 long *abp = A_buf[panel].elts();
3142 abufp[panel] = abp;
3143
3144 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3145 const zz_p *ap1 = &A[i][0];
3146 for (long k = kk; k < k_max; k++) {
3147 abp[k-kk] = rep(ap1[k]);
3148 }
3149 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3150 abp[k-kk] = 0;
3151 }
3152 }
3153 }
3154
3155 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3156
3157 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3158
3159 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3160 NTL_IMPORT(n)
3161 NTL_IMPORT(l)
3162 NTL_IMPORT(m)
3163 NTL_IMPORT(p)
3164 NTL_IMPORT(ll_red_struct)
3165
3166 UniqueArray<long> B_rec;
3167 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3168 long *brec = B_rec.get();
3169
3170 UniqueArray<long> X_buf;
3171 X_buf.SetLength(n*MAT_BLK_SZ);
3172 long *xbp = X_buf.get();
3173
3174 long jj, kk;
3175 long i, j, k;
3176 long panel;
3177 long xpanel;
3178
3179 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3180 xpanel++, jj += MAT_BLK_SZ) {
3181
3182 long j_max = min(jj+MAT_BLK_SZ, m);
3183
3184 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3185
3186 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3187 long k_max = min(kk+MAT_BLK_SZ, l);
3188
3189 // fill brec, transposed
3190
3191 for (k = kk; k < k_max; k++) {
3192 const zz_p *bp = &B[k][0];
3193 for (j = jj; j < j_max; j++)
3194 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
3195 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3196 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
3197 }
3198
3199 const long *abp = abufp[panel];
3200
3201 muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, p, ll_red_struct, j_max-jj);
3202 }
3203
3204
3205 for (i = 0; i < n; i++) {
3206 zz_p *xp = &X[i][0];
3207 for (j = jj; j < j_max; j++)
3208 xp[j].LoopHole() = xbp[i*MAT_BLK_SZ + (j-jj)];
3209 }
3210 }
3211
3212 NTL_GEXEC_RANGE_END
3213 }
3214
3215
3216 static
blk_mul_L(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3217 void blk_mul_L(const mat_window_zz_p& X,
3218 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3219 {
3220 long n = A.NumRows();
3221 long l = A.NumCols();
3222 long m = B.NumCols();
3223
3224 long p = zz_p::modulus();
3225 sp_reduce_struct red_struct = zz_p::red_struct();
3226
3227 Vec< Vec<uhlong> > A_buf;
3228 Vec<uhlong*> abufp;
3229 long npanels = (l+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3230 A_buf.SetLength(npanels);
3231 abufp.SetLength(npanels);
3232
3233 for (long kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3234 long k_max = min(kk+MAT_BLK_SZ, l);
3235
3236 A_buf[panel].SetLength(n * MAT_BLK_SZ);
3237 uhlong *abp = A_buf[panel].elts();
3238 abufp[panel] = abp;
3239
3240 for (long i = 0; i < n; i++, abp += MAT_BLK_SZ) {
3241 const zz_p *ap1 = &A[i][0];
3242 for (long k = kk; k < k_max; k++) {
3243 abp[k-kk] = rep(ap1[k]);
3244 }
3245 for (long k = k_max; k < kk+MAT_BLK_SZ; k++) {
3246 abp[k-kk] = 0;
3247 }
3248 }
3249 }
3250
3251 long nxpanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
3252
3253 const bool seq = double(n)*double(l)*double(m) < PAR_THRESH;
3254
3255 NTL_GEXEC_RANGE(seq, nxpanels, first, last)
3256 NTL_IMPORT(n)
3257 NTL_IMPORT(l)
3258 NTL_IMPORT(m)
3259 NTL_IMPORT(p)
3260 NTL_IMPORT(red_struct)
3261
3262 UniqueArray<uhlong> B_rec;
3263 B_rec.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
3264 uhlong *brec = B_rec.get();
3265
3266 UniqueArray<unsigned long> X_buf;
3267 X_buf.SetLength(n*MAT_BLK_SZ);
3268 unsigned long *xbp = X_buf.get();
3269
3270 long jj, kk;
3271 long i, j, k;
3272 long panel;
3273 long xpanel;
3274
3275 for (xpanel = first, jj = first*MAT_BLK_SZ; xpanel < last;
3276 xpanel++, jj += MAT_BLK_SZ) {
3277
3278 long j_max = min(jj+MAT_BLK_SZ, m);
3279
3280 for (i = 0; i < n*MAT_BLK_SZ; i++) xbp[i] = 0;
3281
3282 unsigned long ured_trigger =
3283 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
3284 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
3285
3286 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
3287
3288 long red_count = red_trigger;
3289
3290 for (kk = 0, panel = 0; kk < l; kk += MAT_BLK_SZ, panel++) {
3291 long k_max = min(kk+MAT_BLK_SZ, l);
3292
3293 // fill brec, transposed
3294
3295 for (k = kk; k < k_max; k++) {
3296 const zz_p *bp = &B[k][0];
3297 for (j = jj; j < j_max; j++)
3298 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = rep(bp[j]);
3299 for (j = j_max; j < jj+MAT_BLK_SZ; j++)
3300 brec[(k-kk)+(j-jj)*MAT_BLK_SZ] = 0;
3301 }
3302
3303 if (red_count-MAT_BLK_SZ < 0) {
3304 red_count = red_trigger;
3305 for (i = 0; i < n*MAT_BLK_SZ; i++)
3306 xbp[i] = rem(xbp[i], p, red_struct);
3307 }
3308
3309 red_count = red_count-MAT_BLK_SZ;
3310
3311 const uhlong *abp = abufp[panel];
3312
3313 muladd_all_by_32_width(0, n, xbp, abp, brec, k_max-kk, j_max-jj);
3314 }
3315
3316
3317 for (i = 0; i < n; i++) {
3318 zz_p *xp = &X[i][0];
3319 for (j = jj; j < j_max; j++)
3320 xp[j].LoopHole() =
3321 rem(xbp[i*MAT_BLK_SZ + (j-jj)], p, red_struct);
3322 }
3323 }
3324
3325 NTL_GEXEC_RANGE_END
3326 }
3327
3328
3329 #endif
3330
3331
3332 static
mul_base(const mat_window_zz_p & X,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3333 void mul_base (const mat_window_zz_p& X,
3334 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3335 {
3336 long n = A.NumRows();
3337 long l = A.NumCols();
3338 long m = B.NumCols();
3339
3340 if (n == 0 || l == 0 || m == 0) {
3341 clear(X);
3342 return;
3343 }
3344
3345 #ifndef NTL_HAVE_LL_TYPE
3346
3347 basic_mul(X, A, B);
3348
3349 #else
3350
3351 long p = zz_p::modulus();
3352 long V = MAT_BLK_SZ*4;
3353
3354 #ifdef NTL_HAVE_AVX
3355
3356 // experimentally, blk_mul_DD beats all the alternatives
3357 // if each dimension is at least 16
3358
3359 if (n >= 16 && l >= 16 && m >= 16 &&
3360 p-1 <= MAX_DBL_INT &&
3361 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3362 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
3363 {
3364 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
3365 if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
3366 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
3367
3368 //cerr << "blk_mul_DD\n";
3369 blk_mul_DD(X, A, B);
3370 return;
3371 }
3372 #endif
3373
3374
3375 if (n < 32 || l < 32 || m < 32) {
3376
3377
3378 if (InnerProd_L_viable(l, p)) {
3379 //cerr << "alt_mul_L\n";
3380 alt_mul_L(X, A, B);
3381 }
3382 else {
3383 //cerr << "alt_mul_LL\n";
3384 alt_mul_LL(X, A, B);
3385 }
3386
3387 }
3388 else {
3389
3390 // Experimentally, the block versions are better when all dimensions
3391 // are at least 32
3392
3393 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("number too big");
3394 if (NTL_OVERFLOW(l, MAT_BLK_SZ, 0)) ResourceError("number too big");
3395 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("number too big");
3396
3397
3398 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
3399 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
3400 //cerr << "blk_mul_L\n";
3401 blk_mul_L(X, A, B);
3402
3403 }
3404 else {
3405 //cerr << "blk_mul_LL\n";
3406 blk_mul_LL(X, A, B);
3407 }
3408
3409 }
3410
3411 #endif
3412
3413
3414 }
3415
3416
3417
3418 // The following implementation of Strassen is derived directly
3419 // from the implementation in FLINT (see http://www.flintlib.org),
3420 // although a number of details have changed.
3421 // The following copyright notice appears in the relevant
3422 // file, which can be obtained at
3423 // https://github.com/fredrik-johansson/flint2/blob/trunk/nmod_mat/mul_strassen.c
3424 // committed on April 26, 2016.
3425
3426 /*
3427 Copyright (C) 2008, Martin Albrecht
3428 Copyright (C) 2008, 2009 William Hart.
3429 Copyright (C) 2010, Fredrik Johansson
3430 This file is part of FLINT.
3431 FLINT is free software: you can redistribute it and/or modify it under
3432 the terms of the GNU Lesser General Public License (LGPL) as published
3433 by the Free Software Foundation; either version 2.1 of the License, or
3434 (at your option) any later version. See <http://www.gnu.org/licenses/>.
3435 */
3436
3437
mul_strassen(const mat_window_zz_p & C,const const_mat_window_zz_p & A,const const_mat_window_zz_p & B)3438 void mul_strassen(const mat_window_zz_p& C,
3439 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)
3440 {
3441 long a, b, c;
3442 long anr, anc, bnr, bnc;
3443
3444
3445 a = A.NumRows();
3446 b = A.NumCols();
3447 c = B.NumCols();
3448
3449
3450 bool use_DD = false;
3451 // this code determines if mul_base triggers blk_mul_DD,
3452 // in which case a higher crossover is used
3453
3454 #if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
3455 {
3456 long V = MAT_BLK_SZ*4;
3457 long p = zz_p::modulus();
3458
3459 if (p-1 <= MAX_DBL_INT &&
3460 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
3461 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1))
3462 {
3463 use_DD = true;
3464 }
3465 }
3466 #endif
3467
3468 long nt = AvailableThreads();
3469
3470 long xover;
3471 // now we set the crossover -- it is kind of a heauristic
3472 // mess based on nt and use_DD...I've run some tests to
3473 // make sure these settings are reasonable, but a more
3474 // rational approach would be preferable
3475
3476 if (nt > 1) {
3477 if (use_DD || nt > 8192/(2*MAT_BLK_SZ))
3478 xover = 8192;
3479 else
3480 xover = max(800, nt*2*MAT_BLK_SZ);
3481 }
3482 else {
3483 if (use_DD)
3484 xover = 800;
3485 else
3486 xover = 448;
3487 }
3488
3489 if (a <= xover || b <= xover || c <= xover)
3490 {
3491 mul_base(C, A, B);
3492 return;
3493 }
3494
3495 anr = a / 2;
3496 anc = b / 2;
3497 bnr = anc;
3498 bnc = c / 2;
3499
3500 const_mat_window_zz_p A11(A, 0, 0, anr, anc);
3501 const_mat_window_zz_p A12(A, 0, anc, anr, 2*anc);
3502 const_mat_window_zz_p A21(A, anr, 0, 2*anr, anc);
3503 const_mat_window_zz_p A22(A, anr, anc, 2*anr, 2*anc);
3504
3505 const_mat_window_zz_p B11(B, 0, 0, bnr, bnc);
3506 const_mat_window_zz_p B12(B, 0, bnc, bnr, 2*bnc);
3507 const_mat_window_zz_p B21(B, bnr, 0, 2*bnr, bnc);
3508 const_mat_window_zz_p B22(B, bnr, bnc, 2*bnr, 2*bnc);
3509
3510 mat_window_zz_p C11(C, 0, 0, anr, bnc);
3511 mat_window_zz_p C12(C, 0, bnc, anr, 2*bnc);
3512 mat_window_zz_p C21(C, anr, 0, 2*anr, bnc);
3513 mat_window_zz_p C22(C, anr, bnc, 2*anr, 2*bnc);
3514
3515 mat_zz_p X1_store;
3516 X1_store.SetDims(anr, max(bnc, anc));
3517
3518 mat_window_zz_p X1a(X1_store, 0, 0, anr, anc);
3519 mat_window_zz_p X1b(X1_store, 0, 0, anr, bnc);
3520
3521 mat_zz_p X2;
3522 X2.SetDims(anc, bnc);
3523
3524 /*
3525 See Jean-Guillaume Dumas, Clement Pernet, Wei Zhou; "Memory
3526 efficient scheduling of Strassen-Winograd's matrix multiplication
3527 algorithm"; http://arxiv.org/pdf/0707.2347v3 for reference on the
3528 used operation scheduling.
3529 */
3530
3531 sub(X1a, A11, A21);
3532 sub(X2, B22, B12);
3533 mul_strassen(C21, X1a, X2);
3534
3535 add(X1a, A21, A22);
3536 sub(X2, B12, B11);
3537 mul_strassen(C22, X1a, X2);
3538
3539 sub(X1a, X1a, A11);
3540 sub(X2, B22, X2);
3541 mul_strassen(C12, X1a, X2);
3542
3543 sub(X1a, A12, X1a);
3544 mul_strassen(C11, X1a, B22);
3545
3546
3547 mul_strassen(X1b, A11, B11);
3548
3549 add(C12, X1b, C12);
3550 add(C21, C12, C21);
3551 add(C12, C12, C22);
3552 add(C22, C21, C22);
3553 add(C12, C12, C11);
3554 sub(X2, X2, B21);
3555 mul_strassen(C11, A22, X2);
3556
3557 X2.kill();
3558
3559 sub(C21, C21, C11);
3560 mul_strassen(C11, A12, B21);
3561
3562 add(C11, X1b, C11);
3563
3564 X1_store.kill();
3565
3566 if (c > 2*bnc) /* A by last col of B -> last col of C */
3567 {
3568 const_mat_window_zz_p Bc(B, 0, 2*bnc, b, c);
3569 mat_window_zz_p Cc(C, 0, 2*bnc, a, c);
3570
3571 mul_strassen(Cc, A, Bc);
3572 }
3573
3574 if (a > 2*anr) /* last row of A by B -> last row of C */
3575 {
3576 const_mat_window_zz_p Ar(A, 2*anr, 0, a, b);
3577 mat_window_zz_p Cr(C, 2*anr, 0, a, c);
3578 mul_strassen(Cr, Ar, B);
3579 }
3580
3581 if (b > 2*anc) /* last col of A by last row of B -> C */
3582 {
3583 const_mat_window_zz_p Ac(A, 0, 2*anc, 2*anr, b);
3584 const_mat_window_zz_p Br(B, 2*bnr, 0, b, 2*bnc);
3585 mat_window_zz_p Cb(C, 0, 0, 2*anr, 2*bnc);
3586
3587 // Cb += Ac*Br
3588 mat_zz_p tmp;
3589 tmp.SetDims(Cb.NumRows(), Cb.NumCols());
3590 mul_strassen(tmp, Ac, Br);
3591 add(Cb, Cb, tmp);
3592 }
3593 }
3594
3595
3596
3597
3598
3599
3600
3601 static
mul_aux(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)3602 void mul_aux(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
3603 {
3604 long n = A.NumRows();
3605 long l = A.NumCols();
3606 long m = B.NumCols();
3607
3608 if (l != B.NumRows())
3609 LogicError("matrix mul: dimension mismatch");
3610
3611 X.SetDims(n, m);
3612
3613 if (n == 0 || l == 0 || m == 0) {
3614 clear(X);
3615 return;
3616 }
3617
3618 mul_strassen(X, A, B);
3619 }
3620
3621
mul(mat_zz_p & X,const mat_zz_p & A,const mat_zz_p & B)3622 void mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
3623 {
3624 if (&X == &A || &X == &B) {
3625 mat_zz_p tmp;
3626 mul_aux(tmp, A, B);
3627 X = tmp;
3628 }
3629 else
3630 mul_aux(X, A, B);
3631 }
3632
3633
3634 // ******************************************************************
3635 //
3636 // Matrix inversion code
3637 //
3638 // ******************************************************************
3639
3640 static
relaxed_InvModStatus(long & x,long a,long n,bool relax)3641 long relaxed_InvModStatus(long& x, long a, long n, bool relax)
3642 {
3643 if (relax) {
3644 return InvModStatus(x, a, n);
3645 }
3646 else {
3647 x = InvMod(a, n);
3648 return 0;
3649 }
3650 }
3651
3652 static
basic_inv(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3653 void basic_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3654 {
3655 long n = A.NumRows();
3656
3657 if (A.NumCols() != n)
3658 LogicError("inv: nonsquare matrix");
3659
3660 if (n == 0) {
3661 set(d);
3662 X.SetDims(0, 0);
3663 return;
3664 }
3665
3666
3667 Mat<long> M;
3668 conv(M, A);
3669 // scratch space
3670
3671 Vec<long> P;
3672 P.SetLength(n);
3673 for (long k = 0; k < n; k++) P[k] = k;
3674 // records swap operations
3675
3676 long det;
3677 det = 1;
3678
3679 long p = zz_p::modulus();
3680 mulmod_t pinv = zz_p::ModulusInverse();
3681
3682 bool seq = n < PAR_THRESH_SQ;
3683
3684 bool pivoting = false;
3685
3686 for (long k = 0; k < n; k++) {
3687 long pos = -1;
3688 long pivot_inv;
3689 for (long i = k; i < n; i++) {
3690 // NOTE: by using InvModStatus, this code will work
3691 // for prime-powers as well as primes
3692 long pivot = M[i][k];
3693 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3694 pos = i;
3695 break;
3696 }
3697 }
3698
3699 if (pos != -1) {
3700 if (k != pos) {
3701 swap(M[pos], M[k]);
3702 det = NegateMod(det, p);
3703 P[k] = pos;
3704 pivoting = true;
3705 }
3706
3707 det = MulMod(det, M[k][k], p);
3708
3709 {
3710 // multiply row k by pivot_inv
3711 long t1 = pivot_inv;
3712 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3713 long *y = &M[k][0];
3714 for (long j = 0; j < n; j++)
3715 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
3716
3717 y[k] = pivot_inv;
3718 }
3719
3720
3721
3722 NTL_GEXEC_RANGE(seq, n, first, last)
3723 NTL_IMPORT(p)
3724 NTL_IMPORT(n)
3725 NTL_IMPORT(k)
3726 long *y = &M[k][0];
3727 for (long i = first; i < last; i++) {
3728 if (i == k) continue; // skip row k
3729
3730 long *x = &M[i][0];
3731 long t1 = x[k];
3732 t1 = NegateMod(t1, p);
3733 x[k] = 0;
3734 if (t1 == 0) continue;
3735
3736 // add t1 * row k to row i
3737 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
3738
3739 for (long j = 0; j < n; j++) {
3740 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
3741 x[j] = AddMod(x[j], t2, p);
3742 }
3743 }
3744 NTL_GEXEC_RANGE_END
3745 }
3746 else {
3747 clear(d);
3748 return;
3749 }
3750 }
3751
3752 if (pivoting) {
3753 // pivot colums, using reverse swap sequence
3754
3755 for (long i = 0; i < n; i++) {
3756 long *x = &M[i][0];
3757
3758 for (long k = n-1; k >= 0; k--) {
3759 long pos = P[k];
3760 if (pos != k) _ntl_swap(x[pos], x[k]);
3761 }
3762 }
3763 }
3764
3765 X.SetDims(n, n);
3766 for (long i = 0; i < n; i++)
3767 for (long j = 0; j < n; j++)
3768 X[i][j].LoopHole() = M[i][j];
3769
3770 d.LoopHole() = det;
3771 }
3772
3773
3774
3775 #ifdef NTL_HAVE_LL_TYPE
3776
3777
3778
3779 static
alt_inv_L(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3780 void alt_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3781 {
3782 long n = A.NumRows();
3783
3784 if (A.NumCols() != n)
3785 LogicError("inv: nonsquare matrix");
3786
3787 if (n == 0) {
3788 set(d);
3789 X.SetDims(0, 0);
3790 return;
3791 }
3792
3793
3794 Mat<unsigned long> M;
3795 conv(M, A);
3796 // scractch space
3797
3798 Vec<long> P;
3799 P.SetLength(n);
3800 for (long k = 0; k < n; k++) P[k] = k;
3801 // records swap operations
3802
3803 long det;
3804 det = 1;
3805
3806 long p = zz_p::modulus();
3807 mulmod_t pinv = zz_p::ModulusInverse();
3808 sp_reduce_struct red_struct = zz_p::red_struct();
3809
3810
3811
3812 bool seq = n < PAR_THRESH_SQ;
3813
3814 bool pivoting = false;
3815
3816 unsigned long ured_trigger =
3817 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
3818 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
3819
3820 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
3821
3822 long red_count = red_trigger;
3823
3824
3825 for (long k = 0; k < n; k++) {
3826 bool cleanup = false;
3827
3828 if (red_count-1 < 0) {
3829 red_count = red_trigger;
3830 cleanup = true;
3831 }
3832
3833 red_count = red_count-1;
3834
3835 long pos = -1;
3836 long pivot;
3837 long pivot_inv;
3838
3839 for (long i = k; i < n; i++) {
3840 // NOTE: by using InvModStatus, this code will work
3841 // for prime-powers as well as primes
3842 pivot = rem(M[i][k], p, red_struct);
3843 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
3844 pos = i;
3845 break;
3846 }
3847 }
3848
3849 if (pos != -1) {
3850 if (k != pos) {
3851 swap(M[pos], M[k]);
3852 det = NegateMod(det, p);
3853 P[k] = pos;
3854 pivoting = true;
3855 }
3856
3857 det = MulMod(det, pivot, p);
3858
3859 {
3860 // multiply row k by pivot_inv
3861 long t1 = pivot_inv;
3862 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
3863 unsigned long *y = &M[k][0];
3864 for (long j = 0; j < n; j++) {
3865 long t2 = rem(y[j], p, red_struct);
3866 y[j] = MulModPrecon(t2, t1, p, t1pinv);
3867 }
3868
3869 y[k] = pivot_inv;
3870 }
3871
3872
3873 NTL_GEXEC_RANGE(seq, n, first, last)
3874 NTL_IMPORT(p)
3875 NTL_IMPORT(n)
3876 NTL_IMPORT(k)
3877 NTL_IMPORT(red_struct)
3878 unsigned long *y = &M[k][0];
3879 if (cleanup) {
3880 for (long i = first; i < last; i++) {
3881 if (i == k) continue;
3882 // skip row k: the data won't change, but it
3883 // technically is a race condition in a multi-theaded
3884 // execution
3885
3886 unsigned long *x = &M[i][0];
3887 for (long j = 0; j < n; j++) {
3888 x[j] = rem(x[j], p, red_struct);
3889 }
3890 }
3891 }
3892
3893
3894 for (long i = first; i < last; i++) {
3895 if (i == k) continue; // skip row k
3896
3897 unsigned long *x = &M[i][0];
3898 long t1 = rem(x[k], p, red_struct);
3899 t1 = NegateMod(t1, p);
3900 x[k] = 0;
3901 if (t1 == 0) continue;
3902
3903 // add t1 * row k to row i
3904 unsigned long ut1 = t1;
3905 long j;
3906 for (j = 0; j <= n-4; j+=4) {
3907 unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
3908 unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
3909 unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
3910 unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
3911 x[j+0] = xj0;
3912 x[j+1] = xj1;
3913 x[j+2] = xj2;
3914 x[j+3] = xj3;
3915 }
3916 for (; j < n; j++) {
3917 x[j] += DO_MUL(y[j], ut1);
3918 }
3919 }
3920 NTL_GEXEC_RANGE_END
3921 }
3922 else {
3923 clear(d);
3924 return;
3925 }
3926 }
3927
3928 if (pivoting) {
3929 // pivot colums, using reverse swap sequence
3930
3931 for (long i = 0; i < n; i++) {
3932 unsigned long *x = &M[i][0];
3933
3934 for (long k = n-1; k >= 0; k--) {
3935 long pos = P[k];
3936 if (pos != k) _ntl_swap(x[pos], x[k]);
3937 }
3938 }
3939 }
3940
3941 X.SetDims(n, n);
3942 for (long i = 0; i < n; i++)
3943 for (long j = 0; j < n; j++)
3944 X[i][j].LoopHole() = rem(M[i][j], p, red_struct);
3945
3946 d.LoopHole() = det;
3947 }
3948
3949
3950
3951
3952
3953 #ifdef NTL_HAVE_AVX
3954
3955 static
alt_inv_DD(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)3956 void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
3957 {
3958 long n = A.NumRows();
3959
3960 if (A.NumCols() != n)
3961 LogicError("inv: nonsquare matrix");
3962
3963 if (n == 0) {
3964 set(d);
3965 X.SetDims(0, 0);
3966 return;
3967 }
3968
3969 Vec< AlignedArray<double> > M;
3970 M.SetLength(n);
3971 for (long i = 0; i < n; i++) M[i].SetLength(n);
3972
3973 for (long i = 0; i < n; i++) {
3974 for (long j = 0; j < n; j++)
3975 M[i][j] = rep(A[i][j]);
3976 }
3977
3978
3979 Vec<long> P;
3980 P.SetLength(n);
3981 for (long k = 0; k < n; k++) P[k] = k;
3982 // records swap operations
3983
3984 long det;
3985 det = 1;
3986
3987 long p = zz_p::modulus();
3988 mulmod_t pinv = zz_p::ModulusInverse();
3989 sp_reduce_struct red_struct = zz_p::red_struct();
3990
3991
3992
3993 bool seq = n < PAR_THRESH_SQ;
3994
3995 bool pivoting = false;
3996
3997 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
3998 long red_count = red_trigger;
3999
4000 for (long k = 0; k < n; k++) {
4001 bool cleanup = false;
4002
4003 if (red_count-1 < 0) {
4004 red_count = red_trigger;
4005 cleanup = true;
4006 }
4007
4008 red_count = red_count-1;
4009
4010 long pos = -1;
4011 long pivot;
4012 long pivot_inv;
4013
4014
4015
4016 for (long i = k; i < n; i++) {
4017 // NOTE: by using InvModStatus, this code will work
4018 // for prime-powers as well as primes
4019 pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
4020 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4021 pos = i;
4022 break;
4023 }
4024 }
4025
4026 if (pos != -1) {
4027 if (k != pos) {
4028 swap(M[pos], M[k]);
4029 det = NegateMod(det, p);
4030 P[k] = pos;
4031 pivoting = true;
4032 }
4033
4034 det = MulMod(det, pivot, p);
4035
4036 {
4037 // multiply row k by pivot_inv
4038 long t1 = pivot_inv;
4039 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
4040 double *y = &M[k][0];
4041 for (long j = 0; j < n; j++) {
4042 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4043 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4044 }
4045
4046 y[k] = pivot_inv;
4047 }
4048
4049
4050 NTL_GEXEC_RANGE(seq, n, first, last)
4051 NTL_IMPORT(p)
4052 NTL_IMPORT(n)
4053 NTL_IMPORT(k)
4054 NTL_IMPORT(red_struct)
4055 double *y = &M[k][0];
4056 if (cleanup) {
4057 for (long i = first; i < last; i++) {
4058 if (i == k) continue;
4059 // skip row k: the data won't change, but it
4060 // technically is a race condition in a multi-theaded
4061 // execution
4062
4063 double *x = &M[i][0];
4064 for (long j = 0; j < n; j++) {
4065 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
4066 }
4067 }
4068 }
4069
4070
4071 for (long i = first; i < last; i++) {
4072 if (i == k) continue; // skip row k
4073
4074 double *x = &M[i][0];
4075 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
4076 t1 = NegateMod(t1, p);
4077 x[k] = 0;
4078 if (t1 == 0) continue;
4079
4080 // add t1 * row k to row i
4081 double ut1 = t1;
4082 muladd_interval1(x, y, ut1, n);
4083 }
4084 NTL_GEXEC_RANGE_END
4085 }
4086 else {
4087 clear(d);
4088 return;
4089 }
4090 }
4091
4092
4093 if (pivoting) {
4094 // pivot colums, using reverse swap sequence
4095
4096 for (long i = 0; i < n; i++) {
4097 double *x = &M[i][0];
4098
4099 for (long k = n-1; k >= 0; k--) {
4100 long pos = P[k];
4101 if (pos != k) _ntl_swap(x[pos], x[k]);
4102 }
4103 }
4104 }
4105
4106
4107 X.SetDims(n, n);
4108 for (long i = 0; i < n; i++)
4109 for (long j = 0; j < n; j++)
4110 X[i][j].LoopHole() = rem((unsigned long)(long)M[i][j], p, red_struct);
4111
4112 d.LoopHole() = det;
4113 }
4114
4115 #endif
4116
4117
4118
4119
4120
4121 #ifdef NTL_HAVE_AVX
4122
4123 static
blk_inv_DD(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4124 void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4125 {
4126 long n = A.NumRows();
4127
4128 if (A.NumCols() != n)
4129 LogicError("inv: nonsquare matrix");
4130
4131 if (n == 0) {
4132 set(d);
4133 X.SetDims(0, 0);
4134 return;
4135 }
4136
4137 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4138
4139 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4140
4141
4142 Vec< AlignedArray<double> > M;
4143 M.SetLength(npanels);
4144 for (long panel = 0; panel < npanels; panel++) {
4145 M[panel].SetLength(n*MAT_BLK_SZ);
4146 double *panelp = &M[panel][0];
4147
4148 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4149 }
4150
4151 // copy A into panels
4152 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4153 long j_max = min(jj+MAT_BLK_SZ, n);
4154 double *panelp = &M[panel][0];
4155
4156 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4157 const zz_p *ap = A[i].elts() + jj;
4158
4159 for (long j = jj; j < j_max; j++)
4160 panelp[j-jj] = rep(ap[j-jj]);
4161 }
4162 }
4163
4164 Vec<long> P;
4165 P.SetLength(n);
4166 for (long k = 0; k < n; k++) P[k] = k;
4167 // records swap operations
4168
4169
4170 long det;
4171 det = 1;
4172
4173 long p = zz_p::modulus();
4174 mulmod_t pinv = zz_p::ModulusInverse();
4175 sp_reduce_struct red_struct = zz_p::red_struct();
4176
4177
4178 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4179
4180 bool pivoting = false;
4181
4182 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
4183 long red_count = red_trigger;
4184
4185 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4186 long k_max = min(kk+MAT_BLK_SZ, n);
4187
4188 bool cleanup = false;
4189
4190 if (red_count-MAT_BLK_SZ < 0) {
4191 red_count = red_trigger;
4192 cleanup = true;
4193 }
4194
4195 red_count = red_count-MAT_BLK_SZ;
4196 double *kpanelp = &M[kpanel][0];
4197
4198 if (cleanup) {
4199 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4200 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4201 }
4202
4203 for (long k = kk; k < k_max; k++) {
4204
4205 long pos = -1;
4206 long pivot;
4207 long pivot_inv;
4208
4209 for (long i = k; i < n; i++) {
4210 // NOTE: by using InvModStatus, this code will work
4211 // for prime-powers as well as primes
4212 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4213 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4214 pos = i;
4215 break;
4216 }
4217 }
4218
4219 if (pos == -1) {
4220 clear(d);
4221 return;
4222 }
4223
4224 double *y = &kpanelp[k*MAT_BLK_SZ];
4225 if (k != pos) {
4226 // swap rows pos and k
4227 double *x = &kpanelp[pos*MAT_BLK_SZ];
4228 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4229
4230 det = NegateMod(det, p);
4231 P[k] = pos;
4232 pivoting = true;
4233 }
4234
4235 det = MulMod(det, pivot, p);
4236
4237 {
4238 // multiply row k by pivot_inv
4239 long t1 = pivot_inv;
4240 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4241 for (long j = 0; j < MAT_BLK_SZ; j++) {
4242 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
4243 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4244 }
4245
4246 y[k-kk] = pivot_inv;
4247 }
4248
4249 for (long i = 0; i < n; i++) {
4250 if (i == k) continue; // skip row k
4251
4252 double *x = &kpanelp[i*MAT_BLK_SZ];
4253 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
4254 t1 = NegateMod(t1, p);
4255 x[k-kk] = 0;
4256 if (t1 == 0) continue;
4257
4258 // add t1 * row k to row i
4259 double ut1 = t1;
4260 muladd_interval(x, y, ut1, MAT_BLK_SZ);
4261 }
4262 }
4263
4264
4265 // finished processing current kpanel
4266 // next, reduce and apply to all other kpanels
4267
4268 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4269 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
4270
4271 // special processing: subtract 1 off of diangonal
4272
4273 for (long k = kk; k < k_max; k++)
4274 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4275
4276
4277 NTL_GEXEC_RANGE(seq, npanels, first, last)
4278 NTL_IMPORT(p)
4279 NTL_IMPORT(n)
4280 NTL_IMPORT(red_struct)
4281 NTL_IMPORT(kpanel)
4282 NTL_IMPORT(kpanelp)
4283 NTL_IMPORT(kk)
4284 NTL_IMPORT(k_max)
4285
4286
4287 AlignedArray<double> buf_store;
4288 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4289 double *buf = &buf_store[0];
4290
4291 for (long jpanel = first; jpanel < last; jpanel++) {
4292 if (jpanel == kpanel) continue;
4293
4294 double *jpanelp = &M[jpanel][0];
4295
4296 if (cleanup) {
4297 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4298 jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
4299 }
4300
4301 // perform swaps
4302 for (long k = kk; k < k_max; k++) {
4303 long pos = P[k];
4304 if (pos != k) {
4305 // swap rows pos and k
4306 double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4307 double *k_p = &jpanelp[k*MAT_BLK_SZ];
4308 for (long j = 0; j < MAT_BLK_SZ; j++)
4309 _ntl_swap(pos_p[j], k_p[j]);
4310 }
4311 }
4312
4313 // copy block number kpanel (the one on the diagonal) into buf
4314
4315 for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
4316 buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
4317
4318 // jpanel += kpanel*buf
4319
4320 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
4321 }
4322
4323 NTL_GEXEC_RANGE_END
4324
4325 // special processing: add 1 back to the diangonal
4326
4327 for (long k = kk; k < k_max; k++)
4328 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4329
4330 }
4331
4332 if (pivoting) {
4333 // pivot colums, using reverse swap sequence
4334
4335 for (long k = n-1; k >= 0; k--) {
4336 long pos = P[k];
4337 if (pos != k) {
4338 // swap columns pos and k
4339
4340 double *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4341 double *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4342 for (long i = 0; i < n; i++) {
4343 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4344 }
4345 }
4346 }
4347 }
4348
4349
4350 // copy panels into X
4351 X.SetDims(n, n);
4352 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4353 long j_max = min(jj+MAT_BLK_SZ, n);
4354 double *panelp = &M[panel][0];
4355
4356 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4357 zz_p *xp = X[i].elts() + jj;
4358
4359 for (long j = jj; j < j_max; j++)
4360 xp[j-jj].LoopHole() = rem((unsigned long)(long)panelp[j-jj], p, red_struct);
4361 }
4362 }
4363
4364 d.LoopHole() = det;
4365
4366 }
4367
4368 #endif
4369
4370
4371
4372 static
blk_inv_L(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4373 void blk_inv_L(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4374 {
4375 long n = A.NumRows();
4376
4377 if (A.NumCols() != n)
4378 LogicError("inv: nonsquare matrix");
4379
4380 if (n == 0) {
4381 set(d);
4382 X.SetDims(0, 0);
4383 return;
4384 }
4385
4386 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
4387
4388 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4389
4390 Vec< UniqueArray<unsigned long> > M;
4391 M.SetLength(npanels);
4392 for (long panel = 0; panel < npanels; panel++) {
4393 M[panel].SetLength(n*MAT_BLK_SZ);
4394 unsigned long *panelp = &M[panel][0];
4395
4396 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4397 }
4398
4399 // copy A into panels
4400 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4401 long j_max = min(jj+MAT_BLK_SZ, n);
4402 unsigned long *panelp = &M[panel][0];
4403
4404 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4405 const zz_p *ap = A[i].elts() + jj;
4406
4407 for (long j = jj; j < j_max; j++)
4408 panelp[j-jj] = rep(ap[j-jj]);
4409 }
4410 }
4411
4412 Vec<long> P;
4413 P.SetLength(n);
4414 for (long k = 0; k < n; k++) P[k] = k;
4415 // records swap operations
4416
4417
4418 long det;
4419 det = 1;
4420
4421 long p = zz_p::modulus();
4422 mulmod_t pinv = zz_p::ModulusInverse();
4423 sp_reduce_struct red_struct = zz_p::red_struct();
4424
4425
4426 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4427
4428 bool pivoting = false;
4429
4430 unsigned long ured_trigger =
4431 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
4432 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
4433
4434 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
4435
4436 long red_count = red_trigger;
4437
4438 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4439 long k_max = min(kk+MAT_BLK_SZ, n);
4440
4441 bool cleanup = false;
4442
4443 if (red_count-MAT_BLK_SZ < 0) {
4444 red_count = red_trigger;
4445 cleanup = true;
4446 }
4447
4448 red_count = red_count-MAT_BLK_SZ;
4449 unsigned long *kpanelp = &M[kpanel][0];
4450
4451 if (cleanup) {
4452 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4453 kpanelp[r] = rem(kpanelp[r], p, red_struct);
4454 }
4455
4456 for (long k = kk; k < k_max; k++) {
4457
4458 long pos = -1;
4459 long pivot;
4460 long pivot_inv;
4461
4462 for (long i = k; i < n; i++) {
4463 // NOTE: by using InvModStatus, this code will work
4464 // for prime-powers as well as primes
4465 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
4466 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4467 pos = i;
4468 break;
4469 }
4470 }
4471
4472 if (pos == -1) {
4473 clear(d);
4474 return;
4475 }
4476
4477 unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
4478 if (k != pos) {
4479 // swap rows pos and k
4480 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
4481 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4482
4483 det = NegateMod(det, p);
4484 P[k] = pos;
4485 pivoting = true;
4486 }
4487
4488 det = MulMod(det, pivot, p);
4489
4490 {
4491 // multiply row k by pivot_inv
4492 long t1 = pivot_inv;
4493 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4494 for (long j = 0; j < MAT_BLK_SZ; j++) {
4495 long t2 = rem(y[j], p, red_struct);
4496 y[j] = MulModPrecon(t2, t1, p, t1pinv);
4497 }
4498
4499 y[k-kk] = pivot_inv;
4500 }
4501
4502 for (long i = 0; i < n; i++) {
4503 if (i == k) continue; // skip row k
4504
4505 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
4506 long t1 = rem(x[k-kk], p, red_struct);
4507 t1 = NegateMod(t1, p);
4508 x[k-kk] = 0;
4509 if (t1 == 0) continue;
4510
4511 // add t1 * row k to row i
4512 unsigned long ut1 = t1;
4513 muladd_interval(x, y, ut1, MAT_BLK_SZ);
4514 }
4515 }
4516
4517
4518 // finished processing current kpanel
4519 // next, reduce and apply to all other kpanels
4520
4521 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4522 kpanelp[r] = rem(kpanelp[r], p, red_struct);
4523
4524 // special processing: subtract 1 off of diangonal
4525
4526 for (long k = kk; k < k_max; k++)
4527 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4528
4529
4530 NTL_GEXEC_RANGE(seq, npanels, first, last)
4531 NTL_IMPORT(p)
4532 NTL_IMPORT(n)
4533 NTL_IMPORT(red_struct)
4534 NTL_IMPORT(kpanel)
4535 NTL_IMPORT(kpanelp)
4536 NTL_IMPORT(kk)
4537 NTL_IMPORT(k_max)
4538
4539
4540 UniqueArray<unsigned long> buf_store;
4541 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4542 unsigned long *buf = &buf_store[0];
4543
4544 for (long jpanel = first; jpanel < last; jpanel++) {
4545 if (jpanel == kpanel) continue;
4546
4547 unsigned long *jpanelp = &M[jpanel][0];
4548
4549 if (cleanup) {
4550 for (long r = 0; r < n*MAT_BLK_SZ; r++)
4551 jpanelp[r] = rem(jpanelp[r], p, red_struct);
4552 }
4553
4554 // perform swaps
4555 for (long k = kk; k < k_max; k++) {
4556 long pos = P[k];
4557 if (pos != k) {
4558 // swap rows pos and k
4559 unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4560 unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
4561 for (long j = 0; j < MAT_BLK_SZ; j++)
4562 _ntl_swap(pos_p[j], k_p[j]);
4563 }
4564 }
4565
4566 // copy block number kpanel (the one on the diagonal) into buf
4567 // here, we transpose it
4568
4569 for (long k = kk; k < k_max; k++)
4570 for (long j = 0; j < MAT_BLK_SZ; j++)
4571 buf[j*MAT_BLK_SZ + (k-kk)] =
4572 rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
4573
4574 // jpanel += kpanel*buf
4575
4576 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk);
4577 }
4578
4579 NTL_GEXEC_RANGE_END
4580
4581 // special processing: add 1 back to the diangonal
4582
4583 for (long k = kk; k < k_max; k++)
4584 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4585
4586 }
4587
4588 if (pivoting) {
4589 // pivot colums, using reverse swap sequence
4590
4591 for (long k = n-1; k >= 0; k--) {
4592 long pos = P[k];
4593 if (pos != k) {
4594 // swap columns pos and k
4595
4596 unsigned long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4597 unsigned long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4598 for (long i = 0; i < n; i++) {
4599 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4600 }
4601 }
4602 }
4603 }
4604
4605 // copy panels into X
4606 X.SetDims(n, n);
4607 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4608 long j_max = min(jj+MAT_BLK_SZ, n);
4609 unsigned long *panelp = &M[panel][0];
4610
4611 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4612 zz_p *xp = X[i].elts() + jj;
4613
4614 for (long j = jj; j < j_max; j++)
4615 xp[j-jj].LoopHole() = rem(panelp[j-jj], p, red_struct);
4616 }
4617 }
4618
4619 d.LoopHole() = det;
4620
4621 }
4622
4623
4624
4625
4626
4627
4628
4629
4630 static
blk_inv_LL(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4631 void blk_inv_LL(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4632 {
4633 long n = A.NumRows();
4634
4635 if (A.NumCols() != n)
4636 LogicError("inv: nonsquare matrix");
4637
4638 if (n == 0) {
4639 set(d);
4640 X.SetDims(0, 0);
4641 return;
4642 }
4643
4644 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too big");
4645
4646 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
4647
4648 Vec< UniqueArray<long> > M;
4649 M.SetLength(npanels);
4650 for (long panel = 0; panel < npanels; panel++) {
4651 M[panel].SetLength(n*MAT_BLK_SZ);
4652 long *panelp = &M[panel][0];
4653
4654 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
4655 }
4656
4657
4658 // copy A into panels
4659 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4660 long j_max = min(jj+MAT_BLK_SZ, n);
4661 long *panelp = &M[panel][0];
4662
4663 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4664 const zz_p *ap = A[i].elts() + jj;
4665
4666 for (long j = jj; j < j_max; j++)
4667 panelp[j-jj] = rep(ap[j-jj]);
4668 }
4669 }
4670
4671 Vec<long> P;
4672 P.SetLength(n);
4673 for (long k = 0; k < n; k++) P[k] = k;
4674 // records swap operations
4675
4676
4677 long det;
4678 det = 1;
4679
4680 long p = zz_p::modulus();
4681 mulmod_t pinv = zz_p::ModulusInverse();
4682 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
4683
4684
4685 bool seq = double(n)*double(n)*double(MAT_BLK_SZ) < PAR_THRESH;
4686
4687 bool pivoting = false;
4688
4689 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
4690 long k_max = min(kk+MAT_BLK_SZ, n);
4691
4692 long *kpanelp = &M[kpanel][0];
4693
4694
4695 for (long k = kk; k < k_max; k++) {
4696
4697 long pos = -1;
4698 long pivot;
4699 long pivot_inv;
4700
4701 for (long i = k; i < n; i++) {
4702 // NOTE: by using InvModStatus, this code will work
4703 // for prime-powers as well as primes
4704 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
4705 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
4706 pos = i;
4707 break;
4708 }
4709 }
4710
4711 if (pos == -1) {
4712 clear(d);
4713 return;
4714 }
4715
4716 long *y = &kpanelp[k*MAT_BLK_SZ];
4717 if (k != pos) {
4718 // swap rows pos and k
4719 long *x = &kpanelp[pos*MAT_BLK_SZ];
4720 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
4721
4722 det = NegateMod(det, p);
4723 P[k] = pos;
4724 pivoting = true;
4725 }
4726
4727 det = MulMod(det, pivot, p);
4728
4729 {
4730 // multiply row k by pivot_inv
4731 long t1 = pivot_inv;
4732 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
4733 for (long j = 0; j < MAT_BLK_SZ; j++) {
4734 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
4735 }
4736
4737 y[k-kk] = pivot_inv;
4738 }
4739
4740 for (long i = 0; i < n; i++) {
4741 if (i == k) continue; // skip row k
4742
4743 long *x = &kpanelp[i*MAT_BLK_SZ];
4744 long t1 = x[k-kk];
4745 t1 = NegateMod(t1, p);
4746 x[k-kk] = 0;
4747 if (t1 == 0) continue;
4748
4749 // add t1 * row k to row i
4750 long ut1 = t1;
4751 muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
4752 }
4753 }
4754
4755
4756 // finished processing current kpanel
4757 // next, reduce and apply to all other kpanels
4758
4759 // special processing: subtract 1 off of diangonal
4760
4761 for (long k = kk; k < k_max; k++)
4762 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4763
4764
4765 NTL_GEXEC_RANGE(seq, npanels, first, last)
4766 NTL_IMPORT(p)
4767 NTL_IMPORT(n)
4768 NTL_IMPORT(ll_red_struct)
4769 NTL_IMPORT(kpanel)
4770 NTL_IMPORT(kpanelp)
4771 NTL_IMPORT(kk)
4772 NTL_IMPORT(k_max)
4773
4774
4775 UniqueArray<long> buf_store;
4776 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
4777 long *buf = &buf_store[0];
4778
4779 for (long jpanel = first; jpanel < last; jpanel++) {
4780 if (jpanel == kpanel) continue;
4781
4782 long *jpanelp = &M[jpanel][0];
4783
4784 // perform swaps
4785 for (long k = kk; k < k_max; k++) {
4786 long pos = P[k];
4787 if (pos != k) {
4788 // swap rows pos and k
4789 long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
4790 long *k_p = &jpanelp[k*MAT_BLK_SZ];
4791 for (long j = 0; j < MAT_BLK_SZ; j++)
4792 _ntl_swap(pos_p[j], k_p[j]);
4793 }
4794 }
4795
4796 // copy block number kpanel (the one on the diagonal) into buf
4797 // here, we transpose it
4798
4799 for (long k = kk; k < k_max; k++)
4800 for (long j = 0; j < MAT_BLK_SZ; j++)
4801 buf[j*MAT_BLK_SZ + (k-kk)] =
4802 jpanelp[k*MAT_BLK_SZ+j];
4803
4804
4805 // jpanel += kpanel*buf
4806
4807 muladd_all_by_32(0, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
4808 }
4809
4810 NTL_GEXEC_RANGE_END
4811
4812 // special processing: add 1 back to the diangonal
4813
4814 for (long k = kk; k < k_max; k++)
4815 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod(kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
4816
4817 }
4818
4819 if (pivoting) {
4820 // pivot colums, using reverse swap sequence
4821
4822 for (long k = n-1; k >= 0; k--) {
4823 long pos = P[k];
4824 if (pos != k) {
4825 // swap columns pos and k
4826
4827 long *x = &M[pos / MAT_BLK_SZ][pos % MAT_BLK_SZ];
4828 long *y = &M[k / MAT_BLK_SZ][k % MAT_BLK_SZ];
4829 for (long i = 0; i < n; i++) {
4830 _ntl_swap(x[i*MAT_BLK_SZ], y[i*MAT_BLK_SZ]);
4831 }
4832 }
4833 }
4834 }
4835
4836 // copy panels into X
4837 X.SetDims(n, n);
4838 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
4839 long j_max = min(jj+MAT_BLK_SZ, n);
4840 long *panelp = &M[panel][0];
4841
4842 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
4843 zz_p *xp = X[i].elts() + jj;
4844
4845 for (long j = jj; j < j_max; j++)
4846 xp[j-jj].LoopHole() = panelp[j-jj];
4847 }
4848 }
4849
4850 d.LoopHole() = det;
4851
4852 }
4853
4854
4855
4856 #endif
4857
4858
4859
relaxed_inv(zz_p & d,mat_zz_p & X,const mat_zz_p & A,bool relax)4860 void relaxed_inv(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
4861 {
4862 long n = A.NumRows();
4863
4864 if (A.NumCols() != n)
4865 LogicError("inv: nonsquare matrix");
4866
4867 #ifndef NTL_HAVE_LL_TYPE
4868
4869 basic_inv(d, X, A, relax);
4870
4871 #else
4872
4873 long p = zz_p::modulus();
4874
4875 if (n < 16) {
4876 //cerr << "basic_inv\n";
4877 basic_inv(d, X, A, relax);
4878 }
4879 else if (n/MAT_BLK_SZ < 4) {
4880 long V = 64;
4881
4882 #ifdef NTL_HAVE_AVX
4883 if (p-1 <= MAX_DBL_INT &&
4884 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
4885 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
4886
4887 //cerr << "alt_inv_DD\n";
4888 alt_inv_DD(d, X, A, relax);
4889 }
4890 else
4891 #endif
4892 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
4893 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
4894
4895 //cerr << "alt_inv_L\n";
4896 alt_inv_L(d, X, A, relax);
4897
4898 }
4899 else {
4900
4901 //cerr << "basic_inv\n";
4902 basic_inv(d, X, A, relax);
4903 }
4904 }
4905 else {
4906 long V = 4*MAT_BLK_SZ;
4907
4908 #ifdef NTL_HAVE_AVX
4909 if (p-1 <= MAX_DBL_INT &&
4910 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
4911 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
4912
4913 //cerr << "blk_inv_DD\n";
4914 blk_inv_DD(d, X, A, relax);
4915 }
4916 else
4917 #endif
4918 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
4919 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
4920
4921 //cerr << "blk_inv_L\n";
4922 blk_inv_L(d, X, A, relax);
4923
4924 }
4925 else {
4926
4927 //cerr << "blk_inv_LL\n";
4928 blk_inv_LL(d, X, A, relax);
4929 }
4930
4931 }
4932
4933 #endif
4934
4935
4936
4937 }
4938
4939
4940
4941 // ******************************************************************
4942 //
4943 // Triangularizing square matrices, with applications
4944 // to solving linear systems and computing determinants.
4945 // Should be about 3x faster than the matrix inverse
4946 // algorithms.
4947 //
4948 // ******************************************************************
4949
4950
4951 static
basic_tri(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)4952 void basic_tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
4953 vec_zz_p *xp, bool trans, bool relax)
4954 {
4955 long n = A.NumRows();
4956
4957 // adjust
4958 if (A.NumCols() != n)
4959 LogicError("tri: nonsquare matrix");
4960
4961 // adjust
4962 if (bp && bp->length() != n)
4963 LogicError("tri: dimension mismatch");
4964
4965 // adjust
4966 if (bp && !xp)
4967 LogicError("tri: bad args");
4968
4969 if (n == 0) {
4970 set(d);
4971 // adjust
4972 if (xp) xp->SetLength(0);
4973 return;
4974 }
4975
4976 // adjust (several lines)
4977 // scratch space
4978 Mat<long> M;
4979 if (!trans) {
4980 conv(M, A);
4981 }
4982 else {
4983 M.SetDims(n, n);
4984 for (long i = 0; i < n; i++)
4985 for (long j = 0; j < n; j++)
4986 M[i][j] = rep(A[j][i]);
4987 }
4988
4989 Vec<long> bv;
4990 if (bp) conv(bv, *bp);
4991 // end adjust
4992
4993
4994 Vec<long> P;
4995 P.SetLength(n);
4996 for (long k = 0; k < n; k++) P[k] = k;
4997 // records swap operations
4998
4999 long det;
5000 det = 1;
5001
5002 long p = zz_p::modulus();
5003 mulmod_t pinv = zz_p::ModulusInverse();
5004
5005
5006 bool pivoting = false;
5007
5008 for (long k = 0; k < n; k++) {
5009 long pos = -1;
5010 long pivot_inv;
5011 for (long i = k; i < n; i++) {
5012 // NOTE: by using InvModStatus, this code will work
5013 // for prime-powers as well as primes
5014 long pivot = M[i][k];
5015 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5016 pos = i;
5017 break;
5018 }
5019 }
5020
5021 if (pos != -1) {
5022 if (k != pos) {
5023 swap(M[pos], M[k]);
5024 det = NegateMod(det, p);
5025 P[k] = pos;
5026 pivoting = true;
5027
5028 // adjust
5029 if (bp) _ntl_swap(bv[pos], bv[k]);
5030 }
5031
5032 det = MulMod(det, M[k][k], p);
5033
5034 {
5035 // multiply row k by pivot_inv
5036 long t1 = pivot_inv;
5037 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5038 long *y = &M[k][0];
5039 // adjust
5040 for (long j = k+1; j < n; j++)
5041 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
5042
5043 // adjust // y[k] = pivot_inv;
5044
5045 // adjust
5046 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5047 }
5048
5049
5050
5051 // adjust
5052 bool seq = n-(k+1) < PAR_THRESH_SQ;
5053 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5054 NTL_IMPORT(p)
5055 NTL_IMPORT(n)
5056 NTL_IMPORT(k)
5057 long *y = &M[k][0];
5058
5059 // adjust
5060 for (long ii = first; ii < last; ii++) {
5061 long i = ii + k+1;
5062
5063 long *x = &M[i][0];
5064 long t1 = x[k];
5065 t1 = NegateMod(t1, p);
5066 // adjust // x[k] = 0;
5067 if (t1 == 0) continue;
5068
5069 // add t1 * row k to row i
5070 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5071
5072 // adjust
5073 for (long j = k+1; j < n; j++) {
5074 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
5075 x[j] = AddMod(x[j], t2, p);
5076 }
5077
5078 // adjust
5079 if (bp)
5080 {
5081 long t2 = MulModPrecon(bv[k], t1, p, t1pinv);
5082 bv[i] = AddMod(bv[i], t2, p);
5083 }
5084 }
5085 NTL_GEXEC_RANGE_END
5086 }
5087 else {
5088 clear(d);
5089 return;
5090 }
5091 }
5092
5093
5094 // adjust
5095 if (bp) {
5096 xp->SetLength(n);
5097 zz_p *X = xp->elts();
5098
5099 for (long i = n-1; i >= 0; i--) {
5100 long t1 = 0;
5101 for (long j = i+1; j < n; j++) {
5102 long t2 = MulMod(rep(X[j]), M[i][j], p);
5103 t1 = AddMod(t1, t2, p);
5104 }
5105 X[i].LoopHole() = SubMod(bv[i], t1, p);
5106 }
5107 }
5108
5109 d.LoopHole() = det;
5110 }
5111
5112
5113
5114
5115 #ifdef NTL_HAVE_LL_TYPE
5116
5117
5118
5119 static
alt_tri_L(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5120 void alt_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5121 vec_zz_p *xp, bool trans, bool relax)
5122 {
5123 long n = A.NumRows();
5124
5125 if (A.NumCols() != n)
5126 LogicError("tri: nonsquare matrix");
5127
5128 // adjust
5129 if (bp && bp->length() != n)
5130 LogicError("tri: dimension mismatch");
5131
5132 // adjust
5133 if (bp && !xp)
5134 LogicError("tri: bad args");
5135
5136 if (n == 0) {
5137 set(d);
5138 if (xp) xp->SetLength(0);
5139 return;
5140 }
5141
5142
5143 // scratch space
5144 Mat<unsigned long> M;
5145 if (!trans) {
5146 conv(M, A);
5147 }
5148 else {
5149 M.SetDims(n, n);
5150 for (long i = 0; i < n; i++)
5151 for (long j = 0; j < n; j++)
5152 M[i][j] = rep(A[j][i]);
5153 }
5154
5155 Vec<long> bv;
5156 if (bp) conv(bv, *bp);
5157
5158 Vec<long> P;
5159 P.SetLength(n);
5160 for (long k = 0; k < n; k++) P[k] = k;
5161 // records swap operations
5162
5163 long det;
5164 det = 1;
5165
5166 long p = zz_p::modulus();
5167 mulmod_t pinv = zz_p::ModulusInverse();
5168 sp_reduce_struct red_struct = zz_p::red_struct();
5169
5170
5171
5172 bool pivoting = false;
5173
5174 unsigned long ured_trigger =
5175 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
5176 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
5177
5178 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
5179
5180 long red_count = red_trigger;
5181
5182
5183 for (long k = 0; k < n; k++) {
5184 bool cleanup = false;
5185
5186 if (red_count-1 < 0) {
5187 red_count = red_trigger;
5188 cleanup = true;
5189 }
5190
5191 red_count = red_count-1;
5192
5193 long pos = -1;
5194 long pivot;
5195 long pivot_inv;
5196
5197 for (long i = k; i < n; i++) {
5198 // NOTE: by using InvModStatus, this code will work
5199 // for prime-powers as well as primes
5200 pivot = rem(M[i][k], p, red_struct);
5201 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5202 pos = i;
5203 break;
5204 }
5205 }
5206
5207 if (pos != -1) {
5208 if (k != pos) {
5209 swap(M[pos], M[k]);
5210 det = NegateMod(det, p);
5211 P[k] = pos;
5212 pivoting = true;
5213
5214 if (bp) _ntl_swap(bv[pos], bv[k]);
5215 }
5216
5217 det = MulMod(det, pivot, p);
5218
5219 {
5220 // multiply row k by pivot_inv
5221 long t1 = pivot_inv;
5222 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5223 unsigned long *y = &M[k][0];
5224 for (long j = k+1; j < n; j++) {
5225 long t2 = rem(y[j], p, red_struct);
5226 y[j] = MulModPrecon(t2, t1, p, t1pinv);
5227 }
5228
5229 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5230 }
5231
5232
5233
5234 bool seq = n-(k+1) < PAR_THRESH_SQ;
5235 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5236 NTL_IMPORT(p)
5237 NTL_IMPORT(n)
5238 NTL_IMPORT(k)
5239 NTL_IMPORT(red_struct)
5240 unsigned long *y = &M[k][0];
5241 if (cleanup) {
5242 for (long ii = first; ii < last; ii++) {
5243 long i = ii + k+1;
5244
5245 unsigned long *x = &M[i][0];
5246 for (long j = k+1; j < n; j++) {
5247 x[j] = rem(x[j], p, red_struct);
5248 }
5249 }
5250 }
5251
5252
5253 for (long ii = first; ii < last; ii++) {
5254 long i = ii + k+1;
5255
5256 unsigned long *x = &M[i][0];
5257 long t1 = rem(x[k], p, red_struct);
5258 t1 = NegateMod(t1, p);
5259 if (t1 == 0) continue;
5260
5261 // add t1 * row k to row i
5262 unsigned long ut1 = t1;
5263 long j;
5264 for (j = k+1; j <= n-4; j+=4) {
5265 unsigned long xj0 = x[j+0] + DO_MUL(y[j+0], ut1);
5266 unsigned long xj1 = x[j+1] + DO_MUL(y[j+1], ut1);
5267 unsigned long xj2 = x[j+2] + DO_MUL(y[j+2], ut1);
5268 unsigned long xj3 = x[j+3] + DO_MUL(y[j+3], ut1);
5269 x[j+0] = xj0;
5270 x[j+1] = xj1;
5271 x[j+2] = xj2;
5272 x[j+3] = xj3;
5273 }
5274 for (; j < n; j++) {
5275 x[j] += DO_MUL(y[j], ut1);
5276 }
5277
5278 if (bp)
5279 {
5280 long t2 = MulMod(bv[k], t1, p);
5281 bv[i] = AddMod(bv[i], t2, p);
5282 }
5283 }
5284 NTL_GEXEC_RANGE_END
5285 }
5286 else {
5287 clear(d);
5288 return;
5289 }
5290 }
5291
5292
5293
5294 if (bp) {
5295 xp->SetLength(n);
5296 zz_p *X = xp->elts();
5297
5298 for (long i = n-1; i >= 0; i--) {
5299 long t1 = 0;
5300 for (long j = i+1; j < n; j++) {
5301 long t0 = rem(M[i][j], p, red_struct);
5302 long t2 = MulMod(rep(X[j]), t0, p);
5303 t1 = AddMod(t1, t2, p);
5304 }
5305 X[i].LoopHole() = SubMod(bv[i], t1, p);
5306 }
5307 }
5308
5309 d.LoopHole() = det;
5310 }
5311
5312
5313
5314
5315 #ifdef NTL_HAVE_AVX
5316
5317 static
alt_tri_DD(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5318 void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5319 vec_zz_p *xp, bool trans, bool relax)
5320 {
5321 long n = A.NumRows();
5322
5323 if (A.NumCols() != n)
5324 LogicError("tri: nonsquare matrix");
5325
5326 // adjust
5327 if (bp && bp->length() != n)
5328 LogicError("tri: dimension mismatch");
5329
5330 // adjust
5331 if (bp && !xp)
5332 LogicError("tri: bad args");
5333
5334 if (n == 0) {
5335 set(d);
5336 if (xp) xp->SetLength(0);
5337 return;
5338 }
5339
5340
5341 // scratch space
5342
5343 Vec< AlignedArray<double> > M;
5344 M.SetLength(n);
5345 for (long i = 0; i < n; i++) M[i].SetLength(n);
5346 if (!trans) {
5347 for (long i = 0; i < n; i++)
5348 for (long j = 0; j < n; j++)
5349 M[i][j] = rep(A[i][j]);
5350 }
5351 else {
5352 for (long i = 0; i < n; i++)
5353 for (long j = 0; j < n; j++)
5354 M[i][j] = rep(A[j][i]);
5355 }
5356
5357 Vec<long> bv;
5358 if (bp) conv(bv, *bp);
5359
5360 Vec<long> P;
5361 P.SetLength(n);
5362 for (long k = 0; k < n; k++) P[k] = k;
5363 // records swap operations
5364
5365 long det;
5366 det = 1;
5367
5368 long p = zz_p::modulus();
5369 mulmod_t pinv = zz_p::ModulusInverse();
5370 sp_reduce_struct red_struct = zz_p::red_struct();
5371
5372
5373
5374 bool pivoting = false;
5375
5376 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
5377 long red_count = red_trigger;
5378
5379 for (long k = 0; k < n; k++) {
5380 bool cleanup = false;
5381
5382 if (red_count-1 < 0) {
5383 red_count = red_trigger;
5384 cleanup = true;
5385 }
5386
5387 red_count = red_count-1;
5388
5389 long pos = -1;
5390 long pivot;
5391 long pivot_inv;
5392
5393 for (long i = k; i < n; i++) {
5394 // NOTE: by using InvModStatus, this code will work
5395 // for prime-powers as well as primes
5396 pivot = rem((unsigned long)(long)M[i][k], p, red_struct);
5397 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5398 pos = i;
5399 break;
5400 }
5401 }
5402
5403 if (pos != -1) {
5404 if (k != pos) {
5405 swap(M[pos], M[k]);
5406 det = NegateMod(det, p);
5407 P[k] = pos;
5408 pivoting = true;
5409
5410 if (bp) _ntl_swap(bv[pos], bv[k]);
5411 }
5412
5413 det = MulMod(det, pivot, p);
5414
5415 {
5416 // multiply row k by pivot_inv
5417 long t1 = pivot_inv;
5418 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv); // t1*pinv;
5419 double *y = &M[k][0];
5420 for (long j = k+1; j < n; j++) {
5421 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
5422 y[j] = MulModPrecon(t2, t1, p, t1pinv);
5423 }
5424
5425 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5426 }
5427
5428
5429
5430 bool seq = n-(k+1) < PAR_THRESH_SQ;
5431 NTL_GEXEC_RANGE(seq, n-(k+1), first, last)
5432 NTL_IMPORT(p)
5433 NTL_IMPORT(n)
5434 NTL_IMPORT(k)
5435 NTL_IMPORT(red_struct)
5436 double *y = &M[k][0];
5437 if (cleanup) {
5438 for (long ii = first; ii < last; ii++) {
5439 long i = ii + k+1;
5440
5441 double *x = &M[i][0];
5442 for (long j = k+1; j < n; j++) {
5443 x[j] = rem((unsigned long)(long)x[j], p, red_struct);
5444 }
5445 }
5446 }
5447
5448 long align_boundary =
5449 min((((k+1)+(NTL_AVX_DBL_ALIGN-1))/NTL_AVX_DBL_ALIGN)*NTL_AVX_DBL_ALIGN, n);
5450
5451
5452 for (long ii = first; ii < last; ii++) {
5453 long i = ii + k+1;
5454
5455 double *x = &M[i][0];
5456 long t1 = rem((unsigned long)(long)x[k], p, red_struct);
5457 t1 = NegateMod(t1, p);
5458 if (t1 == 0) continue;
5459
5460 // add t1 * row k to row i
5461 double ut1 = t1;
5462 for (long j = k+1; j < align_boundary; j++) x[j] += y[j]*ut1;
5463 muladd_interval1(x+align_boundary, y+align_boundary, ut1, n-align_boundary);
5464
5465 if (bp)
5466 {
5467 long t2 = MulMod(bv[k], t1, p);
5468 bv[i] = AddMod(bv[i], t2, p);
5469 }
5470 }
5471 NTL_GEXEC_RANGE_END
5472 }
5473 else {
5474 clear(d);
5475 return;
5476 }
5477 }
5478
5479
5480
5481 if (bp) {
5482 xp->SetLength(n);
5483 zz_p *X = xp->elts();
5484
5485 for (long i = n-1; i >= 0; i--) {
5486 long t1 = 0;
5487 for (long j = i+1; j < n; j++) {
5488 long t0 = rem((unsigned long)(long)M[i][j], p, red_struct);
5489 long t2 = MulMod(rep(X[j]), t0, p);
5490 t1 = AddMod(t1, t2, p);
5491 }
5492 X[i].LoopHole() = SubMod(bv[i], t1, p);
5493 }
5494 }
5495
5496 d.LoopHole() = det;
5497 }
5498
5499
5500 #endif
5501
5502
5503
5504
5505 #ifdef NTL_HAVE_AVX
5506
5507 static
blk_tri_DD(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5508 void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5509 vec_zz_p *xp, bool trans, bool relax)
5510 {
5511 long n = A.NumRows();
5512
5513 if (A.NumCols() != n)
5514 LogicError("tri: nonsquare matrix");
5515
5516 if (bp && bp->length() != n)
5517 LogicError("tri: dimension mismatch");
5518
5519 if (bp && !xp)
5520 LogicError("tri: bad args");
5521
5522 if (n == 0) {
5523 set(d);
5524 if (xp) xp->SetLength(0);
5525 return;
5526 }
5527
5528 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5529
5530 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5531
5532 Vec< AlignedArray<double> > M;
5533 M.SetLength(npanels);
5534 for (long panel = 0; panel < npanels; panel++) {
5535 M[panel].SetLength(n*MAT_BLK_SZ);
5536 double *panelp = &M[panel][0];
5537
5538 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
5539 }
5540
5541 if (trans) {
5542 // copy A transposed into panels
5543 for (long i = 0; i < n; i++) {
5544 const zz_p *row = &A[i][0];
5545 double *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
5546 for (long j = 0; j < n; j++)
5547 col[j*MAT_BLK_SZ] = rep(row[j]);
5548 }
5549 }
5550 else {
5551 // copy A into panels
5552 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
5553 long j_max = min(jj+MAT_BLK_SZ, n);
5554 double *panelp = &M[panel][0];
5555
5556 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5557 const zz_p *ap = A[i].elts() + jj;
5558
5559 for (long j = jj; j < j_max; j++)
5560 panelp[j-jj] = rep(ap[j-jj]);
5561 }
5562 }
5563 }
5564
5565 Vec<long> bv;
5566 if (bp) conv(bv, *bp);
5567
5568 Vec<long> P;
5569 P.SetLength(n);
5570 for (long k = 0; k < n; k++) P[k] = k;
5571 // records swap operations
5572
5573
5574 long det;
5575 det = 1;
5576
5577 long p = zz_p::modulus();
5578 mulmod_t pinv = zz_p::ModulusInverse();
5579 sp_reduce_struct red_struct = zz_p::red_struct();
5580
5581
5582 bool pivoting = false;
5583
5584 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
5585 long red_count = red_trigger;
5586
5587 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5588 long k_max = min(kk+MAT_BLK_SZ, n);
5589
5590 bool cleanup = false;
5591
5592 if (red_count-MAT_BLK_SZ < 0) {
5593 red_count = red_trigger;
5594 cleanup = true;
5595 }
5596
5597 red_count = red_count-MAT_BLK_SZ;
5598 double *kpanelp = &M[kpanel][0];
5599
5600 if (cleanup) {
5601 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5602 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
5603 }
5604
5605 for (long k = kk; k < k_max; k++) {
5606
5607 long pos = -1;
5608 long pivot;
5609 long pivot_inv;
5610
5611 for (long i = k; i < n; i++) {
5612 // NOTE: by using InvModStatus, this code will work
5613 // for prime-powers as well as primes
5614 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
5615 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5616 pos = i;
5617 break;
5618 }
5619 }
5620
5621 if (pos == -1) {
5622 clear(d);
5623 return;
5624 }
5625
5626 double *y = &kpanelp[k*MAT_BLK_SZ];
5627 if (k != pos) {
5628 // swap rows pos and k
5629 double *x = &kpanelp[pos*MAT_BLK_SZ];
5630 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5631
5632 det = NegateMod(det, p);
5633 P[k] = pos;
5634 pivoting = true;
5635
5636 if (bp) _ntl_swap(bv[pos], bv[k]);
5637 }
5638
5639 det = MulMod(det, pivot, p);
5640
5641 {
5642 // multiply row k by pivot_inv
5643 long t1 = pivot_inv;
5644 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5645 for (long j = 0; j < MAT_BLK_SZ; j++) {
5646 long t2 = rem((unsigned long)(long)y[j], p, red_struct);
5647 y[j] = MulModPrecon(t2, t1, p, t1pinv);
5648 }
5649
5650 y[k-kk] = pivot_inv;
5651
5652 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5653 }
5654
5655 for (long i = kk; i < n; i++) {
5656 if (i == k) continue; // skip row k
5657
5658 double *x = &kpanelp[i*MAT_BLK_SZ];
5659 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
5660 t1 = NegateMod(t1, p);
5661 x[k-kk] = 0;
5662 if (t1 == 0) continue;
5663
5664 // add t1 * row k to row i
5665 double ut1 = t1;
5666 muladd_interval(x, y, ut1, MAT_BLK_SZ);
5667 if (bp)
5668 {
5669 long t2 = MulMod(bv[k], t1, p);
5670 bv[i] = AddMod(bv[i], t2, p);
5671 }
5672 }
5673 }
5674
5675
5676 // finished processing current kpanel
5677 // next, reduce and apply to all other kpanels
5678
5679 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5680 kpanelp[r] = rem((unsigned long)(long)kpanelp[r], p, red_struct);
5681
5682 // special processing: subtract 1 off of diangonal
5683
5684 for (long k = kk; k < k_max; k++)
5685 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5686
5687
5688 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
5689
5690 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5691 NTL_IMPORT(p)
5692 NTL_IMPORT(n)
5693 NTL_IMPORT(red_struct)
5694 NTL_IMPORT(kpanel)
5695 NTL_IMPORT(kpanelp)
5696 NTL_IMPORT(kk)
5697 NTL_IMPORT(k_max)
5698
5699
5700 AlignedArray<double> buf_store;
5701 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5702 double *buf = &buf_store[0];
5703
5704 for (long index = first; index < last; index++) {
5705 long jpanel = index + kpanel+1;
5706
5707 double *jpanelp = &M[jpanel][0];
5708
5709 if (cleanup) {
5710 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5711 jpanelp[r] = rem((unsigned long)(long)jpanelp[r], p, red_struct);
5712 }
5713
5714 // perform swaps
5715 for (long k = kk; k < k_max; k++) {
5716 long pos = P[k];
5717 if (pos != k) {
5718 // swap rows pos and k
5719 double *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5720 double *k_p = &jpanelp[k*MAT_BLK_SZ];
5721 for (long j = 0; j < MAT_BLK_SZ; j++)
5722 _ntl_swap(pos_p[j], k_p[j]);
5723 }
5724 }
5725
5726 // copy block number kpanel (the one on the diagonal) into buf
5727
5728 for (long i = 0; i < (k_max-kk)*MAT_BLK_SZ; i++)
5729 buf[i] = rem((unsigned long)(long)jpanelp[kk*MAT_BLK_SZ+i], p, red_struct);
5730
5731 // jpanel += kpanel*buf
5732
5733 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
5734 }
5735
5736 NTL_GEXEC_RANGE_END
5737
5738 // special processing: add 1 back to the diangonal
5739
5740 for (long k = kk; k < k_max; k++)
5741 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5742
5743 }
5744
5745 if (bp) {
5746 xp->SetLength(n);
5747 zz_p *X = xp->elts();
5748
5749 for (long i = n-1; i >= 0; i--) {
5750 long t1 = 0;
5751 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5752 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
5753 jj < n; jj += MAT_BLK_SZ, panel++) {
5754 long j_max = min(jj+MAT_BLK_SZ, n);
5755 double *row = &M[panel][i*MAT_BLK_SZ];
5756 for (long j = jj; j < j_max; j++) {
5757 long t0 = rem((unsigned long)(long)row[j-jj], p, red_struct);
5758 long t2 = MulMod(rep(X[j]), t0, p);
5759 t1 = AddMod(t1, t2, p);
5760 }
5761 }
5762 X[i].LoopHole() = SubMod(bv[i], t1, p);
5763 }
5764 }
5765
5766 d.LoopHole() = det;
5767
5768 }
5769
5770 #endif
5771
5772
5773 static
blk_tri_L(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)5774 void blk_tri_L(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
5775 vec_zz_p *xp, bool trans, bool relax)
5776 {
5777 long n = A.NumRows();
5778
5779 if (A.NumCols() != n)
5780 LogicError("tri: nonsquare matrix");
5781
5782 if (bp && bp->length() != n)
5783 LogicError("tri: dimension mismatch");
5784
5785 if (bp && !xp)
5786 LogicError("tri: bad args");
5787
5788 if (n == 0) {
5789 set(d);
5790 if (xp) xp->SetLength(0);
5791 return;
5792 }
5793
5794 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
5795
5796 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
5797
5798 Vec< UniqueArray<unsigned long> > M;
5799 M.SetLength(npanels);
5800 for (long panel = 0; panel < npanels; panel++) {
5801 M[panel].SetLength(n*MAT_BLK_SZ);
5802 unsigned long *panelp = &M[panel][0];
5803
5804 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
5805 }
5806
5807 if (trans) {
5808 // copy A transposed into panels
5809 for (long i = 0; i < n; i++) {
5810 const zz_p *row = &A[i][0];
5811 unsigned long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
5812 for (long j = 0; j < n; j++)
5813 col[j*MAT_BLK_SZ] = rep(row[j]);
5814 }
5815 }
5816 else {
5817 // copy A into panels
5818 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
5819 long j_max = min(jj+MAT_BLK_SZ, n);
5820 unsigned long *panelp = &M[panel][0];
5821
5822 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
5823 const zz_p *ap = A[i].elts() + jj;
5824
5825 for (long j = jj; j < j_max; j++)
5826 panelp[j-jj] = rep(ap[j-jj]);
5827 }
5828 }
5829 }
5830
5831 Vec<long> bv;
5832 if (bp) conv(bv, *bp);
5833
5834 Vec<long> P;
5835 P.SetLength(n);
5836 for (long k = 0; k < n; k++) P[k] = k;
5837 // records swap operations
5838
5839
5840 long det;
5841 det = 1;
5842
5843 long p = zz_p::modulus();
5844 mulmod_t pinv = zz_p::ModulusInverse();
5845 sp_reduce_struct red_struct = zz_p::red_struct();
5846
5847
5848 bool pivoting = false;
5849
5850 unsigned long ured_trigger =
5851 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
5852 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
5853
5854 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
5855
5856 long red_count = red_trigger;
5857
5858 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
5859 long k_max = min(kk+MAT_BLK_SZ, n);
5860
5861 bool cleanup = false;
5862
5863 if (red_count-MAT_BLK_SZ < 0) {
5864 red_count = red_trigger;
5865 cleanup = true;
5866 }
5867
5868 red_count = red_count-MAT_BLK_SZ;
5869 unsigned long *kpanelp = &M[kpanel][0];
5870
5871 if (cleanup) {
5872 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5873 kpanelp[r] = rem(kpanelp[r], p, red_struct);
5874 }
5875
5876 for (long k = kk; k < k_max; k++) {
5877
5878 long pos = -1;
5879 long pivot;
5880 long pivot_inv;
5881
5882 for (long i = k; i < n; i++) {
5883 // NOTE: by using InvModStatus, this code will work
5884 // for prime-powers as well as primes
5885 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
5886 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
5887 pos = i;
5888 break;
5889 }
5890 }
5891
5892 if (pos == -1) {
5893 clear(d);
5894 return;
5895 }
5896
5897 unsigned long *y = &kpanelp[k*MAT_BLK_SZ];
5898 if (k != pos) {
5899 // swap rows pos and k
5900 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
5901 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
5902
5903 det = NegateMod(det, p);
5904 P[k] = pos;
5905 pivoting = true;
5906
5907 if (bp) _ntl_swap(bv[pos], bv[k]);
5908 }
5909
5910 det = MulMod(det, pivot, p);
5911
5912 {
5913 // multiply row k by pivot_inv
5914 long t1 = pivot_inv;
5915 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
5916 for (long j = 0; j < MAT_BLK_SZ; j++) {
5917 long t2 = rem(y[j], p, red_struct);
5918 y[j] = MulModPrecon(t2, t1, p, t1pinv);
5919 }
5920
5921 y[k-kk] = pivot_inv;
5922
5923 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
5924 }
5925
5926 for (long i = kk; i < n; i++) {
5927 if (i == k) continue; // skip row k
5928
5929 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
5930 long t1 = rem(x[k-kk], p, red_struct);
5931 t1 = NegateMod(t1, p);
5932 x[k-kk] = 0;
5933 if (t1 == 0) continue;
5934
5935 // add t1 * row k to row i
5936 unsigned long ut1 = t1;
5937 muladd_interval(x, y, ut1, MAT_BLK_SZ);
5938 if (bp)
5939 {
5940 long t2 = MulMod(bv[k], t1, p);
5941 bv[i] = AddMod(bv[i], t2, p);
5942 }
5943 }
5944 }
5945
5946
5947 // finished processing current kpanel
5948 // next, reduce and apply to all other kpanels
5949
5950 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5951 kpanelp[r] = rem(kpanelp[r], p, red_struct);
5952
5953 // special processing: subtract 1 off of diangonal
5954
5955 for (long k = kk; k < k_max; k++)
5956 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
5957
5958
5959 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
5960 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
5961 NTL_IMPORT(p)
5962 NTL_IMPORT(n)
5963 NTL_IMPORT(red_struct)
5964 NTL_IMPORT(kpanel)
5965 NTL_IMPORT(kpanelp)
5966 NTL_IMPORT(kk)
5967 NTL_IMPORT(k_max)
5968
5969
5970 UniqueArray<unsigned long> buf_store;
5971 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
5972 unsigned long *buf = &buf_store[0];
5973
5974 for (long index = first; index < last; index++) {
5975 long jpanel = index + kpanel+1;
5976
5977 unsigned long *jpanelp = &M[jpanel][0];
5978
5979 if (cleanup) {
5980 for (long r = kk*MAT_BLK_SZ; r < n*MAT_BLK_SZ; r++)
5981 jpanelp[r] = rem(jpanelp[r], p, red_struct);
5982 }
5983
5984 // perform swaps
5985 for (long k = kk; k < k_max; k++) {
5986 long pos = P[k];
5987 if (pos != k) {
5988 // swap rows pos and k
5989 unsigned long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
5990 unsigned long *k_p = &jpanelp[k*MAT_BLK_SZ];
5991 for (long j = 0; j < MAT_BLK_SZ; j++)
5992 _ntl_swap(pos_p[j], k_p[j]);
5993 }
5994 }
5995
5996 // copy block number kpanel (the one on the diagonal) into buf
5997 // here, we transpose it
5998
5999 for (long k = kk; k < k_max; k++)
6000 for (long j = 0; j < MAT_BLK_SZ; j++)
6001 buf[j*MAT_BLK_SZ + (k-kk)] =
6002 rem(jpanelp[k*MAT_BLK_SZ+j], p, red_struct);
6003
6004 // jpanel += kpanel*buf
6005
6006 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk);
6007 }
6008
6009 NTL_GEXEC_RANGE_END
6010
6011 // special processing: add 1 back to the diangonal
6012
6013 for (long k = kk; k < k_max; k++)
6014 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6015
6016 }
6017
6018 if (bp) {
6019 xp->SetLength(n);
6020 zz_p *X = xp->elts();
6021
6022 for (long i = n-1; i >= 0; i--) {
6023 long t1 = 0;
6024 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6025 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
6026 jj < n; jj += MAT_BLK_SZ, panel++) {
6027 long j_max = min(jj+MAT_BLK_SZ, n);
6028 unsigned long *row = &M[panel][i*MAT_BLK_SZ];
6029 for (long j = jj; j < j_max; j++) {
6030 long t0 = rem(row[j-jj], p, red_struct);
6031 long t2 = MulMod(rep(X[j]), t0, p);
6032 t1 = AddMod(t1, t2, p);
6033 }
6034 }
6035 X[i].LoopHole() = SubMod(bv[i], t1, p);
6036 }
6037 }
6038
6039 d.LoopHole() = det;
6040
6041 }
6042
6043
6044 static
blk_tri_LL(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)6045 void blk_tri_LL(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
6046 vec_zz_p *xp, bool trans, bool relax)
6047 {
6048 long n = A.NumRows();
6049
6050 if (A.NumCols() != n)
6051 LogicError("tri: nonsquare matrix");
6052
6053 if (bp && bp->length() != n)
6054 LogicError("tri: dimension mismatch");
6055
6056 if (bp && !xp)
6057 LogicError("tri: bad args");
6058
6059 if (n == 0) {
6060 set(d);
6061 if (xp) xp->SetLength(0);
6062 return;
6063 }
6064
6065 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6066
6067 long npanels = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6068
6069 Vec< UniqueArray<long> > M;
6070 M.SetLength(npanels);
6071 for (long panel = 0; panel < npanels; panel++) {
6072 M[panel].SetLength(n*MAT_BLK_SZ);
6073 long *panelp = &M[panel][0];
6074
6075 for (long r = 0; r < n*MAT_BLK_SZ; r++) panelp[r] = 0;
6076 }
6077
6078 if (trans) {
6079 // copy A transposed into panels
6080 for (long i = 0; i < n; i++) {
6081 const zz_p *row = &A[i][0];
6082 long *col = &M[i/MAT_BLK_SZ][i%MAT_BLK_SZ];
6083 for (long j = 0; j < n; j++)
6084 col[j*MAT_BLK_SZ] = rep(row[j]);
6085 }
6086 }
6087 else {
6088 // copy A into panels
6089 for (long jj = 0, panel = 0; jj < n; jj += MAT_BLK_SZ, panel++) {
6090 long j_max = min(jj+MAT_BLK_SZ, n);
6091 long *panelp = &M[panel][0];
6092
6093 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6094 const zz_p *ap = A[i].elts() + jj;
6095
6096 for (long j = jj; j < j_max; j++)
6097 panelp[j-jj] = rep(ap[j-jj]);
6098 }
6099 }
6100 }
6101
6102 Vec<long> bv;
6103 if (bp) conv(bv, *bp);
6104
6105 Vec<long> P;
6106 P.SetLength(n);
6107 for (long k = 0; k < n; k++) P[k] = k;
6108 // records swap operations
6109
6110
6111 long det;
6112 det = 1;
6113
6114 long p = zz_p::modulus();
6115 mulmod_t pinv = zz_p::ModulusInverse();
6116 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
6117
6118
6119 bool pivoting = false;
6120
6121 for (long kk = 0, kpanel = 0; kk < n; kk += MAT_BLK_SZ, kpanel++) {
6122 long k_max = min(kk+MAT_BLK_SZ, n);
6123
6124 long *kpanelp = &M[kpanel][0];
6125
6126 for (long k = kk; k < k_max; k++) {
6127
6128 long pos = -1;
6129 long pivot;
6130 long pivot_inv;
6131
6132 for (long i = k; i < n; i++) {
6133 // NOTE: by using InvModStatus, this code will work
6134 // for prime-powers as well as primes
6135 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
6136 if (pivot != 0 && !relaxed_InvModStatus(pivot_inv, pivot, p, relax)) {
6137 pos = i;
6138 break;
6139 }
6140 }
6141
6142 if (pos == -1) {
6143 clear(d);
6144 return;
6145 }
6146
6147 long *y = &kpanelp[k*MAT_BLK_SZ];
6148 if (k != pos) {
6149 // swap rows pos and k
6150 long *x = &kpanelp[pos*MAT_BLK_SZ];
6151 for (long j = 0; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6152
6153 det = NegateMod(det, p);
6154 P[k] = pos;
6155 pivoting = true;
6156
6157 if (bp) _ntl_swap(bv[pos], bv[k]);
6158 }
6159
6160 det = MulMod(det, pivot, p);
6161
6162 {
6163 // multiply row k by pivot_inv
6164 long t1 = pivot_inv;
6165 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
6166 for (long j = 0; j < MAT_BLK_SZ; j++) {
6167 y[j] = MulModPrecon(y[j], t1, p, t1pinv);
6168 }
6169
6170 y[k-kk] = pivot_inv;
6171
6172 if (bp) bv[k] = MulModPrecon(bv[k], t1, p, t1pinv);
6173 }
6174
6175 for (long i = kk; i < n; i++) {
6176 if (i == k) continue; // skip row k
6177
6178 long *x = &kpanelp[i*MAT_BLK_SZ];
6179 long t1 = x[k-kk];
6180 t1 = NegateMod(t1, p);
6181 x[k-kk] = 0;
6182 if (t1 == 0) continue;
6183
6184 // add t1 * row k to row i
6185 long ut1 = t1;
6186 muladd_interval(x, y, ut1, MAT_BLK_SZ, p, pinv);
6187 if (bp)
6188 {
6189 long t2 = MulMod(bv[k], t1, p);
6190 bv[i] = AddMod(bv[i], t2, p);
6191 }
6192 }
6193 }
6194
6195
6196 // finished processing current kpanel
6197 // next, reduce and apply to all other kpanels
6198
6199 // special processing: subtract 1 off of diangonal
6200
6201 for (long k = kk; k < k_max; k++)
6202 kpanelp[k*MAT_BLK_SZ+(k-kk)] = SubMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6203
6204
6205 bool seq = double(npanels-(kpanel+1))*double(n)*double(MAT_BLK_SZ)*double(MAT_BLK_SZ) < PAR_THRESH;
6206 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6207 NTL_IMPORT(p)
6208 NTL_IMPORT(n)
6209 NTL_IMPORT(ll_red_struct)
6210 NTL_IMPORT(kpanel)
6211 NTL_IMPORT(kpanelp)
6212 NTL_IMPORT(kk)
6213 NTL_IMPORT(k_max)
6214
6215
6216 UniqueArray<long> buf_store;
6217 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6218 long *buf = &buf_store[0];
6219
6220 for (long index = first; index < last; index++) {
6221 long jpanel = index + kpanel+1;
6222
6223 long *jpanelp = &M[jpanel][0];
6224
6225 // perform swaps
6226 for (long k = kk; k < k_max; k++) {
6227 long pos = P[k];
6228 if (pos != k) {
6229 // swap rows pos and k
6230 long *pos_p = &jpanelp[pos*MAT_BLK_SZ];
6231 long *k_p = &jpanelp[k*MAT_BLK_SZ];
6232 for (long j = 0; j < MAT_BLK_SZ; j++)
6233 _ntl_swap(pos_p[j], k_p[j]);
6234 }
6235 }
6236
6237 // copy block number kpanel (the one on the diagonal) into buf
6238 // here, we transpose it
6239
6240 for (long k = kk; k < k_max; k++)
6241 for (long j = 0; j < MAT_BLK_SZ; j++)
6242 buf[j*MAT_BLK_SZ + (k-kk)] = jpanelp[k*MAT_BLK_SZ+j];
6243
6244 // jpanel += kpanel*buf
6245
6246 muladd_all_by_32(kk, n, jpanelp, kpanelp, buf, k_max-kk, p, ll_red_struct);
6247 }
6248
6249 NTL_GEXEC_RANGE_END
6250
6251 // special processing: add 1 back to the diangonal
6252
6253 for (long k = kk; k < k_max; k++)
6254 kpanelp[k*MAT_BLK_SZ+(k-kk)] = AddMod((long)kpanelp[k*MAT_BLK_SZ+(k-kk)], 1, p);
6255
6256 }
6257
6258 if (bp) {
6259 xp->SetLength(n);
6260 zz_p *X = xp->elts();
6261
6262 for (long i = n-1; i >= 0; i--) {
6263 long t1 = 0;
6264 long start_panel = ((i+1)+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6265 for (long jj = MAT_BLK_SZ*start_panel, panel = start_panel;
6266 jj < n; jj += MAT_BLK_SZ, panel++) {
6267 long j_max = min(jj+MAT_BLK_SZ, n);
6268 long *row = &M[panel][i*MAT_BLK_SZ];
6269 for (long j = jj; j < j_max; j++) {
6270 long t0 = row[j-jj];
6271 long t2 = MulMod(rep(X[j]), t0, p);
6272 t1 = AddMod(t1, t2, p);
6273 }
6274 }
6275 X[i].LoopHole() = SubMod(bv[i], t1, p);
6276 }
6277 }
6278
6279 d.LoopHole() = det;
6280
6281 }
6282
6283
6284
6285 #endif
6286
6287
6288
6289 static
tri(zz_p & d,const mat_zz_p & A,const vec_zz_p * bp,vec_zz_p * xp,bool trans,bool relax)6290 void tri(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp,
6291 vec_zz_p *xp, bool trans, bool relax)
6292 {
6293 long n = A.NumRows();
6294
6295 if (A.NumCols() != n)
6296 LogicError("inv: nonsquare matrix");
6297
6298 if (bp && bp->length() != n)
6299 LogicError("tri: dimension mismatch");
6300
6301 if (bp && !xp)
6302 LogicError("tri: bad args");
6303
6304 #ifndef NTL_HAVE_LL_TYPE
6305
6306 basic_tri(d, A, bp, xp, trans, relax);
6307
6308 #else
6309
6310 long p = zz_p::modulus();
6311
6312 if (n < 16) {
6313 //cerr << "basic_tri\n";
6314 basic_tri(d, A, bp, xp, trans, relax);
6315 }
6316 else if (n/MAT_BLK_SZ < 4) {
6317 long V = 64;
6318
6319 #ifdef NTL_HAVE_AVX
6320 if (p-1 <= MAX_DBL_INT &&
6321 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
6322 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
6323
6324 //cerr << "alt_tri_DD\n";
6325 alt_tri_DD(d, A, bp, xp, trans, relax);
6326 }
6327 else
6328 #endif
6329 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
6330 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
6331
6332 //cerr << "alt_tri_L\n";
6333 alt_tri_L(d, A, bp, xp, trans, relax);
6334
6335 }
6336 else {
6337
6338 //cerr << "basic_tri\n";
6339 basic_tri(d, A, bp, xp, trans, relax);
6340 }
6341 }
6342 else {
6343 long V = 4*MAT_BLK_SZ;
6344
6345 #ifdef NTL_HAVE_AVX
6346 if (p-1 <= MAX_DBL_INT &&
6347 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
6348 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
6349
6350 //cerr << "blk_tri_DD\n";
6351 blk_tri_DD(d, A, bp, xp, trans, relax);
6352 }
6353 else
6354 #endif
6355 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
6356 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
6357
6358 //cerr << "blk_tri_L\n";
6359 blk_tri_L(d, A, bp, xp, trans, relax);
6360
6361 }
6362 else {
6363
6364 //cerr << "blk_tri_LL\n";
6365 blk_tri_LL(d, A, bp, xp, trans, relax);
6366 }
6367
6368 }
6369
6370 #endif
6371
6372
6373
6374 }
6375
6376
6377
relaxed_determinant(zz_p & d,const mat_zz_p & A,bool relax)6378 void relaxed_determinant(zz_p& d, const mat_zz_p& A, bool relax)
6379 {
6380 tri(d, A, 0, 0, false, relax);
6381 }
6382
6383
relaxed_solve(zz_p & d,vec_zz_p & x,const mat_zz_p & A,const vec_zz_p & b,bool relax)6384 void relaxed_solve(zz_p& d, vec_zz_p& x,
6385 const mat_zz_p& A, const vec_zz_p& b, bool relax)
6386 {
6387 tri(d, A, &b, &x, true, relax);
6388 }
6389
relaxed_solve(zz_p & d,const mat_zz_p & A,vec_zz_p & x,const vec_zz_p & b,bool relax)6390 void relaxed_solve(zz_p& d, const mat_zz_p& A, vec_zz_p& x, const vec_zz_p& b, bool relax)
6391 {
6392 tri(d, A, &b, &x, false, relax);
6393 }
6394
6395 // ******************************************************************
6396 //
6397 // new image and kernel routines
6398 //
6399 // ******************************************************************
6400
6401
6402 static
elim_basic(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)6403 long elim_basic(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6404 long w, bool full)
6405 {
6406 long n = A.NumRows();
6407 long m = A.NumCols();
6408
6409 if (w < 0 || w > m) LogicError("elim: bad args");
6410
6411 // take care of corner cases
6412 if (n == 0) {
6413 if (im) im->SetDims(0, m);
6414 if (ker) ker->SetDims(0, 0);
6415 return 0;
6416 }
6417
6418 if (w == 0) {
6419 if (im) {
6420 if (full)
6421 (*im) = A;
6422 else
6423 im->SetDims(0, m);
6424 }
6425 if (ker) ident(*ker, n);
6426 return 0;
6427 }
6428
6429 Mat<long> M;
6430 conv(M, A);
6431
6432 Vec<long> P;
6433 P.SetLength(n);
6434 for (long k = 0; k < n; k++) P[k] = k;
6435 // records swap operations
6436
6437 Vec<long> pcol;
6438 pcol.SetLength(n);
6439 // pcol[i] records pivot columns for row i
6440
6441 long p = zz_p::modulus();
6442 mulmod_t pinv = zz_p::ModulusInverse();
6443
6444 bool pivoting = false;
6445
6446 long r = 0;
6447
6448 for (long k = 0; k < w; k++) {
6449 long pos = -1;
6450 long pivot_inv;
6451 for (long i = r; i < n; i++) {
6452 long pivot = M[i][k];
6453 if (pivot != 0) {
6454 pivot_inv = InvMod(pivot, p);
6455 pos = i;
6456 break;
6457 }
6458 }
6459
6460 if (pos == -1)
6461 continue;
6462
6463 if (r != pos) {
6464 swap(M[pos], M[r]);
6465 P[r] = pos;
6466 pivoting = true;
6467 }
6468
6469 bool seq = double(n-r)*double(m-k) < PAR_THRESH;
6470
6471 NTL_GEXEC_RANGE(seq, n-(r+1), first, last)
6472 NTL_IMPORT(p)
6473 NTL_IMPORT(n)
6474 NTL_IMPORT(k)
6475 NTL_IMPORT(r)
6476 long *y = &M[r][0];
6477
6478 for (long ii = first; ii < last; ii++) {
6479 long i = ii + r+1;
6480
6481 long *x = &M[i][0];
6482 long t1 = x[k];
6483 t1 = MulMod(t1, pivot_inv, p);
6484 t1 = NegateMod(t1, p);
6485 x[k] = t1;
6486 if (t1 == 0) continue;
6487
6488 // add t1 * row r to row i
6489 mulmod_precon_t t1pinv = PrepMulModPrecon(t1, p, pinv);
6490
6491 for (long j = k+1; j < m; j++) {
6492 long t2 = MulModPrecon(y[j], t1, p, t1pinv);
6493 x[j] = AddMod(x[j], t2, p);
6494 }
6495 }
6496 NTL_GEXEC_RANGE_END
6497
6498 pcol[r] = k;
6499 r++;
6500 }
6501
6502 if (im) {
6503 mat_zz_p& Im = *im;;
6504 if (full)
6505 Im.SetDims(n, m);
6506 else
6507 Im.SetDims(r, m);
6508
6509 for (long i = 0; i < r; i++) {
6510 long pc = pcol[i];
6511 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6512 for (long j = pc; j < m; j++) Im[i][j].LoopHole() = M[i][j];
6513 }
6514
6515 if (full) {
6516 for (long i = r; i < n; i++) {
6517 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6518 for (long j = w; j < m; j++) Im[i][j].LoopHole() = M[i][j];
6519 }
6520 }
6521 }
6522
6523 if (ker) {
6524
6525 if (n == r) {
6526 mat_zz_p& Ker = *ker;
6527 Ker.SetDims(n-r, n);
6528 }
6529 else {
6530 Mat<long> colbuf;
6531 colbuf.SetDims(r, n);
6532
6533 for (long k = 0; k < r; k++) {
6534 long pc = pcol[k];
6535 for (long i = k+1; i < n; i++) colbuf[k][i] = M[i][pc];
6536 }
6537
6538 M.kill();
6539
6540 Mat<long> X;
6541 X.SetDims(n-r, r);
6542
6543 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6544 NTL_GEXEC_RANGE(seq, n-r, first, last)
6545 NTL_IMPORT(p)
6546 NTL_IMPORT(r)
6547
6548 for (long i = first; i < last; i++) {
6549 long *Xi = &X[i][0];
6550
6551 for (long k = r-1; k >= 0; k--) {
6552 long *cvecp = &colbuf[k][0];
6553
6554 long acc = cvecp[i+r];
6555 for (long j = k+1; j < r; j++) {
6556 acc = AddMod( acc, MulMod(Xi[j], cvecp[j], p), p );
6557 }
6558 Xi[k] = acc;
6559 }
6560
6561 }
6562
6563 NTL_GEXEC_RANGE_END
6564
6565 mat_zz_p& Ker = *ker;
6566 Ker.SetDims(n-r, n);
6567 for (long i = 0; i < n-r; i++) {
6568 for (long j = 0; j < r; j++) Ker[i][j].LoopHole() = X[i][j];
6569 for (long j = r; j < n; j++) Ker[i][j].LoopHole() = 0;
6570 Ker[i][r+i].LoopHole() = 1;
6571 }
6572
6573 if (pivoting) {
6574 for (long i = 0; i < n-r; i++) {
6575 zz_p *x = Ker[i].elts();
6576
6577 for (long k = n-1; k >= 0; k--) {
6578 long pos = P[k];
6579 if (pos != k) swap(x[pos], x[k]);
6580 }
6581 }
6582 }
6583 }
6584 }
6585
6586 return r;
6587 }
6588
6589 #ifdef NTL_HAVE_LL_TYPE
6590
6591
6592 #ifdef NTL_HAVE_AVX
6593
6594
6595 static inline
CopyBlock(double * dst_ptr,long dst_blk,const double * src_ptr,long src_blk,long src_limit)6596 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk, long src_limit)
6597 {
6598 long src_row = src_blk*MAT_BLK_SZ;
6599 long dst_row = dst_blk*MAT_BLK_SZ;
6600
6601 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
6602
6603 for (long i = 0; i < nrows; i++)
6604 for (long j = 0; j < MAT_BLK_SZ; j++)
6605 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6606
6607 for (long i = nrows; i < MAT_BLK_SZ; i++)
6608 for (long j = 0; j < MAT_BLK_SZ; j++)
6609 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
6610
6611 }
6612
6613 static inline
CopyBlock(double * dst_ptr,long dst_blk,const double * src_ptr,long src_blk)6614 void CopyBlock(double *dst_ptr, long dst_blk, const double *src_ptr, long src_blk)
6615 {
6616 long src_row = src_blk*MAT_BLK_SZ;
6617 long dst_row = dst_blk*MAT_BLK_SZ;
6618
6619 long nrows = MAT_BLK_SZ;
6620
6621 for (long i = 0; i < nrows; i++)
6622 for (long j = 0; j < MAT_BLK_SZ; j++)
6623 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
6624 }
6625
6626 static inline
SwapOneRow(double * panelp,long i,long pos)6627 void SwapOneRow(double *panelp, long i, long pos)
6628 {
6629 double *pos_p = &panelp[pos*MAT_BLK_SZ];
6630 double *i_p = &panelp[i*MAT_BLK_SZ];
6631 for (long j = 0; j < MAT_BLK_SZ; j++)
6632 _ntl_swap(pos_p[j], i_p[j]);
6633 }
6634
6635 static inline
ApplySwaps(double * panelp,long start,long end,const Vec<long> & P)6636 void ApplySwaps(double *panelp, long start, long end, const Vec<long>& P)
6637 {
6638 for (long i = start; i < end; i++) {
6639 long pos = P[i];
6640 if (pos != i)
6641 SwapOneRow(panelp, i, pos);
6642 }
6643 }
6644
6645
6646 static inline
MulAddBlock(double * x,const double * y,const double * z)6647 void MulAddBlock(double *x, const double *y, const double *z)
6648 {
6649 // x += y*z
6650 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
6651 }
6652
6653
6654 static
elim_blk_DD(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)6655 long elim_blk_DD(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
6656 long w, bool full)
6657 {
6658 long n = A.NumRows();
6659 long m = A.NumCols();
6660
6661 if (w < 0 || w > m) LogicError("elim: bad args");
6662
6663 // take care of corner cases
6664 if (n == 0) {
6665 if (im) im->SetDims(0, m);
6666 if (ker) ker->SetDims(0, 0);
6667 return 0;
6668 }
6669
6670 if (w == 0) {
6671 if (im) {
6672 if (full)
6673 (*im) = A;
6674 else
6675 im->SetDims(0, m);
6676 }
6677 if (ker) ident(*ker, n);
6678 return 0;
6679 }
6680
6681 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6682 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
6683
6684 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6685
6686
6687 Vec< AlignedArray<double> > M;
6688 M.SetLength(npanels);
6689 for (long panel = 0; panel < npanels; panel++) {
6690 M[panel].SetLength(n*MAT_BLK_SZ);
6691 double *panelp = &M[panel][0];
6692
6693 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
6694 }
6695
6696 // copy A into panels
6697 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
6698 long j_max = min(jj+MAT_BLK_SZ, m);
6699 double *panelp = &M[panel][0];
6700
6701 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
6702 const zz_p *ap = A[i].elts() + jj;
6703
6704 for (long j = jj; j < j_max; j++)
6705 panelp[j-jj] = rep(ap[j-jj]);
6706 }
6707 }
6708
6709 AlignedArray<double> aux_panel_store;
6710 aux_panel_store.SetLength(n*MAT_BLK_SZ);
6711 double *aux_panel = &aux_panel_store[0];
6712
6713
6714 AlignedArray<double> buf_store1;
6715 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6716 double *buf1 = &buf_store1[0];
6717
6718 Vec<long> P;
6719 P.SetLength(n);
6720 for (long k = 0; k < n; k++) P[k] = k;
6721 // records swap operations
6722
6723 Vec<long> pcol;
6724 pcol.SetLength(n);
6725 // pcol[i] records pivot columns for row i
6726
6727 long p = zz_p::modulus();
6728 mulmod_t pinv = zz_p::ModulusInverse();
6729 sp_reduce_struct red_struct = zz_p::red_struct();
6730
6731 bool pivoting = false;
6732
6733 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
6734 long red_count = red_trigger;
6735
6736 long r = 0, rr = 0, k = 0, kk = 0;
6737 long rpanel = 0, kpanel = 0;
6738
6739 while (k < w) {
6740
6741 if (r > rr && ker) {
6742 // we have a panel from a previous iteration
6743 // we store enough of it to facilitate the kernel
6744 // computation later. At this point, we have
6745 // r == rr+INV_BLK_SIZE, and it suffices to store
6746 // rows [r..n) into M[rpanel], and this will not
6747 // overwrite anything useful in M[rpanel]
6748
6749 double *panelp = &M[rpanel][0];
6750 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
6751 panelp[h] = aux_panel[h];
6752 }
6753
6754 rpanel++;
6755 }
6756
6757 rr = r;
6758
6759 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
6760
6761 bool cleanup = false;
6762
6763 if (red_count-MAT_BLK_SZ < 0) {
6764 red_count = red_trigger;
6765 cleanup = true;
6766 }
6767
6768 red_count = red_count-MAT_BLK_SZ;
6769
6770 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
6771
6772 if (k == kk+MAT_BLK_SZ) { // start new kpanel
6773 kk = k;
6774 kpanel++;
6775 }
6776
6777 double *kpanelp = &M[kpanel][0];
6778
6779 if (k == kk) { // a fresh kpanel -- special processing
6780
6781 if (cleanup) {
6782 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6783 kpanelp[h] = rem((unsigned long)(long)kpanelp[h], p, red_struct);
6784 }
6785
6786 if (r > rr) {
6787
6788
6789 // apply current sequence of permutations
6790
6791 ApplySwaps(kpanelp, rr, r, P);
6792
6793 // clean aux_panel
6794 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6795 aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
6796
6797 // copy rows [rr..r) of kpanel into buf1
6798 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6799 buf1[i] = rem((unsigned long)(long)kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6800
6801 // kpanel[rr..n) += aux_panel[rr..n)*buf1
6802
6803 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
6804 }
6805 }
6806
6807 long pos = -1;
6808 long pivot;
6809 long pivot_inv;
6810 for (long i = r; i < n; i++) {
6811 pivot = rem((unsigned long)(long)kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
6812 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
6813
6814 if (pivot != 0) {
6815 pivot_inv = InvMod(pivot, p);
6816 pos = i;
6817 break;
6818 }
6819 }
6820
6821 if (pos == -1) {
6822 continue;
6823 }
6824
6825 double *y = &kpanelp[r*MAT_BLK_SZ];
6826 double *y1 = &aux_panel[r*MAT_BLK_SZ];
6827 if (r != pos) {
6828 // swap rows pos and r
6829 double *x = &kpanelp[pos*MAT_BLK_SZ];
6830 double *x1 = &aux_panel[pos*MAT_BLK_SZ];
6831
6832 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
6833 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
6834
6835 P[r] = pos;
6836 pivoting = true;
6837 }
6838
6839 // clean up row r of kpanel and aux_panel
6840 for (long j = k-kk; j < MAT_BLK_SZ; j++)
6841 y[j] = rem((unsigned long)(long)y[j], p, red_struct);
6842 for (long j = 0; j < r-rr; j++)
6843 y1[j] = rem((unsigned long)(long)y1[j], p, red_struct);
6844
6845 // clear column
6846 for (long i = r+1; i < n; i++) {
6847 double *x = &kpanelp[i*MAT_BLK_SZ];
6848 double *x1 = &aux_panel[i*MAT_BLK_SZ];
6849 long t1 = rem((unsigned long)(long)x[k-kk], p, red_struct);
6850 t1 = MulMod(t1, pivot_inv, p);
6851 t1 = NegateMod(t1, p);
6852 x[k-kk] = 0;
6853 x1[r-rr] = t1;
6854 if (t1 == 0) continue;
6855
6856 // add t1 * row r to row i
6857 double ut1 = t1;
6858
6859 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
6860 x[j] += y[j]*ut1;
6861 for (long j = 0; j < r-rr; j++)
6862 x1[j] += y1[j]*ut1;
6863 }
6864
6865 pcol[r] = k;
6866 r++;
6867 }
6868
6869 if (r > rr) {
6870
6871 // we have a panel
6872
6873 // clean it up
6874 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6875 aux_panel[h] = rem((unsigned long)(long)aux_panel[h], p, red_struct);
6876
6877 bool seq =
6878 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
6879
6880 // apply aux_panel to remaining panels: [kpanel+1..npanels)
6881 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
6882 NTL_IMPORT(p)
6883 NTL_IMPORT(n)
6884 NTL_IMPORT(red_struct)
6885 NTL_IMPORT(aux_panel)
6886 NTL_IMPORT(rr)
6887 NTL_IMPORT(r)
6888
6889
6890 AlignedArray<double> buf_store;
6891 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
6892 double *buf = &buf_store[0];
6893
6894
6895 for (long index = first; index < last; index++) {
6896 long jpanel = index + kpanel+1;
6897
6898 double *jpanelp = &M[jpanel][0];
6899
6900 if (cleanup) {
6901 for (long h = 0; h < n*MAT_BLK_SZ; h++)
6902 jpanelp[h] = rem((unsigned long)(long)jpanelp[h], p, red_struct);
6903 }
6904
6905 // perform swaps
6906 ApplySwaps(jpanelp, rr, r, P);
6907
6908 // copy rows [rr..r) of jpanel into buf
6909 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
6910 buf[i] = rem((unsigned long)(long)jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
6911
6912 // jpanel[rr..n) += aux_panel[rr..n)*buf
6913
6914 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
6915 }
6916
6917 NTL_GEXEC_RANGE_END
6918
6919 }
6920
6921 }
6922
6923 if (im) {
6924 mat_zz_p& Im = *im;;
6925 if (full)
6926 Im.SetDims(n, m);
6927 else
6928 Im.SetDims(r, m);
6929
6930 for (long i = 0; i < r; i++) {
6931 long pc = pcol[i];
6932 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
6933 for (long j = pc; j < m; j++) {
6934 double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6935 Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
6936 }
6937 }
6938
6939 if (full) {
6940 for (long i = r; i < n; i++) {
6941 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
6942 for (long j = w; j < m; j++) {
6943 double t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
6944 Im[i][j].LoopHole() = rem((unsigned long)(long)t0, p, red_struct);
6945 }
6946 }
6947 }
6948 }
6949
6950 if (ker) {
6951 if (r == 0) {
6952 ident(*ker, n);
6953 return 0;
6954 }
6955
6956 mat_zz_p& Ker = *ker;
6957 Ker.SetDims(n-r, n);
6958 if (r < n) {
6959
6960 long start_block = r/MAT_BLK_SZ;
6961 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6962 long vblocks = end_block-start_block;
6963 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6964
6965 Vec< AlignedArray<double> > kerbuf;
6966 kerbuf.SetLength(vblocks);
6967 for (long i = 0; i < vblocks; i++)
6968 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
6969
6970 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
6971
6972 // if r > rr, we have a panel sitting in
6973 // aux_panel, which may or may not be a full panel
6974
6975 double *initial_panel = 0;
6976 if (r > rr) {
6977 initial_panel = aux_panel;
6978 }
6979 else {
6980 initial_panel = &M[hblocks-1][0];
6981 }
6982
6983 for (long vb = start_block; vb < end_block; vb++)
6984 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
6985
6986 for (long hb = hblocks-2; hb >= 0; hb--) {
6987
6988 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
6989
6990 for (long b = hb+1; b < end_block; b++)
6991 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
6992 }
6993
6994 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
6995
6996
6997 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
6998 NTL_IMPORT(p)
6999 NTL_IMPORT(red_struct)
7000 NTL_IMPORT(hblocks)
7001
7002 for (long index = first; index < last; index++) {
7003 long vb = index + start_block;
7004 double *kerbufp = &kerbuf[vb-start_block][0];
7005
7006 for (long hb = hblocks-2; hb >= 0; hb--) {
7007 double *colbuf = &M[hb][0];
7008 double *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7009
7010 CopyBlock(acc, 0, colbuf, vb-1);
7011
7012 long red_trigger = (MAX_DBL_INT-(p-1))/((p-1)*(p-1));
7013 long red_count = red_trigger;
7014
7015 for (long b = hb+1; b < hblocks; b++) {
7016
7017 if (red_count-MAT_BLK_SZ < 0) {
7018 red_count = red_trigger;
7019 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7020 acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
7021
7022 }
7023 red_count = red_count-MAT_BLK_SZ;
7024
7025 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7026 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
7027 }
7028
7029 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7030 acc[h] = rem((unsigned long)(long)acc[h], p, red_struct);
7031 }
7032 }
7033
7034 NTL_GEXEC_RANGE_END
7035
7036 for (long i = r; i < n; i++) {
7037
7038 double *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7039
7040 for (long j = 0; j < r; j++) {
7041 double t0 =
7042 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
7043 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7044
7045 Ker[i-r][j].LoopHole() = long(t0);
7046 }
7047 }
7048
7049 for (long i = 0; i < n-r; i++) {
7050 for (long j = 0; j < n-r; j++) {
7051 Ker[i][j+r].LoopHole() = 0;
7052 }
7053 Ker[i][i+r].LoopHole() = 1;
7054 }
7055
7056 if (pivoting) {
7057 for (long i = 0; i < n-r; i++) {
7058 zz_p *x = Ker[i].elts();
7059
7060 for (long k = n-1; k >= 0; k--) {
7061 long pos = P[k];
7062 if (pos != k) swap(x[pos], x[k]);
7063 }
7064 }
7065 }
7066 }
7067 }
7068
7069 return r;
7070
7071 }
7072
7073 #endif
7074
7075
7076
7077 static inline
CopyBlock(unsigned long * dst_ptr,long dst_blk,const unsigned long * src_ptr,long src_blk,long src_limit)7078 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk, long src_limit)
7079 {
7080 long src_row = src_blk*MAT_BLK_SZ;
7081 long dst_row = dst_blk*MAT_BLK_SZ;
7082
7083 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
7084
7085 for (long i = 0; i < nrows; i++)
7086 for (long j = 0; j < MAT_BLK_SZ; j++)
7087 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7088
7089 for (long i = nrows; i < MAT_BLK_SZ; i++)
7090 for (long j = 0; j < MAT_BLK_SZ; j++)
7091 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
7092
7093 }
7094
7095 static inline
CopyBlock(unsigned long * dst_ptr,long dst_blk,const unsigned long * src_ptr,long src_blk)7096 void CopyBlock(unsigned long *dst_ptr, long dst_blk, const unsigned long *src_ptr, long src_blk)
7097 {
7098 long src_row = src_blk*MAT_BLK_SZ;
7099 long dst_row = dst_blk*MAT_BLK_SZ;
7100
7101 long nrows = MAT_BLK_SZ;
7102
7103 for (long i = 0; i < nrows; i++)
7104 for (long j = 0; j < MAT_BLK_SZ; j++)
7105 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7106 }
7107
7108 static inline
TransposeBlock(unsigned long * dst_ptr,long dst_blk)7109 void TransposeBlock(unsigned long *dst_ptr, long dst_blk)
7110 {
7111 dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
7112
7113 for (long i = 0; i < MAT_BLK_SZ; i++)
7114 for (long j = 0; j < i; j++)
7115 _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
7116 }
7117
7118 static inline
SwapOneRow(unsigned long * panelp,long i,long pos)7119 void SwapOneRow(unsigned long *panelp, long i, long pos)
7120 {
7121 unsigned long *pos_p = &panelp[pos*MAT_BLK_SZ];
7122 unsigned long *i_p = &panelp[i*MAT_BLK_SZ];
7123 for (long j = 0; j < MAT_BLK_SZ; j++)
7124 _ntl_swap(pos_p[j], i_p[j]);
7125 }
7126
7127 static inline
ApplySwaps(unsigned long * panelp,long start,long end,const Vec<long> & P)7128 void ApplySwaps(unsigned long *panelp, long start, long end, const Vec<long>& P)
7129 {
7130 for (long i = start; i < end; i++) {
7131 long pos = P[i];
7132 if (pos != i)
7133 SwapOneRow(panelp, i, pos);
7134 }
7135 }
7136
7137
7138 static inline
MulAddBlock(unsigned long * x,const unsigned long * y,const unsigned long * z)7139 void MulAddBlock(unsigned long *x, const unsigned long *y, const unsigned long *z)
7140 {
7141 // x += y*z
7142
7143 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ);
7144 }
7145
7146
7147 static
elim_blk_L(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)7148 long elim_blk_L(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
7149 long w, bool full)
7150 {
7151 long n = A.NumRows();
7152 long m = A.NumCols();
7153
7154 if (w < 0 || w > m) LogicError("elim: bad args");
7155
7156 // take care of corner cases
7157 if (n == 0) {
7158 if (im) im->SetDims(0, m);
7159 if (ker) ker->SetDims(0, 0);
7160 return 0;
7161 }
7162
7163 if (w == 0) {
7164 if (im) {
7165 if (full)
7166 (*im) = A;
7167 else
7168 im->SetDims(0, m);
7169 }
7170 if (ker) ident(*ker, n);
7171 return 0;
7172 }
7173
7174 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7175 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7176
7177 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7178
7179
7180 Vec< UniqueArray<unsigned long> > M;
7181 M.SetLength(npanels);
7182 for (long panel = 0; panel < npanels; panel++) {
7183 M[panel].SetLength(n*MAT_BLK_SZ);
7184 unsigned long *panelp = &M[panel][0];
7185
7186 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
7187 }
7188
7189 // copy A into panels
7190 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
7191 long j_max = min(jj+MAT_BLK_SZ, m);
7192 unsigned long *panelp = &M[panel][0];
7193
7194 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
7195 const zz_p *ap = A[i].elts() + jj;
7196
7197 for (long j = jj; j < j_max; j++)
7198 panelp[j-jj] = rep(ap[j-jj]);
7199 }
7200 }
7201
7202 UniqueArray<unsigned long> aux_panel_store;
7203 aux_panel_store.SetLength(n*MAT_BLK_SZ);
7204 unsigned long *aux_panel = &aux_panel_store[0];
7205
7206
7207 UniqueArray<unsigned long> buf_store1;
7208 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7209 unsigned long *buf1 = &buf_store1[0];
7210
7211 Vec<long> P;
7212 P.SetLength(n);
7213 for (long k = 0; k < n; k++) P[k] = k;
7214 // records swap operations
7215
7216 Vec<long> pcol;
7217 pcol.SetLength(n);
7218 // pcol[i] records pivot columns for row i
7219
7220 long p = zz_p::modulus();
7221 mulmod_t pinv = zz_p::ModulusInverse();
7222 sp_reduce_struct red_struct = zz_p::red_struct();
7223
7224 bool pivoting = false;
7225
7226 unsigned long ured_trigger =
7227 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
7228 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
7229
7230 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
7231
7232 long red_count = red_trigger;
7233
7234 long r = 0, rr = 0, k = 0, kk = 0;
7235 long rpanel = 0, kpanel = 0;
7236
7237 while (k < w) {
7238
7239 if (r > rr && ker) {
7240 // we have a panel from a previous iteration
7241 // we store enough of it to facilitate the kernel
7242 // computation later. At this point, we have
7243 // r == rr+INV_BLK_SIZE, and it suffices to store
7244 // rows [r..n) into M[rpanel], and this will not
7245 // overwrite anything useful in M[rpanel]
7246
7247 unsigned long *panelp = &M[rpanel][0];
7248 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
7249 panelp[h] = aux_panel[h];
7250 }
7251
7252 rpanel++;
7253 }
7254
7255 rr = r;
7256
7257 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
7258
7259 bool cleanup = false;
7260
7261 if (red_count-MAT_BLK_SZ < 0) {
7262 red_count = red_trigger;
7263 cleanup = true;
7264 }
7265
7266 red_count = red_count-MAT_BLK_SZ;
7267
7268 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
7269
7270 if (k == kk+MAT_BLK_SZ) { // start new kpanel
7271 kk = k;
7272 kpanel++;
7273 }
7274
7275 unsigned long *kpanelp = &M[kpanel][0];
7276
7277 if (k == kk) { // a fresh kpanel -- special processing
7278
7279 if (cleanup) {
7280 for (long h = 0; h < n*MAT_BLK_SZ; h++)
7281 kpanelp[h] = rem(kpanelp[h], p, red_struct);
7282 }
7283
7284 if (r > rr) {
7285
7286
7287 // apply current sequence of permutations
7288
7289 ApplySwaps(kpanelp, rr, r, P);
7290
7291 // clean aux_panel
7292 for (long h = 0; h < n*MAT_BLK_SZ; h++)
7293 aux_panel[h] = rem(aux_panel[h], p, red_struct);
7294
7295 // copy rows [rr..r) of kpanel into buf1
7296 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7297 buf1[i] = rem(kpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
7298
7299 TransposeBlock(buf1, 0);
7300
7301 // kpanel[rr..n) += aux_panel[rr..n)*buf1
7302
7303 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr);
7304 }
7305 }
7306
7307 long pos = -1;
7308 long pivot;
7309 long pivot_inv;
7310 for (long i = r; i < n; i++) {
7311 pivot = rem(kpanelp[i*MAT_BLK_SZ+(k-kk)], p, red_struct);
7312 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
7313
7314 if (pivot != 0) {
7315 pivot_inv = InvMod(pivot, p);
7316 pos = i;
7317 break;
7318 }
7319 }
7320
7321 if (pos == -1) {
7322 continue;
7323 }
7324
7325 unsigned long *y = &kpanelp[r*MAT_BLK_SZ];
7326 unsigned long *y1 = &aux_panel[r*MAT_BLK_SZ];
7327 if (r != pos) {
7328 // swap rows pos and r
7329 unsigned long *x = &kpanelp[pos*MAT_BLK_SZ];
7330 unsigned long *x1 = &aux_panel[pos*MAT_BLK_SZ];
7331
7332 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7333 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
7334
7335 P[r] = pos;
7336 pivoting = true;
7337 }
7338
7339 // clean up row r of kpanel and aux_panel
7340 for (long j = k-kk; j < MAT_BLK_SZ; j++)
7341 y[j] = rem(y[j], p, red_struct);
7342 for (long j = 0; j < r-rr; j++)
7343 y1[j] = rem(y1[j], p, red_struct);
7344
7345 // clear column
7346 for (long i = r+1; i < n; i++) {
7347 unsigned long *x = &kpanelp[i*MAT_BLK_SZ];
7348 unsigned long *x1 = &aux_panel[i*MAT_BLK_SZ];
7349 long t1 = rem(x[k-kk], p, red_struct);
7350 t1 = MulMod(t1, pivot_inv, p);
7351 t1 = NegateMod(t1, p);
7352 x[k-kk] = 0;
7353 x1[r-rr] = t1;
7354 if (t1 == 0) continue;
7355
7356 // add t1 * row r to row i
7357 unsigned long ut1 = t1;
7358
7359 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
7360 x[j] += y[j]*ut1;
7361 for (long j = 0; j < r-rr; j++)
7362 x1[j] += y1[j]*ut1;
7363 }
7364
7365 pcol[r] = k;
7366 r++;
7367 }
7368
7369 if (r > rr) {
7370
7371 // we have a panel
7372
7373 // clean it up
7374 for (long h = 0; h < n*MAT_BLK_SZ; h++)
7375 aux_panel[h] = rem(aux_panel[h], p, red_struct);
7376
7377 bool seq =
7378 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
7379
7380 // apply aux_panel to remaining panels: [kpanel+1..npanels)
7381 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
7382 NTL_IMPORT(p)
7383 NTL_IMPORT(n)
7384 NTL_IMPORT(red_struct)
7385 NTL_IMPORT(aux_panel)
7386 NTL_IMPORT(rr)
7387 NTL_IMPORT(r)
7388
7389
7390 UniqueArray<unsigned long> buf_store;
7391 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7392 unsigned long *buf = &buf_store[0];
7393
7394
7395 for (long index = first; index < last; index++) {
7396 long jpanel = index + kpanel+1;
7397
7398 unsigned long *jpanelp = &M[jpanel][0];
7399
7400 if (cleanup) {
7401 for (long h = 0; h < n*MAT_BLK_SZ; h++)
7402 jpanelp[h] = rem(jpanelp[h], p, red_struct);
7403 }
7404
7405 // perform swaps
7406 ApplySwaps(jpanelp, rr, r, P);
7407
7408 // copy rows [rr..r) of jpanel into buf
7409 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7410 buf[i] = rem(jpanelp[rr*MAT_BLK_SZ+i], p, red_struct);
7411
7412 TransposeBlock(buf, 0);
7413
7414 // jpanel[rr..n) += aux_panel[rr..n)*buf
7415
7416 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr);
7417 }
7418
7419 NTL_GEXEC_RANGE_END
7420
7421 }
7422
7423 }
7424
7425 if (im) {
7426 mat_zz_p& Im = *im;;
7427 if (full)
7428 Im.SetDims(n, m);
7429 else
7430 Im.SetDims(r, m);
7431
7432 for (long i = 0; i < r; i++) {
7433 long pc = pcol[i];
7434 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
7435 for (long j = pc; j < m; j++) {
7436 unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7437 Im[i][j].LoopHole() = rem(t0, p, red_struct);
7438 }
7439 }
7440
7441 if (full) {
7442 for (long i = r; i < n; i++) {
7443 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
7444 for (long j = w; j < m; j++) {
7445 unsigned long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7446 Im[i][j].LoopHole() = rem(t0, p, red_struct);
7447 }
7448 }
7449 }
7450 }
7451
7452 if (ker) {
7453 if (r == 0) {
7454 ident(*ker, n);
7455 return 0;
7456 }
7457
7458 mat_zz_p& Ker = *ker;
7459 Ker.SetDims(n-r, n);
7460 if (r < n) {
7461
7462 long start_block = r/MAT_BLK_SZ;
7463 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7464 long vblocks = end_block-start_block;
7465 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7466
7467 Vec< UniqueArray<unsigned long> > kerbuf;
7468 kerbuf.SetLength(vblocks);
7469 for (long i = 0; i < vblocks; i++)
7470 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
7471
7472 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7473
7474 // if r > rr, we have a panel sitting in
7475 // aux_panel, which may or may not be a full panel
7476
7477 unsigned long *initial_panel = 0;
7478 if (r > rr) {
7479 initial_panel = aux_panel;
7480 }
7481 else {
7482 initial_panel = &M[hblocks-1][0];
7483 }
7484
7485 for (long vb = start_block; vb < end_block; vb++)
7486 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
7487
7488 for (long hb = hblocks-2; hb >= 0; hb--) {
7489
7490 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
7491
7492 for (long b = hb+1; b < end_block; b++) {
7493 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
7494 TransposeBlock(&M[hb][0], b-1);
7495 }
7496 }
7497
7498 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
7499
7500
7501 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
7502 NTL_IMPORT(p)
7503 NTL_IMPORT(red_struct)
7504 NTL_IMPORT(hblocks)
7505
7506 for (long index = first; index < last; index++) {
7507 long vb = index + start_block;
7508 unsigned long *kerbufp = &kerbuf[vb-start_block][0];
7509
7510 for (long hb = hblocks-2; hb >= 0; hb--) {
7511 unsigned long *colbuf = &M[hb][0];
7512 unsigned long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7513
7514 CopyBlock(acc, 0, colbuf, vb-1);
7515 TransposeBlock(acc, 0);
7516
7517
7518 unsigned long ured_trigger =
7519 (~(0UL)-cast_unsigned(p-1))/(cast_unsigned(p-1)*cast_unsigned(p-1));
7520 // NOTE: corner case at p == 2: need unsigned long to prevent overflow
7521
7522 long red_trigger = min(cast_unsigned(NTL_MAX_LONG), ured_trigger);
7523 long red_count = red_trigger;
7524
7525 for (long b = hb+1; b < hblocks; b++) {
7526
7527 if (red_count-MAT_BLK_SZ < 0) {
7528 red_count = red_trigger;
7529 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7530 acc[h] = rem(acc[h], p, red_struct);
7531
7532 }
7533 red_count = red_count-MAT_BLK_SZ;
7534
7535 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7536 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ]);
7537 }
7538
7539 for (long h = 0; h < MAT_BLK_SZ*MAT_BLK_SZ; h++)
7540 acc[h] = rem(acc[h], p, red_struct);
7541 }
7542 }
7543
7544 NTL_GEXEC_RANGE_END
7545
7546 for (long i = r; i < n; i++) {
7547
7548 unsigned long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7549
7550 for (long j = 0; j < r; j++) {
7551 unsigned long t0 =
7552 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
7553 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7554
7555 Ker[i-r][j].LoopHole() = long(t0);
7556 }
7557 }
7558
7559 for (long i = 0; i < n-r; i++) {
7560 for (long j = 0; j < n-r; j++) {
7561 Ker[i][j+r].LoopHole() = 0;
7562 }
7563 Ker[i][i+r].LoopHole() = 1;
7564 }
7565
7566 if (pivoting) {
7567 for (long i = 0; i < n-r; i++) {
7568 zz_p *x = Ker[i].elts();
7569
7570 for (long k = n-1; k >= 0; k--) {
7571 long pos = P[k];
7572 if (pos != k) swap(x[pos], x[k]);
7573 }
7574 }
7575 }
7576 }
7577 }
7578
7579 return r;
7580
7581 }
7582
7583
7584 static inline
CopyBlock(long * dst_ptr,long dst_blk,const long * src_ptr,long src_blk,long src_limit)7585 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk, long src_limit)
7586 {
7587 long src_row = src_blk*MAT_BLK_SZ;
7588 long dst_row = dst_blk*MAT_BLK_SZ;
7589
7590 long nrows = min(MAT_BLK_SZ, src_limit - src_row);
7591
7592 for (long i = 0; i < nrows; i++)
7593 for (long j = 0; j < MAT_BLK_SZ; j++)
7594 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7595
7596 for (long i = nrows; i < MAT_BLK_SZ; i++)
7597 for (long j = 0; j < MAT_BLK_SZ; j++)
7598 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = 0;
7599
7600 }
7601
7602 static inline
CopyBlock(long * dst_ptr,long dst_blk,const long * src_ptr,long src_blk)7603 void CopyBlock(long *dst_ptr, long dst_blk, const long *src_ptr, long src_blk)
7604 {
7605 long src_row = src_blk*MAT_BLK_SZ;
7606 long dst_row = dst_blk*MAT_BLK_SZ;
7607
7608 long nrows = MAT_BLK_SZ;
7609
7610 for (long i = 0; i < nrows; i++)
7611 for (long j = 0; j < MAT_BLK_SZ; j++)
7612 dst_ptr[(dst_row + i)*MAT_BLK_SZ + j] = src_ptr[(src_row + i)*MAT_BLK_SZ + j];
7613 }
7614
7615 static inline
TransposeBlock(long * dst_ptr,long dst_blk)7616 void TransposeBlock(long *dst_ptr, long dst_blk)
7617 {
7618 dst_ptr += dst_blk*MAT_BLK_SZ*MAT_BLK_SZ;
7619
7620 for (long i = 0; i < MAT_BLK_SZ; i++)
7621 for (long j = 0; j < i; j++)
7622 _ntl_swap(dst_ptr[i*MAT_BLK_SZ+j], dst_ptr[i+j*MAT_BLK_SZ]);
7623 }
7624
7625 static inline
SwapOneRow(long * panelp,long i,long pos)7626 void SwapOneRow(long *panelp, long i, long pos)
7627 {
7628 long *pos_p = &panelp[pos*MAT_BLK_SZ];
7629 long *i_p = &panelp[i*MAT_BLK_SZ];
7630 for (long j = 0; j < MAT_BLK_SZ; j++)
7631 _ntl_swap(pos_p[j], i_p[j]);
7632 }
7633
7634 static inline
ApplySwaps(long * panelp,long start,long end,const Vec<long> & P)7635 void ApplySwaps(long *panelp, long start, long end, const Vec<long>& P)
7636 {
7637 for (long i = start; i < end; i++) {
7638 long pos = P[i];
7639 if (pos != i)
7640 SwapOneRow(panelp, i, pos);
7641 }
7642 }
7643
7644
7645 static inline
MulAddBlock(long * x,const long * y,const long * z,long p,sp_ll_reduce_struct ll_red_struct)7646 void MulAddBlock(long *x, const long *y, const long *z,
7647 long p, sp_ll_reduce_struct ll_red_struct)
7648 {
7649 // x += y*z
7650
7651 muladd_all_by_32(0, MAT_BLK_SZ, x, y, z, MAT_BLK_SZ, p, ll_red_struct);
7652 }
7653
7654
7655
7656 static
elim_blk_LL(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)7657 long elim_blk_LL(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker,
7658 long w, bool full)
7659 {
7660 long n = A.NumRows();
7661 long m = A.NumCols();
7662
7663 if (w < 0 || w > m) LogicError("elim: bad args");
7664
7665 // take care of corner cases
7666 if (n == 0) {
7667 if (im) im->SetDims(0, m);
7668 if (ker) ker->SetDims(0, 0);
7669 return 0;
7670 }
7671
7672 if (w == 0) {
7673 if (im) {
7674 if (full)
7675 (*im) = A;
7676 else
7677 im->SetDims(0, m);
7678 }
7679 if (ker) ident(*ker, n);
7680 return 0;
7681 }
7682
7683 if (NTL_OVERFLOW(n, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7684 if (NTL_OVERFLOW(m, MAT_BLK_SZ, 0)) ResourceError("dimension too large");
7685
7686 long npanels = (m+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7687
7688
7689 Vec< UniqueArray<long> > M;
7690 M.SetLength(npanels);
7691 for (long panel = 0; panel < npanels; panel++) {
7692 M[panel].SetLength(n*MAT_BLK_SZ);
7693 long *panelp = &M[panel][0];
7694
7695 for (long h = 0; h < n*MAT_BLK_SZ; h++) panelp[h] = 0;
7696 }
7697
7698 // copy A into panels
7699 for (long jj = 0, panel = 0; jj < m; jj += MAT_BLK_SZ, panel++) {
7700 long j_max = min(jj+MAT_BLK_SZ, m);
7701 long *panelp = &M[panel][0];
7702
7703 for (long i = 0; i < n; i++, panelp += MAT_BLK_SZ) {
7704 const zz_p *ap = A[i].elts() + jj;
7705
7706 for (long j = jj; j < j_max; j++)
7707 panelp[j-jj] = rep(ap[j-jj]);
7708 }
7709 }
7710
7711 UniqueArray<long> aux_panel_store;
7712 aux_panel_store.SetLength(n*MAT_BLK_SZ);
7713 long *aux_panel = &aux_panel_store[0];
7714
7715
7716 UniqueArray<long> buf_store1;
7717 buf_store1.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7718 long *buf1 = &buf_store1[0];
7719
7720 Vec<long> P;
7721 P.SetLength(n);
7722 for (long k = 0; k < n; k++) P[k] = k;
7723 // records swap operations
7724
7725 Vec<long> pcol;
7726 pcol.SetLength(n);
7727 // pcol[i] records pivot columns for row i
7728
7729 long p = zz_p::modulus();
7730 mulmod_t pinv = zz_p::ModulusInverse();
7731 sp_ll_reduce_struct ll_red_struct = zz_p::ll_red_struct();
7732
7733 bool pivoting = false;
7734
7735 long r = 0, rr = 0, k = 0, kk = 0;
7736 long rpanel = 0, kpanel = 0;
7737
7738 while (k < w) {
7739
7740 if (r > rr && ker) {
7741 // we have a panel from a previous iteration
7742 // we store enough of it to facilitate the kernel
7743 // computation later. At this point, we have
7744 // r == rr+INV_BLK_SIZE, and it suffices to store
7745 // rows [r..n) into M[rpanel], and this will not
7746 // overwrite anything useful in M[rpanel]
7747
7748 long *panelp = &M[rpanel][0];
7749 for (long h = r*MAT_BLK_SZ; h < n*MAT_BLK_SZ; h++) {
7750 panelp[h] = aux_panel[h];
7751 }
7752
7753 rpanel++;
7754 }
7755
7756 rr = r;
7757
7758 for (long h = 0; h < n*MAT_BLK_SZ; h++) aux_panel[h] = 0;
7759
7760 for (; r < rr+MAT_BLK_SZ && k < w; k++) { // panel incomplete
7761
7762 if (k == kk+MAT_BLK_SZ) { // start new kpanel
7763 kk = k;
7764 kpanel++;
7765 }
7766
7767 long *kpanelp = &M[kpanel][0];
7768
7769 if (k == kk) { // a fresh kpanel -- special processing
7770
7771
7772 if (r > rr) {
7773
7774
7775 // apply current sequence of permutations
7776
7777 ApplySwaps(kpanelp, rr, r, P);
7778
7779 // copy rows [rr..r) of kpanel into buf1
7780 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7781 buf1[i] = kpanelp[rr*MAT_BLK_SZ+i];
7782
7783 TransposeBlock(buf1, 0);
7784
7785 // kpanel[rr..n) += aux_panel[rr..n)*buf1
7786
7787 muladd_all_by_32(rr, n, kpanelp, aux_panel, buf1, r-rr, p, ll_red_struct);
7788 }
7789 }
7790
7791 long pos = -1;
7792 long pivot;
7793 long pivot_inv;
7794 for (long i = r; i < n; i++) {
7795 pivot = kpanelp[i*MAT_BLK_SZ+(k-kk)];
7796 kpanelp[i*MAT_BLK_SZ+(k-kk)] = pivot;
7797
7798 if (pivot != 0) {
7799 pivot_inv = InvMod(pivot, p);
7800 pos = i;
7801 break;
7802 }
7803 }
7804
7805 if (pos == -1) {
7806 continue;
7807 }
7808
7809 long *y = &kpanelp[r*MAT_BLK_SZ];
7810 long *y1 = &aux_panel[r*MAT_BLK_SZ];
7811 if (r != pos) {
7812 // swap rows pos and r
7813 long *x = &kpanelp[pos*MAT_BLK_SZ];
7814 long *x1 = &aux_panel[pos*MAT_BLK_SZ];
7815
7816 for (long j = k-kk; j < MAT_BLK_SZ; j++) _ntl_swap(x[j], y[j]);
7817 for (long j = 0; j < r-rr; j++) _ntl_swap(x1[j], y1[j]);
7818
7819 P[r] = pos;
7820 pivoting = true;
7821 }
7822
7823 // clear column
7824 for (long i = r+1; i < n; i++) {
7825 long *x = &kpanelp[i*MAT_BLK_SZ];
7826 long *x1 = &aux_panel[i*MAT_BLK_SZ];
7827 long t1 = x[k-kk];
7828 t1 = MulMod(t1, pivot_inv, p);
7829 t1 = NegateMod(t1, p);
7830 x[k-kk] = 0;
7831 x1[r-rr] = t1;
7832 if (t1 == 0) continue;
7833
7834 // add t1 * row r to row i
7835 long ut1 = t1;
7836 mulmod_precon_t ut1_pinv = PrepMulModPrecon(ut1, p, pinv);
7837
7838 for (long j = k-kk+1; j < MAT_BLK_SZ; j++)
7839 x[j] = AddMod(x[j], MulModPrecon(y[j], ut1, p, ut1_pinv), p);
7840 for (long j = 0; j < r-rr; j++)
7841 x1[j] = AddMod(x1[j], MulModPrecon(y1[j], ut1, p, ut1_pinv), p);
7842 }
7843
7844 pcol[r] = k;
7845 r++;
7846 }
7847
7848 if (r > rr) {
7849
7850 // we have a panel
7851
7852 bool seq =
7853 double(npanels-(kpanel+1))*double(n-rr)*double(r-rr)*double(MAT_BLK_SZ) < PAR_THRESH;
7854
7855 // apply aux_panel to remaining panels: [kpanel+1..npanels)
7856 NTL_GEXEC_RANGE(seq, npanels-(kpanel+1), first, last)
7857 NTL_IMPORT(p)
7858 NTL_IMPORT(n)
7859 NTL_IMPORT(ll_red_struct)
7860 NTL_IMPORT(aux_panel)
7861 NTL_IMPORT(rr)
7862 NTL_IMPORT(r)
7863
7864
7865 UniqueArray<long> buf_store;
7866 buf_store.SetLength(MAT_BLK_SZ*MAT_BLK_SZ);
7867 long *buf = &buf_store[0];
7868
7869
7870 for (long index = first; index < last; index++) {
7871 long jpanel = index + kpanel+1;
7872
7873 long *jpanelp = &M[jpanel][0];
7874
7875 // perform swaps
7876 ApplySwaps(jpanelp, rr, r, P);
7877
7878 // copy rows [rr..r) of jpanel into buf
7879 for (long i = 0; i < (r-rr)*MAT_BLK_SZ; i++)
7880 buf[i] = jpanelp[rr*MAT_BLK_SZ+i];
7881
7882 TransposeBlock(buf, 0);
7883
7884 // jpanel[rr..n) += aux_panel[rr..n)*buf
7885
7886 muladd_all_by_32(rr, n, jpanelp, aux_panel, buf, r-rr, p, ll_red_struct);
7887 }
7888
7889 NTL_GEXEC_RANGE_END
7890
7891 }
7892
7893 }
7894
7895 if (im) {
7896 mat_zz_p& Im = *im;;
7897 if (full)
7898 Im.SetDims(n, m);
7899 else
7900 Im.SetDims(r, m);
7901
7902 for (long i = 0; i < r; i++) {
7903 long pc = pcol[i];
7904 for (long j = 0; j < pc; j++) Im[i][j].LoopHole() = 0;
7905 for (long j = pc; j < m; j++) {
7906 long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7907 Im[i][j].LoopHole() = t0;
7908 }
7909 }
7910
7911 if (full) {
7912 for (long i = r; i < n; i++) {
7913 for (long j = 0; j < w; j++) Im[i][j].LoopHole() = 0;
7914 for (long j = w; j < m; j++) {
7915 long t0 = M[j/MAT_BLK_SZ][i*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
7916 Im[i][j].LoopHole() = t0;
7917 }
7918 }
7919 }
7920 }
7921
7922 if (ker) {
7923 if (r == 0) {
7924 ident(*ker, n);
7925 return 0;
7926 }
7927
7928 mat_zz_p& Ker = *ker;
7929 Ker.SetDims(n-r, n);
7930 if (r < n) {
7931
7932 long start_block = r/MAT_BLK_SZ;
7933 long end_block = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7934 long vblocks = end_block-start_block;
7935 long hblocks = (r+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7936
7937 Vec< UniqueArray<long> > kerbuf;
7938 kerbuf.SetLength(vblocks);
7939 for (long i = 0; i < vblocks; i++)
7940 kerbuf[i].SetLength(hblocks*MAT_BLK_SZ*MAT_BLK_SZ);
7941
7942 long colblocks = (n+MAT_BLK_SZ-1)/MAT_BLK_SZ;
7943
7944 // if r > rr, we have a panel sitting in
7945 // aux_panel, which may or may not be a full panel
7946
7947 long *initial_panel = 0;
7948 if (r > rr) {
7949 initial_panel = aux_panel;
7950 }
7951 else {
7952 initial_panel = &M[hblocks-1][0];
7953 }
7954
7955 for (long vb = start_block; vb < end_block; vb++)
7956 CopyBlock(&kerbuf[vb-start_block][0], hblocks-1, initial_panel, vb, n);
7957
7958 for (long hb = hblocks-2; hb >= 0; hb--) {
7959
7960 ApplySwaps(&M[hb][0], (hb+1)*MAT_BLK_SZ, r, P);
7961
7962 for (long b = hb+1; b < end_block; b++) {
7963 CopyBlock(&M[hb][0], b-1, &M[hb][0], b, n);
7964 TransposeBlock(&M[hb][0], b-1);
7965 }
7966 }
7967
7968 bool seq = double(n-r)*double(r)*double(r)/2 < PAR_THRESH;
7969
7970
7971 NTL_GEXEC_RANGE(seq, end_block-start_block, first, last)
7972 NTL_IMPORT(p)
7973 NTL_IMPORT(ll_red_struct)
7974 NTL_IMPORT(hblocks)
7975
7976 for (long index = first; index < last; index++) {
7977 long vb = index + start_block;
7978 long *kerbufp = &kerbuf[vb-start_block][0];
7979
7980 for (long hb = hblocks-2; hb >= 0; hb--) {
7981 long *colbuf = &M[hb][0];
7982 long *acc = &kerbufp[hb*MAT_BLK_SZ*MAT_BLK_SZ];
7983
7984 CopyBlock(acc, 0, colbuf, vb-1);
7985 TransposeBlock(acc, 0);
7986
7987 for (long b = hb+1; b < hblocks; b++) {
7988 MulAddBlock(acc, &kerbufp[b*MAT_BLK_SZ*MAT_BLK_SZ],
7989 &colbuf[(b-1)*MAT_BLK_SZ*MAT_BLK_SZ], p, ll_red_struct);
7990 }
7991 }
7992 }
7993
7994 NTL_GEXEC_RANGE_END
7995
7996 for (long i = r; i < n; i++) {
7997
7998 long *kerbufp = &kerbuf[(i/MAT_BLK_SZ)-start_block][0];
7999
8000 for (long j = 0; j < r; j++) {
8001 long t0 =
8002 kerbufp[(j/MAT_BLK_SZ)*MAT_BLK_SZ*MAT_BLK_SZ+
8003 (i%MAT_BLK_SZ)*MAT_BLK_SZ+(j%MAT_BLK_SZ)];
8004
8005 Ker[i-r][j].LoopHole() = long(t0);
8006 }
8007 }
8008
8009 for (long i = 0; i < n-r; i++) {
8010 for (long j = 0; j < n-r; j++) {
8011 Ker[i][j+r].LoopHole() = 0;
8012 }
8013 Ker[i][i+r].LoopHole() = 1;
8014 }
8015
8016 if (pivoting) {
8017 for (long i = 0; i < n-r; i++) {
8018 zz_p *x = Ker[i].elts();
8019
8020 for (long k = n-1; k >= 0; k--) {
8021 long pos = P[k];
8022 if (pos != k) swap(x[pos], x[k]);
8023 }
8024 }
8025 }
8026 }
8027 }
8028
8029 return r;
8030
8031 }
8032
8033
8034 #endif
8035
8036
8037
8038 static
elim(const mat_zz_p & A,mat_zz_p * im,mat_zz_p * ker,long w,bool full)8039 long elim(const mat_zz_p& A, mat_zz_p *im, mat_zz_p *ker, long w, bool full)
8040 {
8041 long n = A.NumRows();
8042 long m = A.NumCols();
8043
8044 if (w < 0 || w > m) LogicError("elim: bad args");
8045
8046 #ifndef NTL_HAVE_LL_TYPE
8047
8048 return elim_basic(A, im, ker, w, full);
8049
8050 #else
8051
8052 long p = zz_p::modulus();
8053
8054 if (n/MAT_BLK_SZ < 4 || w/MAT_BLK_SZ < 4) {
8055 return elim_basic(A, im, ker, w, full);
8056 }
8057 else {
8058 long V = 4*MAT_BLK_SZ;
8059
8060 #ifdef NTL_HAVE_AVX
8061 if (p-1 <= MAX_DBL_INT &&
8062 V <= (MAX_DBL_INT-(p-1))/(p-1) &&
8063 V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
8064
8065 return elim_blk_DD(A, im, ker, w, full);
8066 }
8067 else
8068 #endif
8069 if (cast_unsigned(V) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1) &&
8070 cast_unsigned(V)*cast_unsigned(p-1) <= (~(0UL)-cast_unsigned(p-1))/cast_unsigned(p-1)) {
8071
8072 return elim_blk_L(A, im, ker, w, full);
8073
8074 }
8075 else {
8076
8077 return elim_blk_LL(A, im, ker, w, full);
8078 }
8079
8080 }
8081
8082 #endif
8083
8084
8085
8086 }
8087
8088
8089 // ******************************************************************
8090 //
8091 // High level interfaces
8092 //
8093 // ******************************************************************
8094
8095
8096
gauss(mat_zz_p & M,long w)8097 long gauss(mat_zz_p& M, long w)
8098 {
8099 return elim(M, &M, 0, w, true);
8100 }
8101
8102
gauss(mat_zz_p & M)8103 long gauss(mat_zz_p& M)
8104 {
8105 return gauss(M, M.NumCols());
8106 }
8107
image(mat_zz_p & X,const mat_zz_p & A)8108 void image(mat_zz_p& X, const mat_zz_p& A)
8109 {
8110 elim(A, &X, 0, A.NumCols(), false);
8111 }
8112
kernel(mat_zz_p & X,const mat_zz_p & A)8113 void kernel(mat_zz_p& X, const mat_zz_p& A)
8114 {
8115 elim(A, 0, &X, A.NumCols(), false);
8116 }
8117
8118
8119 // ******************************************************************
8120 //
8121 // Operator/functional notation
8122 //
8123 // ******************************************************************
8124
8125
8126
8127
operator +(const mat_zz_p & a,const mat_zz_p & b)8128 mat_zz_p operator+(const mat_zz_p& a, const mat_zz_p& b)
8129 {
8130 mat_zz_p res;
8131 add(res, a, b);
8132 NTL_OPT_RETURN(mat_zz_p, res);
8133 }
8134
operator *(const mat_zz_p & a,const mat_zz_p & b)8135 mat_zz_p operator*(const mat_zz_p& a, const mat_zz_p& b)
8136 {
8137 mat_zz_p res;
8138 mul_aux(res, a, b);
8139 NTL_OPT_RETURN(mat_zz_p, res);
8140 }
8141
operator -(const mat_zz_p & a,const mat_zz_p & b)8142 mat_zz_p operator-(const mat_zz_p& a, const mat_zz_p& b)
8143 {
8144 mat_zz_p res;
8145 sub(res, a, b);
8146 NTL_OPT_RETURN(mat_zz_p, res);
8147 }
8148
8149
operator -(const mat_zz_p & a)8150 mat_zz_p operator-(const mat_zz_p& a)
8151 {
8152 mat_zz_p res;
8153 negate(res, a);
8154 NTL_OPT_RETURN(mat_zz_p, res);
8155 }
8156
8157
operator *(const mat_zz_p & a,const vec_zz_p & b)8158 vec_zz_p operator*(const mat_zz_p& a, const vec_zz_p& b)
8159 {
8160 vec_zz_p res;
8161 mul_aux(res, a, b);
8162 NTL_OPT_RETURN(vec_zz_p, res);
8163 }
8164
operator *(const vec_zz_p & a,const mat_zz_p & b)8165 vec_zz_p operator*(const vec_zz_p& a, const mat_zz_p& b)
8166 {
8167 vec_zz_p res;
8168 mul(res, a, b);
8169 NTL_OPT_RETURN(vec_zz_p, res);
8170 }
8171
8172
8173 #if 0
8174 // for testing purposes
8175
8176 void test_alt_mul_L(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8177 {
8178 alt_mul_L(X, A, B);
8179 }
8180
8181 void test_alt_mul_LL(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8182 {
8183 alt_mul_LL(X, A, B);
8184 }
8185
8186 void test_blk_mul_DD(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8187 {
8188 blk_mul_DD(X, A, B);
8189 }
8190
8191 void test_blk_mul_LL(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8192 {
8193 blk_mul_LL(X, A, B);
8194 }
8195
8196 void test_blk_mul_L(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8197 {
8198 blk_mul_L(X, A, B);
8199 }
8200
8201 void test_basic_mul(mat_zz_p& X, const mat_zz_p& A, const mat_zz_p& B)
8202 {
8203 basic_mul(X, A, B);
8204 }
8205
8206 #endif
8207
random(mat_zz_p & x,long n,long m)8208 void random(mat_zz_p& x, long n, long m)
8209 {
8210 x.SetDims(n, m);
8211 for (long i = 0; i < n; i++) random(x[i], m);
8212 }
8213
8214 NTL_END_IMPL
8215