1
2 #include <NTL/ZZ.h>
3 #include <NTL/vec_ZZ.h>
4 #include <NTL/Lazy.h>
5 #include <NTL/fileio.h>
6 #include <NTL/SmartPtr.h>
7
8 #include <NTL/BasicThreadPool.h>
9
10
11
12 #if defined(NTL_HAVE_AVX2)
13 #include <immintrin.h>
14 #elif defined(NTL_HAVE_SSSE3)
15 #include <emmintrin.h>
16 #include <tmmintrin.h>
17 #endif
18
19 #if defined(NTL_HAVE_KMA)
20 #include <NTL/linux_s390x.h>
21 #endif
22
23
24
25 NTL_START_IMPL
26
27
28
29
30
zero()31 const ZZ& ZZ::zero()
32 {
33
34 static const ZZ z; // GLOBAL (relies on C++11 thread-safe init)
35 return z;
36 }
37
38
ZZ_expo(long e)39 const ZZ& ZZ_expo(long e)
40 {
41 NTL_TLS_LOCAL(ZZ, expo_helper);
42
43 conv(expo_helper, e);
44 return expo_helper;
45 }
46
47
48
AddMod(ZZ & x,const ZZ & a,long b,const ZZ & n)49 void AddMod(ZZ& x, const ZZ& a, long b, const ZZ& n)
50 {
51 NTL_ZZRegister(B);
52 conv(B, b);
53 AddMod(x, a, B, n);
54 }
55
56
SubMod(ZZ & x,const ZZ & a,long b,const ZZ & n)57 void SubMod(ZZ& x, const ZZ& a, long b, const ZZ& n)
58 {
59 NTL_ZZRegister(B);
60 conv(B, b);
61 SubMod(x, a, B, n);
62 }
63
SubMod(ZZ & x,long a,const ZZ & b,const ZZ & n)64 void SubMod(ZZ& x, long a, const ZZ& b, const ZZ& n)
65 {
66 NTL_ZZRegister(A);
67 conv(A, a);
68 SubMod(x, A, b, n);
69 }
70
71
72
73 // ****** input and output
74
75
76 static NTL_CHEAP_THREAD_LOCAL long iodigits = 0;
77 static NTL_CHEAP_THREAD_LOCAL long ioradix = 0;
78 // iodigits is the greatest integer such that 10^{iodigits} < NTL_WSP_BOUND
79 // ioradix = 10^{iodigits}
80
InitZZIO()81 static void InitZZIO()
82 {
83 long x;
84
85 x = (NTL_WSP_BOUND-1)/10;
86 iodigits = 0;
87 ioradix = 1;
88
89 while (x) {
90 x = x / 10;
91 iodigits++;
92 ioradix = ioradix * 10;
93 }
94
95 if (iodigits <= 0) TerminalError("problem with I/O");
96 }
97
98
operator >>(istream & s,ZZ & x)99 istream& operator>>(istream& s, ZZ& x)
100 {
101 long c;
102 long cval;
103 long sign;
104 long ndigits;
105 long acc;
106 NTL_ZZRegister(a);
107
108 if (!s) NTL_INPUT_ERROR(s, "bad ZZ input");
109
110 if (!iodigits) InitZZIO();
111
112 a = 0;
113
114 SkipWhiteSpace(s);
115 c = s.peek();
116
117 if (c == '-') {
118 sign = -1;
119 s.get();
120 c = s.peek();
121 }
122 else
123 sign = 1;
124
125 cval = CharToIntVal(c);
126
127 if (cval < 0 || cval > 9) NTL_INPUT_ERROR(s, "bad ZZ input");
128
129 ndigits = 0;
130 acc = 0;
131 while (cval >= 0 && cval <= 9) {
132 acc = acc*10 + cval;
133 ndigits++;
134
135 if (ndigits == iodigits) {
136 mul(a, a, ioradix);
137 add(a, a, acc);
138 ndigits = 0;
139 acc = 0;
140 }
141
142 s.get();
143 c = s.peek();
144 cval = CharToIntVal(c);
145 }
146
147 if (ndigits != 0) {
148 long mpy = 1;
149 while (ndigits > 0) {
150 mpy = mpy * 10;
151 ndigits--;
152 }
153
154 mul(a, a, mpy);
155 add(a, a, acc);
156 }
157
158 if (sign == -1)
159 negate(a, a);
160
161 x = a;
162 return s;
163 }
164
165
166 // The class _ZZ_local_stack should be defined in an empty namespace,
167 // but since I don't want to rely on namespaces, we just give it a funny
168 // name to avoid accidental name clashes.
169
170 struct _ZZ_local_stack {
171 long top;
172 Vec<long> data;
173
_ZZ_local_stack_ZZ_local_stack174 _ZZ_local_stack() { top = -1; }
175
pop_ZZ_local_stack176 long pop() { return data[top--]; }
empty_ZZ_local_stack177 long empty() { return (top == -1); }
178 void push(long x);
179 };
180
push(long x)181 void _ZZ_local_stack::push(long x)
182 {
183 if (top+1 >= data.length())
184 data.SetLength(max(32, long(1.414*data.length())));
185
186 top++;
187 data[top] = x;
188 }
189
190
191 static
PrintDigits(ostream & s,long d,long justify)192 void PrintDigits(ostream& s, long d, long justify)
193 {
194 NTL_TLS_LOCAL_INIT(Vec<char>, buf, (INIT_SIZE, iodigits));
195
196 long i = 0;
197
198 while (d) {
199 buf[i] = IntValToChar(d % 10);
200 d = d / 10;
201 i++;
202 }
203
204 if (justify) {
205 long j = iodigits - i;
206 while (j > 0) {
207 s << "0";
208 j--;
209 }
210 }
211
212 while (i > 0) {
213 i--;
214 s << buf[i];
215 }
216 }
217
218
219
220
operator <<(ostream & s,const ZZ & a)221 ostream& operator<<(ostream& s, const ZZ& a)
222 {
223 ZZ b;
224 _ZZ_local_stack S;
225 long r;
226 long k;
227
228 if (!iodigits) InitZZIO();
229
230 b = a;
231
232 k = sign(b);
233
234 if (k == 0) {
235 s << "0";
236 return s;
237 }
238
239 if (k < 0) {
240 s << "-";
241 negate(b, b);
242 }
243
244 do {
245 r = DivRem(b, b, ioradix);
246 S.push(r);
247 } while (!IsZero(b));
248
249 r = S.pop();
250 PrintDigits(s, r, 0);
251
252 while (!S.empty()) {
253 r = S.pop();
254 PrintDigits(s, r, 1);
255 }
256
257 return s;
258 }
259
260
261
GCD(long a,long b)262 long GCD(long a, long b)
263 {
264 long u, v, t, x;
265
266 if (a < 0) {
267 if (a < -NTL_MAX_LONG) ResourceError("GCD: integer overflow");
268 a = -a;
269 }
270
271 if (b < 0) {
272 if (b < -NTL_MAX_LONG) ResourceError("GCD: integer overflow");
273 b = -b;
274 }
275
276
277 if (b==0)
278 x = a;
279 else {
280 u = a;
281 v = b;
282 do {
283 t = u % v;
284 u = v;
285 v = t;
286 } while (v != 0);
287
288 x = u;
289 }
290
291 return x;
292 }
293
294
295
XGCD(long & d,long & s,long & t,long a,long b)296 void XGCD(long& d, long& s, long& t, long a, long b)
297 {
298 long u, v, u0, v0, u1, v1, u2, v2, q, r;
299
300 long aneg = 0, bneg = 0;
301
302 if (a < 0) {
303 if (a < -NTL_MAX_LONG) ResourceError("XGCD: integer overflow");
304 a = -a;
305 aneg = 1;
306 }
307
308 if (b < 0) {
309 if (b < -NTL_MAX_LONG) ResourceError("XGCD: integer overflow");
310 b = -b;
311 bneg = 1;
312 }
313
314 u1=1; v1=0;
315 u2=0; v2=1;
316 u = a; v = b;
317
318 while (v != 0) {
319 q = u / v;
320 r = u % v;
321 u = v;
322 v = r;
323 u0 = u2;
324 v0 = v2;
325 u2 = u1 - q*u2;
326 v2 = v1- q*v2;
327 u1 = u0;
328 v1 = v0;
329 }
330
331 if (aneg)
332 u1 = -u1;
333
334 if (bneg)
335 v1 = -v1;
336
337 d = u;
338 s = u1;
339 t = v1;
340 }
341
InvModStatus(long & x,long a,long n)342 long InvModStatus(long& x, long a, long n)
343 {
344 long d, s, t;
345
346 XGCD(d, s, t, a, n);
347 if (d != 1) {
348 x = d;
349 return 1;
350 }
351 else {
352 if (s < 0)
353 x = s + n;
354 else
355 x = s;
356
357 return 0;
358 }
359 }
360
InvMod(long a,long n)361 long InvMod(long a, long n)
362 {
363 long d, s, t;
364
365 XGCD(d, s, t, a, n);
366 if (d != 1) {
367 InvModError("InvMod: inverse undefined");
368 }
369 if (s < 0)
370 return s + n;
371 else
372 return s;
373 }
374
375
PowerMod(long a,long ee,long n)376 long PowerMod(long a, long ee, long n)
377 {
378 long x, y;
379
380 unsigned long e;
381
382 if (ee < 0)
383 e = - ((unsigned long) ee);
384 else
385 e = ee;
386
387 x = 1;
388 y = a;
389 while (e) {
390 if (e & 1) x = MulMod(x, y, n);
391 y = MulMod(y, y, n);
392 e = e >> 1;
393 }
394
395 if (ee < 0) x = InvMod(x, n);
396
397 return x;
398 }
399
400 static
MillerWitness_sp(long n,long x)401 long MillerWitness_sp(long n, long x)
402 {
403 long m, y, z;
404 long j, k;
405
406 if (x == 0) return 0;
407
408 m = n - 1;
409 k = 0;
410 while((m & 1) == 0) {
411 m = m >> 1;
412 k++;
413 }
414 // n - 1 == 2^k * m, m odd
415
416 z = PowerMod(x, m, n);
417 if (z == 1) return 0;
418
419 j = 0;
420 do {
421 y = z;
422 z = MulMod(y, y, n);
423 j++;
424 } while (j != k && z != 1);
425
426 if (z != 1 || y != n-1) return 1;
427 return 0;
428 }
429
ProbPrime(long n,long NumTrials)430 long ProbPrime(long n, long NumTrials)
431 {
432 if (NumTrials < 0) NumTrials = 0;
433
434 long m, x, y, z;
435 long i, j, k;
436
437 if (n <= 1) return 0;
438
439
440 if (n == 2) return 1;
441 if (n % 2 == 0) return 0;
442
443 if (n == 3) return 1;
444 if (n % 3 == 0) return 0;
445
446 if (n == 5) return 1;
447 if (n % 5 == 0) return 0;
448
449 if (n == 7) return 1;
450 if (n % 7 == 0) return 0;
451
452 if (n == 11) return 1;
453 if (n % 11 == 0) return 0;
454
455 if (n == 13) return 1;
456 if (n % 13 == 0) return 0;
457
458 if (n >= NTL_SP_BOUND) {
459 return ProbPrime(to_ZZ(n), NumTrials);
460 }
461
462 m = n - 1;
463 k = 0;
464 while((m & 1) == 0) {
465 m = m >> 1;
466 k++;
467 }
468
469 // n - 1 == 2^k * m, m odd
470
471 for (i = 0; i < NumTrials+1; i++) {
472 // for consistency with the multi-precision version,
473 // we first see if 2 is a witness, so we really do
474 // NumTrials+1 tests
475
476 if (i == 0)
477 x = 2;
478 else {
479 do {
480 x = RandomBnd(n);
481 } while (x == 0);
482 // x == 0 is not a useful candidate for a witness!
483 }
484
485 z = PowerMod(x, m, n);
486 if (z == 1) continue;
487
488 j = 0;
489 do {
490 y = z;
491 z = MulMod(y, y, n);
492 j++;
493 } while (j != k && z != 1);
494
495 if (z != 1 || y != n-1) return 0;
496 }
497
498 return 1;
499 }
500
501
MillerWitness(const ZZ & n,const ZZ & x)502 long MillerWitness(const ZZ& n, const ZZ& x)
503 {
504 if (n.SinglePrecision()) {
505 return MillerWitness_sp(to_long(n), to_long(x));
506 }
507
508 ZZ m, y, z;
509
510 long j, k;
511
512 if (x == 0) return 0;
513
514 add(m, n, -1);
515 k = MakeOdd(m);
516 // n - 1 == 2^k * m, m odd
517
518 PowerMod(z, x, m, n);
519 if (z == 1) return 0;
520
521 j = 0;
522 do {
523 y = z;
524 SqrMod(z, y, n);
525 j++;
526 } while (j != k && z != 1);
527
528 if (z != 1) return 1;
529 add(y, y, 1);
530 if (y != n) return 1;
531 return 0;
532 }
533
534
535 // ComputePrimeBound computes a reasonable bound for trial
536 // division in the Miller-Rabin test.
537 // It is computed a bit on the "low" side, since being a bit
538 // low doesn't hurt much, but being too high can hurt a lot.
539
540 // See the paper "Fast generation of prime numbers and secure
541 // public-key cryptographic parameters" by Ueli Maurer.
542 // In that paper, it is calculated that the optimal bound in
543 // roughly T_exp/T_div, where T_exp is the time for an exponentiation
544 // and T_div is is the time for a single precision division.
545 // Of course, estimating these times is a bit tricky, and
546 // the values we use are based on experimentation, assuming
547 // GMP is being used. I've tested this on various bit lengths
548 // up to 16,000, and they seem to be pretty close to optimal.
549
550 static
ComputePrimeBound(long bn)551 long ComputePrimeBound(long bn)
552 {
553 long wn = (bn+NTL_ZZ_NBITS-1)/NTL_ZZ_NBITS;
554
555 long fn;
556
557 if (wn <= 36)
558 fn = wn/4 + 1;
559 else
560 fn = long(1.67*sqrt(double(wn)));
561
562 long prime_bnd;
563
564 if (NumBits(bn) + NumBits(fn) > NTL_SP_NBITS)
565 prime_bnd = NTL_SP_BOUND;
566 else
567 prime_bnd = bn*fn;
568
569 return prime_bnd;
570
571
572 }
573
574
ProbPrime(const ZZ & n,long NumTrials)575 long ProbPrime(const ZZ& n, long NumTrials)
576 {
577 if (NumTrials < 0) NumTrials = 0;
578
579 if (n <= 1) return 0;
580
581 if (n.SinglePrecision()) {
582 return ProbPrime(to_long(n), NumTrials);
583 }
584
585
586 long prime_bnd = ComputePrimeBound(NumBits(n));
587
588
589 PrimeSeq s;
590 long p;
591
592 p = s.next();
593 while (p && p < prime_bnd) {
594 if (rem(n, p) == 0)
595 return 0;
596
597 p = s.next();
598 }
599
600 ZZ W;
601 W = 2;
602
603 // first try W == 2....the exponentiation
604 // algorithm runs slightly faster in this case
605
606 if (MillerWitness(n, W))
607 return 0;
608
609
610 long i;
611
612 for (i = 0; i < NumTrials; i++) {
613 do {
614 RandomBnd(W, n);
615 } while (W == 0);
616 // W == 0 is not a useful candidate for a witness!
617
618 if (MillerWitness(n, W))
619 return 0;
620 }
621
622 return 1;
623 }
624
625
626 static
MultiThreadedRandomPrime(ZZ & n,long l,long NumTrials)627 void MultiThreadedRandomPrime(ZZ& n, long l, long NumTrials)
628 {
629
630 long nt = AvailableThreads();
631
632
633
634 const long LOCAL_ITER_BOUND = 8;
635 // since resetting the PRG comes at a certain cost,
636 // we perform a few iterations with each reset to
637 // amortize the reset cost.
638
639 unsigned long initial_counter = 0;
640 ZZ seed;
641 RandomBits(seed, 256);
642
643 for (;;) {
644
645 AtomicLowWaterMark low_water_mark(-1UL);
646 AtomicCounter counter(initial_counter);
647
648 Vec< UniquePtr<ZZ> > result(INIT_SIZE, nt);
649 Vec<unsigned long> result_ctr(INIT_SIZE, nt, -1UL);
650
651 NTL_EXEC_INDEX(nt, index)
652
653 RandomStreamPush push;
654
655 SetSeed(seed);
656 RandomStream& stream = GetCurrentRandomStream();
657
658 ZZ cand;
659
660 while (low_water_mark == -1UL) {
661
662 unsigned long local_ctr = counter.inc();
663 if (local_ctr >> (NTL_BITS_PER_NONCE-1)) {
664 // counter overflow...rather academic
665 break;
666 }
667
668 stream.set_nonce(local_ctr);
669
670 for (long iter = 0; iter < LOCAL_ITER_BOUND &&
671 local_ctr <= low_water_mark; iter++) {
672
673 RandomLen(cand, l);
674 if (!IsOdd(cand)) add(cand, cand, 1);
675
676 if (ProbPrime(cand, 0)) {
677 result[index].make(cand);
678 result_ctr[index] = local_ctr;
679 low_water_mark.UpdateMin(local_ctr);
680 break;
681 }
682 }
683 }
684
685 NTL_EXEC_INDEX_END
686
687 // find index of low_water_mark
688
689 unsigned long low_water_mark1 = low_water_mark;
690 long low_water_index = -1;
691
692 for (long index = 0; index < nt; index++) {
693 if (result_ctr[index] == low_water_mark1) {
694 low_water_index = index;
695 break;
696 }
697 }
698
699 if (low_water_index == -1) {
700 // counter overflow...rather academic
701 initial_counter = 0;
702 RandomBits(seed, 256);
703 continue;
704 }
705
706 ZZ N;
707 N = *result[low_water_index];
708
709 Vec<ZZ> W(INIT_SIZE, NumTrials);
710
711 for (long i = 0; i < NumTrials; i++) {
712 do {
713 RandomBnd(W[i], N);
714 } while (W[i] == 0);
715 }
716
717 AtomicBool tests_pass(true);
718
719 NTL_EXEC_RANGE(NumTrials, first, last)
720
721 for (long i = first; i < last && tests_pass; i++) {
722 if (MillerWitness(N, W[i])) tests_pass = false;
723 }
724
725 NTL_EXEC_RANGE_END
726
727 if (tests_pass) {
728 n = N;
729 return;
730 }
731
732 // very unlikey to get here
733 initial_counter = low_water_mark1 + 1;
734 }
735 }
736
737
RandomPrime(ZZ & n,long l,long NumTrials)738 void RandomPrime(ZZ& n, long l, long NumTrials)
739 {
740 if (NumTrials < 0) NumTrials = 0;
741
742 if (l >= 256) {
743 MultiThreadedRandomPrime(n, l, NumTrials);
744 return;
745 }
746
747 if (l <= 1)
748 LogicError("RandomPrime: l out of range");
749
750 if (l == 2) {
751 if (RandomBnd(2))
752 n = 3;
753 else
754 n = 2;
755
756 return;
757 }
758
759 do {
760 RandomLen(n, l);
761 if (!IsOdd(n)) add(n, n, 1);
762 } while (!ProbPrime(n, NumTrials));
763 }
764
OldRandomPrime(ZZ & n,long l,long NumTrials)765 void OldRandomPrime(ZZ& n, long l, long NumTrials)
766 {
767 if (l <= 1)
768 LogicError("RandomPrime: l out of range");
769
770 if (l == 2) {
771 if (RandomBnd(2))
772 n = 3;
773 else
774 n = 2;
775
776 return;
777 }
778
779 do {
780 RandomLen(n, l);
781 if (!IsOdd(n)) add(n, n, 1);
782 } while (!ProbPrime(n, NumTrials));
783 }
784
NextPrime(ZZ & n,const ZZ & m,long NumTrials)785 void NextPrime(ZZ& n, const ZZ& m, long NumTrials)
786 {
787 ZZ x;
788
789 if (m <= 2) {
790 n = 2;
791 return;
792 }
793
794 x = m;
795
796 while (!ProbPrime(x, NumTrials))
797 add(x, x, 1);
798
799 n = x;
800 }
801
NextPrime(long m,long NumTrials)802 long NextPrime(long m, long NumTrials)
803 {
804 long x;
805
806 if (m <= 2)
807 return 2;
808
809 x = m;
810
811 while (x < NTL_SP_BOUND && !ProbPrime(x, NumTrials))
812 x++;
813
814 if (x >= NTL_SP_BOUND)
815 ResourceError("NextPrime: no more primes");
816
817 return x;
818 }
819
820
821
NextPowerOfTwo(long m)822 long NextPowerOfTwo(long m)
823 {
824 long k;
825 unsigned long n, um;
826
827 if (m < 0) return 0;
828
829 um = m;
830 n = 1;
831 k = 0;
832
833 while (n < um) {
834 n = n << 1;
835 k++;
836 }
837
838 if (k >= NTL_BITS_PER_LONG-1)
839 ResourceError("NextPowerOfTwo: overflow");
840
841 return k;
842 }
843
844
845
846
bit(long a,long k)847 long bit(long a, long k)
848 {
849 unsigned long aa;
850 if (a < 0)
851 aa = - ((unsigned long) a);
852 else
853 aa = a;
854
855 if (k < 0 || k >= NTL_BITS_PER_LONG)
856 return 0;
857 else
858 return long((aa >> k) & 1);
859 }
860
861
862
divide(ZZ & q,const ZZ & a,const ZZ & b)863 long divide(ZZ& q, const ZZ& a, const ZZ& b)
864 {
865 NTL_ZZRegister(qq);
866 NTL_ZZRegister(r);
867
868 if (IsZero(b)) {
869 if (IsZero(a)) {
870 clear(q);
871 return 1;
872 }
873 else
874 return 0;
875 }
876
877
878 if (IsOne(b)) {
879 q = a;
880 return 1;
881 }
882
883 DivRem(qq, r, a, b);
884 if (!IsZero(r)) return 0;
885 q = qq;
886 return 1;
887 }
888
divide(const ZZ & a,const ZZ & b)889 long divide(const ZZ& a, const ZZ& b)
890 {
891 NTL_ZZRegister(r);
892
893 if (IsZero(b)) return IsZero(a);
894 if (IsOne(b)) return 1;
895
896 rem(r, a, b);
897 return IsZero(r);
898 }
899
divide(ZZ & q,const ZZ & a,long b)900 long divide(ZZ& q, const ZZ& a, long b)
901 {
902 NTL_ZZRegister(qq);
903
904 if (!b) {
905 if (IsZero(a)) {
906 clear(q);
907 return 1;
908 }
909 else
910 return 0;
911 }
912
913 if (b == 1) {
914 q = a;
915 return 1;
916 }
917
918 long r = DivRem(qq, a, b);
919 if (r) return 0;
920 q = qq;
921 return 1;
922 }
923
divide(const ZZ & a,long b)924 long divide(const ZZ& a, long b)
925 {
926 if (!b) return IsZero(a);
927 if (b == 1) {
928 return 1;
929 }
930
931 long r = rem(a, b);
932 return (r == 0);
933 }
934
935
InvMod(ZZ & x,const ZZ & a,const ZZ & n)936 void InvMod(ZZ& x, const ZZ& a, const ZZ& n)
937 {
938 // NOTE: the underlying LIP routines write to the first argument,
939 // even if inverse is undefined
940
941 NTL_ZZRegister(xx);
942 if (InvModStatus(xx, a, n))
943 InvModError("InvMod: inverse undefined", a, n);
944 x = xx;
945 }
946
PowerMod(ZZ & x,const ZZ & a,const ZZ & e,const ZZ & n)947 void PowerMod(ZZ& x, const ZZ& a, const ZZ& e, const ZZ& n)
948 {
949 // NOTE: this ensures that all modular inverses are computed
950 // in the routine InvMod above, rather than the LIP-internal
951 // modular inverse routine
952 if (e < 0) {
953 ZZ a_inv;
954 ZZ e_neg;
955
956 InvMod(a_inv, a, n);
957 negate(e_neg, e);
958 LowLevelPowerMod(x, a_inv, e_neg, n);
959 }
960 else
961 LowLevelPowerMod(x, a, e, n);
962 }
963
964 #ifdef NTL_EXCEPTIONS
965
InvModError(const char * s,const ZZ & a,const ZZ & n)966 void InvModError(const char *s, const ZZ& a, const ZZ& n)
967 {
968 throw InvModErrorObject(s, a, n);
969 }
970
971 #else
972
InvModError(const char * s,const ZZ & a,const ZZ & n)973 void InvModError(const char *s, const ZZ& a, const ZZ& n)
974 {
975 TerminalError(s);
976 }
977
978
979 #endif
980
RandomPrime_long(long l,long NumTrials)981 long RandomPrime_long(long l, long NumTrials)
982 {
983 if (NumTrials < 0) NumTrials = 0;
984
985 if (l <= 1 || l >= NTL_BITS_PER_LONG)
986 ResourceError("RandomPrime: length out of range");
987
988 long n;
989 do {
990 n = RandomLen_long(l);
991 } while (!ProbPrime(n, NumTrials));
992
993 return n;
994 }
995
996
997 static Lazy< Vec<char> > lowsieve_storage;
998 // This is a GLOBAL VARIABLE
999
1000
PrimeSeq()1001 PrimeSeq::PrimeSeq()
1002 {
1003 movesieve = 0;
1004 pshift = -1;
1005 pindex = -1;
1006 exhausted = 0;
1007 }
1008
1009
next()1010 long PrimeSeq::next()
1011 {
1012 if (exhausted) {
1013 return 0;
1014 }
1015
1016 if (pshift < 0) {
1017 shift(0);
1018 return 2;
1019 }
1020
1021 for (;;) {
1022 const char *p = movesieve;
1023 long i = pindex;
1024
1025 while ((++i) < NTL_PRIME_BND) {
1026 if (p[i]) {
1027 pindex = i;
1028 return pshift + 2 * i + 3;
1029 }
1030 }
1031
1032 long newshift = pshift + 2*NTL_PRIME_BND;
1033
1034 if (newshift > 2 * NTL_PRIME_BND * (2 * NTL_PRIME_BND + 1)) {
1035 /* end of the road */
1036 exhausted = 1;
1037 return 0;
1038 }
1039
1040 shift(newshift);
1041 }
1042 }
1043
shift(long newshift)1044 void PrimeSeq::shift(long newshift)
1045 {
1046 long i;
1047 long j;
1048 long jstep;
1049 long jstart;
1050 long ibound;
1051 char *p;
1052
1053 if (!lowsieve_storage.built())
1054 start();
1055
1056 const char *lowsieve = lowsieve_storage->elts();
1057
1058
1059 if (newshift < 0) {
1060 pshift = -1;
1061 }
1062 else if (newshift == 0) {
1063 pshift = 0;
1064 movesieve = lowsieve;
1065 }
1066 else if (newshift != pshift) {
1067 if (movesieve_mem.length() == 0) {
1068 movesieve_mem.SetLength(NTL_PRIME_BND);
1069 }
1070
1071 pshift = newshift;
1072 movesieve = p = movesieve_mem.elts();
1073 for (i = 0; i < NTL_PRIME_BND; i++)
1074 p[i] = 1;
1075
1076 jstep = 3;
1077 ibound = pshift + 2 * NTL_PRIME_BND + 1;
1078 for (i = 0; jstep * jstep <= ibound; i++) {
1079 if (lowsieve[i]) {
1080 if (!((jstart = (pshift + 2) / jstep + 1) & 1))
1081 jstart++;
1082 if (jstart <= jstep)
1083 jstart = jstep;
1084 jstart = (jstart * jstep - pshift - 3) / 2;
1085 for (j = jstart; j < NTL_PRIME_BND; j += jstep)
1086 p[j] = 0;
1087 }
1088 jstep += 2;
1089 }
1090 }
1091
1092 pindex = -1;
1093 exhausted = 0;
1094 }
1095
1096
start()1097 void PrimeSeq::start()
1098 {
1099 long i;
1100 long j;
1101 long jstep;
1102 long jstart;
1103 long ibnd;
1104 char *p;
1105
1106 do {
1107 Lazy< Vec<char> >::Builder builder(lowsieve_storage);
1108 if (!builder()) break;
1109
1110 UniquePtr< Vec<char> > ptr;
1111 ptr.make();
1112 ptr->SetLength(NTL_PRIME_BND);
1113
1114 p = ptr->elts();
1115
1116 for (i = 0; i < NTL_PRIME_BND; i++)
1117 p[i] = 1;
1118
1119 jstep = 1;
1120 jstart = -1;
1121 ibnd = (SqrRoot(2 * NTL_PRIME_BND + 1) - 3) / 2;
1122 for (i = 0; i <= ibnd; i++) {
1123 jstart += 2 * ((jstep += 2) - 1);
1124 if (p[i])
1125 for (j = jstart; j < NTL_PRIME_BND; j += jstep)
1126 p[j] = 0;
1127 }
1128
1129 builder.move(ptr);
1130 } while (0);
1131
1132 }
1133
reset(long b)1134 void PrimeSeq::reset(long b)
1135 {
1136 if (b > (2*NTL_PRIME_BND+1)*(2*NTL_PRIME_BND+1)) {
1137 exhausted = 1;
1138 return;
1139 }
1140
1141 if (b <= 2) {
1142 shift(-1);
1143 return;
1144 }
1145
1146 if ((b & 1) == 0) b++;
1147
1148 shift(((b-3) / (2*NTL_PRIME_BND))* (2*NTL_PRIME_BND));
1149 pindex = (b - pshift - 3)/2 - 1;
1150 }
1151
Jacobi(const ZZ & aa,const ZZ & nn)1152 long Jacobi(const ZZ& aa, const ZZ& nn)
1153 {
1154 ZZ a, n;
1155 long t, k;
1156 long d;
1157
1158 a = aa;
1159 n = nn;
1160 t = 1;
1161
1162 while (a != 0) {
1163 k = MakeOdd(a);
1164 d = trunc_long(n, 3);
1165 if ((k & 1) && (d == 3 || d == 5)) t = -t;
1166
1167 if (trunc_long(a, 2) == 3 && (d & 3) == 3) t = -t;
1168 swap(a, n);
1169 rem(a, a, n);
1170 }
1171
1172 if (n == 1)
1173 return t;
1174 else
1175 return 0;
1176 }
1177
1178
SqrRootMod(ZZ & x,const ZZ & aa,const ZZ & nn)1179 void SqrRootMod(ZZ& x, const ZZ& aa, const ZZ& nn)
1180 {
1181 if (aa == 0 || aa == 1) {
1182 x = aa;
1183 return;
1184 }
1185
1186 // at this point, we must have nn >= 5
1187
1188 if (trunc_long(nn, 2) == 3) { // special case, n = 3 (mod 4)
1189 ZZ n, a, e, z;
1190
1191 n = nn;
1192 a = aa;
1193
1194 add(e, n, 1);
1195 RightShift(e, e, 2);
1196
1197 PowerMod(z, a, e, n);
1198 x = z;
1199
1200 return;
1201 }
1202
1203 ZZ n, m;
1204 int h, nlen;
1205
1206 n = nn;
1207 nlen = NumBits(n);
1208
1209 sub(m, n, 1);
1210 h = MakeOdd(m); // h >= 2
1211
1212
1213 if (nlen > 50 && h < SqrRoot(nlen)) {
1214 long i, j;
1215 ZZ a, b, a_inv, c, r, m1, d;
1216
1217 a = aa;
1218 InvMod(a_inv, a, n);
1219
1220 if (h == 2)
1221 b = 2;
1222 else {
1223 do {
1224 RandomBnd(b, n);
1225 } while (Jacobi(b, n) != -1);
1226 }
1227
1228
1229 PowerMod(c, b, m, n);
1230
1231 add(m1, m, 1);
1232 RightShift(m1, m1, 1);
1233 PowerMod(r, a, m1, n);
1234
1235 for (i = h-2; i >= 0; i--) {
1236 SqrMod(d, r, n);
1237 MulMod(d, d, a_inv, n);
1238 for (j = 0; j < i; j++)
1239 SqrMod(d, d, n);
1240 if (!IsOne(d))
1241 MulMod(r, r, c, n);
1242 SqrMod(c, c, n);
1243 }
1244
1245 x = r;
1246 return;
1247 }
1248
1249
1250
1251
1252
1253 long i, k;
1254 ZZ ma, t, u, v, e;
1255 ZZ t1, t2, t3, t4;
1256
1257 n = nn;
1258 NegateMod(ma, aa, n);
1259
1260 // find t such that t^2 - 4*a is not a square
1261
1262 MulMod(t1, ma, 4, n);
1263 do {
1264 RandomBnd(t, n);
1265 SqrMod(t2, t, n);
1266 AddMod(t2, t2, t1, n);
1267 } while (Jacobi(t2, n) != -1);
1268
1269 // compute u*X + v = X^{(n+1)/2} mod f, where f = X^2 - t*X + a
1270
1271 add(e, n, 1);
1272 RightShift(e, e, 1);
1273
1274 u = 0;
1275 v = 1;
1276
1277 k = NumBits(e);
1278
1279 for (i = k - 1; i >= 0; i--) {
1280 add(t2, u, v);
1281 sqr(t3, t2); // t3 = (u+v)^2
1282 sqr(t1, u);
1283 sqr(t2, v);
1284 sub(t3, t3, t1);
1285 sub(t3, t3, t2); // t1 = u^2, t2 = v^2, t3 = 2*u*v
1286 rem(t1, t1, n);
1287 mul(t4, t1, t);
1288 add(t4, t4, t3);
1289 rem(u, t4, n);
1290
1291 mul(t4, t1, ma);
1292 add(t4, t4, t2);
1293 rem(v, t4, n);
1294
1295 if (bit(e, i)) {
1296 MulMod(t1, u, t, n);
1297 AddMod(t1, t1, v, n);
1298 MulMod(v, u, ma, n);
1299 u = t1;
1300 }
1301
1302 }
1303
1304 x = v;
1305 }
1306
1307
1308
1309 // Chinese Remaindering.
1310 //
1311 // This version in new to v3.7, and is significantly
1312 // simpler and faster than the previous version.
1313 //
1314 // This function takes as input g, a, G, p,
1315 // such that a > 0, 0 <= G < p, and gcd(a, p) = 1.
1316 // It computes a' = a*p and g' such that
1317 // * g' = g (mod a);
1318 // * g' = G (mod p);
1319 // * -a'/2 < g' <= a'/2.
1320 // It then sets g := g' and a := a', and returns 1 iff g has changed.
1321 //
1322 // Under normal use, the input value g satisfies -a/2 < g <= a/2;
1323 // however, this was not documented or enforced in earlier versions,
1324 // so to maintain backward compatability, no restrictions are placed
1325 // on g. This routine runs faster, though, if -a/2 < g <= a/2,
1326 // and the first thing the routine does is to make this condition
1327 // hold.
1328 //
1329 // Also, under normal use, both a and p are odd; however, the routine
1330 // will still work even if this is not so.
1331 //
1332 // The routine is based on the following simple fact.
1333 //
1334 // Let -a/2 < g <= a/2, and let h satisfy
1335 // * g + a h = G (mod p);
1336 // * -p/2 < h <= p/2.
1337 // Further, if p = 2*h and g > 0, set
1338 // g' := g - a h;
1339 // otherwise, set
1340 // g' := g + a h.
1341 // Then g' so defined satisfies the above requirements.
1342 //
1343 // It is trivial to see that g's satisfies the congruence conditions.
1344 // The only thing is to check that the "balancing" condition
1345 // -a'/2 < g' <= a'/2 also holds.
1346
1347
CRT(ZZ & gg,ZZ & a,long G,long p)1348 long CRT(ZZ& gg, ZZ& a, long G, long p)
1349 {
1350 if (p >= NTL_SP_BOUND) {
1351 ZZ GG, pp;
1352 conv(GG, G);
1353 conv(pp, p);
1354 return CRT(gg, a, GG, pp);
1355 }
1356
1357 long modified = 0;
1358
1359 NTL_ZZRegister(g);
1360
1361 if (!CRTInRange(gg, a)) {
1362 modified = 1;
1363 ZZ a1;
1364 rem(g, gg, a);
1365 RightShift(a1, a, 1);
1366 if (g > a1) sub(g, g, a);
1367 }
1368 else
1369 g = gg;
1370
1371
1372 long p1;
1373 p1 = p >> 1;
1374
1375 long a_inv;
1376 a_inv = rem(a, p);
1377 a_inv = InvMod(a_inv, p);
1378
1379 long h;
1380 h = rem(g, p);
1381 h = SubMod(G, h, p);
1382 h = MulMod(h, a_inv, p);
1383 if (h > p1)
1384 h = h - p;
1385
1386 if (h != 0) {
1387 modified = 1;
1388
1389 if (!(p & 1) && g > 0 && (h == p1))
1390 MulSubFrom(g, a, h);
1391 else
1392 MulAddTo(g, a, h);
1393 }
1394
1395 mul(a, a, p);
1396 gg = g;
1397
1398 return modified;
1399 }
1400
CRT(ZZ & gg,ZZ & a,const ZZ & G,const ZZ & p)1401 long CRT(ZZ& gg, ZZ& a, const ZZ& G, const ZZ& p)
1402 {
1403 long modified = 0;
1404
1405 ZZ g;
1406
1407 if (!CRTInRange(gg, a)) {
1408 modified = 1;
1409 ZZ a1;
1410 rem(g, gg, a);
1411 RightShift(a1, a, 1);
1412 if (g > a1) sub(g, g, a);
1413 }
1414 else
1415 g = gg;
1416
1417
1418 ZZ p1;
1419 RightShift(p1, p, 1);
1420
1421 ZZ a_inv;
1422 rem(a_inv, a, p);
1423 InvMod(a_inv, a_inv, p);
1424
1425 ZZ h;
1426 rem(h, g, p);
1427 SubMod(h, G, h, p);
1428 MulMod(h, h, a_inv, p);
1429 if (h > p1)
1430 sub(h, h, p);
1431
1432 if (h != 0) {
1433 modified = 1;
1434 ZZ ah;
1435 mul(ah, a, h);
1436
1437 if (!IsOdd(p) && g > 0 && (h == p1))
1438 sub(g, g, ah);
1439 else
1440 add(g, g, ah);
1441 }
1442
1443 mul(a, a, p);
1444 gg = g;
1445
1446 return modified;
1447 }
1448
1449
1450
sub(ZZ & x,long a,const ZZ & b)1451 void sub(ZZ& x, long a, const ZZ& b)
1452 {
1453 NTL_ZZRegister(A);
1454 conv(A, a);
1455 sub(x, A, b);
1456 }
1457
1458
power2(ZZ & x,long e)1459 void power2(ZZ& x, long e)
1460 {
1461 if (e < 0) ArithmeticError("power2: negative exponent");
1462 set(x);
1463 LeftShift(x, x, e);
1464 }
1465
1466
1467
bit_and(ZZ & x,const ZZ & a,long b)1468 void bit_and(ZZ& x, const ZZ& a, long b)
1469 {
1470 NTL_ZZRegister(B);
1471 conv(B, b);
1472 bit_and(x, a, B);
1473 }
1474
bit_or(ZZ & x,const ZZ & a,long b)1475 void bit_or(ZZ& x, const ZZ& a, long b)
1476 {
1477 NTL_ZZRegister(B);
1478 conv(B, b);
1479 bit_or(x, a, B);
1480 }
1481
bit_xor(ZZ & x,const ZZ & a,long b)1482 void bit_xor(ZZ& x, const ZZ& a, long b)
1483 {
1484 NTL_ZZRegister(B);
1485 conv(B, b);
1486 bit_xor(x, a, B);
1487 }
1488
1489
power_long(long a,long e)1490 long power_long(long a, long e)
1491 {
1492 if (e < 0) ArithmeticError("power_long: negative exponent");
1493
1494 if (e == 0) return 1;
1495
1496 if (a == 1) return 1;
1497 if (a == -1) {
1498 if (e & 1)
1499 return -1;
1500 else
1501 return 1;
1502 }
1503
1504 // no overflow check --- result is computed correctly
1505 // modulo word size
1506
1507 unsigned long res = 1;
1508 unsigned long aa = a;
1509 long i;
1510
1511 for (i = 0; i < e; i++)
1512 res *= aa;
1513
1514 return to_long(res);
1515 }
1516
1517
1518
1519 // ======================= new PRG stuff ======================
1520
1521
1522
1523
1524 #if (NTL_BITS_PER_INT32 == 32)
1525 #define INT32MASK(x) (x)
1526 #else
1527 #define INT32MASK(x) ((x) & _ntl_uint32(0xffffffff))
1528 #endif
1529
1530
1531
1532 // SHA256 code adapted from an implementauin by Brad Conte.
1533 // The following is from his original source files.
1534 /*********************************************************************
1535 * Filename: sha256.c
1536 * Author: Brad Conte (brad AT bradconte.com)
1537 * Copyright:
1538 * Disclaimer: This code is presented "as is" without any guarantees.
1539 * Details: Implementation of the SHA-256 hashing algorithm.
1540 SHA-256 is one of the three algorithms in the SHA2
1541 specification. The others, SHA-384 and SHA-512, are not
1542 offered in this implementation.
1543 Algorithm specification can be found here:
1544 * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf
1545 This implementation uses little endian byte order.
1546 *********************************************************************/
1547
1548 // And the following is from the description at
1549 // https://github.com/B-Con/crypto-algorithms
1550
1551 /*********************************************************************
1552
1553 These are basic implementations of standard cryptography algorithms, written by
1554 Brad Conte (brad@bradconte.com) from scratch and without any cross-licensing.
1555 They exist to provide publically accessible, restriction-free implementations
1556 of popular cryptographic algorithms, like AES and SHA-1. These are primarily
1557 intended for educational and pragmatic purposes (such as comparing a
1558 specification to actual implementation code, or for building an internal
1559 application that computes test vectors for a product). The algorithms have been
1560 tested against standard test vectors.
1561
1562 This code is released into the public domain free of any restrictions. The
1563 author requests acknowledgement if the code is used, but does not require it.
1564 This code is provided free of any liability and without any quality claims by
1565 the author.
1566
1567 Note that these are not cryptographically secure implementations. They have no
1568 resistence to side-channel attacks and should not be used in contexts that need
1569 cryptographically secure implementations.
1570
1571 These algorithms are not optimized for speed or space. They are primarily
1572 designed to be easy to read, although some basic optimization techniques have
1573 been employed.
1574
1575 *********************************************************************/
1576
1577
1578
1579
1580
1581
1582 #define SHA256_BLOCKSIZE (64)
1583 #define SHA256_HASHSIZE (32)
1584
1585 // DBL_INT_ADD treats two unsigned ints a and b as one 64-bit integer and adds c to it
1586 static inline
DBL_INT_ADD(_ntl_uint32 & a,_ntl_uint32 & b,_ntl_uint32 c)1587 void DBL_INT_ADD(_ntl_uint32& a, _ntl_uint32& b, _ntl_uint32 c)
1588 {
1589 _ntl_uint32 aa = INT32MASK(a);
1590 if (aa > INT32MASK(_ntl_uint32(0xffffffff) - c)) b++;
1591 a = aa + c;
1592 }
1593
1594 #define ROTLEFT(a,b) (((a) << (b)) | (INT32MASK(a) >> (32-(b))))
1595 #define ROTRIGHT(a,b) ((INT32MASK(a) >> (b)) | ((a) << (32-(b))))
1596
1597 #define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
1598 #define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
1599 #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
1600 #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
1601 #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ (INT32MASK(x) >> 3))
1602 #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ (INT32MASK(x) >> 10))
1603
1604 struct SHA256_CTX {
1605 unsigned char data[64];
1606 _ntl_uint32 datalen;
1607 _ntl_uint32 bitlen[2];
1608 _ntl_uint32 state[8];
1609 };
1610
1611 static const _ntl_uint32 sha256_const[64] = {
1612 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1613 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1614 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1615 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1616 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1617 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1618 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1619 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1620 };
1621
1622
1623 static
sha256_transform(SHA256_CTX & ctx,unsigned char * data)1624 void sha256_transform(SHA256_CTX& ctx, unsigned char *data)
1625 {
1626 _ntl_uint32 a,b,c,d,e,f,g,h,i,j,t1,t2,m[64];
1627
1628 for (i=0,j=0; i < 16; ++i, j += 4)
1629 m[i] = (data[j] << 24) | (data[j+1] << 16) | (data[j+2] << 8) | (data[j+3]);
1630 for ( ; i < 64; ++i)
1631 m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16];
1632
1633 a = ctx.state[0];
1634 b = ctx.state[1];
1635 c = ctx.state[2];
1636 d = ctx.state[3];
1637 e = ctx.state[4];
1638 f = ctx.state[5];
1639 g = ctx.state[6];
1640 h = ctx.state[7];
1641
1642 for (i = 0; i < 64; ++i) {
1643 t1 = h + EP1(e) + CH(e,f,g) + sha256_const[i] + m[i];
1644 t2 = EP0(a) + MAJ(a,b,c);
1645 h = g;
1646 g = f;
1647 f = e;
1648 e = d + t1;
1649 d = c;
1650 c = b;
1651 b = a;
1652 a = t1 + t2;
1653 }
1654
1655 ctx.state[0] += a;
1656 ctx.state[1] += b;
1657 ctx.state[2] += c;
1658 ctx.state[3] += d;
1659 ctx.state[4] += e;
1660 ctx.state[5] += f;
1661 ctx.state[6] += g;
1662 ctx.state[7] += h;
1663 }
1664
1665 static
sha256_init(SHA256_CTX & ctx)1666 void sha256_init(SHA256_CTX& ctx)
1667 {
1668 ctx.datalen = 0;
1669 ctx.bitlen[0] = 0;
1670 ctx.bitlen[1] = 0;
1671 ctx.state[0] = 0x6a09e667;
1672 ctx.state[1] = 0xbb67ae85;
1673 ctx.state[2] = 0x3c6ef372;
1674 ctx.state[3] = 0xa54ff53a;
1675 ctx.state[4] = 0x510e527f;
1676 ctx.state[5] = 0x9b05688c;
1677 ctx.state[6] = 0x1f83d9ab;
1678 ctx.state[7] = 0x5be0cd19;
1679 }
1680
1681 static
sha256_update(SHA256_CTX & ctx,const unsigned char * data,_ntl_uint32 len)1682 void sha256_update(SHA256_CTX& ctx, const unsigned char *data, _ntl_uint32 len)
1683 {
1684 _ntl_uint32 i;
1685
1686 for (i=0; i < len; ++i) {
1687 ctx.data[ctx.datalen] = data[i];
1688 ctx.datalen++;
1689 if (ctx.datalen == 64) {
1690 sha256_transform(ctx,ctx.data);
1691 DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],512);
1692 ctx.datalen = 0;
1693 }
1694 }
1695 }
1696
1697 static
sha256_final(SHA256_CTX & ctx,unsigned char * hash,long hlen=SHA256_HASHSIZE)1698 void sha256_final(SHA256_CTX& ctx, unsigned char *hash,
1699 long hlen=SHA256_HASHSIZE)
1700 {
1701 _ntl_uint32 i, j;
1702
1703 i = ctx.datalen;
1704
1705 // Pad whatever data is left in the buffer.
1706 if (ctx.datalen < 56) {
1707 ctx.data[i++] = 0x80;
1708 while (i < 56)
1709 ctx.data[i++] = 0x00;
1710 }
1711 else {
1712 ctx.data[i++] = 0x80;
1713 while (i < 64)
1714 ctx.data[i++] = 0x00;
1715 sha256_transform(ctx,ctx.data);
1716 memset(ctx.data,0,56);
1717 }
1718
1719 // Append to the padding the total message's length in bits and transform.
1720 DBL_INT_ADD(ctx.bitlen[0],ctx.bitlen[1],ctx.datalen * 8);
1721
1722 ctx.data[63] = ctx.bitlen[0];
1723 ctx.data[62] = ctx.bitlen[0] >> 8;
1724 ctx.data[61] = ctx.bitlen[0] >> 16;
1725 ctx.data[60] = ctx.bitlen[0] >> 24;
1726 ctx.data[59] = ctx.bitlen[1];
1727 ctx.data[58] = ctx.bitlen[1] >> 8;
1728 ctx.data[57] = ctx.bitlen[1] >> 16;
1729 ctx.data[56] = ctx.bitlen[1] >> 24;
1730 sha256_transform(ctx,ctx.data);
1731
1732 for (i = 0; i < 8; i++) {
1733 _ntl_uint32 w = ctx.state[i];
1734 for (j = 0; j < 4; j++) {
1735 if (hlen <= 0) break;
1736 hash[4*i + j] = w >> (24-j*8);
1737 hlen--;
1738 }
1739 }
1740
1741 }
1742
1743
1744
1745 static
sha256(const unsigned char * data,long dlen,unsigned char * hash,long hlen=SHA256_HASHSIZE)1746 void sha256(const unsigned char *data, long dlen, unsigned char *hash,
1747 long hlen=SHA256_HASHSIZE)
1748 {
1749 if (dlen < 0) dlen = 0;
1750 if (hlen < 0) hlen = 0;
1751
1752 SHA256_CTX ctx;
1753 sha256_init(ctx);
1754
1755 const long BLKSIZE = 4096;
1756
1757 long i;
1758 for (i = 0; i <= dlen-BLKSIZE; i += BLKSIZE)
1759 sha256_update(ctx, data + i, BLKSIZE);
1760
1761 if (i < dlen)
1762 sha256_update(ctx, data + i, dlen - i);
1763
1764 sha256_final(ctx, hash, hlen);
1765 }
1766
1767
1768 static
hmac_sha256(const unsigned char * key,long klen,const unsigned char * data,long dlen,unsigned char * hash,long hlen=SHA256_HASHSIZE)1769 void hmac_sha256(const unsigned char *key, long klen,
1770 const unsigned char *data, long dlen,
1771 unsigned char *hash, long hlen=SHA256_HASHSIZE)
1772 {
1773 if (klen < 0) klen = 0;
1774 if (dlen < 0) dlen = 0;
1775 if (hlen < 0) hlen = 0;
1776
1777 unsigned char K[SHA256_BLOCKSIZE];
1778 unsigned char tmp[SHA256_HASHSIZE];
1779
1780 long i;
1781
1782 if (klen <= SHA256_BLOCKSIZE) {
1783 for (i = 0; i < klen; i++)
1784 K[i] = key[i];
1785 for (i = klen; i < SHA256_BLOCKSIZE; i++)
1786 K[i] = 0;
1787 }
1788 else {
1789 sha256(key, klen, K, SHA256_BLOCKSIZE);
1790 for (i = SHA256_HASHSIZE; i < SHA256_BLOCKSIZE; i++)
1791 K[i] = 0;
1792 }
1793
1794 for (i = 0; i < SHA256_BLOCKSIZE; i++)
1795 K[i] ^= 0x36;
1796
1797 SHA256_CTX ctx;
1798 sha256_init(ctx);
1799 sha256_update(ctx, K, SHA256_BLOCKSIZE);
1800 sha256_update(ctx, data, dlen);
1801 sha256_final(ctx, tmp);
1802
1803 for (i = 0; i < SHA256_BLOCKSIZE; i++)
1804 K[i] ^= (0x36 ^ 0x5C);
1805
1806 sha256_init(ctx);
1807 sha256_update(ctx, K, SHA256_BLOCKSIZE);
1808 sha256_update(ctx, tmp, SHA256_HASHSIZE);
1809 sha256_final(ctx, hash, hlen);
1810 }
1811
1812
1813 // This key derivation uses HMAC with a zero key to derive
1814 // an intermediate key K from the data, and then uses HMAC
1815 // as a PRF in counter mode with key K to derive the final key
1816
DeriveKey(unsigned char * key,long klen,const unsigned char * data,long dlen)1817 void DeriveKey(unsigned char *key, long klen,
1818 const unsigned char *data, long dlen)
1819 {
1820 if (dlen < 0) LogicError("DeriveKey: bad args");
1821 if (klen < 0) LogicError("DeriveKey: bad args");
1822
1823 long i, j;
1824
1825
1826 unsigned char K[SHA256_HASHSIZE];
1827 hmac_sha256(0, 0, data, dlen, K);
1828
1829 // initialize 64-bit counter to zero
1830 unsigned char counter[8];
1831 for (j = 0; j < 8; j++) counter[j] = 0;
1832
1833 for (i = 0; i <= klen-SHA256_HASHSIZE; i += SHA256_HASHSIZE) {
1834 hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i);
1835
1836 // increment counter
1837 for (j = 0; j < 8; j++) {
1838 counter[j]++;
1839 if (counter[j] != 0) break;
1840 }
1841 }
1842
1843 if (i < klen)
1844 hmac_sha256(K, SHA256_HASHSIZE, counter, 8, key+i, klen-i);
1845 }
1846
1847
1848
1849
1850 // ******************** ChaCha20 stuff ***********************
1851
1852 // ============= old stuff
1853
1854 #define LE(p) (((_ntl_uint32)((p)[0])) + ((_ntl_uint32)((p)[1]) << 8) + \
1855 ((_ntl_uint32)((p)[2]) << 16) + ((_ntl_uint32)((p)[3]) << 24))
1856
1857 #define FROMLE(p, x) (p)[0] = (x), (p)[1] = ((x) >> 8), \
1858 (p)[2] = ((x) >> 16), (p)[3] = ((x) >> 24)
1859
1860
1861 #define QUARTERROUND(x, a, b, c, d) \
1862 x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 16), \
1863 x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 12), \
1864 x[a] += x[b], x[d] = ROTLEFT(x[d] ^ x[a], 8), \
1865 x[c] += x[d], x[b] = ROTLEFT(x[b] ^ x[c], 7)
1866
1867
1868 static
salsa20_core(_ntl_uint32 * data)1869 void salsa20_core(_ntl_uint32* data)
1870 {
1871 long i;
1872
1873 for (i = 0; i < 10; i++) {
1874 QUARTERROUND(data, 0, 4, 8, 12);
1875 QUARTERROUND(data, 1, 5, 9, 13);
1876 QUARTERROUND(data, 2, 6, 10, 14);
1877 QUARTERROUND(data, 3, 7, 11, 15);
1878 QUARTERROUND(data, 0, 5, 10, 15);
1879 QUARTERROUND(data, 1, 6, 11, 12);
1880 QUARTERROUND(data, 2, 7, 8, 13);
1881 QUARTERROUND(data, 3, 4, 9, 14);
1882 }
1883 }
1884
1885
1886 // key K must be exactly 32 bytes
1887 static
salsa20_init(_ntl_uint32 * state,const unsigned char * K)1888 void salsa20_init(_ntl_uint32 *state, const unsigned char *K)
1889 {
1890 static const _ntl_uint32 chacha_const[4] =
1891 { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 };
1892
1893 long i;
1894
1895 for (i = 0; i < 4; i++)
1896 state[i] = chacha_const[i];
1897
1898 for (i = 4; i < 12; i++)
1899 state[i] = LE(K + 4*(i-4));
1900
1901 for (i = 12; i < 16; i++)
1902 state[i] = 0;
1903 }
1904
1905
1906
1907 // state and data are of length 16
1908 static
salsa20_apply(_ntl_uint32 * state,_ntl_uint32 * data)1909 void salsa20_apply(_ntl_uint32 *state, _ntl_uint32 *data)
1910 {
1911 long i;
1912
1913 for (i = 0; i < 16; i++) data[i] = state[i];
1914
1915 salsa20_core(data);
1916
1917 for (i = 0; i < 16; i++) data[i] += state[i];
1918
1919 for (i = 12; i < 14; i++) {
1920 state[i]++;
1921 state[i] = INT32MASK(state[i]);
1922 if (state[i] != 0) break;
1923 }
1924 }
1925
1926
1927
old_RandomStream(const unsigned char * key)1928 old_RandomStream::old_RandomStream(const unsigned char *key)
1929 {
1930 salsa20_init(state, key);
1931 pos = 64;
1932 }
1933
1934
do_get(unsigned char * res,long n)1935 void old_RandomStream::do_get(unsigned char *res, long n)
1936 {
1937 if (n < 0) LogicError("RandomStream::get: bad args");
1938
1939 long i, j;
1940
1941 if (n <= 64-pos) {
1942 for (i = 0; i < n; i++) res[i] = buf[pos+i];
1943 pos += n;
1944 return;
1945 }
1946
1947 // read remainder of buffer
1948 for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
1949 n -= 64-pos;
1950 res += 64-pos;
1951 pos = 64;
1952
1953 _ntl_uint32 wdata[16];
1954
1955 // read 64-byte chunks
1956 for (i = 0; i <= n-64; i += 64) {
1957 salsa20_apply(state, wdata);
1958 for (j = 0; j < 16; j++)
1959 FROMLE(res + i + 4*j, wdata[j]);
1960 }
1961
1962 if (i < n) {
1963 salsa20_apply(state, wdata);
1964
1965 for (j = 0; j < 16; j++)
1966 FROMLE(buf + 4*j, wdata[j]);
1967
1968 pos = n-i;
1969 for (j = 0; j < pos; j++)
1970 res[i+j] = buf[j];
1971 }
1972 }
1973
1974 #if defined(NTL_RANDOM_AES256CTR)
1975
1976 /* Size must be a multiple of AES block-size (16 bytes). */
1977 #define BUFSIZE 4096
1978 //#define BUFSIZE 8192
1979
1980 static void
inc32(unsigned char ctr[16])1981 inc32(unsigned char ctr[16])
1982 {
1983 int i, c = 1;
1984
1985 for (i = 0; i < 4; i++) {
1986 c += ctr[15 - i];
1987 ctr[15 - i] = (unsigned char)c;
1988 c >>= 8;
1989 }
1990 }
1991
1992 #if defined(NTL_HAVE_AES_NI) && defined(NTL_HAVE_AVX2)
1993
1994 /*****************************************************************
1995 This optimized AES-256 implementation is derived from public
1996 domain code.
1997
1998 Authors:
1999 Romain Dolbeau
2000
2001 Obtained from:
2002 https://github.com/floodyberry/supercop/blob/master/crypto_stream/aes256ctr/dolbeau/aesenc-int/aesenc-int.c
2003 */
2004
2005 #ifdef __INTEL_COMPILER
2006 #define ALIGN16 __declspec(align(16))
2007 #define ALIGN32 __declspec(align(32))
2008 #define ALIGN64 __declspec(align(64))
2009 #else // assume GCC
2010 #define ALIGN16 __attribute__((aligned(16)))
2011 #define ALIGN32 __attribute__((aligned(32)))
2012 #define ALIGN64 __attribute__((aligned(64)))
2013 #ifndef _bswap64
2014 #define _bswap64(a) __builtin_bswap64(a)
2015 #endif
2016 #ifndef _bswap
2017 #define _bswap(a) __builtin_bswap(a)
2018 #endif
2019 #endif
2020
aesni_key256_expand(const unsigned char * key,__m128i rkeys[16])2021 static inline void aesni_key256_expand(const unsigned char* key, __m128i rkeys[16]) {
2022 __m128i key0 = _mm_loadu_si128((const __m128i *)(key+0));
2023 __m128i key1 = _mm_loadu_si128((const __m128i *)(key+16));
2024 __m128i temp0, temp1, temp2, temp4;
2025 int idx = 0;
2026
2027 rkeys[idx++] = key0;
2028 temp0 = key0;
2029 temp2 = key1;
2030
2031 /* blockshift-based block by Cedric Bourrasset & Romain Dolbeau */
2032 #define BLOCK1(IMM) \
2033 temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \
2034 rkeys[idx++] = temp2; \
2035 temp4 = _mm_slli_si128(temp0,4); \
2036 temp0 = _mm_xor_si128(temp0,temp4); \
2037 temp4 = _mm_slli_si128(temp0,8); \
2038 temp0 = _mm_xor_si128(temp0,temp4); \
2039 temp1 = _mm_shuffle_epi32(temp1,0xff); \
2040 temp0 = _mm_xor_si128(temp0,temp1)
2041
2042 #define BLOCK2(IMM) \
2043 temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \
2044 rkeys[idx++] = temp0; \
2045 temp4 = _mm_slli_si128(temp2,4); \
2046 temp2 = _mm_xor_si128(temp2,temp4); \
2047 temp4 = _mm_slli_si128(temp2,8); \
2048 temp2 = _mm_xor_si128(temp2,temp4); \
2049 temp1 = _mm_shuffle_epi32(temp1,0xaa); \
2050 temp2 = _mm_xor_si128(temp2,temp1)
2051
2052 BLOCK1(0x01);
2053 BLOCK2(0x01);
2054
2055 BLOCK1(0x02);
2056 BLOCK2(0x02);
2057
2058 BLOCK1(0x04);
2059 BLOCK2(0x04);
2060
2061 BLOCK1(0x08);
2062 BLOCK2(0x08);
2063
2064 BLOCK1(0x10);
2065 BLOCK2(0x10);
2066
2067 BLOCK1(0x20);
2068 BLOCK2(0x20);
2069
2070 BLOCK1(0x40);
2071 rkeys[idx++] = temp0;
2072 }
2073
2074 /** single, by-the-book AES encryption with AES-NI */
aesni_encrypt1(unsigned char * out,unsigned char * n,__m128i rkeys[16])2075 static inline void aesni_encrypt1(unsigned char *out, unsigned char *n, __m128i rkeys[16]) {
2076 __m128i nv = _mm_load_si128((const __m128i *)n);
2077 int i;
2078 __m128i temp = _mm_xor_si128(nv, rkeys[0]);
2079 #if 0
2080 // This pragma is not recognized by GCC < 8
2081 #pragma unroll(13)
2082 for (i = 1 ; i < 14 ; i++) {
2083 temp = _mm_aesenc_si128(temp, rkeys[i]);
2084 }
2085 #else
2086 temp = _mm_aesenc_si128(temp, rkeys[ 1]);
2087 temp = _mm_aesenc_si128(temp, rkeys[ 2]);
2088 temp = _mm_aesenc_si128(temp, rkeys[ 3]);
2089 temp = _mm_aesenc_si128(temp, rkeys[ 4]);
2090 temp = _mm_aesenc_si128(temp, rkeys[ 5]);
2091 temp = _mm_aesenc_si128(temp, rkeys[ 6]);
2092 temp = _mm_aesenc_si128(temp, rkeys[ 7]);
2093 temp = _mm_aesenc_si128(temp, rkeys[ 8]);
2094 temp = _mm_aesenc_si128(temp, rkeys[ 9]);
2095 temp = _mm_aesenc_si128(temp, rkeys[10]);
2096 temp = _mm_aesenc_si128(temp, rkeys[11]);
2097 temp = _mm_aesenc_si128(temp, rkeys[12]);
2098 temp = _mm_aesenc_si128(temp, rkeys[13]);
2099 #endif
2100 temp = _mm_aesenclast_si128(temp, rkeys[14]);
2101 _mm_store_si128((__m128i*)(out), temp);
2102 }
2103
2104 /** increment the 16-bytes nonce ;
2105 this really should be improved somehow...
2106 but it's not yet time-critical, because we
2107 use the vector variant anyway */
incle(unsigned char n[16])2108 static inline void incle(unsigned char n[16]) {
2109 /* unsigned long long out; */
2110 /* unsigned char carry; */
2111 unsigned long long *n_ = (unsigned long long*)n;
2112 n_[1]++;
2113 if (n_[1] == 0)
2114 n_[0] ++;
2115 /* perhaps this will be efficient on broadwell ? */
2116 /* carry = _addcarry_u64(0, n_[1], 1ULL, &out); */
2117 /* carry = _addcarry_u64(carry, n_[0], 0ULL, &out); */
2118 }
2119
2120 /** multiple-blocks-at-once AES encryption with AES-NI ;
2121 on Haswell, aesenc as a latency of 7 and a througput of 1
2122 so the sequence of aesenc should be bubble-free, if you
2123 have at least 8 blocks. Let's build an arbitratry-sized
2124 function */
2125 /* Step 1 : loading the nonce */
2126 /* load & increment the n vector (non-vectorized, unused for now) */
2127 #define NVx(a) \
2128 __m128i nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *)n), _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); incle(n)
2129 /* load the incremented n vector (vectorized, probably buggy) */
2130 #define NVxV_DEC(a) \
2131 __m128i nv##a;
2132 #define NVxV_NOWRAP(a) \
2133 nv##a = _mm_shuffle_epi8(_mm_add_epi64(nv0i, _mm_set_epi64x(a,0)), _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7))
2134 #define NVxV_WRAP(a) \
2135 __m128i ad##a = _mm_add_epi64(nv0i, _mm_set_epi64x(a,a>=wrapnumber?1:0)); \
2136 nv##a = _mm_shuffle_epi8(ad##a, _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7))
2137
2138 /* Step 2 : define value in round one (xor with subkey #0, aka key) */
2139 #define TEMPx(a) \
2140 __m128i temp##a = _mm_xor_si128(nv##a, rkeys[0])
2141
2142 /* Step 3: one round of AES */
2143 #define AESENCx(a) \
2144 temp##a = _mm_aesenc_si128(temp##a, rkeys[i]);
2145
2146 /* Step 4: last round of AES */
2147 #define AESENCLASTx(a) \
2148 temp##a = _mm_aesenclast_si128(temp##a, rkeys[14]);
2149
2150 /* Step 5: store result */
2151 #define STOREx(a) \
2152 _mm_store_si128((__m128i*)(out+(a*16)), temp##a);
2153
2154 /* all the MAKE* macros are for automatic explicit unrolling */
2155 #define MAKE4(X) \
2156 X(0);X(1);X(2);X(3)
2157
2158 #define MAKE6(X) \
2159 X(0);X(1);X(2);X(3); \
2160 X(4);X(5)
2161
2162 #define MAKE7(X) \
2163 X(0);X(1);X(2);X(3); \
2164 X(4);X(5);X(6)
2165
2166 #define MAKE8(X) \
2167 X(0);X(1);X(2);X(3); \
2168 X(4);X(5);X(6);X(7)
2169
2170 #define MAKE10(X) \
2171 X(0);X(1);X(2);X(3); \
2172 X(4);X(5);X(6);X(7); \
2173 X(8);X(9)
2174
2175 #define MAKE12(X) \
2176 X(0);X(1);X(2);X(3); \
2177 X(4);X(5);X(6);X(7); \
2178 X(8);X(9);X(10);X(11)
2179
2180 #define MAKE16(X) \
2181 X(0);X(1);X(2);X(3); \
2182 X(4);X(5);X(6);X(7); \
2183 X(8);X(9);X(10);X(11); \
2184 X(12);X(13);X(14);X(15)
2185
2186 /* create a function of unrolling N ; the MAKEN is the unrolling
2187 macro, defined above. The N in MAKEN must match N, obviously. */
2188 #define FUNC(N, MAKEN) \
2189 static inline void aesni_encrypt##N(unsigned char *out, unsigned char *n, __m128i rkeys[16]) { \
2190 __m128i nv0i = _mm_load_si128((const __m128i *)n); \
2191 long long nl = *(long long*)&n[8]; \
2192 MAKEN(NVxV_DEC); \
2193 /* check for nonce wraparound */ \
2194 if ((nl < 0) && (nl + N) >= 0) { \
2195 int wrapnumber = (int)(N - (nl+N)); \
2196 MAKEN(NVxV_WRAP); \
2197 _mm_storeu_si128((__m128i*)n, _mm_add_epi64(nv0i, _mm_set_epi64x(N,1))); \
2198 } else { \
2199 MAKEN(NVxV_NOWRAP); \
2200 _mm_storeu_si128((__m128i*)n, _mm_add_epi64(nv0i, _mm_set_epi64x(N,0))); \
2201 } \
2202 int i; \
2203 MAKEN(TEMPx); \
2204 for (i = 1 ; i < 14 ; i++) { \
2205 MAKEN(AESENCx); \
2206 } \
2207 MAKEN(AESENCLASTx); \
2208 MAKEN(STOREx); \
2209 }
2210
2211 /* and now building our unrolled function is trivial */
2212 FUNC(4, MAKE4)
2213 FUNC(6, MAKE6)
2214 FUNC(7, MAKE7)
2215 FUNC(8, MAKE8)
2216 FUNC(10, MAKE10)
2217 FUNC(12, MAKE12)
2218 FUNC(16, MAKE16)
2219
crypto_stream(unsigned char * out,unsigned long long outlen,unsigned char * n,const unsigned char * k)2220 void crypto_stream(
2221 unsigned char *out,
2222 unsigned long long outlen,
2223 unsigned char *n,
2224 const unsigned char *k
2225 )
2226 {
2227 __m128i rkeys[16];
2228 ALIGN16 unsigned char n2[16];
2229 unsigned long long i, j;
2230 aesni_key256_expand(k, rkeys);
2231 /* n2 is in byte-reversed (i.e., native little endian)
2232 order to make increment/testing easier */
2233 (*(unsigned long long*)&n2[8]) = _bswap64((*(unsigned long long*)&n[8]));
2234 (*(unsigned long long*)&n2[0]) = _bswap64((*(unsigned long long*)&n[0]));
2235
2236 #define LOOP(iter) \
2237 int lb = iter * 16; \
2238 for (i = 0 ; i < outlen ; i+= lb) { \
2239 ALIGN16 unsigned char outni[lb]; \
2240 aesni_encrypt##iter(outni, n2, rkeys); \
2241 unsigned long long mj = lb; \
2242 if ((i+mj)>=outlen) \
2243 mj = outlen-i; \
2244 for (j = 0 ; j < mj ; j++) \
2245 out[i+j] = outni[j]; \
2246 }
2247
2248 LOOP(8);
2249
2250 (*(unsigned long long*)&n[8]) = _bswap64((*(unsigned long long*)&n2[8]));
2251 (*(unsigned long long*)&n[0]) = _bswap64((*(unsigned long long*)&n2[0]));
2252 }
2253
2254 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2255 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2256 {
2257 crypto_stream(out, BUFSIZE, iv, key);
2258 }
2259
2260 /*****************************************************************/
2261
2262 #elif defined(NTL_HAVE_KMA)
2263
2264 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2265 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2266 {
2267 static const unsigned char zerobuf[BUFSIZE] = {0};
2268 unsigned long fc = CPACF_KMA_GCM_AES_256 | CPACF_KMA_HS | CPACF_KMA_LAAD;
2269 struct {
2270 unsigned char reserved[12];
2271 unsigned int cv;
2272 unsigned char _[48];
2273 unsigned char j0[16];
2274 unsigned char k[32];
2275 } param;
2276
2277 memcpy(¶m.cv, &iv[12], sizeof(param.cv));
2278 param.cv--;
2279 memcpy(¶m.j0[0], &iv[0], sizeof(param.j0) - sizeof(param.cv));
2280 memcpy(¶m.j0[12], ¶m.cv, sizeof(param.cv));
2281 memcpy(param.k, key, sizeof(param.k));
2282
2283 cpacf_kma(fc, ¶m, out, NULL, 0, zerobuf, sizeof(zerobuf));
2284
2285 param.cv++;
2286 memcpy(&iv[12], ¶m.cv, sizeof(param.cv));
2287 }
2288
2289 #else
2290
2291 /*****************************************************************
2292 This AES-256 reference implementation is derived from
2293 public domain code.
2294
2295 Authors:
2296 Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
2297 Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
2298 Paulo Barreto <paulo.barreto@terra.com.br>
2299
2300 Obtained from:
2301 https://github.com/zakird/zdlibc/blob/master/rijndael-alg-fst.c
2302 */
2303
2304 typedef uint8_t u8;
2305 typedef uint32_t u32;
2306
2307 static const u32 Te0[256] = {
2308 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
2309 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
2310 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
2311 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
2312 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
2313 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
2314 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
2315 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
2316 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
2317 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
2318 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
2319 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
2320 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
2321 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
2322 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
2323 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
2324 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
2325 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
2326 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
2327 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
2328 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
2329 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
2330 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
2331 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
2332 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
2333 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
2334 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
2335 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
2336 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
2337 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
2338 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
2339 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
2340 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
2341 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
2342 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
2343 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
2344 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
2345 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
2346 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
2347 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
2348 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
2349 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
2350 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
2351 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
2352 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
2353 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
2354 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
2355 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
2356 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
2357 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
2358 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
2359 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
2360 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
2361 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
2362 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
2363 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
2364 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
2365 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
2366 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
2367 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
2368 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
2369 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
2370 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
2371 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU,
2372 };
2373 static const u32 Te1[256] = {
2374 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
2375 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
2376 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
2377 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
2378 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
2379 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
2380 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
2381 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
2382 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
2383 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
2384 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
2385 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
2386 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
2387 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
2388 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
2389 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
2390 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
2391 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
2392 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
2393 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
2394 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
2395 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
2396 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
2397 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
2398 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
2399 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
2400 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
2401 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
2402 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
2403 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
2404 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
2405 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
2406 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
2407 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
2408 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
2409 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
2410 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
2411 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
2412 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
2413 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
2414 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
2415 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
2416 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
2417 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
2418 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
2419 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
2420 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
2421 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
2422 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
2423 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
2424 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
2425 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
2426 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
2427 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
2428 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
2429 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
2430 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
2431 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
2432 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
2433 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
2434 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
2435 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
2436 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
2437 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U,
2438 };
2439 static const u32 Te2[256] = {
2440 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
2441 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
2442 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
2443 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
2444 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
2445 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
2446 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
2447 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
2448 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
2449 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
2450 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
2451 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
2452 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
2453 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
2454 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
2455 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
2456 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
2457 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
2458 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
2459 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
2460 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
2461 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
2462 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
2463 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
2464 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
2465 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
2466 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
2467 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
2468 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
2469 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
2470 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
2471 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
2472 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
2473 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
2474 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
2475 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
2476 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
2477 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
2478 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
2479 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
2480 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
2481 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
2482 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
2483 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
2484 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
2485 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
2486 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
2487 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
2488 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
2489 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
2490 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
2491 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
2492 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
2493 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
2494 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
2495 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
2496 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
2497 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
2498 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
2499 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
2500 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
2501 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
2502 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
2503 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U,
2504 };
2505 static const u32 Te3[256] = {
2506 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
2507 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
2508 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
2509 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
2510 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
2511 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
2512 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
2513 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
2514 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
2515 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
2516 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
2517 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
2518 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
2519 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
2520 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
2521 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
2522 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
2523 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
2524 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
2525 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
2526 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
2527 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
2528 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
2529 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
2530 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
2531 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
2532 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
2533 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
2534 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
2535 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
2536 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
2537 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
2538 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
2539 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
2540 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
2541 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
2542 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
2543 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
2544 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
2545 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
2546 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
2547 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
2548 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
2549 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
2550 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
2551 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
2552 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
2553 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
2554 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
2555 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
2556 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
2557 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
2558 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
2559 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
2560 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
2561 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
2562 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
2563 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
2564 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
2565 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
2566 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
2567 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
2568 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
2569 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU,
2570 };
2571 static const u32 Te4[256] = {
2572 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
2573 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
2574 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
2575 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
2576 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
2577 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
2578 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
2579 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
2580 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
2581 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
2582 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
2583 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
2584 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
2585 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
2586 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
2587 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
2588 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
2589 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
2590 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
2591 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
2592 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
2593 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
2594 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
2595 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
2596 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
2597 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
2598 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
2599 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
2600 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
2601 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
2602 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
2603 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
2604 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
2605 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
2606 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
2607 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
2608 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
2609 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
2610 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
2611 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
2612 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
2613 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
2614 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
2615 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
2616 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
2617 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
2618 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
2619 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
2620 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
2621 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
2622 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
2623 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
2624 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
2625 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
2626 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
2627 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
2628 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
2629 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
2630 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
2631 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
2632 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
2633 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
2634 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
2635 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U,
2636 };
2637 static const u32 rcon[] = {
2638 0x01000000, 0x02000000, 0x04000000, 0x08000000,
2639 0x10000000, 0x20000000, 0x40000000, 0x80000000,
2640 0x1B000000, 0x36000000,
2641 };
2642
2643 #define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
2644
2645 #ifdef _MSC_VER
2646 #define GETU32(p) SWAP(*((u32 *)(p)))
2647 #define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
2648 #else
2649 #define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
2650 #define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
2651 #endif
2652
2653 /**
2654 * Expand the cipher key into the encryption key schedule.
2655 */
AES256KeySetupEnc(u32 rk[60],const u8 cipherKey[32])2656 void AES256KeySetupEnc(u32 rk[60], const u8 cipherKey[32]) {
2657 int i = 0;
2658 u32 temp;
2659
2660 rk[0] = GETU32(cipherKey );
2661 rk[1] = GETU32(cipherKey + 4);
2662 rk[2] = GETU32(cipherKey + 8);
2663 rk[3] = GETU32(cipherKey + 12);
2664 rk[4] = GETU32(cipherKey + 16);
2665 rk[5] = GETU32(cipherKey + 20);
2666 rk[6] = GETU32(cipherKey + 24);
2667 rk[7] = GETU32(cipherKey + 28);
2668
2669 for (;;) {
2670 temp = rk[ 7];
2671
2672 rk[ 8] = rk[ 0] ^
2673 (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
2674 (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
2675 (Te4[(temp ) & 0xff] & 0x0000ff00) ^
2676 (Te4[(temp >> 24) ] & 0x000000ff) ^
2677 rcon[i];
2678
2679 rk[ 9] = rk[ 1] ^ rk[ 8];
2680 rk[10] = rk[ 2] ^ rk[ 9];
2681 rk[11] = rk[ 3] ^ rk[10];
2682
2683 if (++i == 7)
2684 return;
2685
2686 temp = rk[11];
2687
2688 rk[12] = rk[ 4] ^
2689 (Te4[(temp >> 24) ] & 0xff000000) ^
2690 (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
2691 (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
2692 (Te4[(temp ) & 0xff] & 0x000000ff);
2693
2694 rk[13] = rk[ 5] ^ rk[12];
2695 rk[14] = rk[ 6] ^ rk[13];
2696 rk[15] = rk[ 7] ^ rk[14];
2697
2698 rk += 8;
2699 }
2700 }
2701
AES256Encrypt(const u32 rk[60],const u8 pt[16],u8 ct[16])2702 void AES256Encrypt(const u32 rk[60], const u8 pt[16], u8 ct[16]) {
2703 u32 s0, s1, s2, s3, t0, t1, t2, t3;
2704 int r, Nr = 14;
2705
2706 /*
2707 * map byte array block to cipher state
2708 * and add initial round key:
2709 */
2710 s0 = GETU32(pt ) ^ rk[0];
2711 s1 = GETU32(pt + 4) ^ rk[1];
2712 s2 = GETU32(pt + 8) ^ rk[2];
2713 s3 = GETU32(pt + 12) ^ rk[3];
2714
2715 /*
2716 * Nr - 1 full rounds:
2717 */
2718 r = Nr >> 1;
2719
2720 for (;;) {
2721 t0 = Te0[(s0 >> 24) ] ^
2722 Te1[(s1 >> 16) & 0xff] ^
2723 Te2[(s2 >> 8) & 0xff] ^
2724 Te3[(s3 ) & 0xff] ^
2725 rk[4];
2726 t1 = Te0[(s1 >> 24) ] ^
2727 Te1[(s2 >> 16) & 0xff] ^
2728 Te2[(s3 >> 8) & 0xff] ^
2729 Te3[(s0 ) & 0xff] ^
2730 rk[5];
2731 t2 = Te0[(s2 >> 24) ] ^
2732 Te1[(s3 >> 16) & 0xff] ^
2733 Te2[(s0 >> 8) & 0xff] ^
2734 Te3[(s1 ) & 0xff] ^
2735 rk[6];
2736 t3 = Te0[(s3 >> 24) ] ^
2737 Te1[(s0 >> 16) & 0xff] ^
2738 Te2[(s1 >> 8) & 0xff] ^
2739 Te3[(s2 ) & 0xff] ^
2740 rk[7];
2741
2742 rk += 8;
2743
2744 if (--r == 0)
2745 break;
2746
2747 s0 = Te0[(t0 >> 24) ] ^
2748 Te1[(t1 >> 16) & 0xff] ^
2749 Te2[(t2 >> 8) & 0xff] ^
2750 Te3[(t3 ) & 0xff] ^
2751 rk[0];
2752 s1 = Te0[(t1 >> 24) ] ^
2753 Te1[(t2 >> 16) & 0xff] ^
2754 Te2[(t3 >> 8) & 0xff] ^
2755 Te3[(t0 ) & 0xff] ^
2756 rk[1];
2757 s2 = Te0[(t2 >> 24) ] ^
2758 Te1[(t3 >> 16) & 0xff] ^
2759 Te2[(t0 >> 8) & 0xff] ^
2760 Te3[(t1 ) & 0xff] ^
2761 rk[2];
2762 s3 = Te0[(t3 >> 24) ] ^
2763 Te1[(t0 >> 16) & 0xff] ^
2764 Te2[(t1 >> 8) & 0xff] ^
2765 Te3[(t2 ) & 0xff] ^
2766 rk[3];
2767 }
2768 /*
2769 * apply last round and
2770 * map cipher state to byte array block:
2771 */
2772 s0 = (Te4[(t0 >> 24) ] & 0xff000000) ^
2773 (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
2774 (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
2775 (Te4[(t3 ) & 0xff] & 0x000000ff) ^
2776 rk[0];
2777
2778 PUTU32(ct , s0);
2779
2780 s1 = (Te4[(t1 >> 24) ] & 0xff000000) ^
2781 (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
2782 (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^
2783 (Te4[(t0 ) & 0xff] & 0x000000ff) ^
2784 rk[1];
2785
2786 PUTU32(ct + 4, s1);
2787
2788 s2 = (Te4[(t2 >> 24) ] & 0xff000000) ^
2789 (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
2790 (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
2791 (Te4[(t1 ) & 0xff] & 0x000000ff) ^
2792 rk[2];
2793
2794 PUTU32(ct + 8, s2);
2795
2796 s3 = (Te4[(t3 >> 24) ] & 0xff000000) ^
2797 (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
2798 (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^
2799 (Te4[(t2 ) & 0xff] & 0x000000ff) ^
2800 rk[3];
2801
2802 PUTU32(ct + 12, s3);
2803 }
2804
2805 /*****************************************************************/
2806
2807 static void
aes256ctr_stream(unsigned char out[BUFSIZE],unsigned char iv[16],const unsigned char key[32])2808 aes256ctr_stream(unsigned char out[BUFSIZE], unsigned char iv[16], const unsigned char key[32])
2809 {
2810 u32 rk[60];
2811 int i;
2812
2813 AES256KeySetupEnc(rk, key);
2814
2815 for (i = 0; i < BUFSIZE; i += 16) {
2816 AES256Encrypt(rk, iv, out + i);
2817 inc32(iv);
2818 }
2819 }
2820
2821 #endif
2822
2823 struct RandomStream_impl {
2824 unsigned char key[32];
2825 unsigned char iv[16];
2826 unsigned char buf[BUFSIZE];
2827
2828 explicit
RandomStream_implRandomStream_impl2829 RandomStream_impl(const unsigned char *k)
2830 {
2831 memcpy(key, k, sizeof(key));
2832 memset(iv, 0, sizeof(iv));
2833 iv[15] = 1; // nonce = 1
2834 }
2835
2836 const unsigned char *
get_bufRandomStream_impl2837 get_buf() const
2838 {
2839 return buf + sizeof(key);
2840 }
2841
2842 long
get_buf_lenRandomStream_impl2843 get_buf_len() const
2844 {
2845 return sizeof(buf) - sizeof(key);
2846 }
2847
get_bytesRandomStream_impl2848 long get_bytes(unsigned char *res, long n, long pos)
2849 {
2850 size_t len;
2851
2852 if (n < 0)
2853 LogicError("RandomStream::get: bad args");
2854
2855 if (n > 0 && sizeof(buf) - sizeof(key) - pos > 0) {
2856 len = min((size_t)n, sizeof(buf) - sizeof(key) - pos);
2857 memcpy(res, buf + sizeof(key) + pos, len);
2858
2859 n -= len;
2860 res += len;
2861 pos += len;
2862 }
2863
2864 while (n > 0) {
2865 aes256ctr_stream(buf, iv, key);
2866 memcpy(key, buf, sizeof(key));
2867
2868 len = min((size_t)n, sizeof(buf) - sizeof(key));
2869 memcpy(res, buf + sizeof(key), len);
2870
2871 n -= len;
2872 res += len;
2873 pos = len;
2874 }
2875
2876 return pos;
2877 }
2878
set_nonceRandomStream_impl2879 void set_nonce(unsigned long nonce)
2880 {
2881 // low-order 8 bytes of iv set to zero
2882 // high-order 8 bytes of iv set to nonce
2883 memset(iv, 0, sizeof(iv));
2884 iv[ 8] = (unsigned char) nonce; nonce >>= 8;
2885 iv[ 9] = (unsigned char) nonce; nonce >>= 8;
2886 iv[10] = (unsigned char) nonce; nonce >>= 8;
2887 iv[11] = (unsigned char) nonce; nonce >>= 8;
2888 iv[12] = (unsigned char) nonce; nonce >>= 8;
2889 iv[13] = (unsigned char) nonce; nonce >>= 8;
2890 iv[14] = (unsigned char) nonce; nonce >>= 8;
2891 iv[15] = (unsigned char) nonce; nonce >>= 8;
2892 }
2893 };
2894
2895 #else // defined(NTL_RANDOM_AES256CTR)
2896
2897 #if (defined(NTL_HAVE_AVX2) || defined(NTL_HAVE_SSSE3))
2898
2899
2900 /*****************************************************************
2901
2902 This AVX2 implementation is derived from public domain code
2903 originally developed by Martin Goll Shay Gueron, and obtained from
2904 here:
2905
2906 https://github.com/floodyberry/supercop/tree/master/crypto_stream/chacha20/goll_gueron
2907
2908 On a Haswell machine, ths code is about 4.x faster than the vanilla
2909 C code.
2910
2911 The following is the README from that page
2912
2913 ==================================================================
2914
2915 This code implements Daniel J. Bernstein's ChaCha stream cipher in C,
2916 targeting architectures with AVX2 and future AVX512 vector extensions.
2917
2918 The implementation improves the slightly modified implementations of Ted Krovetz in the Chromium Project
2919 (http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/nss/nss/lib/freebl/chacha20/chacha20_vec.c and
2920 http://src.chromium.org/viewvc/chrome/trunk/deps/third_party/openssl/openssl/crypto/chacha/chacha_vec.c)
2921 by using the Advanced Vector Extensions AVX2 and, if available in future, AVX512 to widen the vectorization
2922 to 256-bit, respectively 512-bit.
2923
2924 On Intel's Haswell architecture this implementation (using AVX2) is almost ~2x faster than the fastest
2925 implementation here, when encrypting (decrypting) 2 blocks and more. Also, this implementation is expected
2926 to double the speed again, when encrypting (decrypting) 4 blocks and more, running on a future architecture
2927 with support for AVX512.
2928
2929 Further details and our measurement results are provided in:
2930 Goll, M., and Gueron,S.: Vectorization of ChaCha Stream Cipher. Cryptology ePrint Archive,
2931 Report 2013/759, November, 2013, http://eprint.iacr.org/2013/759.pdf
2932
2933 Developers and authors:
2934 *********************************************************
2935 Martin Goll (1) and Shay Gueron (2, 3),
2936 (1) Ruhr-University Bochum, Germany
2937 (2) University of Haifa, Israel
2938 (3) Intel Corporation, Israel Development Center, Haifa, Israel
2939 *********************************************************
2940
2941 Intellectual Property Notices
2942 -----------------------------
2943
2944 There are no known present or future claims by a copyright holder that the
2945 distribution of this software infringes the copyright. In particular, the author
2946 of the software is not making such claims and does not intend to make such
2947 claims.
2948
2949 There are no known present or future claims by a patent holder that the use of
2950 this software infringes the patent. In particular, the author of the software is
2951 not making such claims and does not intend to make such claims.
2952
2953 Our implementation is in public domain.
2954
2955 *****************************************************************/
2956
2957
2958 // round selector, specified values:
2959 // 8: low security - high speed
2960 // 12: mid security - mid speed
2961 // 20: high security - low speed
2962 #ifndef CHACHA_RNDS
2963 #define CHACHA_RNDS 20
2964 #endif
2965
2966
2967 #if (defined(NTL_HAVE_AVX2))
2968
2969 typedef __m256i ivec_t;
2970
2971 #define DELTA _mm256_set_epi64x(0,2,0,2)
2972 #define START _mm256_set_epi64x(0,1,0,0)
2973 #define NONCE(nonce) _mm256_set_epi64x(nonce, 1, nonce, 0)
2974
2975 #define STOREU_VEC(m,r) _mm256_storeu_si256((__m256i*)(m), r)
2976 #define STORE_VEC(m,r) _mm256_store_si256((__m256i*)(m), r)
2977
2978 #define LOAD_VEC(r,m) r = _mm256_load_si256((const __m256i *)(m))
2979 #define LOADU_VEC(r,m) r = _mm256_loadu_si256((const __m256i *)(m))
2980
2981 #define LOADU_VEC_128(r, m) r = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)(m)))
2982
2983
2984
2985 #define ADD_VEC_32(a,b) _mm256_add_epi32(a, b)
2986 #define ADD_VEC_64(a,b) _mm256_add_epi64(a, b)
2987 #define XOR_VEC(a,b) _mm256_xor_si256(a, b)
2988
2989
2990 #define ROR_VEC_V1(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
2991 #define ROR_VEC_V2(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
2992 #define ROR_VEC_V3(x) _mm256_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
2993 #define ROL_VEC_7(x) XOR_VEC(_mm256_slli_epi32(x, 7), _mm256_srli_epi32(x,25))
2994 #define ROL_VEC_12(x) XOR_VEC(_mm256_slli_epi32(x,12), _mm256_srli_epi32(x,20))
2995
2996 #define ROL_VEC_8(x) _mm256_shuffle_epi8(x,_mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3,14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
2997
2998
2999 #define ROL_VEC_16(x) _mm256_shuffle_epi8(x,_mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
3000
3001
3002
3003 #define WRITEU_VEC(op, d, v0, v1, v2, v3) \
3004 STOREU_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
3005 STOREU_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
3006 STOREU_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
3007 STOREU_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
3008
3009 #define WRITE_VEC(op, d, v0, v1, v2, v3) \
3010 STORE_VEC(op + (d + 0*4), _mm256_permute2x128_si256(v0, v1, 0x20)); \
3011 STORE_VEC(op + (d + 8*4), _mm256_permute2x128_si256(v2, v3, 0x20)); \
3012 STORE_VEC(op + (d +16*4), _mm256_permute2x128_si256(v0, v1, 0x31)); \
3013 STORE_VEC(op + (d +24*4), _mm256_permute2x128_si256(v2, v3, 0x31));
3014
3015 #define SZ_VEC (32)
3016
3017 #define RANSTREAM_NCHUNKS (2)
3018 // leads to a BUFSZ of 512
3019
3020
3021 #elif defined(NTL_HAVE_SSSE3)
3022
3023 typedef __m128i ivec_t;
3024
3025 #define DELTA _mm_set_epi32(0,0,0,1)
3026 #define START _mm_setzero_si128()
3027 #define NONCE(nonce) _mm_set_epi64x(nonce,0)
3028
3029 #define STOREU_VEC(m,r) _mm_storeu_si128((__m128i*)(m), r)
3030 #define STORE_VEC(m,r) _mm_store_si128((__m128i*)(m), r)
3031
3032 #define LOAD_VEC(r,m) r = _mm_load_si128((const __m128i *)(m))
3033 #define LOADU_VEC(r,m) r = _mm_loadu_si128((const __m128i *)(m))
3034
3035 #define LOADU_VEC_128(r, m) r = _mm_loadu_si128((const __m128i*)(m))
3036
3037 #define ADD_VEC_32(a,b) _mm_add_epi32(a, b)
3038 #define ADD_VEC_64(a,b) _mm_add_epi64(a, b)
3039 #define XOR_VEC(a,b) _mm_xor_si128(a, b)
3040
3041
3042 #define ROR_VEC_V1(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(0,3,2,1))
3043 #define ROR_VEC_V2(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(1,0,3,2))
3044 #define ROR_VEC_V3(x) _mm_shuffle_epi32(x,_MM_SHUFFLE(2,1,0,3))
3045 #define ROL_VEC_7(x) XOR_VEC(_mm_slli_epi32(x, 7), _mm_srli_epi32(x,25))
3046 #define ROL_VEC_12(x) XOR_VEC(_mm_slli_epi32(x,12), _mm_srli_epi32(x,20))
3047
3048 #define ROL_VEC_8(x) _mm_shuffle_epi8(x,_mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3))
3049
3050
3051 #define ROL_VEC_16(x) _mm_shuffle_epi8(x,_mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2))
3052
3053
3054 #define WRITEU_VEC(op, d, v0, v1, v2, v3) \
3055 STOREU_VEC(op + (d + 0*4), v0); \
3056 STOREU_VEC(op + (d + 4*4), v1); \
3057 STOREU_VEC(op + (d + 8*4), v2); \
3058 STOREU_VEC(op + (d +12*4), v3);
3059
3060 #define WRITE_VEC(op, d, v0, v1, v2, v3) \
3061 STORE_VEC(op + (d + 0*4), v0); \
3062 STORE_VEC(op + (d + 4*4), v1); \
3063 STORE_VEC(op + (d + 8*4), v2); \
3064 STORE_VEC(op + (d +12*4), v3);
3065
3066 #define SZ_VEC (16)
3067
3068 #define RANSTREAM_NCHUNKS (4)
3069 // leads to a BUFSZ of 512
3070
3071 #else
3072
3073 #error "unsupported architecture"
3074
3075 #endif
3076
3077
3078 #define DQROUND_VECTORS_VEC(a,b,c,d) \
3079 a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_16(d); \
3080 c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_12(b); \
3081 a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_8(d); \
3082 c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_7(b); \
3083 b = ROR_VEC_V1(b); c = ROR_VEC_V2(c); d = ROR_VEC_V3(d); \
3084 a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_16(d); \
3085 c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_12(b); \
3086 a = ADD_VEC_32(a,b); d = XOR_VEC(d,a); d = ROL_VEC_8(d); \
3087 c = ADD_VEC_32(c,d); b = XOR_VEC(b,c); b = ROL_VEC_7(b); \
3088 b = ROR_VEC_V3(b); c = ROR_VEC_V2(c); d = ROR_VEC_V1(d);
3089
3090
3091
3092 #define RANSTREAM_STATESZ (4*SZ_VEC)
3093
3094 #define RANSTREAM_CHUNKSZ (2*RANSTREAM_STATESZ)
3095 #define RANSTREAM_BUFSZ (RANSTREAM_NCHUNKS*RANSTREAM_CHUNKSZ)
3096
3097
3098 struct RandomStream_impl {
3099
3100 AlignedArray<unsigned char> state_store;
3101 AlignedArray<unsigned char> buf_store;
3102 long chunk_count;
3103
allocate_spaceRandomStream_impl3104 void allocate_space()
3105 {
3106 state_store.SetLength(RANSTREAM_STATESZ);
3107 buf_store.SetLength(RANSTREAM_BUFSZ);
3108 }
3109
3110
3111 explicit
RandomStream_implRandomStream_impl3112 RandomStream_impl(const unsigned char *key)
3113 {
3114 allocate_space();
3115
3116 unsigned char *state = state_store.elts();
3117
3118 unsigned int chacha_const[] = {
3119 0x61707865,0x3320646E,0x79622D32,0x6B206574
3120 };
3121
3122
3123 ivec_t d0, d1, d2, d3;
3124 LOADU_VEC_128(d0, chacha_const);
3125 LOADU_VEC_128(d1, key);
3126 LOADU_VEC_128(d2, key+16);
3127
3128 d3 = START;
3129
3130
3131 STORE_VEC(state + 0*SZ_VEC, d0);
3132 STORE_VEC(state + 1*SZ_VEC, d1);
3133 STORE_VEC(state + 2*SZ_VEC, d2);
3134 STORE_VEC(state + 3*SZ_VEC, d3);
3135
3136 chunk_count = 0;
3137 }
3138
RandomStream_implRandomStream_impl3139 RandomStream_impl(const RandomStream_impl& other)
3140 {
3141 allocate_space();
3142 *this = other;
3143 }
3144
operator =RandomStream_impl3145 RandomStream_impl& operator=(const RandomStream_impl& other)
3146 {
3147 std::memcpy(state_store.elts(), other.state_store.elts(), RANSTREAM_STATESZ);
3148 std::memcpy(buf_store.elts(), other.buf_store.elts(), RANSTREAM_BUFSZ);
3149 chunk_count = other.chunk_count;
3150 return *this;
3151 }
3152
3153 const unsigned char *
get_bufRandomStream_impl3154 get_buf() const
3155 {
3156 return buf_store.elts();
3157 }
3158
3159 long
get_buf_lenRandomStream_impl3160 get_buf_len() const
3161 {
3162 return RANSTREAM_BUFSZ;
3163 }
3164
3165 // bytes are generated in chunks of RANSTREAM_BUFSZ bytes, except that
3166 // initially, we may generate a few chunks of RANSTREAM_CHUNKSZ
3167 // bytes. This optimizes a bit for short bursts following a reset.
3168
3169 long
get_bytesRandomStream_impl3170 get_bytes(unsigned char *NTL_RESTRICT res,
3171 long n, long pos)
3172 {
3173 if (n < 0) LogicError("RandomStream::get: bad args");
3174 if (n == 0) return pos;
3175
3176 unsigned char *NTL_RESTRICT buf = buf_store.elts();
3177
3178 if (n <= RANSTREAM_BUFSZ-pos) {
3179 std::memcpy(&res[0], &buf[pos], n);
3180 pos += n;
3181 return pos;
3182 }
3183
3184 unsigned char *NTL_RESTRICT state = state_store.elts();
3185
3186 ivec_t d0, d1, d2, d3;
3187 LOAD_VEC(d0, state + 0*SZ_VEC);
3188 LOAD_VEC(d1, state + 1*SZ_VEC);
3189 LOAD_VEC(d2, state + 2*SZ_VEC);
3190 LOAD_VEC(d3, state + 3*SZ_VEC);
3191
3192
3193 // read remainder of buffer
3194 std::memcpy(&res[0], &buf[pos], RANSTREAM_BUFSZ-pos);
3195 n -= RANSTREAM_BUFSZ-pos;
3196 res += RANSTREAM_BUFSZ-pos;
3197 pos = RANSTREAM_BUFSZ;
3198
3199 long i = 0;
3200 for (; i <= n-RANSTREAM_BUFSZ; i += RANSTREAM_BUFSZ) {
3201
3202 chunk_count |= RANSTREAM_NCHUNKS; // disable small buffer strategy
3203
3204 for (long j = 0; j < RANSTREAM_NCHUNKS; j++) {
3205 ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
3206 ivec_t v4=d0, v5=d1, v6=d2, v7=ADD_VEC_64(d3, DELTA);
3207
3208 for (long k = 0; k < CHACHA_RNDS/2; k++) {
3209 DQROUND_VECTORS_VEC(v0,v1,v2,v3)
3210 DQROUND_VECTORS_VEC(v4,v5,v6,v7)
3211 }
3212
3213 WRITEU_VEC(res+i+j*(8*SZ_VEC), 0, ADD_VEC_32(v0,d0), ADD_VEC_32(v1,d1), ADD_VEC_32(v2,d2), ADD_VEC_32(v3,d3))
3214 d3 = ADD_VEC_64(d3, DELTA);
3215 WRITEU_VEC(res+i+j*(8*SZ_VEC), 4*SZ_VEC, ADD_VEC_32(v4,d0), ADD_VEC_32(v5,d1), ADD_VEC_32(v6,d2), ADD_VEC_32(v7,d3))
3216 d3 = ADD_VEC_64(d3, DELTA);
3217
3218 }
3219
3220 }
3221
3222 if (i < n) {
3223
3224 long nchunks;
3225
3226 if (chunk_count < RANSTREAM_NCHUNKS) {
3227 nchunks = long(cast_unsigned((n-i)+RANSTREAM_CHUNKSZ-1)/RANSTREAM_CHUNKSZ);
3228 chunk_count += nchunks;
3229 }
3230 else
3231 nchunks = RANSTREAM_NCHUNKS;
3232
3233 long pos_offset = RANSTREAM_BUFSZ - nchunks*RANSTREAM_CHUNKSZ;
3234 buf += pos_offset;
3235
3236 for (long j = 0; j < nchunks; j++) {
3237 ivec_t v0=d0, v1=d1, v2=d2, v3=d3;
3238 ivec_t v4=d0, v5=d1, v6=d2, v7=ADD_VEC_64(d3, DELTA);
3239
3240 for (long k = 0; k < CHACHA_RNDS/2; k++) {
3241 DQROUND_VECTORS_VEC(v0,v1,v2,v3)
3242 DQROUND_VECTORS_VEC(v4,v5,v6,v7)
3243 }
3244
3245 WRITE_VEC(buf+j*(8*SZ_VEC), 0, ADD_VEC_32(v0,d0), ADD_VEC_32(v1,d1), ADD_VEC_32(v2,d2), ADD_VEC_32(v3,d3))
3246 d3 = ADD_VEC_64(d3, DELTA);
3247 WRITE_VEC(buf+j*(8*SZ_VEC), 4*SZ_VEC, ADD_VEC_32(v4,d0), ADD_VEC_32(v5,d1), ADD_VEC_32(v6,d2), ADD_VEC_32(v7,d3))
3248 d3 = ADD_VEC_64(d3, DELTA);
3249 }
3250
3251 pos = n-i+pos_offset;
3252 std::memcpy(&res[i], &buf[0], n-i);
3253 }
3254
3255 STORE_VEC(state + 3*SZ_VEC, d3);
3256
3257 return pos;
3258 }
3259
set_nonceRandomStream_impl3260 void set_nonce(unsigned long nonce)
3261 {
3262 unsigned char *state = state_store.elts();
3263 ivec_t d3;
3264 d3 = NONCE(nonce);
3265 STORE_VEC(state + 3*SZ_VEC, d3);
3266 chunk_count = 0;
3267 }
3268
3269 };
3270
3271
3272 #else
3273
3274 struct RandomStream_impl {
3275 _ntl_uint32 state[16];
3276 unsigned char buf[64];
3277
3278 explicit
RandomStream_implRandomStream_impl3279 RandomStream_impl(const unsigned char *key)
3280 {
3281 salsa20_init(state, key);
3282 }
3283
3284 const unsigned char *
get_bufRandomStream_impl3285 get_buf() const
3286 {
3287 return &buf[0];
3288 }
3289
3290 long
get_buf_lenRandomStream_impl3291 get_buf_len() const
3292 {
3293 return 64;
3294 }
3295
get_bytesRandomStream_impl3296 long get_bytes(unsigned char *res, long n, long pos)
3297 {
3298 if (n < 0) LogicError("RandomStream::get: bad args");
3299
3300 long i, j;
3301
3302 if (n <= 64-pos) {
3303 for (i = 0; i < n; i++) res[i] = buf[pos+i];
3304 pos += n;
3305 return pos;
3306 }
3307
3308 // read remainder of buffer
3309 for (i = 0; i < 64-pos; i++) res[i] = buf[pos+i];
3310 n -= 64-pos;
3311 res += 64-pos;
3312 pos = 64;
3313
3314 _ntl_uint32 wdata[16];
3315
3316 // read 64-byte chunks
3317 for (i = 0; i <= n-64; i += 64) {
3318 salsa20_apply(state, wdata);
3319 for (j = 0; j < 16; j++)
3320 FROMLE(res + i + 4*j, wdata[j]);
3321 }
3322
3323 if (i < n) {
3324 salsa20_apply(state, wdata);
3325
3326 for (j = 0; j < 16; j++)
3327 FROMLE(buf + 4*j, wdata[j]);
3328
3329 pos = n-i;
3330 for (j = 0; j < pos; j++)
3331 res[i+j] = buf[j];
3332 }
3333
3334 return pos;
3335 }
3336
set_nonceRandomStream_impl3337 void set_nonce(unsigned long nonce)
3338 {
3339 _ntl_uint32 nonce0, nonce1;
3340
3341 nonce0 = nonce;
3342 nonce0 = INT32MASK(nonce0);
3343
3344 nonce1 = 0;
3345
3346 #if (NTL_BITS_PER_LONG > 32)
3347 nonce1 = nonce >> 32;
3348 nonce1 = INT32MASK(nonce1);
3349 #endif
3350
3351 state[12] = 0;
3352 state[13] = 0;
3353 state[14] = nonce0;
3354 state[15] = nonce1;
3355 }
3356 };
3357
3358
3359 #endif
3360 #endif // defined(NTL_RANDOMSTREAM_AES256CTR)
3361
3362
3363
3364 // Boilerplate PIMPL code
3365
3366 RandomStream_impl *
RandomStream_impl_build(const unsigned char * key)3367 RandomStream_impl_build(const unsigned char *key)
3368 {
3369 UniquePtr<RandomStream_impl> p;
3370 p.make(key);
3371 return p.release();
3372 }
3373
3374 RandomStream_impl *
RandomStream_impl_build(const RandomStream_impl & other)3375 RandomStream_impl_build(const RandomStream_impl& other)
3376 {
3377 UniquePtr<RandomStream_impl> p;
3378 p.make(other);
3379 return p.release();
3380 }
3381
3382 void
RandomStream_impl_copy(RandomStream_impl & x,const RandomStream_impl & y)3383 RandomStream_impl_copy(RandomStream_impl& x, const RandomStream_impl& y)
3384 {
3385 x = y;
3386 }
3387
3388 void
RandomStream_impl_delete(RandomStream_impl * p)3389 RandomStream_impl_delete(RandomStream_impl* p)
3390 {
3391 delete p;
3392 }
3393
3394 const unsigned char *
RandomStream_impl_get_buf(const RandomStream_impl & x)3395 RandomStream_impl_get_buf(const RandomStream_impl& x)
3396 {
3397 return x.get_buf();
3398 }
3399
3400 long
RandomStream_impl_get_buf_len(const RandomStream_impl & x)3401 RandomStream_impl_get_buf_len(const RandomStream_impl& x)
3402 {
3403 return x.get_buf_len();
3404 }
3405
3406 long
RandomStream_impl_get_bytes(RandomStream_impl & impl,unsigned char * res,long n,long pos)3407 RandomStream_impl_get_bytes(RandomStream_impl& impl,
3408 unsigned char *res, long n, long pos)
3409 {
3410 return impl.get_bytes(res, n, pos);
3411 }
3412
3413
3414 void
RandomStream_impl_set_nonce(RandomStream_impl & impl,unsigned long nonce)3415 RandomStream_impl_set_nonce(RandomStream_impl& impl, unsigned long nonce)
3416 {
3417 impl.set_nonce(nonce);
3418 }
3419
3420
3421
3422
3423
3424 NTL_TLS_GLOBAL_DECL(UniquePtr<RandomStream>, CurrentRandomStream);
3425
3426
SetSeed(const RandomStream & s)3427 void SetSeed(const RandomStream& s)
3428 {
3429 NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
3430
3431 if (!CurrentRandomStream)
3432 CurrentRandomStream.make(s);
3433 else
3434 *CurrentRandomStream = s;
3435 }
3436
3437
SetSeed(const unsigned char * data,long dlen)3438 void SetSeed(const unsigned char *data, long dlen)
3439 {
3440 if (dlen < 0) LogicError("SetSeed: bad args");
3441
3442 Vec<unsigned char> key;
3443 key.SetLength(NTL_PRG_KEYLEN);
3444 DeriveKey(key.elts(), NTL_PRG_KEYLEN, data, dlen);
3445
3446 SetSeed(RandomStream(key.elts()));
3447 }
3448
SetSeed(const ZZ & seed)3449 void SetSeed(const ZZ& seed)
3450 {
3451 long nb = NumBytes(seed);
3452
3453 Vec<unsigned char> buf;
3454 buf.SetLength(nb);
3455
3456 BytesFromZZ(buf.elts(), seed, nb);
3457
3458 SetSeed(buf.elts(), nb);
3459 }
3460
3461
3462 static
InitRandomStream()3463 void InitRandomStream()
3464 {
3465 const std::string& id = UniqueID();
3466 SetSeed((const unsigned char *) id.c_str(), id.length());
3467 }
3468
3469 static inline
LocalGetCurrentRandomStream()3470 RandomStream& LocalGetCurrentRandomStream()
3471 {
3472 NTL_TLS_GLOBAL_ACCESS(CurrentRandomStream);
3473
3474 if (!CurrentRandomStream) InitRandomStream();
3475 return *CurrentRandomStream;
3476 }
3477
GetCurrentRandomStream()3478 RandomStream& GetCurrentRandomStream()
3479 {
3480 return LocalGetCurrentRandomStream();
3481 }
3482
3483
3484
3485
3486
3487
3488
3489 static inline
WordFromBytes(const unsigned char * buf,long n)3490 unsigned long WordFromBytes(const unsigned char *buf, long n)
3491 {
3492 unsigned long res = 0;
3493 long i;
3494
3495 for (i = n-1; i >= 0; i--)
3496 res = (res << 8) | buf[i];
3497
3498 return res;
3499 }
3500
3501
RandomWord()3502 unsigned long RandomWord()
3503 {
3504 RandomStream& stream = LocalGetCurrentRandomStream();
3505 unsigned char buf[NTL_BITS_PER_LONG/8];
3506
3507 stream.get(buf, NTL_BITS_PER_LONG/8);
3508 return WordFromBytes(buf, NTL_BITS_PER_LONG/8);
3509 }
3510
3511
VectorRandomWord(long k,unsigned long * x)3512 void VectorRandomWord(long k, unsigned long* x)
3513 {
3514 RandomStream& stream = LocalGetCurrentRandomStream();
3515 unsigned char buf[NTL_BITS_PER_LONG/8];
3516
3517 for (long i = 0; i < k; i++) {
3518 stream.get(buf, NTL_BITS_PER_LONG/8);
3519 x[i] = WordFromBytes(buf, NTL_BITS_PER_LONG/8);
3520 }
3521 }
3522
RandomBits_long(long l)3523 long RandomBits_long(long l)
3524 {
3525 if (l <= 0) return 0;
3526 if (l >= NTL_BITS_PER_LONG)
3527 ResourceError("RandomBits: length too big");
3528
3529 RandomStream& stream = LocalGetCurrentRandomStream();
3530 unsigned char buf[NTL_BITS_PER_LONG/8];
3531 long nb = (l+7)/8;
3532 stream.get(buf, nb);
3533
3534 return long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
3535 }
3536
RandomBits_ulong(long l)3537 unsigned long RandomBits_ulong(long l)
3538 {
3539 if (l <= 0) return 0;
3540 if (l > NTL_BITS_PER_LONG)
3541 ResourceError("RandomBits: length too big");
3542
3543 RandomStream& stream = LocalGetCurrentRandomStream();
3544 unsigned char buf[NTL_BITS_PER_LONG/8];
3545 long nb = (l+7)/8;
3546 stream.get(buf, nb);
3547 unsigned long res = WordFromBytes(buf, nb);
3548 if (l < NTL_BITS_PER_LONG)
3549 res = res & ((1UL << l)-1UL);
3550 return res;
3551 }
3552
RandomLen_long(long l)3553 long RandomLen_long(long l)
3554 {
3555 if (l <= 0) return 0;
3556 if (l == 1) return 1;
3557 if (l >= NTL_BITS_PER_LONG)
3558 ResourceError("RandomLen: length too big");
3559
3560 RandomStream& stream = LocalGetCurrentRandomStream();
3561 unsigned char buf[NTL_BITS_PER_LONG/8];
3562 long nb = ((l-1)+7)/8;
3563 stream.get(buf, nb);
3564 unsigned long res = WordFromBytes(buf, nb);
3565 unsigned long mask = (1UL << (l-1)) - 1UL;
3566 return long((res & mask) | (mask+1UL));
3567 }
3568
3569
RandomBnd(long bnd)3570 long RandomBnd(long bnd)
3571 {
3572 if (bnd <= 1) return 0;
3573
3574 RandomStream& stream = LocalGetCurrentRandomStream();
3575 unsigned char buf[NTL_BITS_PER_LONG/8];
3576 long l = NumBits(bnd-1);
3577 long nb = (l+7)/8;
3578
3579 long tmp;
3580 do {
3581 stream.get(buf, nb);
3582 tmp = long(WordFromBytes(buf, nb) & ((1UL << l)-1UL));
3583 } while (tmp >= bnd);
3584
3585 return tmp;
3586 }
3587
3588
3589
RandomBits(ZZ & x,long l)3590 void RandomBits(ZZ& x, long l)
3591 {
3592 if (l <= 0) {
3593 x = 0;
3594 return;
3595 }
3596
3597 if (NTL_OVERFLOW(l, 1, 0))
3598 ResourceError("RandomBits: length too big");
3599
3600 RandomStream& stream = LocalGetCurrentRandomStream();
3601
3602 long nb = (l+7)/8;
3603 unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
3604
3605 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3606 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3607
3608 buf_mem.SetLength(nb);
3609 unsigned char *buf = buf_mem.elts();
3610
3611 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3612 // pre-allocate to ensure strong ES
3613
3614 stream.get(buf, nb);
3615 buf[nb-1] &= mask;
3616
3617 ZZFromBytes(x, buf, nb);
3618 }
3619
3620
RandomLen(ZZ & x,long l)3621 void RandomLen(ZZ& x, long l)
3622 {
3623 if (l <= 0) {
3624 x = 0;
3625 return;
3626 }
3627
3628 if (l == 1) {
3629 x = 1;
3630 return;
3631 }
3632
3633 if (NTL_OVERFLOW(l, 1, 0))
3634 ResourceError("RandomLen: length too big");
3635
3636 RandomStream& stream = LocalGetCurrentRandomStream();
3637
3638 long nb = (l+7)/8;
3639 unsigned long mask = (1UL << (8 - nb*8 + l)) - 1UL;
3640
3641 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3642 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3643
3644 buf_mem.SetLength(nb);
3645 unsigned char *buf = buf_mem.elts();
3646
3647 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3648 // pre-allocate to ensure strong ES
3649
3650 stream.get(buf, nb);
3651 buf[nb-1] &= mask;
3652 buf[nb-1] |= ((mask >> 1) + 1UL);
3653
3654 ZZFromBytes(x, buf, nb);
3655 }
3656
3657
3658
3659
3660
3661 /**********************************************************
3662
3663 The following implementation of RandomBnd is designed
3664 for speed. It certainly is not resilient against a
3665 timing side-channel attack (but then again, none of these
3666 PRG routines are designed to be).
3667
3668 The naive strategy generates random candidates of the right
3669 bit length until the candidate < bnd.
3670 The idea in this implementation is to generate the high
3671 order two bytes of the candidate first, and compare this
3672 to the high order two bytes of tmp. We can discard the
3673 candidate if this is already too large.
3674
3675 ***********************************************************/
3676
RandomBnd(ZZ & x,const ZZ & bnd)3677 void RandomBnd(ZZ& x, const ZZ& bnd)
3678 {
3679 if (bnd <= 1) {
3680 x = 0;
3681 return;
3682 }
3683
3684 RandomStream& stream = LocalGetCurrentRandomStream();
3685
3686 long l = NumBits(bnd);
3687 long nb = (l+7)/8;
3688
3689 if (nb <= 3) {
3690 long lbnd = conv<long>(bnd);
3691 unsigned char lbuf[3];
3692 long ltmp;
3693
3694 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3695 // pre-allocate to ensure strong ES
3696 do {
3697 stream.get(lbuf, nb);
3698 ltmp = long(WordFromBytes(lbuf, nb) & ((1UL << l)-1UL));
3699 } while (ltmp >= lbnd);
3700
3701 conv(x, ltmp);
3702 return;
3703 }
3704
3705 // deal with possible alias
3706 NTL_ZZRegister(tmp_store);
3707 const ZZ& bnd_ref = ((&x == &bnd) ? (tmp_store = bnd) : bnd);
3708
3709
3710 NTL_ZZRegister(hbnd);
3711 RightShift(hbnd, bnd_ref, (nb-2)*8);
3712 long lhbnd = conv<long>(hbnd);
3713
3714 unsigned long mask = (1UL << (16 - nb*8 + l)) - 1UL;
3715
3716 NTL_TLS_LOCAL(Vec<unsigned char>, buf_mem);
3717 Vec<unsigned char>::Watcher watch_buf_mem(buf_mem);
3718 buf_mem.SetLength(nb);
3719 unsigned char *buf = buf_mem.elts();
3720
3721 unsigned char hbuf[2];
3722
3723 x.SetSize((l + NTL_ZZ_NBITS - 1)/NTL_ZZ_NBITS);
3724 // pre-allocate to ensure strong ES
3725 for (;;) {
3726 stream.get(hbuf, 2);
3727 long hpart = long(WordFromBytes(hbuf, 2) & mask);
3728
3729 if (hpart > lhbnd) continue;
3730
3731 stream.get(buf, nb-2);
3732 buf[nb-2] = ((unsigned long) hpart);
3733 buf[nb-1] = ((unsigned long) hpart) >> 8;
3734
3735 ZZFromBytes(x, buf, nb);
3736 if (hpart < lhbnd || x < bnd_ref) break;
3737 }
3738 }
3739
3740
3741
3742
3743 // More prime generation stuff...
3744
3745 static
Log2(double x)3746 double Log2(double x)
3747 {
3748 static const double log2 = log(2.0); // GLOBAL (relies on C++11 thread-safe init)
3749 return log(x)/log2;
3750 }
3751
3752 // Define p(k,t) to be the conditional probability that a random, odd, k-bit
3753 // number is composite, given that it passes t iterations of the
3754 // Miller-Rabin test.
3755 // If this routine returns a non-zero value, then
3756 // p(k,t) <= 2^{-n}.
3757 // This basically encodes the estimates of Damgard, Landrock, and Pomerance;
3758 // it uses floating point arithmetic, but is coded in such a way
3759 // that its results should be correct, assuming that the log function
3760 // is computed with reasonable precision.
3761 //
3762 // It is assumed that k >= 3 and t >= 1; if this does not hold,
3763 // then 0 is returned.
3764
3765 static
ErrBoundTest(long kk,long tt,long nn)3766 long ErrBoundTest(long kk, long tt, long nn)
3767
3768 {
3769 const double fudge = (1.0 + 1024.0/NTL_FDOUBLE_PRECISION);
3770 const double log2_3 = Log2(3.0);
3771 const double log2_7 = Log2(7.0);
3772 const double log2_20 = Log2(20.0);
3773
3774 double k = kk;
3775 double t = tt;
3776 double n = nn;
3777
3778 if (k < 3 || t < 1) return 0;
3779 if (n < 1) return 1;
3780
3781 // the following test is largely academic
3782 if (9*t > NTL_FDOUBLE_PRECISION) LogicError("ErrBoundTest: t too big");
3783
3784 double log2_k = Log2(k);
3785
3786 if ((n + log2_k)*fudge <= 2*t)
3787 return 1;
3788
3789 if ((2*log2_k + 4.0 + n)*fudge <= 2*sqrt(k))
3790 return 2;
3791
3792 if ((t == 2 && k >= 88) || (3 <= t && 9*t <= k && k >= 21)) {
3793 if ((1.5*log2_k + t + 4.0 + n)*fudge <= 0.5*Log2(t) + 2*(sqrt(t*k)))
3794 return 3;
3795 }
3796
3797 if (k <= 9*t && 4*t <= k && k >= 21) {
3798 if ( ((log2_3 + log2_7 + log2_k + n)*fudge <= log2_20 + 5*t) &&
3799 ((log2_3 + (15.0/4.0)*log2_k + n)*fudge <= log2_7 + k/2 + 2*t) &&
3800 ((2*log2_3 + 2 + log2_k + n)*fudge <= k/4 + 3*t) )
3801 return 4;
3802 }
3803
3804 if (4*t >= k && k >= 21) {
3805 if (((15.0/4.0)*log2_k + n)*fudge <= log2_7 + k/2 + 2*t)
3806 return 5;
3807 }
3808
3809 return 0;
3810 }
3811
3812
GenPrime(ZZ & n,long k,long err)3813 void GenPrime(ZZ& n, long k, long err)
3814 {
3815 if (k <= 1) LogicError("GenPrime: bad length");
3816
3817 if (k > (1L << 20)) ResourceError("GenPrime: length too large");
3818
3819 if (err < 1) err = 1;
3820 if (err > 512) err = 512;
3821
3822 if (k == 2) {
3823 if (RandomBnd(2))
3824 n = 3;
3825 else
3826 n = 2;
3827
3828 return;
3829 }
3830
3831
3832 long t;
3833
3834 t = 1;
3835 while (!ErrBoundTest(k, t, err))
3836 t++;
3837
3838 RandomPrime(n, k, t);
3839 }
3840
3841
GenPrime_long(long k,long err)3842 long GenPrime_long(long k, long err)
3843 {
3844 if (k <= 1) LogicError("GenPrime: bad length");
3845
3846 if (k >= NTL_BITS_PER_LONG) ResourceError("GenPrime: length too large");
3847
3848 if (err < 1) err = 1;
3849 if (err > 512) err = 512;
3850
3851 if (k == 2) {
3852 if (RandomBnd(2))
3853 return 3;
3854 else
3855 return 2;
3856 }
3857
3858 long t;
3859
3860 t = 1;
3861 while (!ErrBoundTest(k, t, err))
3862 t++;
3863
3864 return RandomPrime_long(k, t);
3865 }
3866
MultiThreadedGenGermainPrime(ZZ & n,long k,long err)3867 void MultiThreadedGenGermainPrime(ZZ& n, long k, long err)
3868 {
3869 long nt = AvailableThreads();
3870
3871
3872 long prime_bnd = ComputePrimeBound(k);
3873
3874 if (NumBits(prime_bnd) >= k/2)
3875 prime_bnd = (1L << (k/2-1));
3876
3877 ZZ two;
3878 two = 2;
3879
3880 const long LOCAL_ITER_BOUND = 8;
3881 // since resetting the PRG comes at a certain cost,
3882 // we perform a few iterations with each reset to
3883 // amortize the reset cost.
3884
3885 unsigned long initial_counter = 0;
3886 ZZ seed;
3887 RandomBits(seed, 256);
3888
3889
3890 ZZ overflow_counter;
3891
3892 for (;;) {
3893
3894 AtomicLowWaterMark low_water_mark(-1UL);
3895 AtomicCounter counter(initial_counter);
3896
3897 Vec< UniquePtr<ZZ> > result(INIT_SIZE, nt);
3898 Vec<unsigned long> result_ctr(INIT_SIZE, nt, -1UL);
3899
3900 NTL_EXEC_INDEX(nt, index)
3901
3902 RandomStreamPush push;
3903
3904 SetSeed(seed);
3905 RandomStream& stream = GetCurrentRandomStream();
3906
3907 ZZ cand, n1;
3908 PrimeSeq s;
3909
3910 while (low_water_mark == -1UL) {
3911
3912 unsigned long local_ctr = counter.inc();
3913 if (local_ctr >> (NTL_BITS_PER_NONCE-1)) {
3914 // counter overflow...rather academic
3915 break;
3916 }
3917
3918 stream.set_nonce(local_ctr);
3919
3920 for (long iter = 0; iter < LOCAL_ITER_BOUND &&
3921 local_ctr <= low_water_mark; iter++) {
3922
3923
3924 RandomLen(cand, k);
3925 if (!IsOdd(cand)) add(cand, cand, 1);
3926
3927 s.reset(3);
3928 long p;
3929
3930 long sieve_passed = 1;
3931
3932 p = s.next();
3933 while (p && p < prime_bnd) {
3934 long r = rem(cand, p);
3935
3936 if (r == 0) {
3937 sieve_passed = 0;
3938 break;
3939 }
3940
3941 // test if 2*r + 1 = 0 (mod p)
3942 if (r == p-r-1) {
3943 sieve_passed = 0;
3944 break;
3945 }
3946
3947 p = s.next();
3948 }
3949
3950 if (!sieve_passed) continue;
3951
3952
3953 if (MillerWitness(cand, two)) continue;
3954
3955 // n1 = 2*cand+1
3956 mul(n1, cand, 2);
3957 add(n1, n1, 1);
3958
3959
3960 if (MillerWitness(n1, two)) continue;
3961
3962 result[index].make(cand);
3963 result_ctr[index] = local_ctr;
3964 low_water_mark.UpdateMin(local_ctr);
3965 break;
3966 }
3967 }
3968
3969 NTL_EXEC_INDEX_END
3970
3971 // find index of low_water_mark
3972
3973 unsigned long low_water_mark1 = low_water_mark;
3974 long low_water_index = -1;
3975
3976 for (long index = 0; index < nt; index++) {
3977 if (result_ctr[index] == low_water_mark1) {
3978 low_water_index = index;
3979 break;
3980 }
3981 }
3982
3983 if (low_water_index == -1) {
3984 // counter overflow...rather academic
3985 overflow_counter++;
3986 initial_counter = 0;
3987 RandomBits(seed, 256);
3988 continue;
3989 }
3990
3991 ZZ N;
3992 N = *result[low_water_index];
3993
3994 ZZ iter = ((overflow_counter << (NTL_BITS_PER_NONCE-1)) +
3995 conv<ZZ>(low_water_mark1) + 1)*LOCAL_ITER_BOUND;
3996
3997 // now do t M-R iterations...just to make sure
3998
3999 // First compute the appropriate number of M-R iterations, t
4000 // The following computes t such that
4001 // p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4002 // which suffices to get an overall error probability of 2^{-err}.
4003 // Note that this method has the advantage of not requiring
4004 // any assumptions on the density of Germain primes.
4005
4006 long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4007 long t;
4008 t = 1;
4009 while (!ErrBoundTest(k, t, err1))
4010 t++;
4011
4012 Vec<ZZ> W(INIT_SIZE, t);
4013
4014 for (long i = 0; i < t; i++) {
4015 do {
4016 RandomBnd(W[i], N);
4017 } while (W[i] == 0);
4018 }
4019
4020 AtomicBool tests_pass(true);
4021
4022 NTL_EXEC_RANGE(t, first, last)
4023
4024 for (long i = first; i < last && tests_pass; i++) {
4025 if (MillerWitness(N, W[i])) tests_pass = false;
4026 }
4027
4028 NTL_EXEC_RANGE_END
4029
4030 if (tests_pass) {
4031 n = N;
4032 return;
4033 }
4034
4035 // very unlikey to get here
4036 initial_counter = low_water_mark1 + 1;
4037 }
4038 }
4039
GenGermainPrime(ZZ & n,long k,long err)4040 void GenGermainPrime(ZZ& n, long k, long err)
4041 {
4042 if (k <= 1) LogicError("GenGermainPrime: bad length");
4043
4044 if (k > (1L << 20)) ResourceError("GenGermainPrime: length too large");
4045
4046 if (err < 1) err = 1;
4047 if (err > 512) err = 512;
4048
4049 if (k == 2) {
4050 if (RandomBnd(2))
4051 n = 3;
4052 else
4053 n = 2;
4054
4055 return;
4056 }
4057
4058 if (k >= 192) {
4059 MultiThreadedGenGermainPrime(n, k, err);
4060 return;
4061 }
4062
4063
4064 long prime_bnd = ComputePrimeBound(k);
4065
4066 if (NumBits(prime_bnd) >= k/2)
4067 prime_bnd = (1L << (k/2-1));
4068
4069
4070 ZZ two;
4071 two = 2;
4072
4073 ZZ n1;
4074
4075
4076 PrimeSeq s;
4077
4078 ZZ iter;
4079 iter = 0;
4080
4081
4082 for (;;) {
4083 iter++;
4084
4085 RandomLen(n, k);
4086 if (!IsOdd(n)) add(n, n, 1);
4087
4088 s.reset(3);
4089 long p;
4090
4091 long sieve_passed = 1;
4092
4093 p = s.next();
4094 while (p && p < prime_bnd) {
4095 long r = rem(n, p);
4096
4097 if (r == 0) {
4098 sieve_passed = 0;
4099 break;
4100 }
4101
4102 // test if 2*r + 1 = 0 (mod p)
4103 if (r == p-r-1) {
4104 sieve_passed = 0;
4105 break;
4106 }
4107
4108 p = s.next();
4109 }
4110
4111 if (!sieve_passed) continue;
4112
4113
4114 if (MillerWitness(n, two)) continue;
4115
4116 // n1 = 2*n+1
4117 mul(n1, n, 2);
4118 add(n1, n1, 1);
4119
4120
4121 if (MillerWitness(n1, two)) continue;
4122
4123 // now do t M-R iterations...just to make sure
4124
4125 // First compute the appropriate number of M-R iterations, t
4126 // The following computes t such that
4127 // p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4128 // which suffices to get an overall error probability of 2^{-err}.
4129 // Note that this method has the advantage of not requiring
4130 // any assumptions on the density of Germain primes.
4131
4132 long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4133 long t;
4134 t = 1;
4135 while (!ErrBoundTest(k, t, err1))
4136 t++;
4137
4138 ZZ W;
4139 long MR_passed = 1;
4140
4141 long i;
4142 for (i = 1; i <= t; i++) {
4143 do {
4144 RandomBnd(W, n);
4145 } while (W == 0);
4146 // W == 0 is not a useful candidate witness!
4147
4148 if (MillerWitness(n, W)) {
4149 MR_passed = 0;
4150 break;
4151 }
4152 }
4153
4154 if (MR_passed) break;
4155 }
4156 }
4157
OldGenGermainPrime(ZZ & n,long k,long err)4158 void OldGenGermainPrime(ZZ& n, long k, long err)
4159 {
4160 if (k <= 1) LogicError("GenGermainPrime: bad length");
4161
4162 if (k > (1L << 20)) ResourceError("GenGermainPrime: length too large");
4163
4164 if (err < 1) err = 1;
4165 if (err > 512) err = 512;
4166
4167 if (k == 2) {
4168 if (RandomBnd(2))
4169 n = 3;
4170 else
4171 n = 2;
4172
4173 return;
4174 }
4175
4176
4177 long prime_bnd = ComputePrimeBound(k);
4178
4179 if (NumBits(prime_bnd) >= k/2)
4180 prime_bnd = (1L << (k/2-1));
4181
4182
4183 ZZ two;
4184 two = 2;
4185
4186 ZZ n1;
4187
4188
4189 PrimeSeq s;
4190
4191 ZZ iter;
4192 iter = 0;
4193
4194
4195 for (;;) {
4196 iter++;
4197
4198 RandomLen(n, k);
4199 if (!IsOdd(n)) add(n, n, 1);
4200
4201 s.reset(3);
4202 long p;
4203
4204 long sieve_passed = 1;
4205
4206 p = s.next();
4207 while (p && p < prime_bnd) {
4208 long r = rem(n, p);
4209
4210 if (r == 0) {
4211 sieve_passed = 0;
4212 break;
4213 }
4214
4215 // test if 2*r + 1 = 0 (mod p)
4216 if (r == p-r-1) {
4217 sieve_passed = 0;
4218 break;
4219 }
4220
4221 p = s.next();
4222 }
4223
4224 if (!sieve_passed) continue;
4225
4226
4227 if (MillerWitness(n, two)) continue;
4228
4229 // n1 = 2*n+1
4230 mul(n1, n, 2);
4231 add(n1, n1, 1);
4232
4233
4234 if (MillerWitness(n1, two)) continue;
4235
4236 // now do t M-R iterations...just to make sure
4237
4238 // First compute the appropriate number of M-R iterations, t
4239 // The following computes t such that
4240 // p(k,t)*8/k <= 2^{-err}/(5*iter^{1.25})
4241 // which suffices to get an overall error probability of 2^{-err}.
4242 // Note that this method has the advantage of not requiring
4243 // any assumptions on the density of Germain primes.
4244
4245 long err1 = max(1, err + 7 + (5*NumBits(iter) + 3)/4 - NumBits(k));
4246 long t;
4247 t = 1;
4248 while (!ErrBoundTest(k, t, err1))
4249 t++;
4250
4251 ZZ W;
4252 long MR_passed = 1;
4253
4254 long i;
4255 for (i = 1; i <= t; i++) {
4256 do {
4257 RandomBnd(W, n);
4258 } while (W == 0);
4259 // W == 0 is not a useful candidate witness!
4260
4261 if (MillerWitness(n, W)) {
4262 MR_passed = 0;
4263 break;
4264 }
4265 }
4266
4267 if (MR_passed) break;
4268 }
4269 }
4270
GenGermainPrime_long(long k,long err)4271 long GenGermainPrime_long(long k, long err)
4272 {
4273 if (k >= NTL_BITS_PER_LONG-1)
4274 ResourceError("GenGermainPrime_long: length too long");
4275
4276 ZZ n;
4277 GenGermainPrime(n, k, err);
4278 return to_long(n);
4279 }
4280
4281
4282 NTL_END_IMPL
4283